gscraper 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/README.txt +1 -1
- data/lib/gscraper/search/ajax_query.rb +2 -2
- data/lib/gscraper/search/query.rb +24 -8
- data/lib/gscraper/search/web_query.rb +8 -56
- data/lib/gscraper/sponsored_ad.rb +1 -3
- data/lib/gscraper/version.rb +1 -1
- data/spec/has_sponsored_links_examples.rb +2 -14
- data/spec/helpers/uri.rb +1 -2
- data/spec/search/page_has_results_examples.rb +2 -12
- data/spec/search/web_query_spec.rb +4 -0
- metadata +7 -4
data/History.txt
CHANGED
data/README.txt
CHANGED
@@ -42,7 +42,7 @@ GScraper is a web-scraping interface to various Google Services.
|
|
42
42
|
|
43
43
|
q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29')
|
44
44
|
|
45
|
-
q.query #
|
45
|
+
q.query # => "ruby"
|
46
46
|
q.with_words # => "rails"
|
47
47
|
q.occurrs_within # => :title
|
48
48
|
q.rights # => :cc_by_nc
|
@@ -161,9 +161,9 @@ module GScraper
|
|
161
161
|
hash['results'].each_with_index do |result,index|
|
162
162
|
rank = rank_offset + (index + 1)
|
163
163
|
title = Hpricot(result['title']).inner_text
|
164
|
-
url = result['unescapedUrl']
|
164
|
+
url = URI(result['unescapedUrl'])
|
165
165
|
summary = Hpricot(result['content']).inner_text
|
166
|
-
cached_url = result['cacheUrl']
|
166
|
+
cached_url = URI(result['cacheUrl'])
|
167
167
|
|
168
168
|
new_page << Result.new(rank,title,url,summary,cached_url)
|
169
169
|
end
|
@@ -119,19 +119,15 @@ module GScraper
|
|
119
119
|
expr = []
|
120
120
|
|
121
121
|
append_modifier = lambda { |name|
|
122
|
-
modifier = instance_variable_get("@#{name}")
|
122
|
+
modifier = format_modifier(instance_variable_get("@#{name}"))
|
123
123
|
|
124
|
-
expr << "#{name}:#{modifier}"
|
124
|
+
expr << "#{name}:#{modifier}" unless modifier.empty?
|
125
125
|
}
|
126
126
|
|
127
127
|
append_options = lambda { |name|
|
128
|
-
ops = instance_variable_get("@#{name}")
|
128
|
+
ops = format_options(instance_variable_get("@#{name}"))
|
129
129
|
|
130
|
-
|
131
|
-
expr << "#{name}:#{ops.join(' ')}"
|
132
|
-
elsif ops
|
133
|
-
expr << "#{name}:#{ops}"
|
134
|
-
end
|
130
|
+
expr << "#{name}:#{ops}" unless ops.empty?
|
135
131
|
}
|
136
132
|
|
137
133
|
expr << @query if @query
|
@@ -168,6 +164,26 @@ module GScraper
|
|
168
164
|
return expr.join(' ')
|
169
165
|
end
|
170
166
|
|
167
|
+
protected
|
168
|
+
|
169
|
+
def format_modifier(value)
|
170
|
+
if value.kind_of?(Regexp)
|
171
|
+
return value.source
|
172
|
+
else
|
173
|
+
return value.to_s
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def format_options(value)
|
178
|
+
if value.kind_of?(Array)
|
179
|
+
return value.map { |element|
|
180
|
+
format_modifier(element)
|
181
|
+
}.join(' ')
|
182
|
+
else
|
183
|
+
return format_modifier(value)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
171
187
|
end
|
172
188
|
end
|
173
189
|
end
|
@@ -50,51 +50,6 @@ module GScraper
|
|
50
50
|
# Results per-page
|
51
51
|
attr_accessor :results_per_page
|
52
52
|
|
53
|
-
# Search query
|
54
|
-
attr_accessor :query
|
55
|
-
|
56
|
-
# Search 'link' modifier
|
57
|
-
attr_accessor :link
|
58
|
-
|
59
|
-
# Search 'related' modifier
|
60
|
-
attr_accessor :related
|
61
|
-
|
62
|
-
# Search 'info' modifier
|
63
|
-
attr_accessor :info
|
64
|
-
|
65
|
-
# Search 'site' modifier
|
66
|
-
attr_accessor :site
|
67
|
-
|
68
|
-
# Search 'filetype' modifier
|
69
|
-
attr_accessor :filetype
|
70
|
-
|
71
|
-
# Search 'allintitle' modifier
|
72
|
-
attr_accessor :allintitle
|
73
|
-
|
74
|
-
# Search 'intitle' modifier
|
75
|
-
attr_accessor :intitle
|
76
|
-
|
77
|
-
# Search 'allinurl' modifier
|
78
|
-
attr_accessor :allinurl
|
79
|
-
|
80
|
-
# Search 'inurl' modifier
|
81
|
-
attr_accessor :inurl
|
82
|
-
|
83
|
-
# Search 'allintext' modifier
|
84
|
-
attr_accessor :allintext
|
85
|
-
|
86
|
-
# Search 'intext' modifier
|
87
|
-
attr_accessor :intext
|
88
|
-
|
89
|
-
# Search for results containing the exact phrase
|
90
|
-
attr_accessor :exact_phrase
|
91
|
-
|
92
|
-
# Search for results with the words
|
93
|
-
attr_accessor :with_words
|
94
|
-
|
95
|
-
# Search for results with-out the words
|
96
|
-
attr_accessor :without_words
|
97
|
-
|
98
53
|
# Search for results written in the language
|
99
54
|
attr_accessor :language
|
100
55
|
|
@@ -119,9 +74,6 @@ module GScraper
|
|
119
74
|
# Search for results within the past year
|
120
75
|
attr_accessor :within_past_year
|
121
76
|
|
122
|
-
# Search for results containing numbers between the range
|
123
|
-
attr_accessor :numeric_range
|
124
|
-
|
125
77
|
# Search for results where the query ocurrs within the area
|
126
78
|
attr_accessor :occurrs_within
|
127
79
|
|
@@ -386,7 +338,7 @@ module GScraper
|
|
386
338
|
def page(page_index)
|
387
339
|
Page.new do |new_page|
|
388
340
|
doc = @agent.get(page_url(page_index))
|
389
|
-
results = doc.search('//div.g')[0...@results_per_page.to_i]
|
341
|
+
results = doc.search('//li.g|//li/div.g')[0...@results_per_page.to_i]
|
390
342
|
|
391
343
|
rank_offset = result_offset_of(page_index)
|
392
344
|
|
@@ -394,24 +346,24 @@ module GScraper
|
|
394
346
|
rank = rank_offset + (index + 1)
|
395
347
|
link = result.at('//a.l')
|
396
348
|
title = link.inner_text
|
397
|
-
url = link.get_attribute('href')
|
349
|
+
url = URI(link.get_attribute('href'))
|
398
350
|
summary_text = ''
|
399
351
|
cached_url = nil
|
400
352
|
similar_url = nil
|
401
353
|
|
402
|
-
if (content = (result.at('//td.j//font
|
354
|
+
if (content = (result.at('//div.s|//td.j//font')))
|
403
355
|
content.children.each do |elem|
|
404
356
|
break if (!(elem.text?) && elem.name=='br')
|
405
357
|
|
406
358
|
summary_text << elem.inner_text
|
407
359
|
end
|
408
360
|
|
409
|
-
if (cached_link = result.at('
|
410
|
-
cached_url = cached_link.get_attribute('href')
|
361
|
+
if (cached_link = result.at('span.gl/a:first'))
|
362
|
+
cached_url = URI(cached_link.get_attribute('href'))
|
411
363
|
end
|
412
364
|
|
413
|
-
if (similar_link = result.at('
|
414
|
-
similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
|
365
|
+
if (similar_link = result.at('span.gl/a:last'))
|
366
|
+
similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
|
415
367
|
end
|
416
368
|
end
|
417
369
|
|
@@ -445,7 +397,7 @@ module GScraper
|
|
445
397
|
# top and side ads
|
446
398
|
doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
|
447
399
|
title = link.inner_text
|
448
|
-
url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
|
400
|
+
url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))
|
449
401
|
|
450
402
|
links << SponsoredAd.new(title,url)
|
451
403
|
end
|
data/lib/gscraper/version.rb
CHANGED
@@ -24,15 +24,9 @@ shared_examples_for "has Sponsored Links" do
|
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
27
|
-
it "should have non-empty URLs" do
|
28
|
-
@links.each_url do |url|
|
29
|
-
url.length.should_not == 0
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
27
|
it "should have valid URLs" do
|
34
28
|
@links.each_url do |url|
|
35
|
-
|
29
|
+
uri_should_be_valid(url)
|
36
30
|
end
|
37
31
|
end
|
38
32
|
|
@@ -42,15 +36,9 @@ shared_examples_for "has Sponsored Links" do
|
|
42
36
|
end
|
43
37
|
end
|
44
38
|
|
45
|
-
it "should have non-empty direct URLs" do
|
46
|
-
@links.each_direct_url do |url|
|
47
|
-
url.length.should_not == 0
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
39
|
it "should have valid direct URLs" do
|
52
40
|
@links.each_direct_url do |url|
|
53
|
-
|
41
|
+
uri_should_be_valid(url)
|
54
42
|
end
|
55
43
|
end
|
56
44
|
|
data/spec/helpers/uri.rb
CHANGED
@@ -28,24 +28,14 @@ shared_examples_for "Page has Search Results" do
|
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
31
|
-
it "should have non-empty URLs" do
|
32
|
-
@page.each_url do |url|
|
33
|
-
url.length.should_not == 0
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
31
|
it "should have valid URLs" do
|
38
32
|
@page.each_url do |url|
|
39
|
-
|
33
|
+
uri_should_be_valid(url)
|
40
34
|
end
|
41
35
|
end
|
42
36
|
|
43
37
|
it "should have atleast one cached URL" do
|
44
|
-
@page.cached_urls.should_not == 0
|
45
|
-
end
|
46
|
-
|
47
|
-
it "should have atleast one similar query URL" do
|
48
|
-
@page.similar_urls.should_not == 0
|
38
|
+
@page.cached_urls.length.should_not == 0
|
49
39
|
end
|
50
40
|
|
51
41
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern Modulus III
|
@@ -9,11 +9,12 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-
|
12
|
+
date: 2008-08-27 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: hpricot
|
17
|
+
type: :runtime
|
17
18
|
version_requirement:
|
18
19
|
version_requirements: !ruby/object:Gem::Requirement
|
19
20
|
requirements:
|
@@ -23,6 +24,7 @@ dependencies:
|
|
23
24
|
version:
|
24
25
|
- !ruby/object:Gem::Dependency
|
25
26
|
name: mechanize
|
27
|
+
type: :runtime
|
26
28
|
version_requirement:
|
27
29
|
version_requirements: !ruby/object:Gem::Requirement
|
28
30
|
requirements:
|
@@ -32,12 +34,13 @@ dependencies:
|
|
32
34
|
version:
|
33
35
|
- !ruby/object:Gem::Dependency
|
34
36
|
name: hoe
|
37
|
+
type: :development
|
35
38
|
version_requirement:
|
36
39
|
version_requirements: !ruby/object:Gem::Requirement
|
37
40
|
requirements:
|
38
41
|
- - ">="
|
39
42
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.
|
43
|
+
version: 1.7.0
|
41
44
|
version:
|
42
45
|
description: GScraper is a web-scraping interface to various Google Services.
|
43
46
|
email:
|
@@ -113,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
113
116
|
requirements: []
|
114
117
|
|
115
118
|
rubyforge_project: gscraper
|
116
|
-
rubygems_version: 1.
|
119
|
+
rubygems_version: 1.2.0
|
117
120
|
signing_key:
|
118
121
|
specification_version: 2
|
119
122
|
summary: GScraper is a web-scraping interface to various Google Services.
|