gscraper 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/README.txt +1 -1
- data/lib/gscraper/search/ajax_query.rb +2 -2
- data/lib/gscraper/search/query.rb +24 -8
- data/lib/gscraper/search/web_query.rb +8 -56
- data/lib/gscraper/sponsored_ad.rb +1 -3
- data/lib/gscraper/version.rb +1 -1
- data/spec/has_sponsored_links_examples.rb +2 -14
- data/spec/helpers/uri.rb +1 -2
- data/spec/search/page_has_results_examples.rb +2 -12
- data/spec/search/web_query_spec.rb +4 -0
- metadata +7 -4
data/History.txt
CHANGED
data/README.txt
CHANGED
@@ -42,7 +42,7 @@ GScraper is a web-scraping interface to various Google Services.
|
|
42
42
|
|
43
43
|
q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29')
|
44
44
|
|
45
|
-
q.query #
|
45
|
+
q.query # => "ruby"
|
46
46
|
q.with_words # => "rails"
|
47
47
|
q.occurrs_within # => :title
|
48
48
|
q.rights # => :cc_by_nc
|
@@ -161,9 +161,9 @@ module GScraper
|
|
161
161
|
hash['results'].each_with_index do |result,index|
|
162
162
|
rank = rank_offset + (index + 1)
|
163
163
|
title = Hpricot(result['title']).inner_text
|
164
|
-
url = result['unescapedUrl']
|
164
|
+
url = URI(result['unescapedUrl'])
|
165
165
|
summary = Hpricot(result['content']).inner_text
|
166
|
-
cached_url = result['cacheUrl']
|
166
|
+
cached_url = URI(result['cacheUrl'])
|
167
167
|
|
168
168
|
new_page << Result.new(rank,title,url,summary,cached_url)
|
169
169
|
end
|
@@ -119,19 +119,15 @@ module GScraper
|
|
119
119
|
expr = []
|
120
120
|
|
121
121
|
append_modifier = lambda { |name|
|
122
|
-
modifier = instance_variable_get("@#{name}")
|
122
|
+
modifier = format_modifier(instance_variable_get("@#{name}"))
|
123
123
|
|
124
|
-
expr << "#{name}:#{modifier}"
|
124
|
+
expr << "#{name}:#{modifier}" unless modifier.empty?
|
125
125
|
}
|
126
126
|
|
127
127
|
append_options = lambda { |name|
|
128
|
-
ops = instance_variable_get("@#{name}")
|
128
|
+
ops = format_options(instance_variable_get("@#{name}"))
|
129
129
|
|
130
|
-
|
131
|
-
expr << "#{name}:#{ops.join(' ')}"
|
132
|
-
elsif ops
|
133
|
-
expr << "#{name}:#{ops}"
|
134
|
-
end
|
130
|
+
expr << "#{name}:#{ops}" unless ops.empty?
|
135
131
|
}
|
136
132
|
|
137
133
|
expr << @query if @query
|
@@ -168,6 +164,26 @@ module GScraper
|
|
168
164
|
return expr.join(' ')
|
169
165
|
end
|
170
166
|
|
167
|
+
protected
|
168
|
+
|
169
|
+
def format_modifier(value)
|
170
|
+
if value.kind_of?(Regexp)
|
171
|
+
return value.source
|
172
|
+
else
|
173
|
+
return value.to_s
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def format_options(value)
|
178
|
+
if value.kind_of?(Array)
|
179
|
+
return value.map { |element|
|
180
|
+
format_modifier(element)
|
181
|
+
}.join(' ')
|
182
|
+
else
|
183
|
+
return format_modifier(value)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
171
187
|
end
|
172
188
|
end
|
173
189
|
end
|
@@ -50,51 +50,6 @@ module GScraper
|
|
50
50
|
# Results per-page
|
51
51
|
attr_accessor :results_per_page
|
52
52
|
|
53
|
-
# Search query
|
54
|
-
attr_accessor :query
|
55
|
-
|
56
|
-
# Search 'link' modifier
|
57
|
-
attr_accessor :link
|
58
|
-
|
59
|
-
# Search 'related' modifier
|
60
|
-
attr_accessor :related
|
61
|
-
|
62
|
-
# Search 'info' modifier
|
63
|
-
attr_accessor :info
|
64
|
-
|
65
|
-
# Search 'site' modifier
|
66
|
-
attr_accessor :site
|
67
|
-
|
68
|
-
# Search 'filetype' modifier
|
69
|
-
attr_accessor :filetype
|
70
|
-
|
71
|
-
# Search 'allintitle' modifier
|
72
|
-
attr_accessor :allintitle
|
73
|
-
|
74
|
-
# Search 'intitle' modifier
|
75
|
-
attr_accessor :intitle
|
76
|
-
|
77
|
-
# Search 'allinurl' modifier
|
78
|
-
attr_accessor :allinurl
|
79
|
-
|
80
|
-
# Search 'inurl' modifier
|
81
|
-
attr_accessor :inurl
|
82
|
-
|
83
|
-
# Search 'allintext' modifier
|
84
|
-
attr_accessor :allintext
|
85
|
-
|
86
|
-
# Search 'intext' modifier
|
87
|
-
attr_accessor :intext
|
88
|
-
|
89
|
-
# Search for results containing the exact phrase
|
90
|
-
attr_accessor :exact_phrase
|
91
|
-
|
92
|
-
# Search for results with the words
|
93
|
-
attr_accessor :with_words
|
94
|
-
|
95
|
-
# Search for results with-out the words
|
96
|
-
attr_accessor :without_words
|
97
|
-
|
98
53
|
# Search for results written in the language
|
99
54
|
attr_accessor :language
|
100
55
|
|
@@ -119,9 +74,6 @@ module GScraper
|
|
119
74
|
# Search for results within the past year
|
120
75
|
attr_accessor :within_past_year
|
121
76
|
|
122
|
-
# Search for results containing numbers between the range
|
123
|
-
attr_accessor :numeric_range
|
124
|
-
|
125
77
|
# Search for results where the query ocurrs within the area
|
126
78
|
attr_accessor :occurrs_within
|
127
79
|
|
@@ -386,7 +338,7 @@ module GScraper
|
|
386
338
|
def page(page_index)
|
387
339
|
Page.new do |new_page|
|
388
340
|
doc = @agent.get(page_url(page_index))
|
389
|
-
results = doc.search('//div.g')[0...@results_per_page.to_i]
|
341
|
+
results = doc.search('//li.g|//li/div.g')[0...@results_per_page.to_i]
|
390
342
|
|
391
343
|
rank_offset = result_offset_of(page_index)
|
392
344
|
|
@@ -394,24 +346,24 @@ module GScraper
|
|
394
346
|
rank = rank_offset + (index + 1)
|
395
347
|
link = result.at('//a.l')
|
396
348
|
title = link.inner_text
|
397
|
-
url = link.get_attribute('href')
|
349
|
+
url = URI(link.get_attribute('href'))
|
398
350
|
summary_text = ''
|
399
351
|
cached_url = nil
|
400
352
|
similar_url = nil
|
401
353
|
|
402
|
-
if (content = (result.at('//td.j//font
|
354
|
+
if (content = (result.at('//div.s|//td.j//font')))
|
403
355
|
content.children.each do |elem|
|
404
356
|
break if (!(elem.text?) && elem.name=='br')
|
405
357
|
|
406
358
|
summary_text << elem.inner_text
|
407
359
|
end
|
408
360
|
|
409
|
-
if (cached_link = result.at('
|
410
|
-
cached_url = cached_link.get_attribute('href')
|
361
|
+
if (cached_link = result.at('span.gl/a:first'))
|
362
|
+
cached_url = URI(cached_link.get_attribute('href'))
|
411
363
|
end
|
412
364
|
|
413
|
-
if (similar_link = result.at('
|
414
|
-
similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
|
365
|
+
if (similar_link = result.at('span.gl/a:last'))
|
366
|
+
similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
|
415
367
|
end
|
416
368
|
end
|
417
369
|
|
@@ -445,7 +397,7 @@ module GScraper
|
|
445
397
|
# top and side ads
|
446
398
|
doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
|
447
399
|
title = link.inner_text
|
448
|
-
url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
|
400
|
+
url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))
|
449
401
|
|
450
402
|
links << SponsoredAd.new(title,url)
|
451
403
|
end
|
data/lib/gscraper/version.rb
CHANGED
@@ -24,15 +24,9 @@ shared_examples_for "has Sponsored Links" do
|
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
27
|
-
it "should have non-empty URLs" do
|
28
|
-
@links.each_url do |url|
|
29
|
-
url.length.should_not == 0
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
27
|
it "should have valid URLs" do
|
34
28
|
@links.each_url do |url|
|
35
|
-
|
29
|
+
uri_should_be_valid(url)
|
36
30
|
end
|
37
31
|
end
|
38
32
|
|
@@ -42,15 +36,9 @@ shared_examples_for "has Sponsored Links" do
|
|
42
36
|
end
|
43
37
|
end
|
44
38
|
|
45
|
-
it "should have non-empty direct URLs" do
|
46
|
-
@links.each_direct_url do |url|
|
47
|
-
url.length.should_not == 0
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
39
|
it "should have valid direct URLs" do
|
52
40
|
@links.each_direct_url do |url|
|
53
|
-
|
41
|
+
uri_should_be_valid(url)
|
54
42
|
end
|
55
43
|
end
|
56
44
|
|
data/spec/helpers/uri.rb
CHANGED
@@ -28,24 +28,14 @@ shared_examples_for "Page has Search Results" do
|
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
31
|
-
it "should have non-empty URLs" do
|
32
|
-
@page.each_url do |url|
|
33
|
-
url.length.should_not == 0
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
31
|
it "should have valid URLs" do
|
38
32
|
@page.each_url do |url|
|
39
|
-
|
33
|
+
uri_should_be_valid(url)
|
40
34
|
end
|
41
35
|
end
|
42
36
|
|
43
37
|
it "should have atleast one cached URL" do
|
44
|
-
@page.cached_urls.should_not == 0
|
45
|
-
end
|
46
|
-
|
47
|
-
it "should have atleast one similar query URL" do
|
48
|
-
@page.similar_urls.should_not == 0
|
38
|
+
@page.cached_urls.length.should_not == 0
|
49
39
|
end
|
50
40
|
|
51
41
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern Modulus III
|
@@ -9,11 +9,12 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-
|
12
|
+
date: 2008-08-27 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: hpricot
|
17
|
+
type: :runtime
|
17
18
|
version_requirement:
|
18
19
|
version_requirements: !ruby/object:Gem::Requirement
|
19
20
|
requirements:
|
@@ -23,6 +24,7 @@ dependencies:
|
|
23
24
|
version:
|
24
25
|
- !ruby/object:Gem::Dependency
|
25
26
|
name: mechanize
|
27
|
+
type: :runtime
|
26
28
|
version_requirement:
|
27
29
|
version_requirements: !ruby/object:Gem::Requirement
|
28
30
|
requirements:
|
@@ -32,12 +34,13 @@ dependencies:
|
|
32
34
|
version:
|
33
35
|
- !ruby/object:Gem::Dependency
|
34
36
|
name: hoe
|
37
|
+
type: :development
|
35
38
|
version_requirement:
|
36
39
|
version_requirements: !ruby/object:Gem::Requirement
|
37
40
|
requirements:
|
38
41
|
- - ">="
|
39
42
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.
|
43
|
+
version: 1.7.0
|
41
44
|
version:
|
42
45
|
description: GScraper is a web-scraping interface to various Google Services.
|
43
46
|
email:
|
@@ -113,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
113
116
|
requirements: []
|
114
117
|
|
115
118
|
rubyforge_project: gscraper
|
116
|
-
rubygems_version: 1.
|
119
|
+
rubygems_version: 1.2.0
|
117
120
|
signing_key:
|
118
121
|
specification_version: 2
|
119
122
|
summary: GScraper is a web-scraping interface to various Google Services.
|