gscraper 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ == 0.2.1 / 2008-08-27
2
+
3
+ * Updated XPath queries in GScraper::Search::WebQuery for new Google (tm)
4
+ Search Result HTML schema.
5
+
1
6
  == 0.2.0 / 2008-05-10
2
7
 
3
8
  * Removed GScraper::WebAgent.
data/README.txt CHANGED
@@ -42,7 +42,7 @@ GScraper is a web-scraping interface to various Google Services.
42
42
 
43
43
  q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29')
44
44
 
45
- q.query # =>; "ruby"
45
+ q.query # => "ruby"
46
46
  q.with_words # => "rails"
47
47
  q.occurrs_within # => :title
48
48
  q.rights # => :cc_by_nc
@@ -161,9 +161,9 @@ module GScraper
161
161
  hash['results'].each_with_index do |result,index|
162
162
  rank = rank_offset + (index + 1)
163
163
  title = Hpricot(result['title']).inner_text
164
- url = result['unescapedUrl']
164
+ url = URI(result['unescapedUrl'])
165
165
  summary = Hpricot(result['content']).inner_text
166
- cached_url = result['cacheUrl']
166
+ cached_url = URI(result['cacheUrl'])
167
167
 
168
168
  new_page << Result.new(rank,title,url,summary,cached_url)
169
169
  end
@@ -119,19 +119,15 @@ module GScraper
119
119
  expr = []
120
120
 
121
121
  append_modifier = lambda { |name|
122
- modifier = instance_variable_get("@#{name}")
122
+ modifier = format_modifier(instance_variable_get("@#{name}"))
123
123
 
124
- expr << "#{name}:#{modifier}" if modifier
124
+ expr << "#{name}:#{modifier}" unless modifier.empty?
125
125
  }
126
126
 
127
127
  append_options = lambda { |name|
128
- ops = instance_variable_get("@#{name}")
128
+ ops = format_options(instance_variable_get("@#{name}"))
129
129
 
130
- if ops.kind_of?(Array)
131
- expr << "#{name}:#{ops.join(' ')}"
132
- elsif ops
133
- expr << "#{name}:#{ops}"
134
- end
130
+ expr << "#{name}:#{ops}" unless ops.empty?
135
131
  }
136
132
 
137
133
  expr << @query if @query
@@ -168,6 +164,26 @@ module GScraper
168
164
  return expr.join(' ')
169
165
  end
170
166
 
167
+ protected
168
+
169
+ def format_modifier(value)
170
+ if value.kind_of?(Regexp)
171
+ return value.source
172
+ else
173
+ return value.to_s
174
+ end
175
+ end
176
+
177
+ def format_options(value)
178
+ if value.kind_of?(Array)
179
+ return value.map { |element|
180
+ format_modifier(element)
181
+ }.join(' ')
182
+ else
183
+ return format_modifier(value)
184
+ end
185
+ end
186
+
171
187
  end
172
188
  end
173
189
  end
@@ -50,51 +50,6 @@ module GScraper
50
50
  # Results per-page
51
51
  attr_accessor :results_per_page
52
52
 
53
- # Search query
54
- attr_accessor :query
55
-
56
- # Search 'link' modifier
57
- attr_accessor :link
58
-
59
- # Search 'related' modifier
60
- attr_accessor :related
61
-
62
- # Search 'info' modifier
63
- attr_accessor :info
64
-
65
- # Search 'site' modifier
66
- attr_accessor :site
67
-
68
- # Search 'filetype' modifier
69
- attr_accessor :filetype
70
-
71
- # Search 'allintitle' modifier
72
- attr_accessor :allintitle
73
-
74
- # Search 'intitle' modifier
75
- attr_accessor :intitle
76
-
77
- # Search 'allinurl' modifier
78
- attr_accessor :allinurl
79
-
80
- # Search 'inurl' modifier
81
- attr_accessor :inurl
82
-
83
- # Search 'allintext' modifier
84
- attr_accessor :allintext
85
-
86
- # Search 'intext' modifier
87
- attr_accessor :intext
88
-
89
- # Search for results containing the exact phrase
90
- attr_accessor :exact_phrase
91
-
92
- # Search for results with the words
93
- attr_accessor :with_words
94
-
95
- # Search for results with-out the words
96
- attr_accessor :without_words
97
-
98
53
  # Search for results written in the language
99
54
  attr_accessor :language
100
55
 
@@ -119,9 +74,6 @@ module GScraper
119
74
  # Search for results within the past year
120
75
  attr_accessor :within_past_year
121
76
 
122
- # Search for results containing numbers between the range
123
- attr_accessor :numeric_range
124
-
125
77
  # Search for results where the query ocurrs within the area
126
78
  attr_accessor :occurrs_within
127
79
 
@@ -386,7 +338,7 @@ module GScraper
386
338
  def page(page_index)
387
339
  Page.new do |new_page|
388
340
  doc = @agent.get(page_url(page_index))
389
- results = doc.search('//div.g')[0...@results_per_page.to_i]
341
+ results = doc.search('//li.g|//li/div.g')[0...@results_per_page.to_i]
390
342
 
391
343
  rank_offset = result_offset_of(page_index)
392
344
 
@@ -394,24 +346,24 @@ module GScraper
394
346
  rank = rank_offset + (index + 1)
395
347
  link = result.at('//a.l')
396
348
  title = link.inner_text
397
- url = link.get_attribute('href')
349
+ url = URI(link.get_attribute('href'))
398
350
  summary_text = ''
399
351
  cached_url = nil
400
352
  similar_url = nil
401
353
 
402
- if (content = (result.at('//td.j//font|//td.j/div')))
354
+ if (content = (result.at('//div.s|//td.j//font')))
403
355
  content.children.each do |elem|
404
356
  break if (!(elem.text?) && elem.name=='br')
405
357
 
406
358
  summary_text << elem.inner_text
407
359
  end
408
360
 
409
- if (cached_link = result.at('nobr/a:first'))
410
- cached_url = cached_link.get_attribute('href')
361
+ if (cached_link = result.at('span.gl/a:first'))
362
+ cached_url = URI(cached_link.get_attribute('href'))
411
363
  end
412
364
 
413
- if (similar_link = result.at('nobr/a:last'))
414
- similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
365
+ if (similar_link = result.at('span.gl/a:last'))
366
+ similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
415
367
  end
416
368
  end
417
369
 
@@ -445,7 +397,7 @@ module GScraper
445
397
  # top and side ads
446
398
  doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
447
399
  title = link.inner_text
448
- url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
400
+ url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))
449
401
 
450
402
  links << SponsoredAd.new(title,url)
451
403
  end
@@ -43,9 +43,7 @@ module GScraper
43
43
  # Returns the direct URL of the ad.
44
44
  #
45
45
  def direct_url
46
- uri = URI(@url)
47
-
48
- return (uri.query_params['adurl'] || uri.query_params['q'])
46
+ URI(@url.query_params['adurl'] || @url.query_params['q'])
49
47
  end
50
48
 
51
49
  #
@@ -21,5 +21,5 @@
21
21
  #
22
22
 
23
23
  module GScraper
24
- VERSION = '0.2.0'
24
+ VERSION = '0.2.1'
25
25
  end
@@ -24,15 +24,9 @@ shared_examples_for "has Sponsored Links" do
24
24
  end
25
25
  end
26
26
 
27
- it "should have non-empty URLs" do
28
- @links.each_url do |url|
29
- url.length.should_not == 0
30
- end
31
- end
32
-
33
27
  it "should have valid URLs" do
34
28
  @links.each_url do |url|
35
- url_should_be_valid(url)
29
+ uri_should_be_valid(url)
36
30
  end
37
31
  end
38
32
 
@@ -42,15 +36,9 @@ shared_examples_for "has Sponsored Links" do
42
36
  end
43
37
  end
44
38
 
45
- it "should have non-empty direct URLs" do
46
- @links.each_direct_url do |url|
47
- url.length.should_not == 0
48
- end
49
- end
50
-
51
39
  it "should have valid direct URLs" do
52
40
  @links.each_direct_url do |url|
53
- url_should_be_valid(url)
41
+ uri_should_be_valid(url)
54
42
  end
55
43
  end
56
44
 
data/spec/helpers/uri.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  require 'uri'
2
2
 
3
- def url_should_be_valid(url)
4
- uri = URI(url)
3
+ def uri_should_be_valid(uri)
5
4
  uri.scheme.should_not be_nil
6
5
  uri.host.should_not be_nil
7
6
  uri.path.should_not be_nil
@@ -28,24 +28,14 @@ shared_examples_for "Page has Search Results" do
28
28
  end
29
29
  end
30
30
 
31
- it "should have non-empty URLs" do
32
- @page.each_url do |url|
33
- url.length.should_not == 0
34
- end
35
- end
36
-
37
31
  it "should have valid URLs" do
38
32
  @page.each_url do |url|
39
- url_should_be_valid(url)
33
+ uri_should_be_valid(url)
40
34
  end
41
35
  end
42
36
 
43
37
  it "should have atleast one cached URL" do
44
- @page.cached_urls.should_not == 0
45
- end
46
-
47
- it "should have atleast one similar query URL" do
48
- @page.similar_urls.should_not == 0
38
+ @page.cached_urls.length.should_not == 0
49
39
  end
50
40
 
51
41
  end
@@ -71,4 +71,8 @@ describe GScraper::Search::WebQuery do
71
71
 
72
72
  end
73
73
 
74
+ it "should have atleast one similar query URL" do
75
+ @page.similar_urls.length.should_not == 0
76
+ end
77
+
74
78
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern Modulus III
@@ -9,11 +9,12 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-06-21 00:00:00 -07:00
12
+ date: 2008-08-27 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: hpricot
17
+ type: :runtime
17
18
  version_requirement:
18
19
  version_requirements: !ruby/object:Gem::Requirement
19
20
  requirements:
@@ -23,6 +24,7 @@ dependencies:
23
24
  version:
24
25
  - !ruby/object:Gem::Dependency
25
26
  name: mechanize
27
+ type: :runtime
26
28
  version_requirement:
27
29
  version_requirements: !ruby/object:Gem::Requirement
28
30
  requirements:
@@ -32,12 +34,13 @@ dependencies:
32
34
  version:
33
35
  - !ruby/object:Gem::Dependency
34
36
  name: hoe
37
+ type: :development
35
38
  version_requirement:
36
39
  version_requirements: !ruby/object:Gem::Requirement
37
40
  requirements:
38
41
  - - ">="
39
42
  - !ruby/object:Gem::Version
40
- version: 1.6.0
43
+ version: 1.7.0
41
44
  version:
42
45
  description: GScraper is a web-scraping interface to various Google Services.
43
46
  email:
@@ -113,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
113
116
  requirements: []
114
117
 
115
118
  rubyforge_project: gscraper
116
- rubygems_version: 1.1.1
119
+ rubygems_version: 1.2.0
117
120
  signing_key:
118
121
  specification_version: 2
119
122
  summary: GScraper is a web-scraping interface to various Google Services.