gscraper 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ == 0.2.1 / 2008-08-27
2
+
3
+ * Updated XPath queries in GScraper::Search::WebQuery for new Google (tm)
4
+ Search Result HTML schema.
5
+
1
6
  == 0.2.0 / 2008-05-10
2
7
 
3
8
  * Removed GScraper::WebAgent.
data/README.txt CHANGED
@@ -42,7 +42,7 @@ GScraper is a web-scraping interface to various Google Services.
42
42
 
43
43
  q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29')
44
44
 
45
- q.query # =>; "ruby"
45
+ q.query # => "ruby"
46
46
  q.with_words # => "rails"
47
47
  q.occurrs_within # => :title
48
48
  q.rights # => :cc_by_nc
@@ -161,9 +161,9 @@ module GScraper
161
161
  hash['results'].each_with_index do |result,index|
162
162
  rank = rank_offset + (index + 1)
163
163
  title = Hpricot(result['title']).inner_text
164
- url = result['unescapedUrl']
164
+ url = URI(result['unescapedUrl'])
165
165
  summary = Hpricot(result['content']).inner_text
166
- cached_url = result['cacheUrl']
166
+ cached_url = URI(result['cacheUrl'])
167
167
 
168
168
  new_page << Result.new(rank,title,url,summary,cached_url)
169
169
  end
@@ -119,19 +119,15 @@ module GScraper
119
119
  expr = []
120
120
 
121
121
  append_modifier = lambda { |name|
122
- modifier = instance_variable_get("@#{name}")
122
+ modifier = format_modifier(instance_variable_get("@#{name}"))
123
123
 
124
- expr << "#{name}:#{modifier}" if modifier
124
+ expr << "#{name}:#{modifier}" unless modifier.empty?
125
125
  }
126
126
 
127
127
  append_options = lambda { |name|
128
- ops = instance_variable_get("@#{name}")
128
+ ops = format_options(instance_variable_get("@#{name}"))
129
129
 
130
- if ops.kind_of?(Array)
131
- expr << "#{name}:#{ops.join(' ')}"
132
- elsif ops
133
- expr << "#{name}:#{ops}"
134
- end
130
+ expr << "#{name}:#{ops}" unless ops.empty?
135
131
  }
136
132
 
137
133
  expr << @query if @query
@@ -168,6 +164,26 @@ module GScraper
168
164
  return expr.join(' ')
169
165
  end
170
166
 
167
+ protected
168
+
169
+ def format_modifier(value)
170
+ if value.kind_of?(Regexp)
171
+ return value.source
172
+ else
173
+ return value.to_s
174
+ end
175
+ end
176
+
177
+ def format_options(value)
178
+ if value.kind_of?(Array)
179
+ return value.map { |element|
180
+ format_modifier(element)
181
+ }.join(' ')
182
+ else
183
+ return format_modifier(value)
184
+ end
185
+ end
186
+
171
187
  end
172
188
  end
173
189
  end
@@ -50,51 +50,6 @@ module GScraper
50
50
  # Results per-page
51
51
  attr_accessor :results_per_page
52
52
 
53
- # Search query
54
- attr_accessor :query
55
-
56
- # Search 'link' modifier
57
- attr_accessor :link
58
-
59
- # Search 'related' modifier
60
- attr_accessor :related
61
-
62
- # Search 'info' modifier
63
- attr_accessor :info
64
-
65
- # Search 'site' modifier
66
- attr_accessor :site
67
-
68
- # Search 'filetype' modifier
69
- attr_accessor :filetype
70
-
71
- # Search 'allintitle' modifier
72
- attr_accessor :allintitle
73
-
74
- # Search 'intitle' modifier
75
- attr_accessor :intitle
76
-
77
- # Search 'allinurl' modifier
78
- attr_accessor :allinurl
79
-
80
- # Search 'inurl' modifier
81
- attr_accessor :inurl
82
-
83
- # Search 'allintext' modifier
84
- attr_accessor :allintext
85
-
86
- # Search 'intext' modifier
87
- attr_accessor :intext
88
-
89
- # Search for results containing the exact phrase
90
- attr_accessor :exact_phrase
91
-
92
- # Search for results with the words
93
- attr_accessor :with_words
94
-
95
- # Search for results with-out the words
96
- attr_accessor :without_words
97
-
98
53
  # Search for results written in the language
99
54
  attr_accessor :language
100
55
 
@@ -119,9 +74,6 @@ module GScraper
119
74
  # Search for results within the past year
120
75
  attr_accessor :within_past_year
121
76
 
122
- # Search for results containing numbers between the range
123
- attr_accessor :numeric_range
124
-
125
77
  # Search for results where the query ocurrs within the area
126
78
  attr_accessor :occurrs_within
127
79
 
@@ -386,7 +338,7 @@ module GScraper
386
338
  def page(page_index)
387
339
  Page.new do |new_page|
388
340
  doc = @agent.get(page_url(page_index))
389
- results = doc.search('//div.g')[0...@results_per_page.to_i]
341
+ results = doc.search('//li.g|//li/div.g')[0...@results_per_page.to_i]
390
342
 
391
343
  rank_offset = result_offset_of(page_index)
392
344
 
@@ -394,24 +346,24 @@ module GScraper
394
346
  rank = rank_offset + (index + 1)
395
347
  link = result.at('//a.l')
396
348
  title = link.inner_text
397
- url = link.get_attribute('href')
349
+ url = URI(link.get_attribute('href'))
398
350
  summary_text = ''
399
351
  cached_url = nil
400
352
  similar_url = nil
401
353
 
402
- if (content = (result.at('//td.j//font|//td.j/div')))
354
+ if (content = (result.at('//div.s|//td.j//font')))
403
355
  content.children.each do |elem|
404
356
  break if (!(elem.text?) && elem.name=='br')
405
357
 
406
358
  summary_text << elem.inner_text
407
359
  end
408
360
 
409
- if (cached_link = result.at('nobr/a:first'))
410
- cached_url = cached_link.get_attribute('href')
361
+ if (cached_link = result.at('span.gl/a:first'))
362
+ cached_url = URI(cached_link.get_attribute('href'))
411
363
  end
412
364
 
413
- if (similar_link = result.at('nobr/a:last'))
414
- similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
365
+ if (similar_link = result.at('span.gl/a:last'))
366
+ similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
415
367
  end
416
368
  end
417
369
 
@@ -445,7 +397,7 @@ module GScraper
445
397
  # top and side ads
446
398
  doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
447
399
  title = link.inner_text
448
- url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
400
+ url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))
449
401
 
450
402
  links << SponsoredAd.new(title,url)
451
403
  end
@@ -43,9 +43,7 @@ module GScraper
43
43
  # Returns the direct URL of the ad.
44
44
  #
45
45
  def direct_url
46
- uri = URI(@url)
47
-
48
- return (uri.query_params['adurl'] || uri.query_params['q'])
46
+ URI(@url.query_params['adurl'] || @url.query_params['q'])
49
47
  end
50
48
 
51
49
  #
@@ -21,5 +21,5 @@
21
21
  #
22
22
 
23
23
  module GScraper
24
- VERSION = '0.2.0'
24
+ VERSION = '0.2.1'
25
25
  end
@@ -24,15 +24,9 @@ shared_examples_for "has Sponsored Links" do
24
24
  end
25
25
  end
26
26
 
27
- it "should have non-empty URLs" do
28
- @links.each_url do |url|
29
- url.length.should_not == 0
30
- end
31
- end
32
-
33
27
  it "should have valid URLs" do
34
28
  @links.each_url do |url|
35
- url_should_be_valid(url)
29
+ uri_should_be_valid(url)
36
30
  end
37
31
  end
38
32
 
@@ -42,15 +36,9 @@ shared_examples_for "has Sponsored Links" do
42
36
  end
43
37
  end
44
38
 
45
- it "should have non-empty direct URLs" do
46
- @links.each_direct_url do |url|
47
- url.length.should_not == 0
48
- end
49
- end
50
-
51
39
  it "should have valid direct URLs" do
52
40
  @links.each_direct_url do |url|
53
- url_should_be_valid(url)
41
+ uri_should_be_valid(url)
54
42
  end
55
43
  end
56
44
 
data/spec/helpers/uri.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  require 'uri'
2
2
 
3
- def url_should_be_valid(url)
4
- uri = URI(url)
3
+ def uri_should_be_valid(uri)
5
4
  uri.scheme.should_not be_nil
6
5
  uri.host.should_not be_nil
7
6
  uri.path.should_not be_nil
@@ -28,24 +28,14 @@ shared_examples_for "Page has Search Results" do
28
28
  end
29
29
  end
30
30
 
31
- it "should have non-empty URLs" do
32
- @page.each_url do |url|
33
- url.length.should_not == 0
34
- end
35
- end
36
-
37
31
  it "should have valid URLs" do
38
32
  @page.each_url do |url|
39
- url_should_be_valid(url)
33
+ uri_should_be_valid(url)
40
34
  end
41
35
  end
42
36
 
43
37
  it "should have atleast one cached URL" do
44
- @page.cached_urls.should_not == 0
45
- end
46
-
47
- it "should have atleast one similar query URL" do
48
- @page.similar_urls.should_not == 0
38
+ @page.cached_urls.length.should_not == 0
49
39
  end
50
40
 
51
41
  end
@@ -71,4 +71,8 @@ describe GScraper::Search::WebQuery do
71
71
 
72
72
  end
73
73
 
74
+ it "should have atleast one similar query URL" do
75
+ @page.similar_urls.length.should_not == 0
76
+ end
77
+
74
78
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern Modulus III
@@ -9,11 +9,12 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-06-21 00:00:00 -07:00
12
+ date: 2008-08-27 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: hpricot
17
+ type: :runtime
17
18
  version_requirement:
18
19
  version_requirements: !ruby/object:Gem::Requirement
19
20
  requirements:
@@ -23,6 +24,7 @@ dependencies:
23
24
  version:
24
25
  - !ruby/object:Gem::Dependency
25
26
  name: mechanize
27
+ type: :runtime
26
28
  version_requirement:
27
29
  version_requirements: !ruby/object:Gem::Requirement
28
30
  requirements:
@@ -32,12 +34,13 @@ dependencies:
32
34
  version:
33
35
  - !ruby/object:Gem::Dependency
34
36
  name: hoe
37
+ type: :development
35
38
  version_requirement:
36
39
  version_requirements: !ruby/object:Gem::Requirement
37
40
  requirements:
38
41
  - - ">="
39
42
  - !ruby/object:Gem::Version
40
- version: 1.6.0
43
+ version: 1.7.0
41
44
  version:
42
45
  description: GScraper is a web-scraping interface to various Google Services.
43
46
  email:
@@ -113,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
113
116
  requirements: []
114
117
 
115
118
  rubyforge_project: gscraper
116
- rubygems_version: 1.1.1
119
+ rubygems_version: 1.2.0
117
120
  signing_key:
118
121
  specification_version: 2
119
122
  summary: GScraper is a web-scraping interface to various Google Services.