gscraper 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,13 @@
1
- == 0.2.1 / 2008-08-27
1
+ === 0.2.2 / 2009-01-14
2
+
3
+ * Updated GScraper::Search::WebQuery to use Nokogiri properly.
4
+
5
+ === 0.2.1 / 2008-08-27
2
6
 
3
7
  * Updated XPath queries in GScraper::Search::WebQuery for new Google (tm)
4
8
  Search Result HTML schema.
5
9
 
6
- == 0.2.0 / 2008-05-10
10
+ === 0.2.0 / 2008-05-10
7
11
 
8
12
  * Removed GScraper::WebAgent.
9
13
  * Added GScraper::Page and GScraper::HasPages.
@@ -18,13 +22,13 @@
18
22
  * Added GScraper::Search::AJAXQuery.
19
23
  * Replaced Unit Tests with Rspec specifications.
20
24
 
21
- == 0.1.8 / 2008-04-30
25
+ === 0.1.8 / 2008-04-30
22
26
 
23
27
  * Added the GScraper.user_agent_alias=(name) method.
24
28
  * Added URI::HTTP::QueryParams module.
25
29
  * Changed license from MIT to GPL-2.
26
30
 
27
- == 0.1.7 / 2008-04-28
31
+ === 0.1.7 / 2008-04-28
28
32
 
29
33
  * Added support for specifing Search modifiers.
30
34
 
@@ -32,7 +36,7 @@
32
36
 
33
37
  * Added the Search::Result#page method.
34
38
 
35
- == 0.1.6 / 2008-03-15
39
+ === 0.1.6 / 2008-03-15
36
40
 
37
41
  * Renamed GScraper.http_agent to GScraper.web_agent.
38
42
  * Added GScraper.proxy for global proxy configuration.
@@ -43,12 +47,12 @@
43
47
  * Added the methods Query#sponsored_links and Query#top_sponsored_link.
44
48
  * Added examples to README.txt.
45
49
 
46
- == 0.1.5 / 2007-12-29
50
+ === 0.1.5 / 2007-12-29
47
51
 
48
52
  * Fixed class inheritance in gscraper/extensions/uri/http.rb, found by
49
53
  sanitybit.
50
54
 
51
- == 0.1.4 / 2007-12-23
55
+ === 0.1.4 / 2007-12-23
52
56
 
53
57
  * Added Search::Query#result_at for easier access of a single result at
54
58
  a given index.
@@ -63,22 +67,22 @@
63
67
  * Fixed various bugs in Search::Query uncovered during unit-testing.
64
68
  * Fixed typos in Search::Page's documentation.
65
69
 
66
- == 0.1.3 / 2007-12-22
70
+ === 0.1.3 / 2007-12-22
67
71
 
68
72
  * Added the Search::Page class, which contains many of convenance methods
69
73
  for searching through the results within a Page.
70
74
 
71
- == 0.1.2 / 2007-12-22
75
+ === 0.1.2 / 2007-12-22
72
76
 
73
77
  * Fixed a bug related to extracting the correct content-rights from search
74
78
  query URLs.
75
79
  * Added GScraper.user_agent_aliases.
76
80
 
77
- == 0.1.1 / 2007-12-21
81
+ === 0.1.1 / 2007-12-21
78
82
 
79
83
  * Forgot to include lib/gscraper/version.rb.
80
84
 
81
- == 0.1.0 / 2007-12-20
85
+ === 0.1.0 / 2007-12-20
82
86
 
83
87
  * Initial release.
84
88
  * Supports the Google Search service.
data/README.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  = GScraper
2
2
 
3
- * http://rubyforge.org/projects/gscraper/
4
- * Postmodern Modulus III (postmodern.mod3@gmail.com)
3
+ * http://gscraper.rubyforge.org/
4
+ * Postmodern (postmodern.mod3 at gmail.com)
5
5
 
6
6
  == DESCRIPTION:
7
7
 
@@ -17,8 +17,7 @@ GScraper is a web-scraping interface to various Google Services.
17
17
 
18
18
  == REQUIREMENTS:
19
19
 
20
- * Hpricot
21
- * WWW::Mechanize
20
+ * mechanize >= 0.9.0
22
21
 
23
22
  == INSTALL:
24
23
 
data/Rakefile CHANGED
@@ -8,8 +8,9 @@ require './lib/gscraper/version.rb'
8
8
 
9
9
  Hoe.new('gscraper', GScraper::VERSION) do |p|
10
10
  p.rubyforge_name = 'gscraper'
11
- p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
12
- p.extra_deps = ['hpricot', 'mechanize']
11
+ p.developer('Postmodern', 'postmodern.mod3@gmail.com')
12
+ p.remote_rdoc_dir = ''
13
+ p.extra_deps = [['mechanize', '>=0.9.0']]
13
14
  end
14
15
 
15
16
  # vim: syntax=Ruby
@@ -48,10 +48,12 @@ module GScraper
48
48
  #
49
49
  def GScraper.proxy_uri(proxy_info=GScraper.proxy)
50
50
  if GScraper.proxy[:host]
51
- return URI::HTTP.build(:host => GScraper.proxy[:host],
52
- :port => GScraper.proxy[:port],
53
- :userinfo => "#{GScraper.proxy[:user]}:#{GScraper.proxy[:password]}",
54
- :path => '/')
51
+ return URI::HTTP.build(
52
+ :host => GScraper.proxy[:host],
53
+ :port => GScraper.proxy[:port],
54
+ :userinfo => "#{GScraper.proxy[:user]}:#{GScraper.proxy[:password]}",
55
+ :path => '/'
56
+ )
55
57
  end
56
58
  end
57
59
 
@@ -107,6 +107,9 @@ module GScraper
107
107
  ((rank.to_i - 1) % results_per_page.to_i)
108
108
  end
109
109
 
110
+ #
111
+ # The cache of previously requested pages.
112
+ #
110
113
  def page_cache
111
114
  @page_cache ||= Hash.new { |hash,key| hash[key] = page(key.to_i) }
112
115
  end
@@ -30,8 +30,6 @@ require 'gscraper/has_pages'
30
30
  require 'gscraper/licenses'
31
31
  require 'gscraper/gscraper'
32
32
 
33
- require 'hpricot'
34
-
35
33
  module GScraper
36
34
  module Search
37
35
  class WebQuery < Query
@@ -164,7 +162,11 @@ module GScraper
164
162
  def self.from_url(url,options={},&block)
165
163
  url = URI(url.to_s)
166
164
 
167
- options[:results_per_page] = url.query_params['num'].to_i
165
+ if url.query_params['num']
166
+ options[:results_per_page] = url.query_params['num'].to_i
167
+ else
168
+ options[:results_per_page] = RESULTS_PER_PAGE
169
+ end
168
170
 
169
171
  options[:query] = url.query_params['q']
170
172
  options[:exact_phrase] = url.query_params['as_epq']
@@ -338,33 +340,36 @@ module GScraper
338
340
  def page(page_index)
339
341
  Page.new do |new_page|
340
342
  doc = @agent.get(page_url(page_index))
341
- results = doc.search('//li.g|//li/div.g')[0...@results_per_page.to_i]
343
+ results = doc.search('li.g','li/div.g')
342
344
 
343
345
  rank_offset = result_offset_of(page_index)
344
346
 
345
- results.each_with_index do |result,index|
347
+ (0...@results_per_page).each do |index|
348
+ result = results[index]
349
+
346
350
  rank = rank_offset + (index + 1)
347
- link = result.at('//a.l')
351
+ link = result.at('a.l')
348
352
  title = link.inner_text
349
353
  url = URI(link.get_attribute('href'))
350
354
  summary_text = ''
351
355
  cached_url = nil
352
356
  similar_url = nil
353
357
 
354
- if (content = (result.at('//div.s|//td.j//font')))
358
+ if (content = (result.at('div.s','td.j//font')))
355
359
  content.children.each do |elem|
356
360
  break if (!(elem.text?) && elem.name=='br')
357
361
 
358
362
  summary_text << elem.inner_text
359
363
  end
360
364
 
361
- if (cached_link = result.at('span.gl/a:first'))
362
- cached_url = URI(cached_link.get_attribute('href'))
363
- end
365
+ end
364
366
 
365
- if (similar_link = result.at('span.gl/a:last'))
366
- similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
367
- end
367
+ if (cached_link = result.at('span.gl/a:first'))
368
+ cached_url = URI(cached_link.get_attribute('href'))
369
+ end
370
+
371
+ if (similar_link = result.at('span.gl/a:last'))
372
+ similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
368
373
  end
369
374
 
370
375
  new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
@@ -395,7 +400,7 @@ module GScraper
395
400
  doc = @agent.get(search_url)
396
401
 
397
402
  # top and side ads
398
- doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
403
+ doc.search('#pa1', 'a[@id^="an"]').each do |link|
399
404
  title = link.inner_text
400
405
  url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))
401
406
 
@@ -21,5 +21,5 @@
21
21
  #
22
22
 
23
23
  module GScraper
24
- VERSION = '0.2.1'
24
+ VERSION = '0.2.2'
25
25
  end
@@ -5,3 +5,5 @@ Spec::Rake::SpecTask.new(:spec) do |t|
5
5
  t.libs += ['lib', 'spec']
6
6
  t.spec_opts = ['--colour', '--format', 'specdoc']
7
7
  end
8
+
9
+ task :default => :spec
metadata CHANGED
@@ -1,27 +1,17 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
- - Postmodern Modulus III
7
+ - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-08-27 00:00:00 -07:00
12
+ date: 2009-01-14 00:00:00 -08:00
13
13
  default_executable:
14
14
  dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: hpricot
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: "0"
24
- version:
25
15
  - !ruby/object:Gem::Dependency
26
16
  name: mechanize
27
17
  type: :runtime
@@ -30,7 +20,7 @@ dependencies:
30
20
  requirements:
31
21
  - - ">="
32
22
  - !ruby/object:Gem::Version
33
- version: "0"
23
+ version: 0.9.0
34
24
  version:
35
25
  - !ruby/object:Gem::Dependency
36
26
  name: hoe
@@ -40,7 +30,7 @@ dependencies:
40
30
  requirements:
41
31
  - - ">="
42
32
  - !ruby/object:Gem::Version
43
- version: 1.7.0
33
+ version: 1.8.2
44
34
  version:
45
35
  description: GScraper is a web-scraping interface to various Google Services.
46
36
  email:
@@ -94,7 +84,7 @@ files:
94
84
  - spec/search/web_query_spec.rb
95
85
  - spec/gscraper_spec.rb
96
86
  has_rdoc: true
97
- homepage: http://rubyforge.org/projects/gscraper/
87
+ homepage: http://gscraper.rubyforge.org/
98
88
  post_install_message:
99
89
  rdoc_options:
100
90
  - --main
@@ -116,7 +106,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
116
106
  requirements: []
117
107
 
118
108
  rubyforge_project: gscraper
119
- rubygems_version: 1.2.0
109
+ rubygems_version: 1.3.1
120
110
  signing_key:
121
111
  specification_version: 2
122
112
  summary: GScraper is a web-scraping interface to various Google Services.