gscraper 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +15 -11
- data/README.txt +3 -4
- data/Rakefile +3 -2
- data/lib/gscraper/gscraper.rb +6 -4
- data/lib/gscraper/has_pages.rb +3 -0
- data/lib/gscraper/search/web_query.rb +19 -14
- data/lib/gscraper/version.rb +1 -1
- data/tasks/spec.rb +2 -0
- metadata +7 -17
data/History.txt
CHANGED
@@ -1,9 +1,13 @@
|
|
1
|
-
|
1
|
+
=== 0.2.2 / 2009-01-14
|
2
|
+
|
3
|
+
* Updated GScraper::Search::WebQuery to use Nokogiri properly.
|
4
|
+
|
5
|
+
=== 0.2.1 / 2008-08-27
|
2
6
|
|
3
7
|
* Updated XPath queries in GScraper::Search::WebQuery for new Google (tm)
|
4
8
|
Search Result HTML schema.
|
5
9
|
|
6
|
-
|
10
|
+
=== 0.2.0 / 2008-05-10
|
7
11
|
|
8
12
|
* Removed GScraper::WebAgent.
|
9
13
|
* Added GScraper::Page and GScraper::HasPages.
|
@@ -18,13 +22,13 @@
|
|
18
22
|
* Added GScraper::Search::AJAXQuery.
|
19
23
|
* Replaced Unit Tests with Rspec specifications.
|
20
24
|
|
21
|
-
|
25
|
+
=== 0.1.8 / 2008-04-30
|
22
26
|
|
23
27
|
* Added the GScraper.user_agent_alias=(name) method.
|
24
28
|
* Added URI::HTTP::QueryParams module.
|
25
29
|
* Changed license from MIT to GPL-2.
|
26
30
|
|
27
|
-
|
31
|
+
=== 0.1.7 / 2008-04-28
|
28
32
|
|
29
33
|
* Added support for specifing Search modifiers.
|
30
34
|
|
@@ -32,7 +36,7 @@
|
|
32
36
|
|
33
37
|
* Added the Search::Result#page method.
|
34
38
|
|
35
|
-
|
39
|
+
=== 0.1.6 / 2008-03-15
|
36
40
|
|
37
41
|
* Renamed GScraper.http_agent to GScraper.web_agent.
|
38
42
|
* Added GScraper.proxy for global proxy configuration.
|
@@ -43,12 +47,12 @@
|
|
43
47
|
* Added the methods Query#sponsored_links and Query#top_sponsored_link.
|
44
48
|
* Added examples to README.txt.
|
45
49
|
|
46
|
-
|
50
|
+
=== 0.1.5 / 2007-12-29
|
47
51
|
|
48
52
|
* Fixed class inheritance in gscraper/extensions/uri/http.rb, found by
|
49
53
|
sanitybit.
|
50
54
|
|
51
|
-
|
55
|
+
=== 0.1.4 / 2007-12-23
|
52
56
|
|
53
57
|
* Added Search::Query#result_at for easier access of a single result at
|
54
58
|
a given index.
|
@@ -63,22 +67,22 @@
|
|
63
67
|
* Fixed various bugs in Search::Query uncovered during unit-testing.
|
64
68
|
* Fixed typos in Search::Page's documentation.
|
65
69
|
|
66
|
-
|
70
|
+
=== 0.1.3 / 2007-12-22
|
67
71
|
|
68
72
|
* Added the Search::Page class, which contains many of convenance methods
|
69
73
|
for searching through the results within a Page.
|
70
74
|
|
71
|
-
|
75
|
+
=== 0.1.2 / 2007-12-22
|
72
76
|
|
73
77
|
* Fixed a bug related to extracting the correct content-rights from search
|
74
78
|
query URLs.
|
75
79
|
* Added GScraper.user_agent_aliases.
|
76
80
|
|
77
|
-
|
81
|
+
=== 0.1.1 / 2007-12-21
|
78
82
|
|
79
83
|
* Forgot to include lib/gscraper/version.rb.
|
80
84
|
|
81
|
-
|
85
|
+
=== 0.1.0 / 2007-12-20
|
82
86
|
|
83
87
|
* Initial release.
|
84
88
|
* Supports the Google Search service.
|
data/README.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
= GScraper
|
2
2
|
|
3
|
-
* http://rubyforge.org/
|
4
|
-
* Postmodern
|
3
|
+
* http://gscraper.rubyforge.org/
|
4
|
+
* Postmodern (postmodern.mod3 at gmail.com)
|
5
5
|
|
6
6
|
== DESCRIPTION:
|
7
7
|
|
@@ -17,8 +17,7 @@ GScraper is a web-scraping interface to various Google Services.
|
|
17
17
|
|
18
18
|
== REQUIREMENTS:
|
19
19
|
|
20
|
-
*
|
21
|
-
* WWW::Mechanize
|
20
|
+
* mechanize >= 0.9.0
|
22
21
|
|
23
22
|
== INSTALL:
|
24
23
|
|
data/Rakefile
CHANGED
@@ -8,8 +8,9 @@ require './lib/gscraper/version.rb'
|
|
8
8
|
|
9
9
|
Hoe.new('gscraper', GScraper::VERSION) do |p|
|
10
10
|
p.rubyforge_name = 'gscraper'
|
11
|
-
p.developer('Postmodern
|
12
|
-
p.
|
11
|
+
p.developer('Postmodern', 'postmodern.mod3@gmail.com')
|
12
|
+
p.remote_rdoc_dir = ''
|
13
|
+
p.extra_deps = [['mechanize', '>=0.9.0']]
|
13
14
|
end
|
14
15
|
|
15
16
|
# vim: syntax=Ruby
|
data/lib/gscraper/gscraper.rb
CHANGED
@@ -48,10 +48,12 @@ module GScraper
|
|
48
48
|
#
|
49
49
|
def GScraper.proxy_uri(proxy_info=GScraper.proxy)
|
50
50
|
if GScraper.proxy[:host]
|
51
|
-
return URI::HTTP.build(
|
52
|
-
|
53
|
-
|
54
|
-
|
51
|
+
return URI::HTTP.build(
|
52
|
+
:host => GScraper.proxy[:host],
|
53
|
+
:port => GScraper.proxy[:port],
|
54
|
+
:userinfo => "#{GScraper.proxy[:user]}:#{GScraper.proxy[:password]}",
|
55
|
+
:path => '/'
|
56
|
+
)
|
55
57
|
end
|
56
58
|
end
|
57
59
|
|
data/lib/gscraper/has_pages.rb
CHANGED
@@ -30,8 +30,6 @@ require 'gscraper/has_pages'
|
|
30
30
|
require 'gscraper/licenses'
|
31
31
|
require 'gscraper/gscraper'
|
32
32
|
|
33
|
-
require 'hpricot'
|
34
|
-
|
35
33
|
module GScraper
|
36
34
|
module Search
|
37
35
|
class WebQuery < Query
|
@@ -164,7 +162,11 @@ module GScraper
|
|
164
162
|
def self.from_url(url,options={},&block)
|
165
163
|
url = URI(url.to_s)
|
166
164
|
|
167
|
-
|
165
|
+
if url.query_params['num']
|
166
|
+
options[:results_per_page] = url.query_params['num'].to_i
|
167
|
+
else
|
168
|
+
options[:results_per_page] = RESULTS_PER_PAGE
|
169
|
+
end
|
168
170
|
|
169
171
|
options[:query] = url.query_params['q']
|
170
172
|
options[:exact_phrase] = url.query_params['as_epq']
|
@@ -338,33 +340,36 @@ module GScraper
|
|
338
340
|
def page(page_index)
|
339
341
|
Page.new do |new_page|
|
340
342
|
doc = @agent.get(page_url(page_index))
|
341
|
-
results = doc.search('
|
343
|
+
results = doc.search('li.g','li/div.g')
|
342
344
|
|
343
345
|
rank_offset = result_offset_of(page_index)
|
344
346
|
|
345
|
-
|
347
|
+
(0...@results_per_page).each do |index|
|
348
|
+
result = results[index]
|
349
|
+
|
346
350
|
rank = rank_offset + (index + 1)
|
347
|
-
link = result.at('
|
351
|
+
link = result.at('a.l')
|
348
352
|
title = link.inner_text
|
349
353
|
url = URI(link.get_attribute('href'))
|
350
354
|
summary_text = ''
|
351
355
|
cached_url = nil
|
352
356
|
similar_url = nil
|
353
357
|
|
354
|
-
if (content = (result.at('
|
358
|
+
if (content = (result.at('div.s','td.j//font')))
|
355
359
|
content.children.each do |elem|
|
356
360
|
break if (!(elem.text?) && elem.name=='br')
|
357
361
|
|
358
362
|
summary_text << elem.inner_text
|
359
363
|
end
|
360
364
|
|
361
|
-
|
362
|
-
cached_url = URI(cached_link.get_attribute('href'))
|
363
|
-
end
|
365
|
+
end
|
364
366
|
|
365
|
-
|
366
|
-
|
367
|
-
|
367
|
+
if (cached_link = result.at('span.gl/a:first'))
|
368
|
+
cached_url = URI(cached_link.get_attribute('href'))
|
369
|
+
end
|
370
|
+
|
371
|
+
if (similar_link = result.at('span.gl/a:last'))
|
372
|
+
similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
|
368
373
|
end
|
369
374
|
|
370
375
|
new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
|
@@ -395,7 +400,7 @@ module GScraper
|
|
395
400
|
doc = @agent.get(search_url)
|
396
401
|
|
397
402
|
# top and side ads
|
398
|
-
doc.search('
|
403
|
+
doc.search('#pa1', 'a[@id^="an"]').each do |link|
|
399
404
|
title = link.inner_text
|
400
405
|
url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))
|
401
406
|
|
data/lib/gscraper/version.rb
CHANGED
data/tasks/spec.rb
CHANGED
metadata
CHANGED
@@ -1,27 +1,17 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
- Postmodern
|
7
|
+
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2009-01-14 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
|
-
- !ruby/object:Gem::Dependency
|
16
|
-
name: hpricot
|
17
|
-
type: :runtime
|
18
|
-
version_requirement:
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: "0"
|
24
|
-
version:
|
25
15
|
- !ruby/object:Gem::Dependency
|
26
16
|
name: mechanize
|
27
17
|
type: :runtime
|
@@ -30,7 +20,7 @@ dependencies:
|
|
30
20
|
requirements:
|
31
21
|
- - ">="
|
32
22
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
23
|
+
version: 0.9.0
|
34
24
|
version:
|
35
25
|
- !ruby/object:Gem::Dependency
|
36
26
|
name: hoe
|
@@ -40,7 +30,7 @@ dependencies:
|
|
40
30
|
requirements:
|
41
31
|
- - ">="
|
42
32
|
- !ruby/object:Gem::Version
|
43
|
-
version: 1.
|
33
|
+
version: 1.8.2
|
44
34
|
version:
|
45
35
|
description: GScraper is a web-scraping interface to various Google Services.
|
46
36
|
email:
|
@@ -94,7 +84,7 @@ files:
|
|
94
84
|
- spec/search/web_query_spec.rb
|
95
85
|
- spec/gscraper_spec.rb
|
96
86
|
has_rdoc: true
|
97
|
-
homepage: http://rubyforge.org/
|
87
|
+
homepage: http://gscraper.rubyforge.org/
|
98
88
|
post_install_message:
|
99
89
|
rdoc_options:
|
100
90
|
- --main
|
@@ -116,7 +106,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
116
106
|
requirements: []
|
117
107
|
|
118
108
|
rubyforge_project: gscraper
|
119
|
-
rubygems_version: 1.
|
109
|
+
rubygems_version: 1.3.1
|
120
110
|
signing_key:
|
121
111
|
specification_version: 2
|
122
112
|
summary: GScraper is a web-scraping interface to various Google Services.
|