gscraper 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +15 -11
- data/README.txt +3 -4
- data/Rakefile +3 -2
- data/lib/gscraper/gscraper.rb +6 -4
- data/lib/gscraper/has_pages.rb +3 -0
- data/lib/gscraper/search/web_query.rb +19 -14
- data/lib/gscraper/version.rb +1 -1
- data/tasks/spec.rb +2 -0
- metadata +7 -17
data/History.txt
CHANGED
@@ -1,9 +1,13 @@
|
|
1
|
-
|
1
|
+
=== 0.2.2 / 2009-01-14
|
2
|
+
|
3
|
+
* Updated GScraper::Search::WebQuery to use Nokogiri properly.
|
4
|
+
|
5
|
+
=== 0.2.1 / 2008-08-27
|
2
6
|
|
3
7
|
* Updated XPath queries in GScraper::Search::WebQuery for new Google (tm)
|
4
8
|
Search Result HTML schema.
|
5
9
|
|
6
|
-
|
10
|
+
=== 0.2.0 / 2008-05-10
|
7
11
|
|
8
12
|
* Removed GScraper::WebAgent.
|
9
13
|
* Added GScraper::Page and GScraper::HasPages.
|
@@ -18,13 +22,13 @@
|
|
18
22
|
* Added GScraper::Search::AJAXQuery.
|
19
23
|
* Replaced Unit Tests with Rspec specifications.
|
20
24
|
|
21
|
-
|
25
|
+
=== 0.1.8 / 2008-04-30
|
22
26
|
|
23
27
|
* Added the GScraper.user_agent_alias=(name) method.
|
24
28
|
* Added URI::HTTP::QueryParams module.
|
25
29
|
* Changed license from MIT to GPL-2.
|
26
30
|
|
27
|
-
|
31
|
+
=== 0.1.7 / 2008-04-28
|
28
32
|
|
29
33
|
* Added support for specifing Search modifiers.
|
30
34
|
|
@@ -32,7 +36,7 @@
|
|
32
36
|
|
33
37
|
* Added the Search::Result#page method.
|
34
38
|
|
35
|
-
|
39
|
+
=== 0.1.6 / 2008-03-15
|
36
40
|
|
37
41
|
* Renamed GScraper.http_agent to GScraper.web_agent.
|
38
42
|
* Added GScraper.proxy for global proxy configuration.
|
@@ -43,12 +47,12 @@
|
|
43
47
|
* Added the methods Query#sponsored_links and Query#top_sponsored_link.
|
44
48
|
* Added examples to README.txt.
|
45
49
|
|
46
|
-
|
50
|
+
=== 0.1.5 / 2007-12-29
|
47
51
|
|
48
52
|
* Fixed class inheritance in gscraper/extensions/uri/http.rb, found by
|
49
53
|
sanitybit.
|
50
54
|
|
51
|
-
|
55
|
+
=== 0.1.4 / 2007-12-23
|
52
56
|
|
53
57
|
* Added Search::Query#result_at for easier access of a single result at
|
54
58
|
a given index.
|
@@ -63,22 +67,22 @@
|
|
63
67
|
* Fixed various bugs in Search::Query uncovered during unit-testing.
|
64
68
|
* Fixed typos in Search::Page's documentation.
|
65
69
|
|
66
|
-
|
70
|
+
=== 0.1.3 / 2007-12-22
|
67
71
|
|
68
72
|
* Added the Search::Page class, which contains many of convenance methods
|
69
73
|
for searching through the results within a Page.
|
70
74
|
|
71
|
-
|
75
|
+
=== 0.1.2 / 2007-12-22
|
72
76
|
|
73
77
|
* Fixed a bug related to extracting the correct content-rights from search
|
74
78
|
query URLs.
|
75
79
|
* Added GScraper.user_agent_aliases.
|
76
80
|
|
77
|
-
|
81
|
+
=== 0.1.1 / 2007-12-21
|
78
82
|
|
79
83
|
* Forgot to include lib/gscraper/version.rb.
|
80
84
|
|
81
|
-
|
85
|
+
=== 0.1.0 / 2007-12-20
|
82
86
|
|
83
87
|
* Initial release.
|
84
88
|
* Supports the Google Search service.
|
data/README.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
= GScraper
|
2
2
|
|
3
|
-
* http://rubyforge.org/
|
4
|
-
* Postmodern
|
3
|
+
* http://gscraper.rubyforge.org/
|
4
|
+
* Postmodern (postmodern.mod3 at gmail.com)
|
5
5
|
|
6
6
|
== DESCRIPTION:
|
7
7
|
|
@@ -17,8 +17,7 @@ GScraper is a web-scraping interface to various Google Services.
|
|
17
17
|
|
18
18
|
== REQUIREMENTS:
|
19
19
|
|
20
|
-
*
|
21
|
-
* WWW::Mechanize
|
20
|
+
* mechanize >= 0.9.0
|
22
21
|
|
23
22
|
== INSTALL:
|
24
23
|
|
data/Rakefile
CHANGED
@@ -8,8 +8,9 @@ require './lib/gscraper/version.rb'
|
|
8
8
|
|
9
9
|
Hoe.new('gscraper', GScraper::VERSION) do |p|
|
10
10
|
p.rubyforge_name = 'gscraper'
|
11
|
-
p.developer('Postmodern
|
12
|
-
p.
|
11
|
+
p.developer('Postmodern', 'postmodern.mod3@gmail.com')
|
12
|
+
p.remote_rdoc_dir = ''
|
13
|
+
p.extra_deps = [['mechanize', '>=0.9.0']]
|
13
14
|
end
|
14
15
|
|
15
16
|
# vim: syntax=Ruby
|
data/lib/gscraper/gscraper.rb
CHANGED
@@ -48,10 +48,12 @@ module GScraper
|
|
48
48
|
#
|
49
49
|
def GScraper.proxy_uri(proxy_info=GScraper.proxy)
|
50
50
|
if GScraper.proxy[:host]
|
51
|
-
return URI::HTTP.build(
|
52
|
-
|
53
|
-
|
54
|
-
|
51
|
+
return URI::HTTP.build(
|
52
|
+
:host => GScraper.proxy[:host],
|
53
|
+
:port => GScraper.proxy[:port],
|
54
|
+
:userinfo => "#{GScraper.proxy[:user]}:#{GScraper.proxy[:password]}",
|
55
|
+
:path => '/'
|
56
|
+
)
|
55
57
|
end
|
56
58
|
end
|
57
59
|
|
data/lib/gscraper/has_pages.rb
CHANGED
@@ -30,8 +30,6 @@ require 'gscraper/has_pages'
|
|
30
30
|
require 'gscraper/licenses'
|
31
31
|
require 'gscraper/gscraper'
|
32
32
|
|
33
|
-
require 'hpricot'
|
34
|
-
|
35
33
|
module GScraper
|
36
34
|
module Search
|
37
35
|
class WebQuery < Query
|
@@ -164,7 +162,11 @@ module GScraper
|
|
164
162
|
def self.from_url(url,options={},&block)
|
165
163
|
url = URI(url.to_s)
|
166
164
|
|
167
|
-
|
165
|
+
if url.query_params['num']
|
166
|
+
options[:results_per_page] = url.query_params['num'].to_i
|
167
|
+
else
|
168
|
+
options[:results_per_page] = RESULTS_PER_PAGE
|
169
|
+
end
|
168
170
|
|
169
171
|
options[:query] = url.query_params['q']
|
170
172
|
options[:exact_phrase] = url.query_params['as_epq']
|
@@ -338,33 +340,36 @@ module GScraper
|
|
338
340
|
def page(page_index)
|
339
341
|
Page.new do |new_page|
|
340
342
|
doc = @agent.get(page_url(page_index))
|
341
|
-
results = doc.search('
|
343
|
+
results = doc.search('li.g','li/div.g')
|
342
344
|
|
343
345
|
rank_offset = result_offset_of(page_index)
|
344
346
|
|
345
|
-
|
347
|
+
(0...@results_per_page).each do |index|
|
348
|
+
result = results[index]
|
349
|
+
|
346
350
|
rank = rank_offset + (index + 1)
|
347
|
-
link = result.at('
|
351
|
+
link = result.at('a.l')
|
348
352
|
title = link.inner_text
|
349
353
|
url = URI(link.get_attribute('href'))
|
350
354
|
summary_text = ''
|
351
355
|
cached_url = nil
|
352
356
|
similar_url = nil
|
353
357
|
|
354
|
-
if (content = (result.at('
|
358
|
+
if (content = (result.at('div.s','td.j//font')))
|
355
359
|
content.children.each do |elem|
|
356
360
|
break if (!(elem.text?) && elem.name=='br')
|
357
361
|
|
358
362
|
summary_text << elem.inner_text
|
359
363
|
end
|
360
364
|
|
361
|
-
|
362
|
-
cached_url = URI(cached_link.get_attribute('href'))
|
363
|
-
end
|
365
|
+
end
|
364
366
|
|
365
|
-
|
366
|
-
|
367
|
-
|
367
|
+
if (cached_link = result.at('span.gl/a:first'))
|
368
|
+
cached_url = URI(cached_link.get_attribute('href'))
|
369
|
+
end
|
370
|
+
|
371
|
+
if (similar_link = result.at('span.gl/a:last'))
|
372
|
+
similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
|
368
373
|
end
|
369
374
|
|
370
375
|
new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
|
@@ -395,7 +400,7 @@ module GScraper
|
|
395
400
|
doc = @agent.get(search_url)
|
396
401
|
|
397
402
|
# top and side ads
|
398
|
-
doc.search('
|
403
|
+
doc.search('#pa1', 'a[@id^="an"]').each do |link|
|
399
404
|
title = link.inner_text
|
400
405
|
url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))
|
401
406
|
|
data/lib/gscraper/version.rb
CHANGED
data/tasks/spec.rb
CHANGED
metadata
CHANGED
@@ -1,27 +1,17 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
- Postmodern
|
7
|
+
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2009-01-14 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
|
-
- !ruby/object:Gem::Dependency
|
16
|
-
name: hpricot
|
17
|
-
type: :runtime
|
18
|
-
version_requirement:
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: "0"
|
24
|
-
version:
|
25
15
|
- !ruby/object:Gem::Dependency
|
26
16
|
name: mechanize
|
27
17
|
type: :runtime
|
@@ -30,7 +20,7 @@ dependencies:
|
|
30
20
|
requirements:
|
31
21
|
- - ">="
|
32
22
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
23
|
+
version: 0.9.0
|
34
24
|
version:
|
35
25
|
- !ruby/object:Gem::Dependency
|
36
26
|
name: hoe
|
@@ -40,7 +30,7 @@ dependencies:
|
|
40
30
|
requirements:
|
41
31
|
- - ">="
|
42
32
|
- !ruby/object:Gem::Version
|
43
|
-
version: 1.
|
33
|
+
version: 1.8.2
|
44
34
|
version:
|
45
35
|
description: GScraper is a web-scraping interface to various Google Services.
|
46
36
|
email:
|
@@ -94,7 +84,7 @@ files:
|
|
94
84
|
- spec/search/web_query_spec.rb
|
95
85
|
- spec/gscraper_spec.rb
|
96
86
|
has_rdoc: true
|
97
|
-
homepage: http://rubyforge.org/
|
87
|
+
homepage: http://gscraper.rubyforge.org/
|
98
88
|
post_install_message:
|
99
89
|
rdoc_options:
|
100
90
|
- --main
|
@@ -116,7 +106,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
116
106
|
requirements: []
|
117
107
|
|
118
108
|
rubyforge_project: gscraper
|
119
|
-
rubygems_version: 1.
|
109
|
+
rubygems_version: 1.3.1
|
120
110
|
signing_key:
|
121
111
|
specification_version: 2
|
122
112
|
summary: GScraper is a web-scraping interface to various Google Services.
|