gscraper 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +33 -21
- data/Manifest.txt +3 -0
- data/README.txt +107 -4
- data/lib/gscraper/gscraper.rb +92 -21
- data/lib/gscraper/licenses.rb +27 -4
- data/lib/gscraper/search/page.rb +9 -11
- data/lib/gscraper/search/query.rb +142 -104
- data/lib/gscraper/search/result.rb +13 -12
- data/lib/gscraper/search/search.rb +3 -3
- data/lib/gscraper/sponsored_ad.rb +35 -0
- data/lib/gscraper/sponsored_links.rb +151 -0
- data/lib/gscraper/version.rb +1 -1
- data/lib/gscraper/web_agent.rb +38 -0
- data/test/search/query_result.rb +1 -1
- data/test/test_gscraper.rb +1 -4
- metadata +73 -63
@@ -1,7 +1,12 @@
|
|
1
|
+
require 'gscraper/search/query'
|
2
|
+
require 'gscraper/web_agent'
|
3
|
+
|
1
4
|
module GScraper
|
2
5
|
module Search
|
3
6
|
class Result
|
4
7
|
|
8
|
+
include WebAgent
|
9
|
+
|
5
10
|
# Rank of the result page
|
6
11
|
attr_reader :rank
|
7
12
|
|
@@ -33,18 +38,6 @@ module GScraper
|
|
33
38
|
@similar_url = similar_url
|
34
39
|
end
|
35
40
|
|
36
|
-
#
|
37
|
-
# Opens the URL of the cached page for the Result. If _opts_ are
|
38
|
-
# given, they will be used in accessing the cached page URL.
|
39
|
-
#
|
40
|
-
# result.cached_page # => File
|
41
|
-
#
|
42
|
-
def cached_page(opts={})
|
43
|
-
if @cached_url
|
44
|
-
return GScraper.open(@cached_url,opts)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
41
|
#
|
49
42
|
# Create a new Query for results that are similar to the Result. If
|
50
43
|
# a _block_ is given, it will be passed the newly created Query
|
@@ -64,6 +57,14 @@ module GScraper
|
|
64
57
|
end
|
65
58
|
end
|
66
59
|
|
60
|
+
#
|
61
|
+
# Fetches the cached page of the result. If a _block_ is given it will
|
62
|
+
# be passed the cached page.
|
63
|
+
#
|
64
|
+
def cached_page(&block)
|
65
|
+
get_page(@cached_url,&block)
|
66
|
+
end
|
67
|
+
|
67
68
|
#
|
68
69
|
# Returns a string containing the result's title.
|
69
70
|
#
|
@@ -3,7 +3,7 @@ require 'gscraper/search/query'
|
|
3
3
|
module GScraper
|
4
4
|
module Search
|
5
5
|
#
|
6
|
-
# Returns a new Query object with the given
|
6
|
+
# Returns a new Query object with the given _options_. See Query.new.
|
7
7
|
#
|
8
8
|
# Search.query(:query => 'ruby', :with_words => 'sow rspec')
|
9
9
|
#
|
@@ -11,8 +11,8 @@ module GScraper
|
|
11
11
|
# q.within_past_week = true
|
12
12
|
# end
|
13
13
|
#
|
14
|
-
def Search.query(
|
15
|
-
Query.new(
|
14
|
+
def Search.query(options={},&block)
|
15
|
+
Query.new(options,&block)
|
16
16
|
end
|
17
17
|
|
18
18
|
#
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'gscraper/extensions/uri'
|
2
|
+
|
3
|
+
module GScraper
|
4
|
+
class SponsoredAd
|
5
|
+
|
6
|
+
# Title of the ad
|
7
|
+
attr_reader :title
|
8
|
+
|
9
|
+
# URL of the ad
|
10
|
+
attr_reader :url
|
11
|
+
|
12
|
+
#
|
13
|
+
# Creates a new SponsoredAd with the specified _title_ and _url_.
|
14
|
+
#
|
15
|
+
def initialize(title,url)
|
16
|
+
@title = title
|
17
|
+
@url = URI.parse(url)
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
# Returns the direct URL of the ad.
|
22
|
+
#
|
23
|
+
def direct_url
|
24
|
+
@url.query_params['adurl'] || @url.query_params['q']
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Returns the title of the ad.
|
29
|
+
#
|
30
|
+
def to_s
|
31
|
+
@title.to_s
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
require 'gscraper/sponsored_ad'
|
2
|
+
|
3
|
+
module GScraper
|
4
|
+
class SponsoredLinks < Array
|
5
|
+
#
|
6
|
+
# Creates a new SponsoredLinks object with the given _ads_.
|
7
|
+
#
|
8
|
+
def initialize(ads=[])
|
9
|
+
super(ads)
|
10
|
+
end
|
11
|
+
|
12
|
+
#
|
13
|
+
# Returns a mapped Array of the ads within the SponsoredLinks
|
14
|
+
# using the given _block_. If the _block_ is not given, the
|
15
|
+
# SponsoredLinks will be returned.
|
16
|
+
#
|
17
|
+
# sponsored.map # => SponsoredLinks
|
18
|
+
#
|
19
|
+
# sponsored.map { |ad| ad.url } # => [...]
|
20
|
+
#
|
21
|
+
def map(&block)
|
22
|
+
return self unless block
|
23
|
+
|
24
|
+
mapped = []
|
25
|
+
|
26
|
+
each { |ad| mapped << block.call(ad) }
|
27
|
+
return mapped
|
28
|
+
end
|
29
|
+
|
30
|
+
#
|
31
|
+
# Selects the ads within the SponsoredLinks which match the given _block_.
|
32
|
+
#
|
33
|
+
# sponsored.select { |ad| ad.title =~ /consume/i }
|
34
|
+
#
|
35
|
+
def select(&block)
|
36
|
+
SponsoredLinks.new(super(&block))
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Selects the ads using the specified _block_.
|
41
|
+
#
|
42
|
+
# sponsored.ads_with { |ad| ad.title =~ /status symbol/ }
|
43
|
+
#
|
44
|
+
def ads_with(&block)
|
45
|
+
select(&block)
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Selects the ads with the matching _title_. The _title_ may be
|
50
|
+
# either a String or a Regexp. If _block_ is given, each matching
|
51
|
+
# ad will be passed to the _block_.
|
52
|
+
#
|
53
|
+
# sponsored.ads_with_title('be attractive') #=> SponsoredLinks
|
54
|
+
#
|
55
|
+
# sponsored.ads_with_title(/buy me/) do |ad|
|
56
|
+
# puts ad.url
|
57
|
+
# end
|
58
|
+
#
|
59
|
+
def ads_with_title(title,&block)
|
60
|
+
if title.kind_of?(Regexp)
|
61
|
+
ads = ads_with { |ad| ad.title =~ title }
|
62
|
+
else
|
63
|
+
ads = ads_with { |ad| ad.title == title }
|
64
|
+
end
|
65
|
+
|
66
|
+
ads.each(&block) if block
|
67
|
+
return ads
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
# Selects the ads with the matching _url_. The _url_ may be
|
72
|
+
# either a String or a Regexp. If _block_ is given, each matching
|
73
|
+
# ad will be passed to the _block_.
|
74
|
+
#
|
75
|
+
# sponsored.ads_with_url(/\.com/) # => SponsoredLinks
|
76
|
+
#
|
77
|
+
# sponsored.ads_with_url(/^https:\/\//) do |ad|
|
78
|
+
# puts ad.title
|
79
|
+
# end
|
80
|
+
#
|
81
|
+
def ads_with_url(url,&block)
|
82
|
+
if url.kind_of?(Regexp)
|
83
|
+
ads = ads_with { |ad| ad.url =~ url }
|
84
|
+
else
|
85
|
+
ads = ads_with { |ad| ad.url == url }
|
86
|
+
end
|
87
|
+
|
88
|
+
ads.each(&block) if block
|
89
|
+
return ads
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# Returns an Array containing the titles of the ads within the
|
94
|
+
# SponsoredLinks.
|
95
|
+
#
|
96
|
+
# sponsored.titles # => [...]
|
97
|
+
#
|
98
|
+
def titles
|
99
|
+
map { |ad| ad.title }
|
100
|
+
end
|
101
|
+
|
102
|
+
#
|
103
|
+
# Returns an Array containing the URLs of the ads within the
|
104
|
+
# SponsoredLinks.
|
105
|
+
#
|
106
|
+
# sponsored.urls # => [...]
|
107
|
+
#
|
108
|
+
def urls
|
109
|
+
map { |ad| ad.url }
|
110
|
+
end
|
111
|
+
|
112
|
+
#
|
113
|
+
# Iterates over each ad's title within the SponsoredLinks, passing each to
|
114
|
+
# the given _block_.
|
115
|
+
#
|
116
|
+
# each_title { |title| puts title }
|
117
|
+
#
|
118
|
+
def each_title(&block)
|
119
|
+
titles.each(&block)
|
120
|
+
end
|
121
|
+
|
122
|
+
#
|
123
|
+
# Iterates over each ad's url within the SponsoredLinks, passing each to
|
124
|
+
# the given _block_.
|
125
|
+
#
|
126
|
+
# each_url { |url| puts url }
|
127
|
+
#
|
128
|
+
def each_url(&block)
|
129
|
+
urls.each(&block)
|
130
|
+
end
|
131
|
+
|
132
|
+
#
|
133
|
+
# Returns the titles of the ads that match the specified _block_.
|
134
|
+
#
|
135
|
+
# sponsored.titles_of { |ad| ad.url.include?('www') }
|
136
|
+
#
|
137
|
+
def titles_of(&block)
|
138
|
+
ads_with(&block).titles
|
139
|
+
end
|
140
|
+
|
141
|
+
#
|
142
|
+
# Returns the urls of the ads that match the specified _block_.
|
143
|
+
#
|
144
|
+
# sponsored.urls_of { |ad| ad.title =~ /buy these pants/ }
|
145
|
+
#
|
146
|
+
def urls_of(&block)
|
147
|
+
ads_with(&block).urls
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
151
|
+
end
|
data/lib/gscraper/version.rb
CHANGED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'gscraper/gscraper'
|
2
|
+
|
3
|
+
module GScraper
|
4
|
+
module WebAgent
|
5
|
+
protected
|
6
|
+
|
7
|
+
#
|
8
|
+
# Returns the WWW::Mechanize agent.
|
9
|
+
#
|
10
|
+
def web_agent(&block)
|
11
|
+
@web_agent ||= GScraper.web_agent
|
12
|
+
|
13
|
+
block.call(@web_agent) if block
|
14
|
+
return @web_agent
|
15
|
+
end
|
16
|
+
|
17
|
+
#
|
18
|
+
# Fetches the specified _url_, with the given _referer_ using the
|
19
|
+
# web_agent.
|
20
|
+
#
|
21
|
+
# get_page('http://www.hackety.org/')
|
22
|
+
#
|
23
|
+
def get_page(url,referer=nil,&block)
|
24
|
+
web_agent.get(url,referer,&block)
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Posts the specified _url_ and the given _query_ parameters using the
|
29
|
+
# web_agent.
|
30
|
+
#
|
31
|
+
# post_page('http://www.wired.com/', :q => 'the future')
|
32
|
+
#
|
33
|
+
def post_page(url,query={})
|
34
|
+
web_agent.post(url,query)
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
data/test/search/query_result.rb
CHANGED
@@ -10,7 +10,7 @@ class QueryResult < Test::Unit::TestCase
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def test_first_result
|
13
|
-
result = @query.
|
13
|
+
result = @query.top_result
|
14
14
|
|
15
15
|
assert_not_nil result, "The Query for 'ruby' has no first-result"
|
16
16
|
assert_equal result.rank, 1, "The first result for the Query 'ruby' does not have the rank of 1"
|
data/test/test_gscraper.rb
CHANGED
metadata
CHANGED
@@ -1,33 +1,55 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.4
|
3
|
-
specification_version: 1
|
4
2
|
name: gscraper
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.1.
|
7
|
-
date: 2007-12-29 00:00:00 -08:00
|
8
|
-
summary: A ruby web-scraping interface to various Google Services
|
9
|
-
require_paths:
|
10
|
-
- lib
|
11
|
-
email: postmodern.mod3@gmail.com
|
12
|
-
homepage: " by Postmodern Modulus III"
|
13
|
-
rubyforge_project: gscraper
|
14
|
-
description: "== FEATURES/PROBLEMS: * Supports the Google Search service. * Provides HTTP access with custom User-Agent strings. == REQUIREMENTS: * Hpricot * Mechanize == INSTALL:"
|
15
|
-
autorequire:
|
16
|
-
default_executable:
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: true
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">"
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 0.0.0
|
24
|
-
version:
|
4
|
+
version: 0.1.6
|
25
5
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
|
-
post_install_message:
|
29
6
|
authors:
|
30
7
|
- Postmodern Modulus III
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-03-15 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: "0"
|
23
|
+
version:
|
24
|
+
- !ruby/object:Gem::Dependency
|
25
|
+
name: mechanize
|
26
|
+
version_requirement:
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
requirements:
|
29
|
+
- - ">="
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: "0"
|
32
|
+
version:
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: hoe
|
35
|
+
version_requirement:
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.5.1
|
41
|
+
version:
|
42
|
+
description: "== FEATURES/PROBLEMS: * Supports the Google Search service. * Provides access to search results and ranks. * Provides access to the Sponsored Links. * Provides HTTP access with custom User-Agent strings. * Provides proxy settings for HTTP access. == REQUIREMENTS: * Hpricot * WWW::Mechanize == INSTALL:"
|
43
|
+
email: postmodern.mod3@gmail.com
|
44
|
+
executables: []
|
45
|
+
|
46
|
+
extensions: []
|
47
|
+
|
48
|
+
extra_rdoc_files:
|
49
|
+
- History.txt
|
50
|
+
- LICENSE.txt
|
51
|
+
- Manifest.txt
|
52
|
+
- README.txt
|
31
53
|
files:
|
32
54
|
- History.txt
|
33
55
|
- LICENSE.txt
|
@@ -37,10 +59,13 @@ files:
|
|
37
59
|
- lib/gscraper.rb
|
38
60
|
- lib/gscraper/version.rb
|
39
61
|
- lib/gscraper/gscraper.rb
|
62
|
+
- lib/gscraper/web_agent.rb
|
40
63
|
- lib/gscraper/extensions/uri/http.rb
|
41
64
|
- lib/gscraper/extensions/uri.rb
|
42
65
|
- lib/gscraper/extensions.rb
|
43
66
|
- lib/gscraper/licenses.rb
|
67
|
+
- lib/gscraper/sponsored_ad.rb
|
68
|
+
- lib/gscraper/sponsored_links.rb
|
44
69
|
- lib/gscraper/search/result.rb
|
45
70
|
- lib/gscraper/search/page.rb
|
46
71
|
- lib/gscraper/search/query.rb
|
@@ -51,47 +76,32 @@ files:
|
|
51
76
|
- test/search/query_result.rb
|
52
77
|
- test/search/query_pages.rb
|
53
78
|
- test/search/page_results.rb
|
54
|
-
|
55
|
-
|
79
|
+
has_rdoc: true
|
80
|
+
homepage: " by Postmodern Modulus III"
|
81
|
+
post_install_message:
|
56
82
|
rdoc_options:
|
57
83
|
- --main
|
58
84
|
- README.txt
|
59
|
-
|
60
|
-
-
|
61
|
-
|
62
|
-
|
63
|
-
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
85
|
+
require_paths:
|
86
|
+
- lib
|
87
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: "0"
|
92
|
+
version:
|
93
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - ">="
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: "0"
|
98
|
+
version:
|
68
99
|
requirements: []
|
69
100
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
- !ruby/object:Gem::Version
|
78
|
-
version: 0.0.0
|
79
|
-
version:
|
80
|
-
- !ruby/object:Gem::Dependency
|
81
|
-
name: mechanize
|
82
|
-
version_requirement:
|
83
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
84
|
-
requirements:
|
85
|
-
- - ">"
|
86
|
-
- !ruby/object:Gem::Version
|
87
|
-
version: 0.0.0
|
88
|
-
version:
|
89
|
-
- !ruby/object:Gem::Dependency
|
90
|
-
name: hoe
|
91
|
-
version_requirement:
|
92
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
93
|
-
requirements:
|
94
|
-
- - ">="
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
version: 1.4.0
|
97
|
-
version:
|
101
|
+
rubyforge_project: gscraper
|
102
|
+
rubygems_version: 1.0.1
|
103
|
+
signing_key:
|
104
|
+
specification_version: 2
|
105
|
+
summary: A ruby web-scraping interface to various Google Services
|
106
|
+
test_files:
|
107
|
+
- test/test_gscraper.rb
|