gscraper 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +8 -0
- data/.specopts +1 -0
- data/.yardopts +1 -0
- data/ChangeLog.md +122 -0
- data/Gemfile +25 -0
- data/{README.txt → README.md} +25 -24
- data/Rakefile +32 -10
- data/gscraper.gemspec +112 -0
- data/lib/gscraper.rb +0 -2
- data/lib/gscraper/extensions.rb +0 -2
- data/lib/gscraper/extensions/uri.rb +0 -2
- data/lib/gscraper/extensions/uri/http.rb +0 -2
- data/lib/gscraper/extensions/uri/query_params.rb +18 -5
- data/lib/gscraper/gscraper.rb +61 -70
- data/lib/gscraper/has_pages.rb +76 -20
- data/lib/gscraper/licenses.rb +0 -2
- data/lib/gscraper/page.rb +45 -16
- data/lib/gscraper/search.rb +0 -2
- data/lib/gscraper/search/ajax_query.rb +75 -22
- data/lib/gscraper/search/page.rb +328 -122
- data/lib/gscraper/search/query.rb +100 -7
- data/lib/gscraper/search/result.rb +27 -6
- data/lib/gscraper/search/search.rb +59 -9
- data/lib/gscraper/search/web_query.rb +120 -37
- data/lib/gscraper/sponsored_ad.rb +19 -6
- data/lib/gscraper/sponsored_links.rb +260 -92
- data/lib/gscraper/version.rb +2 -3
- data/spec/extensions/uri/query_params_spec.rb +8 -0
- data/spec/gscraper_spec.rb +9 -4
- data/spec/has_pages_examples.rb +0 -2
- data/spec/has_sponsored_links_examples.rb +2 -1
- data/spec/helpers/query.rb +3 -1
- data/spec/helpers/uri.rb +6 -4
- data/spec/page_has_results_examples.rb +0 -2
- data/spec/search/ajax_query_spec.rb +6 -11
- data/spec/search/page_has_results_examples.rb +0 -2
- data/spec/search/web_query_spec.rb +6 -11
- data/spec/spec_helper.rb +10 -4
- metadata +147 -54
- data/History.txt +0 -101
- data/Manifest.txt +0 -38
- data/tasks/spec.rb +0 -9
data/lib/gscraper/licenses.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
module GScraper
|
data/lib/gscraper/page.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,47 +16,77 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
21
|
+
require 'enumerator'
|
22
|
+
|
23
23
|
module GScraper
|
24
24
|
class Page < Array
|
25
25
|
|
26
26
|
#
|
27
|
-
# Creates a new Page object
|
28
|
-
#
|
27
|
+
# Creates a new Page object.
|
28
|
+
#
|
29
|
+
# @param [Array] elements
|
30
|
+
# The elements to populate the page with.
|
31
|
+
#
|
32
|
+
# @yield [page]
|
33
|
+
# If a block is given, it will be passed the newly created page.
|
34
|
+
#
|
35
|
+
# @yieldparam [Page] page
|
36
|
+
# The newly created page.
|
29
37
|
#
|
30
|
-
def initialize(elements=[]
|
38
|
+
def initialize(elements=[])
|
31
39
|
super(elements)
|
32
40
|
|
33
|
-
|
41
|
+
yield self if block_given?
|
34
42
|
end
|
35
43
|
|
36
44
|
#
|
37
|
-
#
|
38
|
-
# given _block_. If the _block_ is not given, the page will be
|
39
|
-
# returned.
|
45
|
+
# Maps the elements within the page.
|
40
46
|
#
|
41
|
-
#
|
47
|
+
# @yield [element]
|
48
|
+
# The given block will be passed each element in the page.
|
42
49
|
#
|
43
|
-
#
|
50
|
+
# @return [Array, Enumerator]
|
51
|
+
# The mapped result. If no block was given, an Enumerator object will
|
52
|
+
# be returned.
|
44
53
|
#
|
45
|
-
|
46
|
-
|
54
|
+
# @example
|
55
|
+
# page.map
|
56
|
+
# # => Page
|
57
|
+
#
|
58
|
+
# @example
|
59
|
+
# page.map { |element| element.field }
|
60
|
+
# # => [...]
|
61
|
+
#
|
62
|
+
def map
|
63
|
+
return enum_for(:map) unless block_given?
|
47
64
|
|
48
65
|
mapped = []
|
49
66
|
|
50
|
-
each { |element| mapped <<
|
67
|
+
each { |element| mapped << yield(element) }
|
51
68
|
return mapped
|
52
69
|
end
|
53
70
|
|
54
71
|
#
|
55
|
-
# Selects the elements within the
|
72
|
+
# Selects the elements within the page.
|
73
|
+
#
|
74
|
+
# @yield [element]
|
75
|
+
# The given block will be passed each element in the page.
|
76
|
+
#
|
77
|
+
# @return [Array, Enumerator]
|
78
|
+
# The selected elements. If no block was given, an Enumerator object
|
79
|
+
# is returned.
|
56
80
|
#
|
81
|
+
# @example
|
57
82
|
# page.select { |element| element.field =~ /ruby/i }
|
58
83
|
#
|
59
84
|
def select(&block)
|
60
|
-
|
85
|
+
unless block
|
86
|
+
enum_for(:select)
|
87
|
+
else
|
88
|
+
self.class.new(super(&block))
|
89
|
+
end
|
61
90
|
end
|
62
91
|
|
63
92
|
end
|
data/lib/gscraper/search.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'gscraper/search/web_query'
|
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'gscraper/search/result'
|
@@ -32,6 +30,9 @@ require 'nokogiri'
|
|
32
30
|
|
33
31
|
module GScraper
|
34
32
|
module Search
|
33
|
+
#
|
34
|
+
# Represents a Query through the Google AJAX search API.
|
35
|
+
#
|
35
36
|
class AJAXQuery < Query
|
36
37
|
|
37
38
|
include HasPages
|
@@ -70,16 +71,28 @@ module GScraper
|
|
70
71
|
attr_accessor :version
|
71
72
|
|
72
73
|
#
|
73
|
-
# Creates a new
|
74
|
-
# given it will be passed the newly created AJAXQuery object.
|
74
|
+
# Creates a new AJAX query.
|
75
75
|
#
|
76
|
-
#
|
77
|
-
#
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
81
|
-
#
|
82
|
-
#
|
76
|
+
# @param [Hash] options
|
77
|
+
# Query options.
|
78
|
+
#
|
79
|
+
# @option options [Symbol] :language (:en)
|
80
|
+
# The search language.
|
81
|
+
#
|
82
|
+
# @option options [String] :sig ('582c1116317355adf613a6a843f19ece')
|
83
|
+
# The search signature.
|
84
|
+
#
|
85
|
+
# @option options [Symbol] :key (:notsupplied)
|
86
|
+
# The search key.
|
87
|
+
#
|
88
|
+
# @option options [Float] :version (1.0)
|
89
|
+
# The desired API version.
|
90
|
+
#
|
91
|
+
# @yield [query]
|
92
|
+
# If a block is given, the new AJAX query will be passed to it.
|
93
|
+
#
|
94
|
+
# @yieldparam [AJAXQuery] query
|
95
|
+
# The new AJAX query.
|
83
96
|
#
|
84
97
|
def initialize(options={},&block)
|
85
98
|
@agent = GScraper.web_agent(options)
|
@@ -94,10 +107,26 @@ module GScraper
|
|
94
107
|
end
|
95
108
|
|
96
109
|
#
|
97
|
-
# Creates a new
|
98
|
-
#
|
110
|
+
# Creates a new AJAX query from the specified URL.
|
111
|
+
#
|
112
|
+
# @param [URI::HTTP, String] url
|
113
|
+
# The URL to create the query from.
|
114
|
+
#
|
115
|
+
# @param [Hash] options
|
116
|
+
# Additional query options.
|
117
|
+
#
|
118
|
+
# @yield [query]
|
119
|
+
# If a block is given, it will be passed the new AJAX query.
|
120
|
+
#
|
121
|
+
# @yieldparam [AJAXQuery] query
|
122
|
+
# The new AJAX query.
|
123
|
+
#
|
124
|
+
# @return [AJAXQuery]
|
125
|
+
# The new AJAX query.
|
99
126
|
#
|
100
|
-
|
127
|
+
# @see AJAXQuery.new
|
128
|
+
#
|
129
|
+
def AJAXQuery.from_url(url,options={},&block)
|
101
130
|
url = URI(url.to_s)
|
102
131
|
|
103
132
|
options[:language] = url.query_params['hl']
|
@@ -111,14 +140,22 @@ module GScraper
|
|
111
140
|
end
|
112
141
|
|
113
142
|
#
|
114
|
-
#
|
143
|
+
# The results per page.
|
144
|
+
#
|
145
|
+
# @return [Integer]
|
146
|
+
# The number of results per page.
|
147
|
+
#
|
148
|
+
# @see RESULTS_PER_PAGE
|
115
149
|
#
|
116
150
|
def results_per_page
|
117
151
|
RESULTS_PER_PAGE
|
118
152
|
end
|
119
153
|
|
120
154
|
#
|
121
|
-
#
|
155
|
+
# The URL that represents the query.
|
156
|
+
#
|
157
|
+
# @return [URI::HTTP]
|
158
|
+
# The URL for the query.
|
122
159
|
#
|
123
160
|
def search_url
|
124
161
|
search_url = URI(API_URL)
|
@@ -134,8 +171,13 @@ module GScraper
|
|
134
171
|
end
|
135
172
|
|
136
173
|
#
|
137
|
-
#
|
138
|
-
#
|
174
|
+
# The URL that represents the query at a specific page index.
|
175
|
+
#
|
176
|
+
# @param [Integer] page_index
|
177
|
+
# The page index to create the URL for.
|
178
|
+
#
|
179
|
+
# @return [URI::HTTP]
|
180
|
+
# The query URL for the given page index.
|
139
181
|
#
|
140
182
|
def page_url(page_index)
|
141
183
|
url = search_url
|
@@ -148,8 +190,13 @@ module GScraper
|
|
148
190
|
end
|
149
191
|
|
150
192
|
#
|
151
|
-
#
|
152
|
-
#
|
193
|
+
# A page containing results at the specified page index.
|
194
|
+
#
|
195
|
+
# @param [Integer] page_index
|
196
|
+
# The index of the page.
|
197
|
+
#
|
198
|
+
# @return [Page<Result>]
|
199
|
+
# A page object.
|
153
200
|
#
|
154
201
|
def page(page_index)
|
155
202
|
Page.new do |new_page|
|
@@ -162,8 +209,14 @@ module GScraper
|
|
162
209
|
hash['results'].each_with_index do |result,index|
|
163
210
|
rank = rank_offset + (index + 1)
|
164
211
|
title = Nokogiri::HTML(result['title']).inner_text
|
165
|
-
url = URI(result['unescapedUrl'])
|
166
|
-
|
212
|
+
url = URI(URI.escape(result['unescapedUrl']))
|
213
|
+
|
214
|
+
unless result['content'].empty?
|
215
|
+
summary = Nokogiri::HTML(result['content']).inner_text
|
216
|
+
else
|
217
|
+
summary = ''
|
218
|
+
end
|
219
|
+
|
167
220
|
cached_url = URI(result['cacheUrl'])
|
168
221
|
|
169
222
|
new_page << Result.new(rank,title,url,summary,cached_url)
|
data/lib/gscraper/search/page.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'gscraper/search/result'
|
@@ -27,224 +25,376 @@ module GScraper
|
|
27
25
|
module Search
|
28
26
|
class Page < GScraper::Page
|
29
27
|
|
28
|
+
alias results_with select
|
29
|
+
|
30
30
|
#
|
31
|
-
# Selects the results
|
31
|
+
# Selects the results with the matching title.
|
32
32
|
#
|
33
|
-
#
|
33
|
+
# @param [String, Regexp] title
|
34
|
+
# The title to search for.
|
34
35
|
#
|
35
|
-
|
36
|
-
|
37
|
-
end
|
38
|
-
|
36
|
+
# @yield [result]
|
37
|
+
# The given block will be passed each matching result.
|
39
38
|
#
|
40
|
-
#
|
41
|
-
#
|
42
|
-
# result will be passed to the _block_.
|
39
|
+
# @yieldparam [Result] result
|
40
|
+
# A result with the matching title.
|
43
41
|
#
|
42
|
+
# @return [Array<Result>]
|
43
|
+
# The results with the matching title.
|
44
|
+
#
|
45
|
+
# @example
|
44
46
|
# page.results_with_title('hackety org') #=> Page
|
45
47
|
#
|
48
|
+
# @example
|
46
49
|
# page.results_with_title(/awesome/) do |result|
|
47
50
|
# puts result.url
|
48
51
|
# end
|
49
52
|
#
|
50
|
-
def results_with_title(title
|
51
|
-
|
52
|
-
|
53
|
+
def results_with_title(title)
|
54
|
+
unless block_given?
|
55
|
+
enum_for(:results_with_title,title)
|
53
56
|
else
|
54
|
-
|
55
|
-
|
57
|
+
results_with do |result|
|
58
|
+
if result.title.match(title)
|
59
|
+
yield result
|
56
60
|
|
57
|
-
|
58
|
-
|
61
|
+
true
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
59
65
|
end
|
60
66
|
|
61
67
|
#
|
62
|
-
# Selects the results with the matching
|
63
|
-
# either a String or a Regexp. If _block_ is given, each matching
|
64
|
-
# result will be passed to the _block_.
|
68
|
+
# Selects the results with the matching URL.
|
65
69
|
#
|
70
|
+
# @param [String, Regexp] url
|
71
|
+
# The URL to search for.
|
72
|
+
#
|
73
|
+
# @yield [result]
|
74
|
+
# The given block will be passed each matching result.
|
75
|
+
#
|
76
|
+
# @yieldparam [Result] result
|
77
|
+
# A result with the matching URL.
|
78
|
+
#
|
79
|
+
# @return [Array<Result>]
|
80
|
+
# The results with the matching URL.
|
81
|
+
#
|
82
|
+
# @example
|
66
83
|
# page.results_with_url(/\.com/) # => Page
|
67
84
|
#
|
85
|
+
# @example
|
68
86
|
# page.results_with_url(/^https:\/\//) do |result|
|
69
87
|
# puts result.title
|
70
88
|
# end
|
71
89
|
#
|
72
|
-
def results_with_url(url
|
73
|
-
|
74
|
-
|
90
|
+
def results_with_url(url)
|
91
|
+
unless block_given?
|
92
|
+
enum_for(:results_with_url,url)
|
75
93
|
else
|
76
|
-
|
77
|
-
|
94
|
+
results_with do |result|
|
95
|
+
if result.url.match(url)
|
96
|
+
yield result
|
78
97
|
|
79
|
-
|
80
|
-
|
98
|
+
true
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
81
102
|
end
|
82
103
|
|
83
104
|
#
|
84
|
-
# Selects the results with the matching
|
85
|
-
# be either a String or a Regexp. If _block_ is given, each matching
|
86
|
-
# result will be passed to the _block_.
|
105
|
+
# Selects the results with the matching summary.
|
87
106
|
#
|
107
|
+
# @param [String, Regexp] summary
|
108
|
+
# The summary to search for.
|
109
|
+
#
|
110
|
+
# @yield [result]
|
111
|
+
# The given block will be passed each matching result.
|
112
|
+
#
|
113
|
+
# @yieldparam [Result] result
|
114
|
+
# A result with the matching summary.
|
115
|
+
#
|
116
|
+
# @return [Array<Result>]
|
117
|
+
# The results with the matching summary.
|
118
|
+
#
|
119
|
+
# @example
|
88
120
|
# page.results_with_summary(/cheese cake/) # => Page
|
89
121
|
#
|
122
|
+
# @example
|
90
123
|
# page.results_with_summary(/Scientifically/) do |result|
|
91
124
|
# puts result.url
|
92
125
|
# end
|
93
126
|
#
|
94
|
-
def results_with_summary(summary
|
95
|
-
|
96
|
-
|
127
|
+
def results_with_summary(summary)
|
128
|
+
unless block_given?
|
129
|
+
enum_for(:results_with_summary,summary)
|
97
130
|
else
|
98
|
-
|
99
|
-
|
131
|
+
results_with do |result|
|
132
|
+
if result.summary.match(summary)
|
133
|
+
yield result
|
100
134
|
|
101
|
-
|
102
|
-
|
135
|
+
true
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
103
139
|
end
|
104
140
|
|
105
141
|
#
|
106
|
-
#
|
107
|
-
# Page.
|
142
|
+
# Iterates over each result's rank within the page.
|
108
143
|
#
|
109
|
-
#
|
144
|
+
# @yield [rank]
|
145
|
+
# The given block will be passed the ranks of each result in
|
146
|
+
# the page.
|
110
147
|
#
|
111
|
-
|
112
|
-
|
148
|
+
# @yieldparam [Integer] rank
|
149
|
+
# The rank of a result in the page.
|
150
|
+
#
|
151
|
+
# @return [Enumerator]
|
152
|
+
# If no block is given, an Enumerator object will be returned.
|
153
|
+
#
|
154
|
+
# @example
|
155
|
+
# each_rank { |rank| puts rank }
|
156
|
+
#
|
157
|
+
def each_rank
|
158
|
+
unless block_given?
|
159
|
+
enum_for(:each_rank)
|
160
|
+
else
|
161
|
+
each { |result| yield result.rank }
|
162
|
+
end
|
113
163
|
end
|
114
164
|
|
115
165
|
#
|
116
|
-
#
|
117
|
-
# Page.
|
166
|
+
# Iterates over each result's title within the page.
|
118
167
|
#
|
119
|
-
#
|
168
|
+
# @yield [title]
|
169
|
+
# The given block will be passed the title of each result in
|
170
|
+
# the page.
|
171
|
+
#
|
172
|
+
# @yieldparam [String] title
|
173
|
+
# The title of a result in the page.
|
120
174
|
#
|
121
|
-
|
122
|
-
|
175
|
+
# @return [Enumerator]
|
176
|
+
# If no block is given, an Enumerator object will be returned.
|
177
|
+
#
|
178
|
+
# @example
|
179
|
+
# each_title { |title| puts title }
|
180
|
+
#
|
181
|
+
def each_title
|
182
|
+
unless block_given?
|
183
|
+
enum_for(:each_title)
|
184
|
+
else
|
185
|
+
each { |result| yield result.title }
|
186
|
+
end
|
123
187
|
end
|
124
188
|
|
125
189
|
#
|
126
|
-
#
|
127
|
-
# Page.
|
190
|
+
# Iterates over each result's url within the page.
|
128
191
|
#
|
129
|
-
#
|
192
|
+
# @yield [url]
|
193
|
+
# The given block will be passed the URL of each result in
|
194
|
+
# the page.
|
195
|
+
#
|
196
|
+
# @yieldparam [URI::HTTP] url
|
197
|
+
# The URL of a result in the page.
|
130
198
|
#
|
131
|
-
|
132
|
-
|
199
|
+
# @return [Enumerator]
|
200
|
+
# If no block is given, an Enumerator object will be returned.
|
201
|
+
#
|
202
|
+
# @example
|
203
|
+
# each_url { |url| puts url }
|
204
|
+
#
|
205
|
+
def each_url
|
206
|
+
unless block_given?
|
207
|
+
enum_for(:each_url)
|
208
|
+
else
|
209
|
+
each { |result| yield result.url }
|
210
|
+
end
|
133
211
|
end
|
134
212
|
|
135
213
|
#
|
136
|
-
#
|
137
|
-
# Page.
|
214
|
+
# Iterates over each result's summary within the page.
|
138
215
|
#
|
139
|
-
#
|
216
|
+
# @yield [summary]
|
217
|
+
# The given block will be passed the summary of each result in
|
218
|
+
# the page.
|
219
|
+
#
|
220
|
+
# @yieldparam [String] summary
|
221
|
+
# The summary of a result in the page.
|
140
222
|
#
|
141
|
-
|
142
|
-
|
223
|
+
# @return [Enumerator]
|
224
|
+
# If no block is given, an Enumerator object will be returned.
|
225
|
+
#
|
226
|
+
# @example
|
227
|
+
# each_summary { |summary| puts summary }
|
228
|
+
#
|
229
|
+
def each_summary
|
230
|
+
unless block_given?
|
231
|
+
enum_for(:each_summary)
|
232
|
+
else
|
233
|
+
each { |result| yield result.summary }
|
234
|
+
end
|
143
235
|
end
|
144
236
|
|
145
237
|
#
|
146
|
-
#
|
147
|
-
# the Page.
|
238
|
+
# Iterates over each result's cached URLs within the page.
|
148
239
|
#
|
149
|
-
#
|
240
|
+
# @yield [cached_url]
|
241
|
+
# The given block will be passed the Cached URL of each result in
|
242
|
+
# the page.
|
243
|
+
#
|
244
|
+
# @yieldparam [URI::HTTP] cached_url
|
245
|
+
# The Cached URL of a result in the page.
|
150
246
|
#
|
151
|
-
|
152
|
-
|
247
|
+
# @return [Enumerator]
|
248
|
+
# If no block is given, an Enumerator object will be returned.
|
249
|
+
#
|
250
|
+
# @example
|
251
|
+
# each_cached_url { |cached_url| puts cached_url }
|
252
|
+
#
|
253
|
+
def each_cached_url
|
254
|
+
unless block_given?
|
255
|
+
enum_for(:each_cached_url)
|
256
|
+
else
|
257
|
+
each do |result|
|
258
|
+
yield result.cached_url if result.cached_url
|
259
|
+
end
|
260
|
+
end
|
153
261
|
end
|
154
262
|
|
155
263
|
#
|
156
|
-
#
|
157
|
-
# the Page.
|
264
|
+
# Iterates over each result's cached pages within the page.
|
158
265
|
#
|
159
|
-
#
|
266
|
+
# @yield [cached_page]
|
267
|
+
# The given block will be passed the Cached Page of each result in
|
268
|
+
# the page.
|
269
|
+
#
|
270
|
+
# @yieldparam [Mechanize::Page] cached_page
|
271
|
+
# The Cached Page of a result in the page.
|
160
272
|
#
|
161
|
-
|
162
|
-
|
273
|
+
# @return [Enumerator]
|
274
|
+
# If no block is given, an Enumerator object will be returned.
|
275
|
+
#
|
276
|
+
# @example
|
277
|
+
# each_cached_page { |page| puts page.readlines }
|
278
|
+
#
|
279
|
+
def each_cached_page
|
280
|
+
unless block_given?
|
281
|
+
enum_for(:each_cached_page)
|
282
|
+
else
|
283
|
+
each do |result|
|
284
|
+
yield result.cached_page if result.cached_page
|
285
|
+
end
|
286
|
+
end
|
163
287
|
end
|
164
288
|
|
165
289
|
#
|
166
|
-
#
|
167
|
-
# within the Page.
|
290
|
+
# Iterates over each result's similar Query URLs within the page.
|
168
291
|
#
|
169
|
-
#
|
292
|
+
# @yield [similar_url]
|
293
|
+
# The given block will be passed the Similar Query URL of each
|
294
|
+
# result in the page.
|
295
|
+
#
|
296
|
+
# @yieldparam [URI::HTTP] similar_url
|
297
|
+
# The Cached URL of a result in the page.
|
170
298
|
#
|
171
|
-
|
172
|
-
|
299
|
+
# @return [Enumerator]
|
300
|
+
# If no block is given, an Enumerator object will be returned.
|
301
|
+
#
|
302
|
+
# @example
|
303
|
+
# each_similar_url { |similar_url| puts similar_url }
|
304
|
+
#
|
305
|
+
def each_similar_url
|
306
|
+
unless block_given?
|
307
|
+
enum_for(:each_similar_url)
|
308
|
+
else
|
309
|
+
each do |result|
|
310
|
+
yield result.similar_url if result.similar_url
|
311
|
+
end
|
312
|
+
end
|
173
313
|
end
|
174
314
|
|
175
315
|
#
|
176
|
-
#
|
177
|
-
# the given _block_.
|
316
|
+
# Returns the ranks of the results in the page.
|
178
317
|
#
|
179
|
-
#
|
318
|
+
# @return [Array<Integer>]
|
319
|
+
# The ranks of the results.
|
180
320
|
#
|
181
|
-
def
|
182
|
-
|
321
|
+
def ranks
|
322
|
+
each_rank.to_a
|
183
323
|
end
|
184
324
|
|
185
325
|
#
|
186
|
-
#
|
187
|
-
# the given _block_.
|
326
|
+
# Returns the titles of the results in the page.
|
188
327
|
#
|
189
|
-
#
|
328
|
+
# @return [Array<String>]
|
329
|
+
# The titles of the results.
|
190
330
|
#
|
191
|
-
def
|
192
|
-
|
331
|
+
def titles
|
332
|
+
each_title.to_a
|
193
333
|
end
|
194
334
|
|
195
335
|
#
|
196
|
-
#
|
197
|
-
# the given _block_.
|
336
|
+
# Returns the URLs of the results in the page.
|
198
337
|
#
|
199
|
-
#
|
338
|
+
# @return [Array<URI::HTTP>]
|
339
|
+
# The URLs of the results.
|
200
340
|
#
|
201
|
-
def
|
202
|
-
|
341
|
+
def urls
|
342
|
+
each_url.to_a
|
203
343
|
end
|
204
344
|
|
205
345
|
#
|
206
|
-
#
|
207
|
-
# to the given _block_.
|
346
|
+
# Returns the summaries of the results in the page.
|
208
347
|
#
|
209
|
-
#
|
348
|
+
# @return [Array<String>]
|
349
|
+
# The summaries of the results.
|
210
350
|
#
|
211
|
-
def
|
212
|
-
|
351
|
+
def summaries
|
352
|
+
each_summary.to_a
|
213
353
|
end
|
214
354
|
|
215
355
|
#
|
216
|
-
#
|
217
|
-
# each to the given _block_.
|
356
|
+
# Returns the Cached URLs of the results in the page.
|
218
357
|
#
|
219
|
-
#
|
358
|
+
# @return [Array<URI::HTTP>]
|
359
|
+
# The Cached URLs of the results.
|
220
360
|
#
|
221
|
-
def
|
222
|
-
|
361
|
+
def cached_urls
|
362
|
+
each_cached_url.to_a
|
223
363
|
end
|
224
364
|
|
225
365
|
#
|
226
|
-
#
|
227
|
-
# each to the given _block_.
|
366
|
+
# Returns the Cached Pages of the results in the page.
|
228
367
|
#
|
229
|
-
#
|
368
|
+
# @return [Array<Mechanize::Page>]
|
369
|
+
# The Cached Pages of the results.
|
230
370
|
#
|
231
|
-
def
|
232
|
-
|
371
|
+
def cached_pages
|
372
|
+
each_cached_page.to_a
|
233
373
|
end
|
234
374
|
|
235
375
|
#
|
236
|
-
#
|
237
|
-
# passing each to the given _block_.
|
376
|
+
# Returns the Similar Query URLs of the results in the page.
|
238
377
|
#
|
239
|
-
#
|
378
|
+
# @return [Array<URI::HTTP>]
|
379
|
+
# The Similar Query URLs of the results.
|
240
380
|
#
|
241
|
-
def
|
242
|
-
|
381
|
+
def similar_urls
|
382
|
+
each_similar_url.to_a
|
243
383
|
end
|
244
384
|
|
245
385
|
#
|
246
|
-
# Returns the ranks of the results that match the
|
386
|
+
# Returns the ranks of the results that match the given block.
|
387
|
+
#
|
388
|
+
# @yield [result]
|
389
|
+
# The given block will be used to filter the results in the page.
|
247
390
|
#
|
391
|
+
# @yieldparam [Result] result
|
392
|
+
# A result in the page.
|
393
|
+
#
|
394
|
+
# @return [Array<Integer>]
|
395
|
+
# The ranks of the results which match the given block.
|
396
|
+
#
|
397
|
+
# @example
|
248
398
|
# page.ranks_of { |result| result.title =~ /awesome/ }
|
249
399
|
#
|
250
400
|
def ranks_of(&block)
|
@@ -252,8 +402,18 @@ module GScraper
|
|
252
402
|
end
|
253
403
|
|
254
404
|
#
|
255
|
-
# Returns the titles of the results that match the
|
405
|
+
# Returns the titles of the results that match the given block.
|
406
|
+
#
|
407
|
+
# @yield [result]
|
408
|
+
# The given block will be used to filter the results in the page.
|
256
409
|
#
|
410
|
+
# @yieldparam [Result] result
|
411
|
+
# A result in the page.
|
412
|
+
#
|
413
|
+
# @return [Array<String>]
|
414
|
+
# The titles of the results which match the given block.
|
415
|
+
#
|
416
|
+
# @example
|
257
417
|
# page.titles_of { |result| result.url.include?('www') }
|
258
418
|
#
|
259
419
|
def titles_of(&block)
|
@@ -261,8 +421,18 @@ module GScraper
|
|
261
421
|
end
|
262
422
|
|
263
423
|
#
|
264
|
-
# Returns the urls of the results that match the
|
424
|
+
# Returns the urls of the results that match the given block.
|
425
|
+
#
|
426
|
+
# @yield [result]
|
427
|
+
# The given block will be used to filter the results in the page.
|
265
428
|
#
|
429
|
+
# @yieldparam [Result] result
|
430
|
+
# A result in the page.
|
431
|
+
#
|
432
|
+
# @return [Array<URI::HTTP>]
|
433
|
+
# The URLs of the results which match the given block.
|
434
|
+
#
|
435
|
+
# @example
|
266
436
|
# page.urls_of { |result| result.summary =~ /awesome pants/ }
|
267
437
|
#
|
268
438
|
def urls_of(&block)
|
@@ -270,9 +440,18 @@ module GScraper
|
|
270
440
|
end
|
271
441
|
|
272
442
|
#
|
273
|
-
# Returns the summaries of the results that match the
|
274
|
-
#
|
443
|
+
# Returns the summaries of the results that match the given block.
|
444
|
+
#
|
445
|
+
# @yield [result]
|
446
|
+
# The given block will be used to filter the results in the page.
|
275
447
|
#
|
448
|
+
# @yieldparam [Result] result
|
449
|
+
# A result in the page.
|
450
|
+
#
|
451
|
+
# @return [Array<String>]
|
452
|
+
# The summaries of the results which match the given block.
|
453
|
+
#
|
454
|
+
# @example
|
276
455
|
# page.summaries_of { |result| result.title =~ /what if/ }
|
277
456
|
#
|
278
457
|
def summaries_of(&block)
|
@@ -280,9 +459,18 @@ module GScraper
|
|
280
459
|
end
|
281
460
|
|
282
461
|
#
|
283
|
-
# Returns the
|
284
|
-
#
|
462
|
+
# Returns the Cached URLs of the results that match the given block.
|
463
|
+
#
|
464
|
+
# @yield [result]
|
465
|
+
# The given block will be used to filter the results in the page.
|
285
466
|
#
|
467
|
+
# @yieldparam [Result] result
|
468
|
+
# A result in the page.
|
469
|
+
#
|
470
|
+
# @return [Array<URI::HTTP>]
|
471
|
+
# The Cached URLs of the results which match the given block.
|
472
|
+
#
|
473
|
+
# @example
|
286
474
|
# page.cached_urls_of { |result| result.title =~ /howdy/ }
|
287
475
|
#
|
288
476
|
def cached_urls_of(&block)
|
@@ -290,20 +478,38 @@ module GScraper
|
|
290
478
|
end
|
291
479
|
|
292
480
|
#
|
293
|
-
# Returns the cached pages of the results that match the
|
294
|
-
#
|
295
|
-
#
|
481
|
+
# Returns the cached pages of the results that match the given block.
|
482
|
+
#
|
483
|
+
# @yield [result]
|
484
|
+
# The given block will be used to filter the results in the page.
|
296
485
|
#
|
486
|
+
# @yieldparam [Result] result
|
487
|
+
# A result in the page.
|
488
|
+
#
|
489
|
+
# @return [Array<Mechanize::Page>]
|
490
|
+
# The Cached Page of the results which match the given block.
|
491
|
+
#
|
492
|
+
# @example
|
297
493
|
# page.cached_pages_of { |result| result.title =~ /dude/ }
|
298
494
|
#
|
299
|
-
def cached_pages_of(
|
300
|
-
results_with(&block).cached_pages
|
495
|
+
def cached_pages_of(&block)
|
496
|
+
results_with(&block).cached_pages
|
301
497
|
end
|
302
498
|
|
303
499
|
#
|
304
|
-
# Returns the
|
305
|
-
#
|
500
|
+
# Returns the Similar Query URLs of the results that match the given
|
501
|
+
# block.
|
502
|
+
#
|
503
|
+
# @yield [result]
|
504
|
+
# The given block will be used to filter the results in the page.
|
505
|
+
#
|
506
|
+
# @yieldparam [Result] result
|
507
|
+
# A result in the page.
|
508
|
+
#
|
509
|
+
# @return [Array<URI::HTTP>]
|
510
|
+
# The Similar Query URLs of the results which match the given block.
|
306
511
|
#
|
512
|
+
# @example
|
307
513
|
# page.similar_urls_of { |result| result.title =~ /what if/ }
|
308
514
|
#
|
309
515
|
def similar_urls_of(&block)
|