gscraper 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +8 -0
- data/.specopts +1 -0
- data/.yardopts +1 -0
- data/ChangeLog.md +122 -0
- data/Gemfile +25 -0
- data/{README.txt → README.md} +25 -24
- data/Rakefile +32 -10
- data/gscraper.gemspec +112 -0
- data/lib/gscraper.rb +0 -2
- data/lib/gscraper/extensions.rb +0 -2
- data/lib/gscraper/extensions/uri.rb +0 -2
- data/lib/gscraper/extensions/uri/http.rb +0 -2
- data/lib/gscraper/extensions/uri/query_params.rb +18 -5
- data/lib/gscraper/gscraper.rb +61 -70
- data/lib/gscraper/has_pages.rb +76 -20
- data/lib/gscraper/licenses.rb +0 -2
- data/lib/gscraper/page.rb +45 -16
- data/lib/gscraper/search.rb +0 -2
- data/lib/gscraper/search/ajax_query.rb +75 -22
- data/lib/gscraper/search/page.rb +328 -122
- data/lib/gscraper/search/query.rb +100 -7
- data/lib/gscraper/search/result.rb +27 -6
- data/lib/gscraper/search/search.rb +59 -9
- data/lib/gscraper/search/web_query.rb +120 -37
- data/lib/gscraper/sponsored_ad.rb +19 -6
- data/lib/gscraper/sponsored_links.rb +260 -92
- data/lib/gscraper/version.rb +2 -3
- data/spec/extensions/uri/query_params_spec.rb +8 -0
- data/spec/gscraper_spec.rb +9 -4
- data/spec/has_pages_examples.rb +0 -2
- data/spec/has_sponsored_links_examples.rb +2 -1
- data/spec/helpers/query.rb +3 -1
- data/spec/helpers/uri.rb +6 -4
- data/spec/page_has_results_examples.rb +0 -2
- data/spec/search/ajax_query_spec.rb +6 -11
- data/spec/search/page_has_results_examples.rb +0 -2
- data/spec/search/web_query_spec.rb +6 -11
- data/spec/spec_helper.rb +10 -4
- metadata +147 -54
- data/History.txt +0 -101
- data/Manifest.txt +0 -38
- data/tasks/spec.rb +0 -9
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,7 +16,6 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'gscraper/extensions/uri'
|
@@ -32,7 +30,13 @@ module GScraper
|
|
32
30
|
attr_reader :url
|
33
31
|
|
34
32
|
#
|
35
|
-
# Creates a new SponsoredAd
|
33
|
+
# Creates a new SponsoredAd.
|
34
|
+
#
|
35
|
+
# @param [String] title
|
36
|
+
# The title of the ad.
|
37
|
+
#
|
38
|
+
# @param [URI::HTTP] url
|
39
|
+
# The URL of the ad.
|
36
40
|
#
|
37
41
|
def initialize(title,url)
|
38
42
|
@title = title
|
@@ -40,21 +44,30 @@ module GScraper
|
|
40
44
|
end
|
41
45
|
|
42
46
|
#
|
43
|
-
#
|
47
|
+
# The direct link of the ad.
|
48
|
+
#
|
49
|
+
# @return [String]
|
50
|
+
# The direct link.
|
44
51
|
#
|
45
52
|
def direct_link
|
46
53
|
@url.query_params['adurl'] || @url.query_params['q']
|
47
54
|
end
|
48
55
|
|
49
56
|
#
|
50
|
-
#
|
57
|
+
# The direct URI of the ad.
|
58
|
+
#
|
59
|
+
# @return [URI::HTTP]
|
60
|
+
# The direct URI.
|
51
61
|
#
|
52
62
|
def direct_url
|
53
63
|
URI(URI.escape(direct_link))
|
54
64
|
end
|
55
65
|
|
56
66
|
#
|
57
|
-
#
|
67
|
+
# The title of the ad.
|
68
|
+
#
|
69
|
+
# @return [String]
|
70
|
+
# The title.
|
58
71
|
#
|
59
72
|
def to_s
|
60
73
|
@title.to_s
|
@@ -1,5 +1,4 @@
|
|
1
1
|
#
|
2
|
-
#--
|
3
2
|
# GScraper - A web-scraping interface to various Google Services.
|
4
3
|
#
|
5
4
|
# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
|
@@ -17,181 +16,330 @@
|
|
17
16
|
# You should have received a copy of the GNU General Public License
|
18
17
|
# along with this program; if not, write to the Free Software
|
19
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
-
#++
|
21
19
|
#
|
22
20
|
|
23
21
|
require 'gscraper/sponsored_ad'
|
24
22
|
|
23
|
+
require 'enumerator'
|
24
|
+
|
25
25
|
module GScraper
|
26
26
|
class SponsoredLinks < Array
|
27
|
+
|
28
|
+
#
|
29
|
+
# Creates a new SponsoredLinks object.
|
27
30
|
#
|
28
|
-
#
|
29
|
-
#
|
30
|
-
# object.
|
31
|
+
# @param [Array] ads
|
32
|
+
# The ads to populate the sponsored links object with.
|
31
33
|
#
|
32
|
-
|
34
|
+
# @yield [links]
|
35
|
+
# If a block is given, it will be passed the new sponsored links
|
36
|
+
# object.
|
37
|
+
#
|
38
|
+
# @yieldparam [SponsoredLinks] links
|
39
|
+
# The new sponsored links object.
|
40
|
+
#
|
41
|
+
def initialize(ads=[])
|
33
42
|
super(ads)
|
34
43
|
|
35
|
-
|
44
|
+
yield self if block_given?
|
36
45
|
end
|
37
46
|
|
38
47
|
#
|
39
|
-
#
|
40
|
-
#
|
41
|
-
#
|
48
|
+
# Maps the sponsored ads.
|
49
|
+
#
|
50
|
+
# @yield [ad]
|
51
|
+
# The given block will be passed each ad.
|
42
52
|
#
|
43
|
-
#
|
53
|
+
# @yieldparam [SponsoredAd] ad
|
54
|
+
# The sponsored ad.
|
44
55
|
#
|
45
|
-
#
|
56
|
+
# @return [Array, Enumerator]
|
57
|
+
# The mapped result. If no block was given, an Enumerator object will
|
58
|
+
# be returned.
|
46
59
|
#
|
47
|
-
|
48
|
-
|
60
|
+
# @example
|
61
|
+
# sponsored.map
|
62
|
+
# # => SponsoredLinks
|
63
|
+
#
|
64
|
+
# @example
|
65
|
+
# sponsored.map { |ad| ad.url }
|
66
|
+
# # => [...]
|
67
|
+
#
|
68
|
+
def map
|
69
|
+
return enum_for(:map) unless block_given?
|
49
70
|
|
50
71
|
mapped = []
|
51
72
|
|
52
|
-
each { |ad| mapped <<
|
73
|
+
each { |ad| mapped << yield(ad) }
|
53
74
|
return mapped
|
54
75
|
end
|
55
76
|
|
56
77
|
#
|
57
|
-
# Selects the ads within the
|
78
|
+
# Selects the ads within the sponsored links.
|
79
|
+
#
|
80
|
+
# @yield [ad]
|
81
|
+
# The given block will determine which ads to select.
|
82
|
+
#
|
83
|
+
# @yieldparam [SponsoredAd] ad
|
84
|
+
# A sponsored ad.
|
85
|
+
#
|
86
|
+
# @return [Array, Enumerator]
|
87
|
+
# The selected ads. If no block is given, an Enumerator object will
|
88
|
+
# be returned.
|
58
89
|
#
|
90
|
+
# @example
|
59
91
|
# sponsored.select { |ad| ad.title =~ /consume/i }
|
60
92
|
#
|
61
93
|
def select(&block)
|
62
|
-
|
94
|
+
unless block
|
95
|
+
enum_for(:select)
|
96
|
+
else
|
97
|
+
SponsoredLinks.new(super(&block))
|
98
|
+
end
|
63
99
|
end
|
64
100
|
|
101
|
+
alias ads_with select
|
102
|
+
|
65
103
|
#
|
66
|
-
# Selects the ads
|
104
|
+
# Selects the ads with the matching title.
|
67
105
|
#
|
68
|
-
#
|
106
|
+
# @param [String, Regexp] title
|
107
|
+
# The title to search for.
|
69
108
|
#
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
109
|
+
# @yield [ad]
|
110
|
+
# Each matching ad will be passed to the given block.
|
111
|
+
#
|
112
|
+
# @yieldparam [SponsoredAd] ad
|
113
|
+
# A sponsored ad with the matching title.
|
74
114
|
#
|
75
|
-
#
|
76
|
-
#
|
77
|
-
#
|
115
|
+
# @return [Array, Enumerator]
|
116
|
+
# The sponsored ads with the matching title. If no block is given,
|
117
|
+
# an Enumerator object will be returned.
|
78
118
|
#
|
79
|
-
#
|
119
|
+
# @example
|
120
|
+
# sponsored.ads_with_title('be attractive')
|
121
|
+
# # => SponsoredLinks
|
80
122
|
#
|
123
|
+
# @example
|
81
124
|
# sponsored.ads_with_title(/buy me/) do |ad|
|
82
125
|
# puts ad.url
|
83
126
|
# end
|
84
127
|
#
|
85
|
-
def ads_with_title(title
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
128
|
+
def ads_with_title(title)
|
129
|
+
return enum_for(:ads_with_title,title) unless block_given?
|
130
|
+
|
131
|
+
comparitor = if title.kind_of?(Regexp)
|
132
|
+
lambda { |ad| ad.title =~ title }
|
133
|
+
else
|
134
|
+
lambda { |ad| ad.title == title }
|
135
|
+
end
|
91
136
|
|
92
|
-
|
93
|
-
|
137
|
+
return ads_with do |ad|
|
138
|
+
if comparitor.call(ad)
|
139
|
+
yield ad
|
140
|
+
|
141
|
+
true
|
142
|
+
end
|
143
|
+
end
|
94
144
|
end
|
95
145
|
|
96
146
|
#
|
97
|
-
# Selects the ads with the matching
|
98
|
-
# either a String or a Regexp. If _block_ is given, each matching
|
99
|
-
# ad will be passed to the _block_.
|
147
|
+
# Selects the ads with the matching URL.
|
100
148
|
#
|
101
|
-
#
|
149
|
+
# @param [String, Regexp] url
|
150
|
+
# The URL to search for.
|
102
151
|
#
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
152
|
+
# @yield [ad]
|
153
|
+
# Each matching ad will be passed to the given block.
|
154
|
+
#
|
155
|
+
# @yieldparam [SponsoredAd] ad
|
156
|
+
# A sponsored ad with the matching URL.
|
157
|
+
#
|
158
|
+
# @return [Array, Enumerator]
|
159
|
+
# The sponsored ads with the matching URL. If no block is given,
|
160
|
+
# an Enumerator object will be returned.
|
161
|
+
#
|
162
|
+
# @example
|
163
|
+
# sponsored.ads_with_url(/\.com/)
|
164
|
+
# # => SponsoredLinks
|
165
|
+
#
|
166
|
+
def ads_with_url(url)
|
167
|
+
return enum_for(:ads_with_url,url) unless block_given?
|
109
168
|
|
110
|
-
|
111
|
-
|
169
|
+
comparitor = if url.kind_of?(Regexp)
|
170
|
+
lambda { |ad| ad.url =~ url }
|
171
|
+
else
|
172
|
+
lambda { |ad| ad.url == url }
|
173
|
+
end
|
174
|
+
|
175
|
+
return ads_with do |ad|
|
176
|
+
if comparitor.call(ad)
|
177
|
+
yield ad
|
178
|
+
|
179
|
+
true
|
180
|
+
end
|
181
|
+
end
|
112
182
|
end
|
113
183
|
|
114
184
|
#
|
115
|
-
# Selects the ads with the matching
|
116
|
-
# be either a String or a Regexp. If _block_ is given, each matching
|
117
|
-
# ad will be passed to the _block_.
|
185
|
+
# Selects the ads with the matching direct URL.
|
118
186
|
#
|
119
|
-
#
|
187
|
+
# @param [String, Regexp] direct_url
|
188
|
+
# The direct URL to search for.
|
120
189
|
#
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
190
|
+
# @yield [ad]
|
191
|
+
# Each matching ad will be passed to the given block.
|
192
|
+
#
|
193
|
+
# @yieldparam [SponsoredAd] ad
|
194
|
+
# A sponsored ad with the matching direct URL.
|
195
|
+
#
|
196
|
+
# @return [Array, Enumerator]
|
197
|
+
# The sponsored ads with the matching URL. If no block is given,
|
198
|
+
# an Enumerator object will be returned.
|
199
|
+
#
|
200
|
+
# @example
|
201
|
+
# sponsored.ads_with_direct_url(/\.com/)
|
202
|
+
# # => SponsoredLinks
|
203
|
+
#
|
204
|
+
def ads_with_direct_url(direct_url)
|
205
|
+
return enum_for(:ads_with_direct_url,direct_url) unless block_given?
|
206
|
+
|
207
|
+
comparitor = if direct_url.kind_of?(Regexp)
|
208
|
+
lambda { |ad| ad.direct_url =~ direct_url }
|
209
|
+
else
|
210
|
+
lambda { |ad| ad.direct_url == direct_url }
|
211
|
+
end
|
127
212
|
|
128
|
-
|
129
|
-
|
213
|
+
return ads_with do |ad|
|
214
|
+
if comparitor.call(ad)
|
215
|
+
yield ad
|
216
|
+
|
217
|
+
true
|
218
|
+
end
|
219
|
+
end
|
130
220
|
end
|
131
221
|
|
132
222
|
#
|
133
|
-
#
|
134
|
-
# SponsoredLinks.
|
223
|
+
# Iterates over the titles of each ad.
|
135
224
|
#
|
136
|
-
#
|
225
|
+
# @yield [title]
|
226
|
+
# The given block will be passed each title.
|
137
227
|
#
|
138
|
-
|
139
|
-
|
228
|
+
# @yieldparam [String] title
|
229
|
+
# A title of an ad.
|
230
|
+
#
|
231
|
+
# @return [Enumerator]
|
232
|
+
# If no block is given, an Enumerator object will be returned.
|
233
|
+
#
|
234
|
+
# @example
|
235
|
+
# each_title { |title| puts title }
|
236
|
+
#
|
237
|
+
def each_title
|
238
|
+
unless block_given?
|
239
|
+
enum_for(:each_title)
|
240
|
+
else
|
241
|
+
each { |ad| yield ad.title }
|
242
|
+
end
|
140
243
|
end
|
141
244
|
|
142
245
|
#
|
143
|
-
#
|
144
|
-
# SponsoredLinks.
|
246
|
+
# Iterates over the URLs of each ad.
|
145
247
|
#
|
146
|
-
#
|
248
|
+
# @yield [url]
|
249
|
+
# The given block will be passed each URL.
|
147
250
|
#
|
148
|
-
|
149
|
-
|
251
|
+
# @yieldparam [URI::HTTP] url
|
252
|
+
# An URL of an ad.
|
253
|
+
#
|
254
|
+
# @return [Enumerator]
|
255
|
+
# If no block is given, an Enumerator object will be returned.
|
256
|
+
#
|
257
|
+
# @example
|
258
|
+
# each_url { |url| puts url }
|
259
|
+
#
|
260
|
+
def each_url
|
261
|
+
unless block_given?
|
262
|
+
enum_for(:each_url)
|
263
|
+
else
|
264
|
+
each { |ad| yield ad.url }
|
265
|
+
end
|
150
266
|
end
|
151
267
|
|
152
268
|
#
|
153
|
-
#
|
154
|
-
# SponsoredLinks.
|
269
|
+
# Iterates over the direct URLs of each ad.
|
155
270
|
#
|
156
|
-
#
|
271
|
+
# @yield [direct_url]
|
272
|
+
# The given block will be passed each direct URL.
|
157
273
|
#
|
158
|
-
|
159
|
-
|
274
|
+
# @yieldparam [URI::HTTP] direct_url
|
275
|
+
# A direct URL of an ad.
|
276
|
+
#
|
277
|
+
# @return [Enumerator]
|
278
|
+
# If no block is given, an Enumerator object will be returned.
|
279
|
+
#
|
280
|
+
# @example
|
281
|
+
# each_direct_url { |url| puts url }
|
282
|
+
#
|
283
|
+
def each_direct_url
|
284
|
+
unless block_given?
|
285
|
+
enum_for(:each_direct_url)
|
286
|
+
else
|
287
|
+
each { |ad| yield ad.direct_url }
|
288
|
+
end
|
160
289
|
end
|
161
290
|
|
162
291
|
#
|
163
|
-
#
|
164
|
-
# the given _block_.
|
292
|
+
# The titles for the ads.
|
165
293
|
#
|
166
|
-
#
|
294
|
+
# @return [Array<String>]
|
295
|
+
# The titles for the ads.
|
296
|
+
#
|
297
|
+
# @example
|
298
|
+
# sponsored.titles # => [...]
|
167
299
|
#
|
168
|
-
def
|
169
|
-
|
300
|
+
def titles
|
301
|
+
each_title.to_a
|
170
302
|
end
|
171
303
|
|
172
304
|
#
|
173
|
-
#
|
174
|
-
# the given _block_.
|
305
|
+
# The URLs for the ads.
|
175
306
|
#
|
176
|
-
#
|
307
|
+
# @return [Array<URI::HTTP>]
|
308
|
+
# The URLs for the ads.
|
177
309
|
#
|
178
|
-
|
179
|
-
|
310
|
+
# @example
|
311
|
+
# sponsored.urls # => [...]
|
312
|
+
#
|
313
|
+
def urls
|
314
|
+
each_url.to_a
|
180
315
|
end
|
181
316
|
|
182
317
|
#
|
183
|
-
#
|
184
|
-
# each to the given _block_.
|
318
|
+
# The direct URLs for the ads.
|
185
319
|
#
|
186
|
-
#
|
320
|
+
# @return [Array<URI::HTTP>]
|
321
|
+
# The direct URLs for the ads.
|
322
|
+
#
|
323
|
+
# @example
|
324
|
+
# sponsored.direct_urls # => [...]
|
187
325
|
#
|
188
|
-
def
|
189
|
-
|
326
|
+
def direct_urls
|
327
|
+
each_direct_url.to_a
|
190
328
|
end
|
191
329
|
|
192
330
|
#
|
193
|
-
#
|
331
|
+
# The titles of the selected ads.
|
332
|
+
#
|
333
|
+
# @yield [ad]
|
334
|
+
# The given block will be passed each ad to be selected.
|
335
|
+
#
|
336
|
+
# @yieldparam [SponsoredAd] ad
|
337
|
+
# An ad to be selected.
|
194
338
|
#
|
339
|
+
# @return [Array<String>]
|
340
|
+
# The titles of the selected ads.
|
341
|
+
#
|
342
|
+
# @example
|
195
343
|
# sponsored.titles_of { |ad| ad.url.include?('www') }
|
196
344
|
#
|
197
345
|
def titles_of(&block)
|
@@ -199,8 +347,18 @@ module GScraper
|
|
199
347
|
end
|
200
348
|
|
201
349
|
#
|
202
|
-
#
|
350
|
+
# The URLs of the selected ads.
|
351
|
+
#
|
352
|
+
# @yield [ad]
|
353
|
+
# The given block will be passed each ad to be selected.
|
203
354
|
#
|
355
|
+
# @yieldparam [SponsoredAd] ad
|
356
|
+
# An ad to be selected.
|
357
|
+
#
|
358
|
+
# @return [Array<String>]
|
359
|
+
# The URLs of the selected ads.
|
360
|
+
#
|
361
|
+
# @example
|
204
362
|
# sponsored.urls_of { |ad| ad.title =~ /buy these pants/ }
|
205
363
|
#
|
206
364
|
def urls_of(&block)
|
@@ -208,8 +366,18 @@ module GScraper
|
|
208
366
|
end
|
209
367
|
|
210
368
|
#
|
211
|
-
#
|
369
|
+
# The direct URLs of the selected ads.
|
370
|
+
#
|
371
|
+
# @yield [ad]
|
372
|
+
# The given block will be passed each ad to be selected.
|
373
|
+
#
|
374
|
+
# @yieldparam [SponsoredAd] ad
|
375
|
+
# An ad to be selected.
|
376
|
+
#
|
377
|
+
# @return [Array<String>]
|
378
|
+
# The direct URLs of the selected ads.
|
212
379
|
#
|
380
|
+
# @example
|
213
381
|
# sponsored.urls_of { |ad| ad.title =~ /buy these pants/ }
|
214
382
|
#
|
215
383
|
def direct_urls_of(&block)
|