gscraper 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. data/.gitignore +8 -0
  2. data/.specopts +1 -0
  3. data/.yardopts +1 -0
  4. data/ChangeLog.md +122 -0
  5. data/Gemfile +25 -0
  6. data/{README.txt → README.md} +25 -24
  7. data/Rakefile +32 -10
  8. data/gscraper.gemspec +112 -0
  9. data/lib/gscraper.rb +0 -2
  10. data/lib/gscraper/extensions.rb +0 -2
  11. data/lib/gscraper/extensions/uri.rb +0 -2
  12. data/lib/gscraper/extensions/uri/http.rb +0 -2
  13. data/lib/gscraper/extensions/uri/query_params.rb +18 -5
  14. data/lib/gscraper/gscraper.rb +61 -70
  15. data/lib/gscraper/has_pages.rb +76 -20
  16. data/lib/gscraper/licenses.rb +0 -2
  17. data/lib/gscraper/page.rb +45 -16
  18. data/lib/gscraper/search.rb +0 -2
  19. data/lib/gscraper/search/ajax_query.rb +75 -22
  20. data/lib/gscraper/search/page.rb +328 -122
  21. data/lib/gscraper/search/query.rb +100 -7
  22. data/lib/gscraper/search/result.rb +27 -6
  23. data/lib/gscraper/search/search.rb +59 -9
  24. data/lib/gscraper/search/web_query.rb +120 -37
  25. data/lib/gscraper/sponsored_ad.rb +19 -6
  26. data/lib/gscraper/sponsored_links.rb +260 -92
  27. data/lib/gscraper/version.rb +2 -3
  28. data/spec/extensions/uri/query_params_spec.rb +8 -0
  29. data/spec/gscraper_spec.rb +9 -4
  30. data/spec/has_pages_examples.rb +0 -2
  31. data/spec/has_sponsored_links_examples.rb +2 -1
  32. data/spec/helpers/query.rb +3 -1
  33. data/spec/helpers/uri.rb +6 -4
  34. data/spec/page_has_results_examples.rb +0 -2
  35. data/spec/search/ajax_query_spec.rb +6 -11
  36. data/spec/search/page_has_results_examples.rb +0 -2
  37. data/spec/search/web_query_spec.rb +6 -11
  38. data/spec/spec_helper.rb +10 -4
  39. metadata +147 -54
  40. data/History.txt +0 -101
  41. data/Manifest.txt +0 -38
  42. data/tasks/spec.rb +0 -9
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/extensions/uri'
@@ -32,7 +30,13 @@ module GScraper
32
30
  attr_reader :url
33
31
 
34
32
  #
35
- # Creates a new SponsoredAd with the specified _title_ and _url_.
33
+ # Creates a new SponsoredAd.
34
+ #
35
+ # @param [String] title
36
+ # The title of the ad.
37
+ #
38
+ # @param [URI::HTTP] url
39
+ # The URL of the ad.
36
40
  #
37
41
  def initialize(title,url)
38
42
  @title = title
@@ -40,21 +44,30 @@ module GScraper
40
44
  end
41
45
 
42
46
  #
43
- # Returns the direct link of the ad.
47
+ # The direct link of the ad.
48
+ #
49
+ # @return [String]
50
+ # The direct link.
44
51
  #
45
52
  def direct_link
46
53
  @url.query_params['adurl'] || @url.query_params['q']
47
54
  end
48
55
 
49
56
  #
50
- # Returns the direct URL of the ad.
57
+ # The direct URI of the ad.
58
+ #
59
+ # @return [URI::HTTP]
60
+ # The direct URI.
51
61
  #
52
62
  def direct_url
53
63
  URI(URI.escape(direct_link))
54
64
  end
55
65
 
56
66
  #
57
- # Returns the title of the ad.
67
+ # The title of the ad.
68
+ #
69
+ # @return [String]
70
+ # The title.
58
71
  #
59
72
  def to_s
60
73
  @title.to_s
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,181 +16,330 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/sponsored_ad'
24
22
 
23
+ require 'enumerator'
24
+
25
25
  module GScraper
26
26
  class SponsoredLinks < Array
27
+
28
+ #
29
+ # Creates a new SponsoredLinks object.
27
30
  #
28
- # Creates a new SponsoredLinks object with the given _ads_. If a
29
- # _block_ is given, it will be passed the newly created SponsoredLinks
30
- # object.
31
+ # @param [Array] ads
32
+ # The ads to populate the sponsored links object with.
31
33
  #
32
- def initialize(ads=[],&block)
34
+ # @yield [links]
35
+ # If a block is given, it will be passed the new sponsored links
36
+ # object.
37
+ #
38
+ # @yieldparam [SponsoredLinks] links
39
+ # The new sponsored links object.
40
+ #
41
+ def initialize(ads=[])
33
42
  super(ads)
34
43
 
35
- block.call(self) if block
44
+ yield self if block_given?
36
45
  end
37
46
 
38
47
  #
39
- # Returns a mapped Array of the ads within the SponsoredLinks
40
- # using the given _block_. If the _block_ is not given, the
41
- # SponsoredLinks will be returned.
48
+ # Maps the sponsored ads.
49
+ #
50
+ # @yield [ad]
51
+ # The given block will be passed each ad.
42
52
  #
43
- # sponsored.map # => SponsoredLinks
53
+ # @yieldparam [SponsoredAd] ad
54
+ # The sponsored ad.
44
55
  #
45
- # sponsored.map { |ad| ad.url } # => [...]
56
+ # @return [Array, Enumerator]
57
+ # The mapped result. If no block was given, an Enumerator object will
58
+ # be returned.
46
59
  #
47
- def map(&block)
48
- return self unless block
60
+ # @example
61
+ # sponsored.map
62
+ # # => SponsoredLinks
63
+ #
64
+ # @example
65
+ # sponsored.map { |ad| ad.url }
66
+ # # => [...]
67
+ #
68
+ def map
69
+ return enum_for(:map) unless block_given?
49
70
 
50
71
  mapped = []
51
72
 
52
- each { |ad| mapped << block.call(ad) }
73
+ each { |ad| mapped << yield(ad) }
53
74
  return mapped
54
75
  end
55
76
 
56
77
  #
57
- # Selects the ads within the SponsoredLinks which match the given _block_.
78
+ # Selects the ads within the sponsored links.
79
+ #
80
+ # @yield [ad]
81
+ # The given block will determine which ads to select.
82
+ #
83
+ # @yieldparam [SponsoredAd] ad
84
+ # A sponsored ad.
85
+ #
86
+ # @return [Array, Enumerator]
87
+ # The selected ads. If no block is given, an Enumerator object will
88
+ # be returned.
58
89
  #
90
+ # @example
59
91
  # sponsored.select { |ad| ad.title =~ /consume/i }
60
92
  #
61
93
  def select(&block)
62
- SponsoredLinks.new(super(&block))
94
+ unless block
95
+ enum_for(:select)
96
+ else
97
+ SponsoredLinks.new(super(&block))
98
+ end
63
99
  end
64
100
 
101
+ alias ads_with select
102
+
65
103
  #
66
- # Selects the ads using the specified _block_.
104
+ # Selects the ads with the matching title.
67
105
  #
68
- # sponsored.ads_with { |ad| ad.title =~ /status symbol/ }
106
+ # @param [String, Regexp] title
107
+ # The title to search for.
69
108
  #
70
- def ads_with(&block)
71
- select(&block)
72
- end
73
-
109
+ # @yield [ad]
110
+ # Each matching ad will be passed to the given block.
111
+ #
112
+ # @yieldparam [SponsoredAd] ad
113
+ # A sponsored ad with the matching title.
74
114
  #
75
- # Selects the ads with the matching _title_. The _title_ may be
76
- # either a String or a Regexp. If _block_ is given, each matching
77
- # ad will be passed to the _block_.
115
+ # @return [Array, Enumerator]
116
+ # The sponsored ads with the matching title. If no block is given,
117
+ # an Enumerator object will be returned.
78
118
  #
79
- # sponsored.ads_with_title('be attractive') #=> SponsoredLinks
119
+ # @example
120
+ # sponsored.ads_with_title('be attractive')
121
+ # # => SponsoredLinks
80
122
  #
123
+ # @example
81
124
  # sponsored.ads_with_title(/buy me/) do |ad|
82
125
  # puts ad.url
83
126
  # end
84
127
  #
85
- def ads_with_title(title,&block)
86
- if title.kind_of?(Regexp)
87
- ads = ads_with { |ad| ad.title =~ title }
88
- else
89
- ads = ads_with { |ad| ad.title == title }
90
- end
128
+ def ads_with_title(title)
129
+ return enum_for(:ads_with_title,title) unless block_given?
130
+
131
+ comparitor = if title.kind_of?(Regexp)
132
+ lambda { |ad| ad.title =~ title }
133
+ else
134
+ lambda { |ad| ad.title == title }
135
+ end
91
136
 
92
- ads.each(&block) if block
93
- return ads
137
+ return ads_with do |ad|
138
+ if comparitor.call(ad)
139
+ yield ad
140
+
141
+ true
142
+ end
143
+ end
94
144
  end
95
145
 
96
146
  #
97
- # Selects the ads with the matching _url_. The _url_ may be
98
- # either a String or a Regexp. If _block_ is given, each matching
99
- # ad will be passed to the _block_.
147
+ # Selects the ads with the matching URL.
100
148
  #
101
- # sponsored.ads_with_url(/\.com/) # => SponsoredLinks
149
+ # @param [String, Regexp] url
150
+ # The URL to search for.
102
151
  #
103
- def ads_with_url(url,&block)
104
- if url.kind_of?(Regexp)
105
- ads = ads_with { |ad| ad.url =~ url }
106
- else
107
- ads = ads_with { |ad| ad.url == url }
108
- end
152
+ # @yield [ad]
153
+ # Each matching ad will be passed to the given block.
154
+ #
155
+ # @yieldparam [SponsoredAd] ad
156
+ # A sponsored ad with the matching URL.
157
+ #
158
+ # @return [Array, Enumerator]
159
+ # The sponsored ads with the matching URL. If no block is given,
160
+ # an Enumerator object will be returned.
161
+ #
162
+ # @example
163
+ # sponsored.ads_with_url(/\.com/)
164
+ # # => SponsoredLinks
165
+ #
166
+ def ads_with_url(url)
167
+ return enum_for(:ads_with_url,url) unless block_given?
109
168
 
110
- ads.each(&block) if block
111
- return ads
169
+ comparitor = if url.kind_of?(Regexp)
170
+ lambda { |ad| ad.url =~ url }
171
+ else
172
+ lambda { |ad| ad.url == url }
173
+ end
174
+
175
+ return ads_with do |ad|
176
+ if comparitor.call(ad)
177
+ yield ad
178
+
179
+ true
180
+ end
181
+ end
112
182
  end
113
183
 
114
184
  #
115
- # Selects the ads with the matching _direct_url_. The _direct_url_ may
116
- # be either a String or a Regexp. If _block_ is given, each matching
117
- # ad will be passed to the _block_.
185
+ # Selects the ads with the matching direct URL.
118
186
  #
119
- # sponsored.ads_with_direct_url(/\.com/) # => SponsoredLinks
187
+ # @param [String, Regexp] direct_url
188
+ # The direct URL to search for.
120
189
  #
121
- def ads_with_direct_url(direct_url,&block)
122
- if direct_url.kind_of?(Regexp)
123
- ads = ads_with { |ad| ad.direct_url =~ direct_url }
124
- else
125
- ads = ads_with { |ad| ad.direct_url == direct_url }
126
- end
190
+ # @yield [ad]
191
+ # Each matching ad will be passed to the given block.
192
+ #
193
+ # @yieldparam [SponsoredAd] ad
194
+ # A sponsored ad with the matching direct URL.
195
+ #
196
+ # @return [Array, Enumerator]
197
+ # The sponsored ads with the matching URL. If no block is given,
198
+ # an Enumerator object will be returned.
199
+ #
200
+ # @example
201
+ # sponsored.ads_with_direct_url(/\.com/)
202
+ # # => SponsoredLinks
203
+ #
204
+ def ads_with_direct_url(direct_url)
205
+ return enum_for(:ads_with_direct_url,direct_url) unless block_given?
206
+
207
+ comparitor = if direct_url.kind_of?(Regexp)
208
+ lambda { |ad| ad.direct_url =~ direct_url }
209
+ else
210
+ lambda { |ad| ad.direct_url == direct_url }
211
+ end
127
212
 
128
- ads.each(&block) if block
129
- return ads
213
+ return ads_with do |ad|
214
+ if comparitor.call(ad)
215
+ yield ad
216
+
217
+ true
218
+ end
219
+ end
130
220
  end
131
221
 
132
222
  #
133
- # Returns an Array containing the titles of the ads within the
134
- # SponsoredLinks.
223
+ # Iterates over the titles of each ad.
135
224
  #
136
- # sponsored.titles # => [...]
225
+ # @yield [title]
226
+ # The given block will be passed each title.
137
227
  #
138
- def titles
139
- map { |ad| ad.title }
228
+ # @yieldparam [String] title
229
+ # A title of an ad.
230
+ #
231
+ # @return [Enumerator]
232
+ # If no block is given, an Enumerator object will be returned.
233
+ #
234
+ # @example
235
+ # each_title { |title| puts title }
236
+ #
237
+ def each_title
238
+ unless block_given?
239
+ enum_for(:each_title)
240
+ else
241
+ each { |ad| yield ad.title }
242
+ end
140
243
  end
141
244
 
142
245
  #
143
- # Returns an Array containing the URLs of the ads within the
144
- # SponsoredLinks.
246
+ # Iterates over the URLs of each ad.
145
247
  #
146
- # sponsored.urls # => [...]
248
+ # @yield [url]
249
+ # The given block will be passed each URL.
147
250
  #
148
- def urls
149
- map { |ad| ad.url }
251
+ # @yieldparam [URI::HTTP] url
252
+ # An URL of an ad.
253
+ #
254
+ # @return [Enumerator]
255
+ # If no block is given, an Enumerator object will be returned.
256
+ #
257
+ # @example
258
+ # each_url { |url| puts url }
259
+ #
260
+ def each_url
261
+ unless block_given?
262
+ enum_for(:each_url)
263
+ else
264
+ each { |ad| yield ad.url }
265
+ end
150
266
  end
151
267
 
152
268
  #
153
- # Returns an Array containing the direct URLs of the ads within the
154
- # SponsoredLinks.
269
+ # Iterates over the direct URLs of each ad.
155
270
  #
156
- # sponsored.direct_urls # => [...]
271
+ # @yield [direct_url]
272
+ # The given block will be passed each direct URL.
157
273
  #
158
- def direct_urls
159
- map { |ad| ad.direct_url }
274
+ # @yieldparam [URI::HTTP] direct_url
275
+ # A direct URL of an ad.
276
+ #
277
+ # @return [Enumerator]
278
+ # If no block is given, an Enumerator object will be returned.
279
+ #
280
+ # @example
281
+ # each_direct_url { |url| puts url }
282
+ #
283
+ def each_direct_url
284
+ unless block_given?
285
+ enum_for(:each_direct_url)
286
+ else
287
+ each { |ad| yield ad.direct_url }
288
+ end
160
289
  end
161
290
 
162
291
  #
163
- # Iterates over each ad's title within the SponsoredLinks, passing each to
164
- # the given _block_.
292
+ # The titles for the ads.
165
293
  #
166
- # each_title { |title| puts title }
294
+ # @return [Array<String>]
295
+ # The titles for the ads.
296
+ #
297
+ # @example
298
+ # sponsored.titles # => [...]
167
299
  #
168
- def each_title(&block)
169
- titles.each(&block)
300
+ def titles
301
+ each_title.to_a
170
302
  end
171
303
 
172
304
  #
173
- # Iterates over each ad's URL within the SponsoredLinks, passing each to
174
- # the given _block_.
305
+ # The URLs for the ads.
175
306
  #
176
- # each_url { |url| puts url }
307
+ # @return [Array<URI::HTTP>]
308
+ # The URLs for the ads.
177
309
  #
178
- def each_url(&block)
179
- urls.each(&block)
310
+ # @example
311
+ # sponsored.urls # => [...]
312
+ #
313
+ def urls
314
+ each_url.to_a
180
315
  end
181
316
 
182
317
  #
183
- # Iterates over each ad's direct URL within the SponsoredLinks, passing
184
- # each to the given _block_.
318
+ # The direct URLs for the ads.
185
319
  #
186
- # each_direct_url { |url| puts url }
320
+ # @return [Array<URI::HTTP>]
321
+ # The direct URLs for the ads.
322
+ #
323
+ # @example
324
+ # sponsored.direct_urls # => [...]
187
325
  #
188
- def each_direct_url(&block)
189
- direct_urls.each(&block)
326
+ def direct_urls
327
+ each_direct_url.to_a
190
328
  end
191
329
 
192
330
  #
193
- # Returns the titles of the ads that match the specified _block_.
331
+ # The titles of the selected ads.
332
+ #
333
+ # @yield [ad]
334
+ # The given block will be passed each ad to be selected.
335
+ #
336
+ # @yieldparam [SponsoredAd] ad
337
+ # An ad to be selected.
194
338
  #
339
+ # @return [Array<String>]
340
+ # The titles of the selected ads.
341
+ #
342
+ # @example
195
343
  # sponsored.titles_of { |ad| ad.url.include?('www') }
196
344
  #
197
345
  def titles_of(&block)
@@ -199,8 +347,18 @@ module GScraper
199
347
  end
200
348
 
201
349
  #
202
- # Returns the URLs of the ads that match the specified _block_.
350
+ # The URLs of the selected ads.
351
+ #
352
+ # @yield [ad]
353
+ # The given block will be passed each ad to be selected.
203
354
  #
355
+ # @yieldparam [SponsoredAd] ad
356
+ # An ad to be selected.
357
+ #
358
+ # @return [Array<String>]
359
+ # The URLs of the selected ads.
360
+ #
361
+ # @example
204
362
  # sponsored.urls_of { |ad| ad.title =~ /buy these pants/ }
205
363
  #
206
364
  def urls_of(&block)
@@ -208,8 +366,18 @@ module GScraper
208
366
  end
209
367
 
210
368
  #
211
- # Returns the direct URLs of the ads that match the specified _block_.
369
+ # The direct URLs of the selected ads.
370
+ #
371
+ # @yield [ad]
372
+ # The given block will be passed each ad to be selected.
373
+ #
374
+ # @yieldparam [SponsoredAd] ad
375
+ # An ad to be selected.
376
+ #
377
+ # @return [Array<String>]
378
+ # The direct URLs of the selected ads.
212
379
  #
380
+ # @example
213
381
  # sponsored.urls_of { |ad| ad.title =~ /buy these pants/ }
214
382
  #
215
383
  def direct_urls_of(&block)