gscraper 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. data/.gitignore +8 -0
  2. data/.specopts +1 -0
  3. data/.yardopts +1 -0
  4. data/ChangeLog.md +122 -0
  5. data/Gemfile +25 -0
  6. data/{README.txt → README.md} +25 -24
  7. data/Rakefile +32 -10
  8. data/gscraper.gemspec +112 -0
  9. data/lib/gscraper.rb +0 -2
  10. data/lib/gscraper/extensions.rb +0 -2
  11. data/lib/gscraper/extensions/uri.rb +0 -2
  12. data/lib/gscraper/extensions/uri/http.rb +0 -2
  13. data/lib/gscraper/extensions/uri/query_params.rb +18 -5
  14. data/lib/gscraper/gscraper.rb +61 -70
  15. data/lib/gscraper/has_pages.rb +76 -20
  16. data/lib/gscraper/licenses.rb +0 -2
  17. data/lib/gscraper/page.rb +45 -16
  18. data/lib/gscraper/search.rb +0 -2
  19. data/lib/gscraper/search/ajax_query.rb +75 -22
  20. data/lib/gscraper/search/page.rb +328 -122
  21. data/lib/gscraper/search/query.rb +100 -7
  22. data/lib/gscraper/search/result.rb +27 -6
  23. data/lib/gscraper/search/search.rb +59 -9
  24. data/lib/gscraper/search/web_query.rb +120 -37
  25. data/lib/gscraper/sponsored_ad.rb +19 -6
  26. data/lib/gscraper/sponsored_links.rb +260 -92
  27. data/lib/gscraper/version.rb +2 -3
  28. data/spec/extensions/uri/query_params_spec.rb +8 -0
  29. data/spec/gscraper_spec.rb +9 -4
  30. data/spec/has_pages_examples.rb +0 -2
  31. data/spec/has_sponsored_links_examples.rb +2 -1
  32. data/spec/helpers/query.rb +3 -1
  33. data/spec/helpers/uri.rb +6 -4
  34. data/spec/page_has_results_examples.rb +0 -2
  35. data/spec/search/ajax_query_spec.rb +6 -11
  36. data/spec/search/page_has_results_examples.rb +0 -2
  37. data/spec/search/web_query_spec.rb +6 -11
  38. data/spec/spec_helper.rb +10 -4
  39. metadata +147 -54
  40. data/History.txt +0 -101
  41. data/Manifest.txt +0 -38
  42. data/tasks/spec.rb +0 -9
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/search/result'
@@ -82,10 +80,77 @@ module GScraper
82
80
  attr_accessor :numeric_range
83
81
 
84
82
  #
85
- # Creates a new Query object from the given search options. If a
86
- # block is given, it will be passed the newly created Query object.
83
+ # Creates a new query.
87
84
  #
88
- def initialize(options={},&block)
85
+ # @param [Hash] options
86
+ # Additional options.
87
+ #
88
+ # @option options [String] :query
89
+ # The search query.
90
+ #
91
+ # @option options [String] :link
92
+ # Search for results which link to the specified URI.
93
+ #
94
+ # @option options [String] :related
95
+ # Search for results which relate to the specified URI.
96
+ #
97
+ # @option options [String] :info
98
+ # Return information about the specified URI.
99
+ #
100
+ # @option options [String] :site
101
+ # Limit results to the specified site.
102
+ #
103
+ # @option options [String] :filetype
104
+ # Limit results to those with the specified file-type.
105
+ #
106
+ # @option options [String, Array] :allintitle
107
+ # Search for results with all of the keywords appearing in the
108
+ # title.
109
+ #
110
+ # @option options [String] :intitle
111
+ # Search for results with the keyword appearing in the title.
112
+ #
113
+ # @option options [String, Array] :allintext
114
+ # Search for results with all of the keywords appearing in the text.
115
+ #
116
+ # @option options [String] :intext
117
+ # Search for results with the keyword appearing in the text.
118
+ #
119
+ # @option options [String, Array] :allinanchor
120
+ # Search for results with all of the keywords appearing in the
121
+ # text of links.
122
+ #
123
+ # @option options [String] :inanchor
124
+ # Search for results with the keyword appearing in the text of
125
+ # links.
126
+ #
127
+ # @option options [String] :exact_phrase
128
+ # Search for results containing the specified exact phrase.
129
+ #
130
+ # @option options [String, Array] :with_words
131
+ # Search for results containing all of the specified words.
132
+ #
133
+ # @option options [String, Array] :without_words
134
+ # Search for results not containing any of the specified words.
135
+ #
136
+ # @option options [Range] :numeric_range
137
+ # Search for results contain numbers that fall within the
138
+ # specified Range.
139
+ #
140
+ # @option options [String] :define
141
+ # Search for results containing the definition of the specified
142
+ # keyword.
143
+ #
144
+ # @yield [query]
145
+ # If a block is given, it will be passed the new query.
146
+ #
147
+ # @yieldparam [Query] query
148
+ # The new query.
149
+ #
150
+ # @return [Query]
151
+ # The new query.
152
+ #
153
+ def initialize(options={})
89
154
  @query = options[:query]
90
155
 
91
156
  @link = options[:link]
@@ -100,18 +165,24 @@ module GScraper
100
165
  @inurl = options[:inurl]
101
166
  @allintext = options[:allintext]
102
167
  @intext = options[:intext]
168
+ @allinanchor = options[:allinanchor]
169
+ @inanchor = options[:inanchor]
103
170
 
104
171
  @exact_phrase = options[:exact_phrase]
105
172
  @with_words = options[:with_words]
106
173
  @without_words = options[:without_words]
107
174
 
108
175
  @numeric_range = options[:numeric_range]
176
+ @define = options[:define]
109
177
 
110
- block.call(self) if block
178
+ yield self if block_given?
111
179
  end
112
180
 
113
181
  #
114
- # Returns the query expression.
182
+ # The query expression.
183
+ #
184
+ # @return [String]
185
+ # The expression representing the query.
115
186
  #
116
187
  def expression
117
188
  expr = []
@@ -142,6 +213,10 @@ module GScraper
142
213
  append_modifier.call(:inurl)
143
214
  append_options.call(:allintext)
144
215
  append_modifier.call(:intext)
216
+ append_options.call(:allinanchor)
217
+ append_modifier.call(:inanchor)
218
+
219
+ append_modifier.call(:define)
145
220
 
146
221
  if @exact_phrase
147
222
  expr << "\"#{@exact_phrase}\""
@@ -164,6 +239,15 @@ module GScraper
164
239
 
165
240
  protected
166
241
 
242
+ #
243
+ # Formats the value for a search modifer.
244
+ #
245
+ # @param [Regexp, String]
246
+ # The value for the search modifier.
247
+ #
248
+ # @return [String]
249
+ # The formatted value.
250
+ #
167
251
  def format_modifier(value)
168
252
  if value.kind_of?(Regexp)
169
253
  return value.source
@@ -172,6 +256,15 @@ module GScraper
172
256
  end
173
257
  end
174
258
 
259
+ #
260
+ # Formats the value(s) for a search option.
261
+ #
262
+ # @param [Array, Regexp, String]
263
+ # The value(s) for the search modifier.
264
+ #
265
+ # @return [String]
266
+ # The formatted value.
267
+ #
175
268
  def format_options(value)
176
269
  if value.kind_of?(Array)
177
270
  return value.map { |element|
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/search/query'
@@ -46,8 +44,22 @@ module GScraper
46
44
  attr_reader :similar_url
47
45
 
48
46
  #
49
- # Creates a new Result object with the given _rank_, _title_
50
- # _summary_, _url_, _size_, _cache_url_ and _similar_url_.
47
+ # Creates a new {Result} object.
48
+ #
49
+ # @param [Integer] rank
50
+ # The rank of the result.
51
+ #
52
+ # @param [String] title
53
+ # The title of the result.
54
+ #
55
+ # @param [String] summary
56
+ # The summary of the result.
57
+ #
58
+ # @param [URI::HTTP] cached_url
59
+ # The Cached URL for the result.
60
+ #
61
+ # @param [URI::HTTP] similar_url
62
+ # The Similar Query URL for the result.
51
63
  #
52
64
  def initialize(rank,title,url,summary,cached_url=nil,similar_url=nil)
53
65
  @agent = GScraper.web_agent
@@ -63,12 +75,18 @@ module GScraper
63
75
  #
64
76
  # Fetches the page of the result.
65
77
  #
78
+ # @return [Mechanize::Page]
79
+ # The page the result represents.
80
+ #
66
81
  def page
67
82
  @agent.get(@url)
68
83
  end
69
84
 
70
85
  #
71
- # Fetches the cached page of the result.
86
+ # Fetches the Cached Page of the result.
87
+ #
88
+ # @return [Mechanize::Page]
89
+ # The Cached Page for the result.
72
90
  #
73
91
  def cached_page
74
92
  if @cached_url
@@ -77,7 +95,10 @@ module GScraper
77
95
  end
78
96
 
79
97
  #
80
- # Returns a string containing the result's title.
98
+ # The result's title.
99
+ #
100
+ # @return [String]
101
+ # The title of the result.
81
102
  #
82
103
  def to_s
83
104
  @title.to_s
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/search/web_query'
@@ -26,46 +24,98 @@ require 'gscraper/search/ajax_query'
26
24
  module GScraper
27
25
  module Search
28
26
  #
29
- # Returns a new Query object with the given _options_. See Query.new.
27
+ # Creates a new web-query.
30
28
  #
29
+ # @param [Hash] options
30
+ # Additional options.
31
+ #
32
+ # @yield [query]
33
+ # If a block is given, it will be passed the new web-query.
34
+ #
35
+ # @yieldparam [WebQuery] query
36
+ # The new web query.
37
+ #
38
+ # @return [WebQuery]
39
+ # The new web-query.
40
+ #
41
+ # @example
31
42
  # Search.query(:query => 'ruby', :with_words => 'sow rspec')
32
43
  #
44
+ # @example
33
45
  # Search.query(:exact_phrase => 'fluent interfaces') do |q|
34
46
  # q.within_past_week = true
35
47
  # end
36
48
  #
49
+ # @see WebQuery#initialize
50
+ #
37
51
  def Search.query(options={},&block)
38
52
  WebQuery.new(options,&block)
39
53
  end
40
54
 
41
55
  #
42
- # Returns the Query object that represents the specified _url_.
43
- # See Query.from_url.
56
+ # Creates a web-query from a search URL.
57
+ #
58
+ # @param [String] url
59
+ # The search URL.
60
+ #
61
+ # @yield [query]
62
+ # If a block is given, it will be passed the new web-query.
63
+ #
64
+ # @yieldparam [WebQuery] query
65
+ # The new web query.
66
+ #
67
+ # @return [WebQuery]
68
+ # The new web-query.
44
69
  #
70
+ # @example
45
71
  # Search.query_from_url('http://www.google.com/search?q=ruby+zen)
46
72
  #
73
+ # @example
47
74
  # Search.query_from_url('http://www.google.com/search?q=ruby') do |q|
48
75
  # q.within_last_month = true
49
76
  # q.occurrs_within = :title
50
77
  # end
51
78
  #
79
+ # @see WebQuery.from_url.
80
+ #
52
81
  def Search.query_from_url(url,&block)
53
82
  WebQuery.from_url(url,&block)
54
83
  end
55
84
 
56
85
  #
57
- # Returns a new AJAXQuery object with the given _options_. See
58
- # AJAXQuery.new.
86
+ # Creates a new AJAX query.
87
+ #
88
+ # @param [Hash] options
89
+ # Additional options.
90
+ #
91
+ # @yield [query]
92
+ # If a block is given, the new AJAX query will be passed to it.
59
93
  #
94
+ # @yieldparam [AJAXQuery] query
95
+ # The new AJAX query.
96
+ #
97
+ # @example
60
98
  # Search.ajax_query(:query => 'ruby')
61
99
  #
100
+ # @see AJAXQuery#initialize
101
+ #
62
102
  def Search.ajax_query(options={},&block)
63
103
  AJAXQuery.new(options,&block)
64
104
  end
65
105
 
66
106
  #
67
- # Returns the AJAXQuery object that represents the specified _url_.
68
- # See AJAXQuery.from_url.
107
+ # Creates a AJAX query from a given search URL.
108
+ #
109
+ # @param [URI::HTTP] url
110
+ # The search URL.
111
+ #
112
+ # @yield [query]
113
+ # If a block is given, the new AJAX query will be passed to it.
114
+ #
115
+ # @yieldparam [AJAXQuery] query
116
+ # The new AJAX query.
117
+ #
118
+ # @see AJAXQuery.from_url.
69
119
  #
70
120
  def Search.ajax_query_from_url(url,&block)
71
121
  AJAXQuery.from_url(url,&block)
@@ -1,5 +1,4 @@
1
1
  #
2
- #--
3
2
  # GScraper - A web-scraping interface to various Google Services.
4
3
  #
5
4
  # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
@@ -17,7 +16,6 @@
17
16
  # You should have received a copy of the GNU General Public License
18
17
  # along with this program; if not, write to the Free Software
19
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
- #++
21
19
  #
22
20
 
23
21
  require 'gscraper/search/result'
@@ -87,18 +85,56 @@ module GScraper
87
85
  # Filter the search results
88
86
  attr_accessor :filtered
89
87
 
90
- # Search for results similar to the page
91
- attr_accessor :similar_to
92
-
93
- # Search for results linking to the page
94
- attr_accessor :links_to
95
-
96
88
  #
97
- # Creates a new WebQuery object from the given search options. If a
98
- # block is given, it will be passed the newly created query object.
89
+ # Creates a new Web query.
90
+ #
91
+ # @param [Hash] options
92
+ # Additional options.
93
+ #
94
+ # @option options [Integer] :results_per_page
95
+ # Specifies the number of results for each page.
96
+ #
97
+ # @option options [String] :language
98
+ # Search for results in the specified language.
99
+ #
100
+ # @option options [String] :region
101
+ # Search for results from the specified region.
102
+ #
103
+ # @option options [Boolean] :within_past_day
104
+ # Search for results that were created within the past day.
105
+ #
106
+ # @option options [Boolean] :within_past_week
107
+ # Search for results that were created within the past week.
108
+ #
109
+ # @option options [Boolean] :within_past_month
110
+ # Search for results that were created within the past month.
111
+ #
112
+ # @option options [Boolean] :within_past_year
113
+ # Search for results that were created within the past year.
99
114
  #
115
+ # @option options [:title, :body, :url] :occurrs_within
116
+ # Searches for results where the keywords occurr within a specific
117
+ # part of the result page.
118
+ #
119
+ # @option options [Symbol] :rights
120
+ # Search for results licensed under the specified license.
121
+ #
122
+ # @option options [Boolean] :filtered
123
+ # Specifies whether or not to use SafeSearch.
124
+ #
125
+ # @yield [query]
126
+ # If a block is given, it will be passed the new Web query.
127
+ #
128
+ # @yieldparam [WebQuery] query
129
+ # The new Web query.
130
+ #
131
+ # @return [WebQuery]
132
+ # The new Web query.
133
+ #
134
+ # @example
100
135
  # WebQuery.new(:query => 'ruby', :with_words => 'sow rspec')
101
136
  #
137
+ # @example
102
138
  # WebQuery.new(:exact_phrase => 'fluent interfaces') do |q|
103
139
  # q.within_past_week = true
104
140
  # end
@@ -142,18 +178,37 @@ module GScraper
142
178
  @rights = options[:rights]
143
179
  @filtered = options[:filtered]
144
180
 
145
- @similar_to = options[:similar_to]
146
- @links_to = options[:links_to]
147
-
148
181
  super(options,&block)
149
182
  end
150
183
 
184
+ alias similar_to related
185
+ alias similar_to= related=
186
+
187
+ alias links_to link
188
+ alias links_to= link=
189
+
151
190
  #
152
- # Creates a new WebQuery object from the specified URL. If a block is
153
- # given, it will be passed the newly created WebQuery object.
191
+ # Creates a new Web query from a search URL.
154
192
  #
193
+ # @param [URI::HTTP, String] url
194
+ # The search URL.
195
+ #
196
+ # @param [Hash] options
197
+ # Additional options.
198
+ #
199
+ # @yield [query]
200
+ # If a block is given, it will be passed the new Web query.
201
+ #
202
+ # @yieldparam [WebQuery] query
203
+ # The new web query.
204
+ #
205
+ # @return [WebQuery]
206
+ # The new Web query.
207
+ #
208
+ # @example
155
209
  # WebQuery.from_url('http://www.google.com/search?q=ruby+zen')
156
210
  #
211
+ # @example
157
212
  # WebQuery.from_url('http://www.google.com/search?q=ruby') do |q|
158
213
  # q.within_last_month = true
159
214
  # q.occurrs_within = :title
@@ -198,8 +253,10 @@ module GScraper
198
253
  end
199
254
 
200
255
  if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
201
- options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,
202
- url.query_params['as_nhi'].to_i)
256
+ options[:numeric_range] = Range.new(
257
+ url.query_params['as_nlo'].to_i,
258
+ url.query_params['as_nhi'].to_i
259
+ )
203
260
  end
204
261
 
205
262
  case url.query_params['as_occt']
@@ -231,16 +288,19 @@ module GScraper
231
288
  end
232
289
 
233
290
  if url.query_params['as_rq']
234
- options[:similar_to] = url.query_params['as_rq']
291
+ options[:related] = url.query_params['as_rq']
235
292
  elsif url.query_params['as_lq']
236
- options[:links_to] = url.query_params['as_lq']
293
+ options[:link] = url.query_params['as_lq']
237
294
  end
238
295
 
239
296
  return self.new(options,&block)
240
297
  end
241
298
 
242
299
  #
243
- # Returns the URL that represents the query.
300
+ # The URL that represents the query.
301
+ #
302
+ # @return [URI::HTTP]
303
+ # The URL for the query.
244
304
  #
245
305
  def search_url
246
306
  url = URI(SEARCH_URL)
@@ -311,18 +371,17 @@ module GScraper
311
371
 
312
372
  url.query_params['safe'] = 'active' if @filtered
313
373
 
314
- if @similar_to
315
- url.query_params['as_rq'] = @similar_to
316
- elsif @links_to
317
- url.query_params['as_lq'] = @links_to
318
- end
319
-
320
374
  return url
321
375
  end
322
376
 
323
377
  #
324
- # Returns the URL that represents the query at the specific
325
- # _page_index_.
378
+ # Returns the URL that represents the query at a specific page index.
379
+ #
380
+ # @param [Integer] page_index
381
+ # The page index to create the URL for.
382
+ #
383
+ # @return [URI::HTTP]
384
+ # The URL for a query at the given page index.
326
385
  #
327
386
  def page_url(page_index)
328
387
  url = search_url
@@ -334,8 +393,13 @@ module GScraper
334
393
  end
335
394
 
336
395
  #
337
- # Returns a Page object containing Result objects at the specified
338
- # _page_index_.
396
+ # Returns a page containing results at the specific page index.
397
+ #
398
+ # @param [Integer] page_index
399
+ # The page index to query.
400
+ #
401
+ # @return [Page<Result>]
402
+ # The page at the given index for the query.
339
403
  #
340
404
  def page(page_index)
341
405
  Page.new do |new_page|
@@ -379,22 +443,30 @@ module GScraper
379
443
  end
380
444
 
381
445
  #
382
- # Returns the first Result on the first_page.
446
+ # Returns the first result on the first page.
447
+ #
448
+ # @return [Result]
449
+ # The first result.
383
450
  #
384
451
  def top_result
385
452
  first_page.first
386
453
  end
387
454
 
388
455
  #
389
- # Returns the Result at the specified _index_.
456
+ # Returns the result at the specified index.
457
+ #
458
+ # @param [Integer]
459
+ # The index of the result.
390
460
  #
391
461
  def result_at(index)
392
462
  page(page_index_of(index))[result_index_of(index)]
393
463
  end
394
464
 
395
465
  #
396
- # Returns a SponsoredLinks object containing SponsoredAd objects of
397
- # the query.
466
+ # Returns the sponsored links for the query.
467
+ #
468
+ # @return [SponsoredLinks<SponsoredAd>]
469
+ # The sponsored links for the query.
398
470
  #
399
471
  def sponsored_links
400
472
  SponsoredLinks.new do |links|
@@ -411,15 +483,26 @@ module GScraper
411
483
  end
412
484
 
413
485
  #
414
- # Returns the first sponsored link on the first page of results.
486
+ # Returns the first sponsored ad on the first page of results.
487
+ #
488
+ # @return [SponsoredAd]
489
+ # The first sponsored ad.
415
490
  #
416
491
  def top_sponsored_link
417
492
  top_sponsored_links.first
418
493
  end
419
494
 
420
495
  #
421
- # Iterates over the sponsored links on the first page of
422
- # results passing each to the specified _block_.
496
+ # Iterates over the sponsored ads on the first page.
497
+ #
498
+ # @yield [ad]
499
+ # The given block will be passed each sponsored ad.
500
+ #
501
+ # @yieldparam [SponsoredAd] ad
502
+ # A sponsored ad on the first page.
503
+ #
504
+ # @return [Enumerator]
505
+ # If no block is given, an Enumerator object will be returned.
423
506
  #
424
507
  def each_sponsored_link(&block)
425
508
  sponsored_links.each(&block)