gscraper 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,4 +18,4 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
- require 'gscraper/extensions/uri'
21
+ require 'gscraper/search/exceptions/blocked'
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,4 +18,12 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
- require 'gscraper/extensions/uri/http'
21
+ module GScraper
22
+ module Search
23
+ #
24
+ # @since 0.4.0
25
+ #
26
+ class Blocked < RuntimeError
27
+ end
28
+ end
29
+ end
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -51,15 +51,13 @@ module GScraper
51
51
  # end
52
52
  #
53
53
  def results_with_title(title)
54
- unless block_given?
55
- enum_for(:results_with_title,title)
56
- else
57
- results_with do |result|
58
- if result.title.match(title)
59
- yield result
60
-
61
- true
62
- end
54
+ return enum_for(:results_with_title,title) unless block_given?
55
+
56
+ results_with do |result|
57
+ if result.title.match(title)
58
+ yield result
59
+
60
+ true
63
61
  end
64
62
  end
65
63
  end
@@ -88,15 +86,13 @@ module GScraper
88
86
  # end
89
87
  #
90
88
  def results_with_url(url)
91
- unless block_given?
92
- enum_for(:results_with_url,url)
93
- else
94
- results_with do |result|
95
- if result.url.match(url)
96
- yield result
97
-
98
- true
99
- end
89
+ return enum_for(:results_with_url,url) unless block_given?
90
+
91
+ results_with do |result|
92
+ if result.url.match(url)
93
+ yield result
94
+
95
+ true
100
96
  end
101
97
  end
102
98
  end
@@ -125,15 +121,13 @@ module GScraper
125
121
  # end
126
122
  #
127
123
  def results_with_summary(summary)
128
- unless block_given?
129
- enum_for(:results_with_summary,summary)
130
- else
131
- results_with do |result|
132
- if result.summary.match(summary)
133
- yield result
134
-
135
- true
136
- end
124
+ return enum_for(:results_with_summary,summary) unless block_given?
125
+
126
+ results_with do |result|
127
+ if result.summary.match(summary)
128
+ yield result
129
+
130
+ true
137
131
  end
138
132
  end
139
133
  end
@@ -155,11 +149,9 @@ module GScraper
155
149
  # each_rank { |rank| puts rank }
156
150
  #
157
151
  def each_rank
158
- unless block_given?
159
- enum_for(:each_rank)
160
- else
161
- each { |result| yield result.rank }
162
- end
152
+ return enum_for(:each_rank) unless block_given?
153
+
154
+ each { |result| yield result.rank }
163
155
  end
164
156
 
165
157
  #
@@ -179,11 +171,9 @@ module GScraper
179
171
  # each_title { |title| puts title }
180
172
  #
181
173
  def each_title
182
- unless block_given?
183
- enum_for(:each_title)
184
- else
185
- each { |result| yield result.title }
186
- end
174
+ return enum_for(:each_title) unless block_given?
175
+
176
+ each { |result| yield result.title }
187
177
  end
188
178
 
189
179
  #
@@ -203,11 +193,9 @@ module GScraper
203
193
  # each_url { |url| puts url }
204
194
  #
205
195
  def each_url
206
- unless block_given?
207
- enum_for(:each_url)
208
- else
209
- each { |result| yield result.url }
210
- end
196
+ return enum_for(:each_url) unless block_given?
197
+
198
+ each { |result| yield result.url }
211
199
  end
212
200
 
213
201
  #
@@ -227,11 +215,9 @@ module GScraper
227
215
  # each_summary { |summary| puts summary }
228
216
  #
229
217
  def each_summary
230
- unless block_given?
231
- enum_for(:each_summary)
232
- else
233
- each { |result| yield result.summary }
234
- end
218
+ return enum_for(:each_summary) unless block_given?
219
+
220
+ each { |result| yield result.summary }
235
221
  end
236
222
 
237
223
  #
@@ -251,12 +237,10 @@ module GScraper
251
237
  # each_cached_url { |cached_url| puts cached_url }
252
238
  #
253
239
  def each_cached_url
254
- unless block_given?
255
- enum_for(:each_cached_url)
256
- else
257
- each do |result|
258
- yield result.cached_url if result.cached_url
259
- end
240
+ return enum_for(:each_cached_url) unless block_given?
241
+
242
+ each do |result|
243
+ yield result.cached_url if result.cached_url
260
244
  end
261
245
  end
262
246
 
@@ -277,12 +261,10 @@ module GScraper
277
261
  # each_cached_page { |page| puts page.readlines }
278
262
  #
279
263
  def each_cached_page
280
- unless block_given?
281
- enum_for(:each_cached_page)
282
- else
283
- each do |result|
284
- yield result.cached_page if result.cached_page
285
- end
264
+ return enum_for(:each_cached_page) unless block_given?
265
+
266
+ each do |result|
267
+ yield result.cached_page if result.cached_page
286
268
  end
287
269
  end
288
270
 
@@ -303,12 +285,10 @@ module GScraper
303
285
  # each_similar_url { |similar_url| puts similar_url }
304
286
  #
305
287
  def each_similar_url
306
- unless block_given?
307
- enum_for(:each_similar_url)
308
- else
309
- each do |result|
310
- yield result.similar_url if result.similar_url
311
- end
288
+ return enum_for(:each_similar_url) unless block_given?
289
+
290
+ each do |result|
291
+ yield result.similar_url if result.similar_url
312
292
  end
313
293
  end
314
294
 
@@ -421,7 +401,7 @@ module GScraper
421
401
  end
422
402
 
423
403
  #
424
- # Returns the urls of the results that match the given block.
404
+ # Returns the URLs of the results that match the given block.
425
405
  #
426
406
  # @yield [result]
427
407
  # The given block will be used to filter the results in the page.
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,12 +18,8 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
- require 'gscraper/search/result'
22
- require 'gscraper/search/page'
23
- require 'gscraper/sponsored_ad'
24
- require 'gscraper/sponsored_links'
25
- require 'gscraper/extensions/uri'
26
- require 'gscraper/has_pages'
21
+ require 'gscraper/hosts'
22
+ require 'gscraper/languages'
27
23
  require 'gscraper/licenses'
28
24
  require 'gscraper/gscraper'
29
25
 
@@ -31,9 +27,21 @@ module GScraper
31
27
  module Search
32
28
  class Query
33
29
 
30
+ # Web Search sub-domain
31
+ SUB_DOMAIN = 'www'
32
+
33
+ # Default host to submit queries to
34
+ DEFAULT_HOST = "#{SUB_DOMAIN}.#{Hosts::PRIMARY_DOMAIN}"
35
+
36
+ # The host to submit queries to
37
+ attr_writer :search_host
38
+
34
39
  # Search query
35
40
  attr_accessor :query
36
41
 
42
+ # The search language
43
+ attr_accessor :language
44
+
37
45
  # Search 'link' modifier
38
46
  attr_accessor :link
39
47
 
@@ -79,15 +87,24 @@ module GScraper
79
87
  # Search for results containing numbers between the range
80
88
  attr_accessor :numeric_range
81
89
 
90
+ # Search for results containing the definitions of the keywords
91
+ attr_accessor :define
92
+
82
93
  #
83
94
  # Creates a new query.
84
95
  #
85
96
  # @param [Hash] options
86
97
  # Additional options.
87
98
  #
99
+ # @option options [String] :search_host (www.google.com)
100
+ # The host to submit queries to.
101
+ #
88
102
  # @option options [String] :query
89
103
  # The search query.
90
104
  #
105
+ # @option options [Symbol, String] :language (Languages.native)
106
+ # The search language.
107
+ #
91
108
  # @option options [String] :link
92
109
  # Search for results which link to the specified URI.
93
110
  #
@@ -103,20 +120,20 @@ module GScraper
103
120
  # @option options [String] :filetype
104
121
  # Limit results to those with the specified file-type.
105
122
  #
106
- # @option options [String, Array] :allintitle
123
+ # @option options [Array, String] :allintitle
107
124
  # Search for results with all of the keywords appearing in the
108
125
  # title.
109
126
  #
110
127
  # @option options [String] :intitle
111
128
  # Search for results with the keyword appearing in the title.
112
129
  #
113
- # @option options [String, Array] :allintext
130
+ # @option options [Array, String] :allintext
114
131
  # Search for results with all of the keywords appearing in the text.
115
132
  #
116
133
  # @option options [String] :intext
117
134
  # Search for results with the keyword appearing in the text.
118
135
  #
119
- # @option options [String, Array] :allinanchor
136
+ # @option options [Array, String] :allinanchor
120
137
  # Search for results with all of the keywords appearing in the
121
138
  # text of links.
122
139
  #
@@ -127,13 +144,13 @@ module GScraper
127
144
  # @option options [String] :exact_phrase
128
145
  # Search for results containing the specified exact phrase.
129
146
  #
130
- # @option options [String, Array] :with_words
147
+ # @option options [Array, String] :with_words
131
148
  # Search for results containing all of the specified words.
132
149
  #
133
- # @option options [String, Array] :without_words
150
+ # @option options [Array, String] :without_words
134
151
  # Search for results not containing any of the specified words.
135
152
  #
136
- # @option options [Range] :numeric_range
153
+ # @option options [Range, Array, String] :numeric_range
137
154
  # Search for results contain numbers that fall within the
138
155
  # specified Range.
139
156
  #
@@ -141,6 +158,10 @@ module GScraper
141
158
  # Search for results containing the definition of the specified
142
159
  # keyword.
143
160
  #
161
+ # @option options [Boolean] :load_balance (false)
162
+ # Specifies whether to distribute queries accross multiple Google
163
+ # domains.
164
+ #
144
165
  # @yield [query]
145
166
  # If a block is given, it will be passed the new query.
146
167
  #
@@ -151,33 +172,54 @@ module GScraper
151
172
  # The new query.
152
173
  #
153
174
  def initialize(options={})
154
- @query = options[:query]
175
+ @search_host = options.fetch(:search_host,DEFAULT_HOST)
176
+
177
+ @query = options[:query]
178
+ @language = options.fetch(:language,Languages.native)
155
179
 
156
- @link = options[:link]
157
- @related = options[:related]
158
- @info = options[:info]
159
- @site = options[:site]
180
+ @link = options[:link]
181
+ @related = options[:related]
182
+ @info = options[:info]
183
+ @site = options[:site]
160
184
  @filetype = options[:filetype]
161
185
 
162
- @allintitle = options[:allintitle]
163
- @intitle = options[:intitle]
164
- @allinurl = options[:allinurl]
165
- @inurl = options[:inurl]
166
- @allintext = options[:allintext]
167
- @intext = options[:intext]
186
+ @allintitle = options[:allintitle]
187
+ @intitle = options[:intitle]
188
+ @allinurl = options[:allinurl]
189
+ @inurl = options[:inurl]
190
+ @allintext = options[:allintext]
191
+ @intext = options[:intext]
168
192
  @allinanchor = options[:allinanchor]
169
- @inanchor = options[:inanchor]
193
+ @inanchor = options[:inanchor]
170
194
 
171
- @exact_phrase = options[:exact_phrase]
172
- @with_words = options[:with_words]
195
+ @exact_phrase = options[:exact_phrase]
196
+ @with_words = options[:with_words]
173
197
  @without_words = options[:without_words]
174
198
 
175
199
  @numeric_range = options[:numeric_range]
176
- @define = options[:define]
200
+ @define = options[:define]
201
+
202
+ @load_balance = options.fetch(:load_balance,false)
177
203
 
178
204
  yield self if block_given?
179
205
  end
180
206
 
207
+ #
208
+ # The host to submit queries to.
209
+ #
210
+ # @return [String]
211
+ # The host to submit queries to.
212
+ #
213
+ # @since 0.4.0
214
+ #
215
+ def search_host
216
+ if @load_balance
217
+ Hosts::DOMAINS[rand(Hosts::DOMAINS.length)]
218
+ else
219
+ @search_host
220
+ end
221
+ end
222
+
181
223
  #
182
224
  # The query expression.
183
225
  #
@@ -222,16 +264,25 @@ module GScraper
222
264
  expr << "\"#{@exact_phrase}\""
223
265
  end
224
266
 
225
- if @with_words.kind_of?(Array)
267
+ case @with_words
268
+ when String
269
+ expr << @with_words
270
+ when Enumerable
226
271
  expr << @with_words.join(' OR ')
227
272
  end
228
-
229
- if @without_words.kind_of?(Array)
273
+
274
+ case @without_words
275
+ when String
276
+ expr << @without_words
277
+ when Enumerable
230
278
  expr << @without_words.map { |word| "-#{word}" }.join(' ')
231
279
  end
232
280
 
233
- if @numeric_range.kind_of?(Range)
234
- expr << "#{@numeric_range.begin}..#{@numeric_range.end}"
281
+ case @numeric_range
282
+ when String
283
+ expr << @numeric_range
284
+ when Range, Array
285
+ expr << "#{@numeric_range.first}..#{@numeric_range.last}"
235
286
  end
236
287
 
237
288
  return expr.join(' ')
@@ -240,7 +291,7 @@ module GScraper
240
291
  protected
241
292
 
242
293
  #
243
- # Formats the value for a search modifer.
294
+ # Formats the value for a search modifier.
244
295
  #
245
296
  # @param [Regexp, String]
246
297
  # The value for the search modifier.
@@ -249,10 +300,11 @@ module GScraper
249
300
  # The formatted value.
250
301
  #
251
302
  def format_modifier(value)
252
- if value.kind_of?(Regexp)
253
- return value.source
303
+ case value
304
+ when Range
305
+ value.source
254
306
  else
255
- return value.to_s
307
+ value.to_s
256
308
  end
257
309
  end
258
310
 
@@ -266,13 +318,7 @@ module GScraper
266
318
  # The formatted value.
267
319
  #
268
320
  def format_options(value)
269
- if value.kind_of?(Array)
270
- return value.map { |element|
271
- format_modifier(element)
272
- }.join(' ')
273
- else
274
- return format_modifier(value)
275
- end
321
+ Array(value).map(&method(:format_modifier)).join(' ')
276
322
  end
277
323
 
278
324
  end