gscraper 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,4 +18,4 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
- require 'gscraper/extensions/uri'
21
+ require 'gscraper/search/exceptions/blocked'
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,4 +18,12 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
- require 'gscraper/extensions/uri/http'
21
+ module GScraper
22
+ module Search
23
+ #
24
+ # @since 0.4.0
25
+ #
26
+ class Blocked < RuntimeError
27
+ end
28
+ end
29
+ end
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -51,15 +51,13 @@ module GScraper
51
51
  # end
52
52
  #
53
53
  def results_with_title(title)
54
- unless block_given?
55
- enum_for(:results_with_title,title)
56
- else
57
- results_with do |result|
58
- if result.title.match(title)
59
- yield result
60
-
61
- true
62
- end
54
+ return enum_for(:results_with_title,title) unless block_given?
55
+
56
+ results_with do |result|
57
+ if result.title.match(title)
58
+ yield result
59
+
60
+ true
63
61
  end
64
62
  end
65
63
  end
@@ -88,15 +86,13 @@ module GScraper
88
86
  # end
89
87
  #
90
88
  def results_with_url(url)
91
- unless block_given?
92
- enum_for(:results_with_url,url)
93
- else
94
- results_with do |result|
95
- if result.url.match(url)
96
- yield result
97
-
98
- true
99
- end
89
+ return enum_for(:results_with_url,url) unless block_given?
90
+
91
+ results_with do |result|
92
+ if result.url.match(url)
93
+ yield result
94
+
95
+ true
100
96
  end
101
97
  end
102
98
  end
@@ -125,15 +121,13 @@ module GScraper
125
121
  # end
126
122
  #
127
123
  def results_with_summary(summary)
128
- unless block_given?
129
- enum_for(:results_with_summary,summary)
130
- else
131
- results_with do |result|
132
- if result.summary.match(summary)
133
- yield result
134
-
135
- true
136
- end
124
+ return enum_for(:results_with_summary,summary) unless block_given?
125
+
126
+ results_with do |result|
127
+ if result.summary.match(summary)
128
+ yield result
129
+
130
+ true
137
131
  end
138
132
  end
139
133
  end
@@ -155,11 +149,9 @@ module GScraper
155
149
  # each_rank { |rank| puts rank }
156
150
  #
157
151
  def each_rank
158
- unless block_given?
159
- enum_for(:each_rank)
160
- else
161
- each { |result| yield result.rank }
162
- end
152
+ return enum_for(:each_rank) unless block_given?
153
+
154
+ each { |result| yield result.rank }
163
155
  end
164
156
 
165
157
  #
@@ -179,11 +171,9 @@ module GScraper
179
171
  # each_title { |title| puts title }
180
172
  #
181
173
  def each_title
182
- unless block_given?
183
- enum_for(:each_title)
184
- else
185
- each { |result| yield result.title }
186
- end
174
+ return enum_for(:each_title) unless block_given?
175
+
176
+ each { |result| yield result.title }
187
177
  end
188
178
 
189
179
  #
@@ -203,11 +193,9 @@ module GScraper
203
193
  # each_url { |url| puts url }
204
194
  #
205
195
  def each_url
206
- unless block_given?
207
- enum_for(:each_url)
208
- else
209
- each { |result| yield result.url }
210
- end
196
+ return enum_for(:each_url) unless block_given?
197
+
198
+ each { |result| yield result.url }
211
199
  end
212
200
 
213
201
  #
@@ -227,11 +215,9 @@ module GScraper
227
215
  # each_summary { |summary| puts summary }
228
216
  #
229
217
  def each_summary
230
- unless block_given?
231
- enum_for(:each_summary)
232
- else
233
- each { |result| yield result.summary }
234
- end
218
+ return enum_for(:each_summary) unless block_given?
219
+
220
+ each { |result| yield result.summary }
235
221
  end
236
222
 
237
223
  #
@@ -251,12 +237,10 @@ module GScraper
251
237
  # each_cached_url { |cached_url| puts cached_url }
252
238
  #
253
239
  def each_cached_url
254
- unless block_given?
255
- enum_for(:each_cached_url)
256
- else
257
- each do |result|
258
- yield result.cached_url if result.cached_url
259
- end
240
+ return enum_for(:each_cached_url) unless block_given?
241
+
242
+ each do |result|
243
+ yield result.cached_url if result.cached_url
260
244
  end
261
245
  end
262
246
 
@@ -277,12 +261,10 @@ module GScraper
277
261
  # each_cached_page { |page| puts page.readlines }
278
262
  #
279
263
  def each_cached_page
280
- unless block_given?
281
- enum_for(:each_cached_page)
282
- else
283
- each do |result|
284
- yield result.cached_page if result.cached_page
285
- end
264
+ return enum_for(:each_cached_page) unless block_given?
265
+
266
+ each do |result|
267
+ yield result.cached_page if result.cached_page
286
268
  end
287
269
  end
288
270
 
@@ -303,12 +285,10 @@ module GScraper
303
285
  # each_similar_url { |similar_url| puts similar_url }
304
286
  #
305
287
  def each_similar_url
306
- unless block_given?
307
- enum_for(:each_similar_url)
308
- else
309
- each do |result|
310
- yield result.similar_url if result.similar_url
311
- end
288
+ return enum_for(:each_similar_url) unless block_given?
289
+
290
+ each do |result|
291
+ yield result.similar_url if result.similar_url
312
292
  end
313
293
  end
314
294
 
@@ -421,7 +401,7 @@ module GScraper
421
401
  end
422
402
 
423
403
  #
424
- # Returns the urls of the results that match the given block.
404
+ # Returns the URLs of the results that match the given block.
425
405
  #
426
406
  # @yield [result]
427
407
  # The given block will be used to filter the results in the page.
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,12 +18,8 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
- require 'gscraper/search/result'
22
- require 'gscraper/search/page'
23
- require 'gscraper/sponsored_ad'
24
- require 'gscraper/sponsored_links'
25
- require 'gscraper/extensions/uri'
26
- require 'gscraper/has_pages'
21
+ require 'gscraper/hosts'
22
+ require 'gscraper/languages'
27
23
  require 'gscraper/licenses'
28
24
  require 'gscraper/gscraper'
29
25
 
@@ -31,9 +27,21 @@ module GScraper
31
27
  module Search
32
28
  class Query
33
29
 
30
+ # Web Search sub-domain
31
+ SUB_DOMAIN = 'www'
32
+
33
+ # Default host to submit queries to
34
+ DEFAULT_HOST = "#{SUB_DOMAIN}.#{Hosts::PRIMARY_DOMAIN}"
35
+
36
+ # The host to submit queries to
37
+ attr_writer :search_host
38
+
34
39
  # Search query
35
40
  attr_accessor :query
36
41
 
42
+ # The search language
43
+ attr_accessor :language
44
+
37
45
  # Search 'link' modifier
38
46
  attr_accessor :link
39
47
 
@@ -79,15 +87,24 @@ module GScraper
79
87
  # Search for results containing numbers between the range
80
88
  attr_accessor :numeric_range
81
89
 
90
+ # Search for results containing the definitions of the keywords
91
+ attr_accessor :define
92
+
82
93
  #
83
94
  # Creates a new query.
84
95
  #
85
96
  # @param [Hash] options
86
97
  # Additional options.
87
98
  #
99
+ # @option options [String] :search_host (www.google.com)
100
+ # The host to submit queries to.
101
+ #
88
102
  # @option options [String] :query
89
103
  # The search query.
90
104
  #
105
+ # @option options [Symbol, String] :language (Languages.native)
106
+ # The search language.
107
+ #
91
108
  # @option options [String] :link
92
109
  # Search for results which link to the specified URI.
93
110
  #
@@ -103,20 +120,20 @@ module GScraper
103
120
  # @option options [String] :filetype
104
121
  # Limit results to those with the specified file-type.
105
122
  #
106
- # @option options [String, Array] :allintitle
123
+ # @option options [Array, String] :allintitle
107
124
  # Search for results with all of the keywords appearing in the
108
125
  # title.
109
126
  #
110
127
  # @option options [String] :intitle
111
128
  # Search for results with the keyword appearing in the title.
112
129
  #
113
- # @option options [String, Array] :allintext
130
+ # @option options [Array, String] :allintext
114
131
  # Search for results with all of the keywords appearing in the text.
115
132
  #
116
133
  # @option options [String] :intext
117
134
  # Search for results with the keyword appearing in the text.
118
135
  #
119
- # @option options [String, Array] :allinanchor
136
+ # @option options [Array, String] :allinanchor
120
137
  # Search for results with all of the keywords appearing in the
121
138
  # text of links.
122
139
  #
@@ -127,13 +144,13 @@ module GScraper
127
144
  # @option options [String] :exact_phrase
128
145
  # Search for results containing the specified exact phrase.
129
146
  #
130
- # @option options [String, Array] :with_words
147
+ # @option options [Array, String] :with_words
131
148
  # Search for results containing all of the specified words.
132
149
  #
133
- # @option options [String, Array] :without_words
150
+ # @option options [Array, String] :without_words
134
151
  # Search for results not containing any of the specified words.
135
152
  #
136
- # @option options [Range] :numeric_range
153
+ # @option options [Range, Array, String] :numeric_range
137
154
  # Search for results contain numbers that fall within the
138
155
  # specified Range.
139
156
  #
@@ -141,6 +158,10 @@ module GScraper
141
158
  # Search for results containing the definition of the specified
142
159
  # keyword.
143
160
  #
161
+ # @option options [Boolean] :load_balance (false)
162
+ # Specifies whether to distribute queries accross multiple Google
163
+ # domains.
164
+ #
144
165
  # @yield [query]
145
166
  # If a block is given, it will be passed the new query.
146
167
  #
@@ -151,33 +172,54 @@ module GScraper
151
172
  # The new query.
152
173
  #
153
174
  def initialize(options={})
154
- @query = options[:query]
175
+ @search_host = options.fetch(:search_host,DEFAULT_HOST)
176
+
177
+ @query = options[:query]
178
+ @language = options.fetch(:language,Languages.native)
155
179
 
156
- @link = options[:link]
157
- @related = options[:related]
158
- @info = options[:info]
159
- @site = options[:site]
180
+ @link = options[:link]
181
+ @related = options[:related]
182
+ @info = options[:info]
183
+ @site = options[:site]
160
184
  @filetype = options[:filetype]
161
185
 
162
- @allintitle = options[:allintitle]
163
- @intitle = options[:intitle]
164
- @allinurl = options[:allinurl]
165
- @inurl = options[:inurl]
166
- @allintext = options[:allintext]
167
- @intext = options[:intext]
186
+ @allintitle = options[:allintitle]
187
+ @intitle = options[:intitle]
188
+ @allinurl = options[:allinurl]
189
+ @inurl = options[:inurl]
190
+ @allintext = options[:allintext]
191
+ @intext = options[:intext]
168
192
  @allinanchor = options[:allinanchor]
169
- @inanchor = options[:inanchor]
193
+ @inanchor = options[:inanchor]
170
194
 
171
- @exact_phrase = options[:exact_phrase]
172
- @with_words = options[:with_words]
195
+ @exact_phrase = options[:exact_phrase]
196
+ @with_words = options[:with_words]
173
197
  @without_words = options[:without_words]
174
198
 
175
199
  @numeric_range = options[:numeric_range]
176
- @define = options[:define]
200
+ @define = options[:define]
201
+
202
+ @load_balance = options.fetch(:load_balance,false)
177
203
 
178
204
  yield self if block_given?
179
205
  end
180
206
 
207
+ #
208
+ # The host to submit queries to.
209
+ #
210
+ # @return [String]
211
+ # The host to submit queries to.
212
+ #
213
+ # @since 0.4.0
214
+ #
215
+ def search_host
216
+ if @load_balance
217
+ Hosts::DOMAINS[rand(Hosts::DOMAINS.length)]
218
+ else
219
+ @search_host
220
+ end
221
+ end
222
+
181
223
  #
182
224
  # The query expression.
183
225
  #
@@ -222,16 +264,25 @@ module GScraper
222
264
  expr << "\"#{@exact_phrase}\""
223
265
  end
224
266
 
225
- if @with_words.kind_of?(Array)
267
+ case @with_words
268
+ when String
269
+ expr << @with_words
270
+ when Enumerable
226
271
  expr << @with_words.join(' OR ')
227
272
  end
228
-
229
- if @without_words.kind_of?(Array)
273
+
274
+ case @without_words
275
+ when String
276
+ expr << @without_words
277
+ when Enumerable
230
278
  expr << @without_words.map { |word| "-#{word}" }.join(' ')
231
279
  end
232
280
 
233
- if @numeric_range.kind_of?(Range)
234
- expr << "#{@numeric_range.begin}..#{@numeric_range.end}"
281
+ case @numeric_range
282
+ when String
283
+ expr << @numeric_range
284
+ when Range, Array
285
+ expr << "#{@numeric_range.first}..#{@numeric_range.last}"
235
286
  end
236
287
 
237
288
  return expr.join(' ')
@@ -240,7 +291,7 @@ module GScraper
240
291
  protected
241
292
 
242
293
  #
243
- # Formats the value for a search modifer.
294
+ # Formats the value for a search modifier.
244
295
  #
245
296
  # @param [Regexp, String]
246
297
  # The value for the search modifier.
@@ -249,10 +300,11 @@ module GScraper
249
300
  # The formatted value.
250
301
  #
251
302
  def format_modifier(value)
252
- if value.kind_of?(Regexp)
253
- return value.source
303
+ case value
304
+ when Range
305
+ value.source
254
306
  else
255
- return value.to_s
307
+ value.to_s
256
308
  end
257
309
  end
258
310
 
@@ -266,13 +318,7 @@ module GScraper
266
318
  # The formatted value.
267
319
  #
268
320
  def format_options(value)
269
- if value.kind_of?(Array)
270
- return value.map { |element|
271
- format_modifier(element)
272
- }.join(' ')
273
- else
274
- return format_modifier(value)
275
- end
321
+ Array(value).map(&method(:format_modifier)).join(' ')
276
322
  end
277
323
 
278
324
  end