gscraper 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -0
- data/ChangeLog.md +24 -2
- data/README.md +12 -7
- data/Rakefile +26 -29
- data/gemspec.yml +20 -0
- data/gscraper.gemspec +124 -109
- data/lib/gscraper.rb +1 -1
- data/lib/gscraper/gscraper.rb +24 -20
- data/lib/gscraper/has_pages.rb +1 -3
- data/lib/gscraper/hosts.rb +158 -0
- data/lib/gscraper/languages.rb +110 -0
- data/lib/gscraper/licenses.rb +4 -1
- data/lib/gscraper/page.rb +1 -3
- data/lib/gscraper/search.rb +1 -1
- data/lib/gscraper/search/ajax_query.rb +33 -34
- data/lib/gscraper/{extensions.rb → search/exceptions.rb} +2 -2
- data/lib/gscraper/{extensions/uri.rb → search/exceptions/blocked.rb} +10 -2
- data/lib/gscraper/search/page.rb +47 -67
- data/lib/gscraper/search/query.rb +90 -44
- data/lib/gscraper/search/result.rb +7 -9
- data/lib/gscraper/search/search.rb +2 -2
- data/lib/gscraper/search/web_query.rb +93 -101
- data/lib/gscraper/sponsored_ad.rb +3 -3
- data/lib/gscraper/sponsored_links.rb +1 -3
- data/lib/gscraper/version.rb +2 -2
- data/spec/languages_spec.rb +28 -0
- data/spec/search/ajax_query_spec.rb +2 -1
- data/spec/search/query_spec.rb +29 -0
- data/spec/search/web_query_spec.rb +21 -1
- data/spec/spec_helper.rb +2 -12
- metadata +107 -125
- data/.specopts +0 -1
- data/Gemfile +0 -25
- data/lib/gscraper/extensions/uri/http.rb +0 -31
- data/lib/gscraper/extensions/uri/query_params.rb +0 -109
- data/spec/extensions/uri/http_spec.rb +0 -9
- data/spec/extensions/uri/query_params_spec.rb +0 -46
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,4 +18,4 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
-
require 'gscraper/
|
21
|
+
require 'gscraper/search/exceptions/blocked'
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,4 +18,12 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
-
|
21
|
+
module GScraper
|
22
|
+
module Search
|
23
|
+
#
|
24
|
+
# @since 0.4.0
|
25
|
+
#
|
26
|
+
class Blocked < RuntimeError
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/gscraper/search/page.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -51,15 +51,13 @@ module GScraper
|
|
51
51
|
# end
|
52
52
|
#
|
53
53
|
def results_with_title(title)
|
54
|
-
unless block_given?
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
true
|
62
|
-
end
|
54
|
+
return enum_for(:results_with_title,title) unless block_given?
|
55
|
+
|
56
|
+
results_with do |result|
|
57
|
+
if result.title.match(title)
|
58
|
+
yield result
|
59
|
+
|
60
|
+
true
|
63
61
|
end
|
64
62
|
end
|
65
63
|
end
|
@@ -88,15 +86,13 @@ module GScraper
|
|
88
86
|
# end
|
89
87
|
#
|
90
88
|
def results_with_url(url)
|
91
|
-
unless block_given?
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
true
|
99
|
-
end
|
89
|
+
return enum_for(:results_with_url,url) unless block_given?
|
90
|
+
|
91
|
+
results_with do |result|
|
92
|
+
if result.url.match(url)
|
93
|
+
yield result
|
94
|
+
|
95
|
+
true
|
100
96
|
end
|
101
97
|
end
|
102
98
|
end
|
@@ -125,15 +121,13 @@ module GScraper
|
|
125
121
|
# end
|
126
122
|
#
|
127
123
|
def results_with_summary(summary)
|
128
|
-
unless block_given?
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
true
|
136
|
-
end
|
124
|
+
return enum_for(:results_with_summary,summary) unless block_given?
|
125
|
+
|
126
|
+
results_with do |result|
|
127
|
+
if result.summary.match(summary)
|
128
|
+
yield result
|
129
|
+
|
130
|
+
true
|
137
131
|
end
|
138
132
|
end
|
139
133
|
end
|
@@ -155,11 +149,9 @@ module GScraper
|
|
155
149
|
# each_rank { |rank| puts rank }
|
156
150
|
#
|
157
151
|
def each_rank
|
158
|
-
unless block_given?
|
159
|
-
|
160
|
-
|
161
|
-
each { |result| yield result.rank }
|
162
|
-
end
|
152
|
+
return enum_for(:each_rank) unless block_given?
|
153
|
+
|
154
|
+
each { |result| yield result.rank }
|
163
155
|
end
|
164
156
|
|
165
157
|
#
|
@@ -179,11 +171,9 @@ module GScraper
|
|
179
171
|
# each_title { |title| puts title }
|
180
172
|
#
|
181
173
|
def each_title
|
182
|
-
unless block_given?
|
183
|
-
|
184
|
-
|
185
|
-
each { |result| yield result.title }
|
186
|
-
end
|
174
|
+
return enum_for(:each_title) unless block_given?
|
175
|
+
|
176
|
+
each { |result| yield result.title }
|
187
177
|
end
|
188
178
|
|
189
179
|
#
|
@@ -203,11 +193,9 @@ module GScraper
|
|
203
193
|
# each_url { |url| puts url }
|
204
194
|
#
|
205
195
|
def each_url
|
206
|
-
unless block_given?
|
207
|
-
|
208
|
-
|
209
|
-
each { |result| yield result.url }
|
210
|
-
end
|
196
|
+
return enum_for(:each_url) unless block_given?
|
197
|
+
|
198
|
+
each { |result| yield result.url }
|
211
199
|
end
|
212
200
|
|
213
201
|
#
|
@@ -227,11 +215,9 @@ module GScraper
|
|
227
215
|
# each_summary { |summary| puts summary }
|
228
216
|
#
|
229
217
|
def each_summary
|
230
|
-
unless block_given?
|
231
|
-
|
232
|
-
|
233
|
-
each { |result| yield result.summary }
|
234
|
-
end
|
218
|
+
return enum_for(:each_summary) unless block_given?
|
219
|
+
|
220
|
+
each { |result| yield result.summary }
|
235
221
|
end
|
236
222
|
|
237
223
|
#
|
@@ -251,12 +237,10 @@ module GScraper
|
|
251
237
|
# each_cached_url { |cached_url| puts cached_url }
|
252
238
|
#
|
253
239
|
def each_cached_url
|
254
|
-
unless block_given?
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
yield result.cached_url if result.cached_url
|
259
|
-
end
|
240
|
+
return enum_for(:each_cached_url) unless block_given?
|
241
|
+
|
242
|
+
each do |result|
|
243
|
+
yield result.cached_url if result.cached_url
|
260
244
|
end
|
261
245
|
end
|
262
246
|
|
@@ -277,12 +261,10 @@ module GScraper
|
|
277
261
|
# each_cached_page { |page| puts page.readlines }
|
278
262
|
#
|
279
263
|
def each_cached_page
|
280
|
-
unless block_given?
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
yield result.cached_page if result.cached_page
|
285
|
-
end
|
264
|
+
return enum_for(:each_cached_page) unless block_given?
|
265
|
+
|
266
|
+
each do |result|
|
267
|
+
yield result.cached_page if result.cached_page
|
286
268
|
end
|
287
269
|
end
|
288
270
|
|
@@ -303,12 +285,10 @@ module GScraper
|
|
303
285
|
# each_similar_url { |similar_url| puts similar_url }
|
304
286
|
#
|
305
287
|
def each_similar_url
|
306
|
-
unless block_given?
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
yield result.similar_url if result.similar_url
|
311
|
-
end
|
288
|
+
return enum_for(:each_similar_url) unless block_given?
|
289
|
+
|
290
|
+
each do |result|
|
291
|
+
yield result.similar_url if result.similar_url
|
312
292
|
end
|
313
293
|
end
|
314
294
|
|
@@ -421,7 +401,7 @@ module GScraper
|
|
421
401
|
end
|
422
402
|
|
423
403
|
#
|
424
|
-
# Returns the
|
404
|
+
# Returns the URLs of the results that match the given block.
|
425
405
|
#
|
426
406
|
# @yield [result]
|
427
407
|
# The given block will be used to filter the results in the page.
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,12 +18,8 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
-
require 'gscraper/
|
22
|
-
require 'gscraper/
|
23
|
-
require 'gscraper/sponsored_ad'
|
24
|
-
require 'gscraper/sponsored_links'
|
25
|
-
require 'gscraper/extensions/uri'
|
26
|
-
require 'gscraper/has_pages'
|
21
|
+
require 'gscraper/hosts'
|
22
|
+
require 'gscraper/languages'
|
27
23
|
require 'gscraper/licenses'
|
28
24
|
require 'gscraper/gscraper'
|
29
25
|
|
@@ -31,9 +27,21 @@ module GScraper
|
|
31
27
|
module Search
|
32
28
|
class Query
|
33
29
|
|
30
|
+
# Web Search sub-domain
|
31
|
+
SUB_DOMAIN = 'www'
|
32
|
+
|
33
|
+
# Default host to submit queries to
|
34
|
+
DEFAULT_HOST = "#{SUB_DOMAIN}.#{Hosts::PRIMARY_DOMAIN}"
|
35
|
+
|
36
|
+
# The host to submit queries to
|
37
|
+
attr_writer :search_host
|
38
|
+
|
34
39
|
# Search query
|
35
40
|
attr_accessor :query
|
36
41
|
|
42
|
+
# The search language
|
43
|
+
attr_accessor :language
|
44
|
+
|
37
45
|
# Search 'link' modifier
|
38
46
|
attr_accessor :link
|
39
47
|
|
@@ -79,15 +87,24 @@ module GScraper
|
|
79
87
|
# Search for results containing numbers between the range
|
80
88
|
attr_accessor :numeric_range
|
81
89
|
|
90
|
+
# Search for results containing the definitions of the keywords
|
91
|
+
attr_accessor :define
|
92
|
+
|
82
93
|
#
|
83
94
|
# Creates a new query.
|
84
95
|
#
|
85
96
|
# @param [Hash] options
|
86
97
|
# Additional options.
|
87
98
|
#
|
99
|
+
# @option options [String] :search_host (www.google.com)
|
100
|
+
# The host to submit queries to.
|
101
|
+
#
|
88
102
|
# @option options [String] :query
|
89
103
|
# The search query.
|
90
104
|
#
|
105
|
+
# @option options [Symbol, String] :language (Languages.native)
|
106
|
+
# The search language.
|
107
|
+
#
|
91
108
|
# @option options [String] :link
|
92
109
|
# Search for results which link to the specified URI.
|
93
110
|
#
|
@@ -103,20 +120,20 @@ module GScraper
|
|
103
120
|
# @option options [String] :filetype
|
104
121
|
# Limit results to those with the specified file-type.
|
105
122
|
#
|
106
|
-
# @option options [
|
123
|
+
# @option options [Array, String] :allintitle
|
107
124
|
# Search for results with all of the keywords appearing in the
|
108
125
|
# title.
|
109
126
|
#
|
110
127
|
# @option options [String] :intitle
|
111
128
|
# Search for results with the keyword appearing in the title.
|
112
129
|
#
|
113
|
-
# @option options [
|
130
|
+
# @option options [Array, String] :allintext
|
114
131
|
# Search for results with all of the keywords appearing in the text.
|
115
132
|
#
|
116
133
|
# @option options [String] :intext
|
117
134
|
# Search for results with the keyword appearing in the text.
|
118
135
|
#
|
119
|
-
# @option options [
|
136
|
+
# @option options [Array, String] :allinanchor
|
120
137
|
# Search for results with all of the keywords appearing in the
|
121
138
|
# text of links.
|
122
139
|
#
|
@@ -127,13 +144,13 @@ module GScraper
|
|
127
144
|
# @option options [String] :exact_phrase
|
128
145
|
# Search for results containing the specified exact phrase.
|
129
146
|
#
|
130
|
-
# @option options [
|
147
|
+
# @option options [Array, String] :with_words
|
131
148
|
# Search for results containing all of the specified words.
|
132
149
|
#
|
133
|
-
# @option options [
|
150
|
+
# @option options [Array, String] :without_words
|
134
151
|
# Search for results not containing any of the specified words.
|
135
152
|
#
|
136
|
-
# @option options [Range] :numeric_range
|
153
|
+
# @option options [Range, Array, String] :numeric_range
|
137
154
|
# Search for results contain numbers that fall within the
|
138
155
|
# specified Range.
|
139
156
|
#
|
@@ -141,6 +158,10 @@ module GScraper
|
|
141
158
|
# Search for results containing the definition of the specified
|
142
159
|
# keyword.
|
143
160
|
#
|
161
|
+
# @option options [Boolean] :load_balance (false)
|
162
|
+
# Specifies whether to distribute queries accross multiple Google
|
163
|
+
# domains.
|
164
|
+
#
|
144
165
|
# @yield [query]
|
145
166
|
# If a block is given, it will be passed the new query.
|
146
167
|
#
|
@@ -151,33 +172,54 @@ module GScraper
|
|
151
172
|
# The new query.
|
152
173
|
#
|
153
174
|
def initialize(options={})
|
154
|
-
@
|
175
|
+
@search_host = options.fetch(:search_host,DEFAULT_HOST)
|
176
|
+
|
177
|
+
@query = options[:query]
|
178
|
+
@language = options.fetch(:language,Languages.native)
|
155
179
|
|
156
|
-
@link
|
157
|
-
@related
|
158
|
-
@info
|
159
|
-
@site
|
180
|
+
@link = options[:link]
|
181
|
+
@related = options[:related]
|
182
|
+
@info = options[:info]
|
183
|
+
@site = options[:site]
|
160
184
|
@filetype = options[:filetype]
|
161
185
|
|
162
|
-
@allintitle
|
163
|
-
@intitle
|
164
|
-
@allinurl
|
165
|
-
@inurl
|
166
|
-
@allintext
|
167
|
-
@intext
|
186
|
+
@allintitle = options[:allintitle]
|
187
|
+
@intitle = options[:intitle]
|
188
|
+
@allinurl = options[:allinurl]
|
189
|
+
@inurl = options[:inurl]
|
190
|
+
@allintext = options[:allintext]
|
191
|
+
@intext = options[:intext]
|
168
192
|
@allinanchor = options[:allinanchor]
|
169
|
-
@inanchor
|
193
|
+
@inanchor = options[:inanchor]
|
170
194
|
|
171
|
-
@exact_phrase
|
172
|
-
@with_words
|
195
|
+
@exact_phrase = options[:exact_phrase]
|
196
|
+
@with_words = options[:with_words]
|
173
197
|
@without_words = options[:without_words]
|
174
198
|
|
175
199
|
@numeric_range = options[:numeric_range]
|
176
|
-
@define
|
200
|
+
@define = options[:define]
|
201
|
+
|
202
|
+
@load_balance = options.fetch(:load_balance,false)
|
177
203
|
|
178
204
|
yield self if block_given?
|
179
205
|
end
|
180
206
|
|
207
|
+
#
|
208
|
+
# The host to submit queries to.
|
209
|
+
#
|
210
|
+
# @return [String]
|
211
|
+
# The host to submit queries to.
|
212
|
+
#
|
213
|
+
# @since 0.4.0
|
214
|
+
#
|
215
|
+
def search_host
|
216
|
+
if @load_balance
|
217
|
+
Hosts::DOMAINS[rand(Hosts::DOMAINS.length)]
|
218
|
+
else
|
219
|
+
@search_host
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
181
223
|
#
|
182
224
|
# The query expression.
|
183
225
|
#
|
@@ -222,16 +264,25 @@ module GScraper
|
|
222
264
|
expr << "\"#{@exact_phrase}\""
|
223
265
|
end
|
224
266
|
|
225
|
-
|
267
|
+
case @with_words
|
268
|
+
when String
|
269
|
+
expr << @with_words
|
270
|
+
when Enumerable
|
226
271
|
expr << @with_words.join(' OR ')
|
227
272
|
end
|
228
|
-
|
229
|
-
|
273
|
+
|
274
|
+
case @without_words
|
275
|
+
when String
|
276
|
+
expr << @without_words
|
277
|
+
when Enumerable
|
230
278
|
expr << @without_words.map { |word| "-#{word}" }.join(' ')
|
231
279
|
end
|
232
280
|
|
233
|
-
|
234
|
-
|
281
|
+
case @numeric_range
|
282
|
+
when String
|
283
|
+
expr << @numeric_range
|
284
|
+
when Range, Array
|
285
|
+
expr << "#{@numeric_range.first}..#{@numeric_range.last}"
|
235
286
|
end
|
236
287
|
|
237
288
|
return expr.join(' ')
|
@@ -240,7 +291,7 @@ module GScraper
|
|
240
291
|
protected
|
241
292
|
|
242
293
|
#
|
243
|
-
# Formats the value for a search
|
294
|
+
# Formats the value for a search modifier.
|
244
295
|
#
|
245
296
|
# @param [Regexp, String]
|
246
297
|
# The value for the search modifier.
|
@@ -249,10 +300,11 @@ module GScraper
|
|
249
300
|
# The formatted value.
|
250
301
|
#
|
251
302
|
def format_modifier(value)
|
252
|
-
|
253
|
-
|
303
|
+
case value
|
304
|
+
when Range
|
305
|
+
value.source
|
254
306
|
else
|
255
|
-
|
307
|
+
value.to_s
|
256
308
|
end
|
257
309
|
end
|
258
310
|
|
@@ -266,13 +318,7 @@ module GScraper
|
|
266
318
|
# The formatted value.
|
267
319
|
#
|
268
320
|
def format_options(value)
|
269
|
-
|
270
|
-
return value.map { |element|
|
271
|
-
format_modifier(element)
|
272
|
-
}.join(' ')
|
273
|
-
else
|
274
|
-
return format_modifier(value)
|
275
|
-
end
|
321
|
+
Array(value).map(&method(:format_modifier)).join(' ')
|
276
322
|
end
|
277
323
|
|
278
324
|
end
|