gscraper 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/ChangeLog.md +24 -2
- data/README.md +12 -7
- data/Rakefile +26 -29
- data/gemspec.yml +20 -0
- data/gscraper.gemspec +124 -109
- data/lib/gscraper.rb +1 -1
- data/lib/gscraper/gscraper.rb +24 -20
- data/lib/gscraper/has_pages.rb +1 -3
- data/lib/gscraper/hosts.rb +158 -0
- data/lib/gscraper/languages.rb +110 -0
- data/lib/gscraper/licenses.rb +4 -1
- data/lib/gscraper/page.rb +1 -3
- data/lib/gscraper/search.rb +1 -1
- data/lib/gscraper/search/ajax_query.rb +33 -34
- data/lib/gscraper/{extensions.rb → search/exceptions.rb} +2 -2
- data/lib/gscraper/{extensions/uri.rb → search/exceptions/blocked.rb} +10 -2
- data/lib/gscraper/search/page.rb +47 -67
- data/lib/gscraper/search/query.rb +90 -44
- data/lib/gscraper/search/result.rb +7 -9
- data/lib/gscraper/search/search.rb +2 -2
- data/lib/gscraper/search/web_query.rb +93 -101
- data/lib/gscraper/sponsored_ad.rb +3 -3
- data/lib/gscraper/sponsored_links.rb +1 -3
- data/lib/gscraper/version.rb +2 -2
- data/spec/languages_spec.rb +28 -0
- data/spec/search/ajax_query_spec.rb +2 -1
- data/spec/search/query_spec.rb +29 -0
- data/spec/search/web_query_spec.rb +21 -1
- data/spec/spec_helper.rb +2 -12
- metadata +107 -125
- data/.specopts +0 -1
- data/Gemfile +0 -25
- data/lib/gscraper/extensions/uri/http.rb +0 -31
- data/lib/gscraper/extensions/uri/query_params.rb +0 -109
- data/spec/extensions/uri/http_spec.rb +0 -9
- data/spec/extensions/uri/query_params_spec.rb +0 -46
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,4 +18,4 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
-
require 'gscraper/
|
21
|
+
require 'gscraper/search/exceptions/blocked'
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,4 +18,12 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
-
|
21
|
+
module GScraper
|
22
|
+
module Search
|
23
|
+
#
|
24
|
+
# @since 0.4.0
|
25
|
+
#
|
26
|
+
class Blocked < RuntimeError
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/gscraper/search/page.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -51,15 +51,13 @@ module GScraper
|
|
51
51
|
# end
|
52
52
|
#
|
53
53
|
def results_with_title(title)
|
54
|
-
unless block_given?
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
true
|
62
|
-
end
|
54
|
+
return enum_for(:results_with_title,title) unless block_given?
|
55
|
+
|
56
|
+
results_with do |result|
|
57
|
+
if result.title.match(title)
|
58
|
+
yield result
|
59
|
+
|
60
|
+
true
|
63
61
|
end
|
64
62
|
end
|
65
63
|
end
|
@@ -88,15 +86,13 @@ module GScraper
|
|
88
86
|
# end
|
89
87
|
#
|
90
88
|
def results_with_url(url)
|
91
|
-
unless block_given?
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
true
|
99
|
-
end
|
89
|
+
return enum_for(:results_with_url,url) unless block_given?
|
90
|
+
|
91
|
+
results_with do |result|
|
92
|
+
if result.url.match(url)
|
93
|
+
yield result
|
94
|
+
|
95
|
+
true
|
100
96
|
end
|
101
97
|
end
|
102
98
|
end
|
@@ -125,15 +121,13 @@ module GScraper
|
|
125
121
|
# end
|
126
122
|
#
|
127
123
|
def results_with_summary(summary)
|
128
|
-
unless block_given?
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
true
|
136
|
-
end
|
124
|
+
return enum_for(:results_with_summary,summary) unless block_given?
|
125
|
+
|
126
|
+
results_with do |result|
|
127
|
+
if result.summary.match(summary)
|
128
|
+
yield result
|
129
|
+
|
130
|
+
true
|
137
131
|
end
|
138
132
|
end
|
139
133
|
end
|
@@ -155,11 +149,9 @@ module GScraper
|
|
155
149
|
# each_rank { |rank| puts rank }
|
156
150
|
#
|
157
151
|
def each_rank
|
158
|
-
unless block_given?
|
159
|
-
|
160
|
-
|
161
|
-
each { |result| yield result.rank }
|
162
|
-
end
|
152
|
+
return enum_for(:each_rank) unless block_given?
|
153
|
+
|
154
|
+
each { |result| yield result.rank }
|
163
155
|
end
|
164
156
|
|
165
157
|
#
|
@@ -179,11 +171,9 @@ module GScraper
|
|
179
171
|
# each_title { |title| puts title }
|
180
172
|
#
|
181
173
|
def each_title
|
182
|
-
unless block_given?
|
183
|
-
|
184
|
-
|
185
|
-
each { |result| yield result.title }
|
186
|
-
end
|
174
|
+
return enum_for(:each_title) unless block_given?
|
175
|
+
|
176
|
+
each { |result| yield result.title }
|
187
177
|
end
|
188
178
|
|
189
179
|
#
|
@@ -203,11 +193,9 @@ module GScraper
|
|
203
193
|
# each_url { |url| puts url }
|
204
194
|
#
|
205
195
|
def each_url
|
206
|
-
unless block_given?
|
207
|
-
|
208
|
-
|
209
|
-
each { |result| yield result.url }
|
210
|
-
end
|
196
|
+
return enum_for(:each_url) unless block_given?
|
197
|
+
|
198
|
+
each { |result| yield result.url }
|
211
199
|
end
|
212
200
|
|
213
201
|
#
|
@@ -227,11 +215,9 @@ module GScraper
|
|
227
215
|
# each_summary { |summary| puts summary }
|
228
216
|
#
|
229
217
|
def each_summary
|
230
|
-
unless block_given?
|
231
|
-
|
232
|
-
|
233
|
-
each { |result| yield result.summary }
|
234
|
-
end
|
218
|
+
return enum_for(:each_summary) unless block_given?
|
219
|
+
|
220
|
+
each { |result| yield result.summary }
|
235
221
|
end
|
236
222
|
|
237
223
|
#
|
@@ -251,12 +237,10 @@ module GScraper
|
|
251
237
|
# each_cached_url { |cached_url| puts cached_url }
|
252
238
|
#
|
253
239
|
def each_cached_url
|
254
|
-
unless block_given?
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
yield result.cached_url if result.cached_url
|
259
|
-
end
|
240
|
+
return enum_for(:each_cached_url) unless block_given?
|
241
|
+
|
242
|
+
each do |result|
|
243
|
+
yield result.cached_url if result.cached_url
|
260
244
|
end
|
261
245
|
end
|
262
246
|
|
@@ -277,12 +261,10 @@ module GScraper
|
|
277
261
|
# each_cached_page { |page| puts page.readlines }
|
278
262
|
#
|
279
263
|
def each_cached_page
|
280
|
-
unless block_given?
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
yield result.cached_page if result.cached_page
|
285
|
-
end
|
264
|
+
return enum_for(:each_cached_page) unless block_given?
|
265
|
+
|
266
|
+
each do |result|
|
267
|
+
yield result.cached_page if result.cached_page
|
286
268
|
end
|
287
269
|
end
|
288
270
|
|
@@ -303,12 +285,10 @@ module GScraper
|
|
303
285
|
# each_similar_url { |similar_url| puts similar_url }
|
304
286
|
#
|
305
287
|
def each_similar_url
|
306
|
-
unless block_given?
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
yield result.similar_url if result.similar_url
|
311
|
-
end
|
288
|
+
return enum_for(:each_similar_url) unless block_given?
|
289
|
+
|
290
|
+
each do |result|
|
291
|
+
yield result.similar_url if result.similar_url
|
312
292
|
end
|
313
293
|
end
|
314
294
|
|
@@ -421,7 +401,7 @@ module GScraper
|
|
421
401
|
end
|
422
402
|
|
423
403
|
#
|
424
|
-
# Returns the
|
404
|
+
# Returns the URLs of the results that match the given block.
|
425
405
|
#
|
426
406
|
# @yield [result]
|
427
407
|
# The given block will be used to filter the results in the page.
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,12 +18,8 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
-
require 'gscraper/
|
22
|
-
require 'gscraper/
|
23
|
-
require 'gscraper/sponsored_ad'
|
24
|
-
require 'gscraper/sponsored_links'
|
25
|
-
require 'gscraper/extensions/uri'
|
26
|
-
require 'gscraper/has_pages'
|
21
|
+
require 'gscraper/hosts'
|
22
|
+
require 'gscraper/languages'
|
27
23
|
require 'gscraper/licenses'
|
28
24
|
require 'gscraper/gscraper'
|
29
25
|
|
@@ -31,9 +27,21 @@ module GScraper
|
|
31
27
|
module Search
|
32
28
|
class Query
|
33
29
|
|
30
|
+
# Web Search sub-domain
|
31
|
+
SUB_DOMAIN = 'www'
|
32
|
+
|
33
|
+
# Default host to submit queries to
|
34
|
+
DEFAULT_HOST = "#{SUB_DOMAIN}.#{Hosts::PRIMARY_DOMAIN}"
|
35
|
+
|
36
|
+
# The host to submit queries to
|
37
|
+
attr_writer :search_host
|
38
|
+
|
34
39
|
# Search query
|
35
40
|
attr_accessor :query
|
36
41
|
|
42
|
+
# The search language
|
43
|
+
attr_accessor :language
|
44
|
+
|
37
45
|
# Search 'link' modifier
|
38
46
|
attr_accessor :link
|
39
47
|
|
@@ -79,15 +87,24 @@ module GScraper
|
|
79
87
|
# Search for results containing numbers between the range
|
80
88
|
attr_accessor :numeric_range
|
81
89
|
|
90
|
+
# Search for results containing the definitions of the keywords
|
91
|
+
attr_accessor :define
|
92
|
+
|
82
93
|
#
|
83
94
|
# Creates a new query.
|
84
95
|
#
|
85
96
|
# @param [Hash] options
|
86
97
|
# Additional options.
|
87
98
|
#
|
99
|
+
# @option options [String] :search_host (www.google.com)
|
100
|
+
# The host to submit queries to.
|
101
|
+
#
|
88
102
|
# @option options [String] :query
|
89
103
|
# The search query.
|
90
104
|
#
|
105
|
+
# @option options [Symbol, String] :language (Languages.native)
|
106
|
+
# The search language.
|
107
|
+
#
|
91
108
|
# @option options [String] :link
|
92
109
|
# Search for results which link to the specified URI.
|
93
110
|
#
|
@@ -103,20 +120,20 @@ module GScraper
|
|
103
120
|
# @option options [String] :filetype
|
104
121
|
# Limit results to those with the specified file-type.
|
105
122
|
#
|
106
|
-
# @option options [
|
123
|
+
# @option options [Array, String] :allintitle
|
107
124
|
# Search for results with all of the keywords appearing in the
|
108
125
|
# title.
|
109
126
|
#
|
110
127
|
# @option options [String] :intitle
|
111
128
|
# Search for results with the keyword appearing in the title.
|
112
129
|
#
|
113
|
-
# @option options [
|
130
|
+
# @option options [Array, String] :allintext
|
114
131
|
# Search for results with all of the keywords appearing in the text.
|
115
132
|
#
|
116
133
|
# @option options [String] :intext
|
117
134
|
# Search for results with the keyword appearing in the text.
|
118
135
|
#
|
119
|
-
# @option options [
|
136
|
+
# @option options [Array, String] :allinanchor
|
120
137
|
# Search for results with all of the keywords appearing in the
|
121
138
|
# text of links.
|
122
139
|
#
|
@@ -127,13 +144,13 @@ module GScraper
|
|
127
144
|
# @option options [String] :exact_phrase
|
128
145
|
# Search for results containing the specified exact phrase.
|
129
146
|
#
|
130
|
-
# @option options [
|
147
|
+
# @option options [Array, String] :with_words
|
131
148
|
# Search for results containing all of the specified words.
|
132
149
|
#
|
133
|
-
# @option options [
|
150
|
+
# @option options [Array, String] :without_words
|
134
151
|
# Search for results not containing any of the specified words.
|
135
152
|
#
|
136
|
-
# @option options [Range] :numeric_range
|
153
|
+
# @option options [Range, Array, String] :numeric_range
|
137
154
|
# Search for results contain numbers that fall within the
|
138
155
|
# specified Range.
|
139
156
|
#
|
@@ -141,6 +158,10 @@ module GScraper
|
|
141
158
|
# Search for results containing the definition of the specified
|
142
159
|
# keyword.
|
143
160
|
#
|
161
|
+
# @option options [Boolean] :load_balance (false)
|
162
|
+
# Specifies whether to distribute queries accross multiple Google
|
163
|
+
# domains.
|
164
|
+
#
|
144
165
|
# @yield [query]
|
145
166
|
# If a block is given, it will be passed the new query.
|
146
167
|
#
|
@@ -151,33 +172,54 @@ module GScraper
|
|
151
172
|
# The new query.
|
152
173
|
#
|
153
174
|
def initialize(options={})
|
154
|
-
@
|
175
|
+
@search_host = options.fetch(:search_host,DEFAULT_HOST)
|
176
|
+
|
177
|
+
@query = options[:query]
|
178
|
+
@language = options.fetch(:language,Languages.native)
|
155
179
|
|
156
|
-
@link
|
157
|
-
@related
|
158
|
-
@info
|
159
|
-
@site
|
180
|
+
@link = options[:link]
|
181
|
+
@related = options[:related]
|
182
|
+
@info = options[:info]
|
183
|
+
@site = options[:site]
|
160
184
|
@filetype = options[:filetype]
|
161
185
|
|
162
|
-
@allintitle
|
163
|
-
@intitle
|
164
|
-
@allinurl
|
165
|
-
@inurl
|
166
|
-
@allintext
|
167
|
-
@intext
|
186
|
+
@allintitle = options[:allintitle]
|
187
|
+
@intitle = options[:intitle]
|
188
|
+
@allinurl = options[:allinurl]
|
189
|
+
@inurl = options[:inurl]
|
190
|
+
@allintext = options[:allintext]
|
191
|
+
@intext = options[:intext]
|
168
192
|
@allinanchor = options[:allinanchor]
|
169
|
-
@inanchor
|
193
|
+
@inanchor = options[:inanchor]
|
170
194
|
|
171
|
-
@exact_phrase
|
172
|
-
@with_words
|
195
|
+
@exact_phrase = options[:exact_phrase]
|
196
|
+
@with_words = options[:with_words]
|
173
197
|
@without_words = options[:without_words]
|
174
198
|
|
175
199
|
@numeric_range = options[:numeric_range]
|
176
|
-
@define
|
200
|
+
@define = options[:define]
|
201
|
+
|
202
|
+
@load_balance = options.fetch(:load_balance,false)
|
177
203
|
|
178
204
|
yield self if block_given?
|
179
205
|
end
|
180
206
|
|
207
|
+
#
|
208
|
+
# The host to submit queries to.
|
209
|
+
#
|
210
|
+
# @return [String]
|
211
|
+
# The host to submit queries to.
|
212
|
+
#
|
213
|
+
# @since 0.4.0
|
214
|
+
#
|
215
|
+
def search_host
|
216
|
+
if @load_balance
|
217
|
+
Hosts::DOMAINS[rand(Hosts::DOMAINS.length)]
|
218
|
+
else
|
219
|
+
@search_host
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
181
223
|
#
|
182
224
|
# The query expression.
|
183
225
|
#
|
@@ -222,16 +264,25 @@ module GScraper
|
|
222
264
|
expr << "\"#{@exact_phrase}\""
|
223
265
|
end
|
224
266
|
|
225
|
-
|
267
|
+
case @with_words
|
268
|
+
when String
|
269
|
+
expr << @with_words
|
270
|
+
when Enumerable
|
226
271
|
expr << @with_words.join(' OR ')
|
227
272
|
end
|
228
|
-
|
229
|
-
|
273
|
+
|
274
|
+
case @without_words
|
275
|
+
when String
|
276
|
+
expr << @without_words
|
277
|
+
when Enumerable
|
230
278
|
expr << @without_words.map { |word| "-#{word}" }.join(' ')
|
231
279
|
end
|
232
280
|
|
233
|
-
|
234
|
-
|
281
|
+
case @numeric_range
|
282
|
+
when String
|
283
|
+
expr << @numeric_range
|
284
|
+
when Range, Array
|
285
|
+
expr << "#{@numeric_range.first}..#{@numeric_range.last}"
|
235
286
|
end
|
236
287
|
|
237
288
|
return expr.join(' ')
|
@@ -240,7 +291,7 @@ module GScraper
|
|
240
291
|
protected
|
241
292
|
|
242
293
|
#
|
243
|
-
# Formats the value for a search
|
294
|
+
# Formats the value for a search modifier.
|
244
295
|
#
|
245
296
|
# @param [Regexp, String]
|
246
297
|
# The value for the search modifier.
|
@@ -249,10 +300,11 @@ module GScraper
|
|
249
300
|
# The formatted value.
|
250
301
|
#
|
251
302
|
def format_modifier(value)
|
252
|
-
|
253
|
-
|
303
|
+
case value
|
304
|
+
when Range
|
305
|
+
value.source
|
254
306
|
else
|
255
|
-
|
307
|
+
value.to_s
|
256
308
|
end
|
257
309
|
end
|
258
310
|
|
@@ -266,13 +318,7 @@ module GScraper
|
|
266
318
|
# The formatted value.
|
267
319
|
#
|
268
320
|
def format_options(value)
|
269
|
-
|
270
|
-
return value.map { |element|
|
271
|
-
format_modifier(element)
|
272
|
-
}.join(' ')
|
273
|
-
else
|
274
|
-
return format_modifier(value)
|
275
|
-
end
|
321
|
+
Array(value).map(&method(:format_modifier)).join(' ')
|
276
322
|
end
|
277
323
|
|
278
324
|
end
|