gscraper 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +17 -2
- data/Manifest.txt +3 -0
- data/Rakefile +1 -0
- data/lib/gscraper/extensions/uri/http.rb +8 -2
- data/lib/gscraper/gscraper.rb +4 -3
- data/lib/gscraper/search/page.rb +181 -43
- data/lib/gscraper/search/query.rb +85 -19
- data/lib/gscraper/search/result.rb +41 -2
- data/lib/gscraper/version.rb +1 -1
- data/test/search/page_results.rb +103 -0
- data/test/search/query_from_url.rb +4 -4
- data/test/search/query_pages.rb +32 -0
- data/test/search/query_result.rb +30 -0
- data/test/test_gscraper.rb +3 -0
- metadata +23 -2
data/History.txt
CHANGED
@@ -1,7 +1,22 @@
|
|
1
|
+
== 0.1.4 / 2007-12-23
|
2
|
+
|
3
|
+
* Added Search::Query#result_at for easier access of a single result at
|
4
|
+
a given index.
|
5
|
+
* Adding scraping of the "Cached" and "Similar Pages" URLs of Search
|
6
|
+
Results.
|
7
|
+
* Added methods to Search::Page for accessing cached URLs, cached pages,
|
8
|
+
similar query URLs and similar Queries in mass.
|
9
|
+
* Search::Query#page and Search::Query#first_page now can receive blocks.
|
10
|
+
* Improved the formating of URL query parameters.
|
11
|
+
* Added more unit-tests.
|
12
|
+
* Fixed scraping of Search Result summaries.
|
13
|
+
* Fixed various bugs in Search::Query uncovered during unit-testing.
|
14
|
+
* Fixed typos in Search::Page's documentation.
|
15
|
+
|
1
16
|
== 0.1.3 / 2007-12-22
|
2
17
|
|
3
|
-
* Added the Page class, which contains many of convenance methods
|
4
|
-
searching through the results within a Page.
|
18
|
+
* Added the Search::Page class, which contains many of convenance methods
|
19
|
+
for searching through the results within a Page.
|
5
20
|
|
6
21
|
== 0.1.2 / 2007-12-22
|
7
22
|
|
data/Manifest.txt
CHANGED
data/Rakefile
CHANGED
@@ -12,6 +12,7 @@ Hoe.new('gscraper', GScraper::VERSION) do |p|
|
|
12
12
|
p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
|
13
13
|
p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
|
14
14
|
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
15
|
+
p.extra_deps = ['hpricot', 'mechanize']
|
15
16
|
end
|
16
17
|
|
17
18
|
# vim: syntax=Ruby
|
@@ -4,12 +4,14 @@ module URI
|
|
4
4
|
# Query parameters
|
5
5
|
attr_reader :query_params
|
6
6
|
|
7
|
+
alias_method :old_initialize, :initialize
|
8
|
+
|
7
9
|
#
|
8
10
|
# Creates a new URI::HTTP object and initializes query_params as a
|
9
11
|
# new Hash.
|
10
12
|
#
|
11
13
|
def initialize(*args)
|
12
|
-
|
14
|
+
old_initialize(*args)
|
13
15
|
|
14
16
|
@query_params = {}
|
15
17
|
parse_query_params
|
@@ -57,7 +59,11 @@ module URI
|
|
57
59
|
if value==true
|
58
60
|
"#{name}=active"
|
59
61
|
elsif value
|
60
|
-
|
62
|
+
if value.kind_of?(Array)
|
63
|
+
"#{name}=#{URI.encode(value.join(' '))}"
|
64
|
+
else
|
65
|
+
"#{name}=#{URI.encode(value.to_s)}"
|
66
|
+
end
|
61
67
|
else
|
62
68
|
"#{name}="
|
63
69
|
end
|
data/lib/gscraper/gscraper.rb
CHANGED
@@ -13,7 +13,7 @@ module GScraper
|
|
13
13
|
# Returns the GScraper User-Agent
|
14
14
|
#
|
15
15
|
def GScraper.user_agent
|
16
|
-
@user_agent
|
16
|
+
@user_agent ||= nil
|
17
17
|
end
|
18
18
|
|
19
19
|
#
|
@@ -24,10 +24,11 @@ module GScraper
|
|
24
24
|
end
|
25
25
|
|
26
26
|
#
|
27
|
-
# Opens the _uri_ with the given _opts_. The contents of the _uri_ will
|
28
|
-
#
|
27
|
+
# Opens the _uri_ with the given _opts_. The contents of the _uri_ will be
|
28
|
+
# returned.
|
29
29
|
#
|
30
30
|
# GScraper.open('http://www.hackety.org/')
|
31
|
+
#
|
31
32
|
# GScraper.open('http://tenderlovemaking.com/',
|
32
33
|
# :user_agent_alias => 'Linux Mozilla')
|
33
34
|
# GScraper.open('http://www.wired.com/', :user_agent => 'the future')
|
data/lib/gscraper/search/page.rb
CHANGED
@@ -115,77 +115,174 @@ module GScraper
|
|
115
115
|
|
116
116
|
#
|
117
117
|
# Returns an Array containing the ranks of the results within the
|
118
|
-
# Page.
|
118
|
+
# Page.
|
119
119
|
#
|
120
120
|
# page.ranks # => [...]
|
121
121
|
#
|
122
|
-
|
123
|
-
|
124
|
-
# end
|
125
|
-
#
|
126
|
-
def ranks(&block)
|
127
|
-
mapped = map { |result| result.rank }
|
128
|
-
|
129
|
-
mapped.each(&block) if block
|
130
|
-
return mapped
|
122
|
+
def ranks
|
123
|
+
map { |result| result.rank }
|
131
124
|
end
|
132
125
|
|
133
126
|
#
|
134
127
|
# Returns an Array containing the titles of the results within the
|
135
|
-
# Page.
|
128
|
+
# Page.
|
136
129
|
#
|
137
130
|
# page.titles # => [...]
|
138
131
|
#
|
139
|
-
|
140
|
-
|
141
|
-
# end
|
142
|
-
#
|
143
|
-
def titles(&block)
|
144
|
-
mapped = map { |result| result.title }
|
145
|
-
|
146
|
-
mapped.each(&block) if block
|
147
|
-
return mapped
|
132
|
+
def titles
|
133
|
+
map { |result| result.title }
|
148
134
|
end
|
149
135
|
|
150
136
|
#
|
151
137
|
# Returns an Array containing the URLs of the results within the
|
152
|
-
# Page.
|
138
|
+
# Page.
|
153
139
|
#
|
154
140
|
# page.urls # => [...]
|
155
141
|
#
|
156
|
-
|
157
|
-
|
158
|
-
# end
|
159
|
-
#
|
160
|
-
def urls(&block)
|
161
|
-
mapped = map { |result| result.url }
|
162
|
-
|
163
|
-
mapped.each(&block) if block
|
164
|
-
return mapped
|
142
|
+
def urls
|
143
|
+
map { |result| result.url }
|
165
144
|
end
|
166
145
|
|
167
146
|
#
|
168
147
|
# Returns an Array containing the summaries of the results within the
|
169
|
-
# Page.
|
170
|
-
# _block_.
|
148
|
+
# Page.
|
171
149
|
#
|
172
150
|
# page.summaries # => [...]
|
173
151
|
#
|
174
|
-
|
175
|
-
|
176
|
-
|
152
|
+
def summaries
|
153
|
+
map { |result| result.summary }
|
154
|
+
end
|
155
|
+
|
156
|
+
#
|
157
|
+
# Returns an Array containing the cached URLs of the results within
|
158
|
+
# the Page.
|
159
|
+
#
|
160
|
+
# page.cached_urls # => [...]
|
177
161
|
#
|
178
|
-
def
|
179
|
-
|
162
|
+
def cached_urls
|
163
|
+
map { |result| result.cached_url }
|
164
|
+
end
|
180
165
|
|
181
|
-
|
182
|
-
|
166
|
+
#
|
167
|
+
# Returns an Array containing the cached pages of the results within
|
168
|
+
# the Page. If _opts_ are given, they will be used in accessing the
|
169
|
+
# cached page.
|
170
|
+
#
|
171
|
+
# page.cached_pages # => [...]
|
172
|
+
#
|
173
|
+
def cached_pages(opts={})
|
174
|
+
map { |result| result.cached_page(opts) }
|
175
|
+
end
|
176
|
+
|
177
|
+
#
|
178
|
+
# Returns an Array containing the similar Query URLs of the results
|
179
|
+
# within the Page.
|
180
|
+
#
|
181
|
+
# page.similar_urls # => [...]
|
182
|
+
#
|
183
|
+
def similar_urls
|
184
|
+
map { |result| result.similar_url }
|
185
|
+
end
|
186
|
+
|
187
|
+
#
|
188
|
+
# Returns an Array containing the similar Queries of the results
|
189
|
+
# within the Page.
|
190
|
+
#
|
191
|
+
# page.similar_queries # => [...]
|
192
|
+
#
|
193
|
+
def similar_queries
|
194
|
+
map { |result| result.similar_query }
|
195
|
+
end
|
196
|
+
|
197
|
+
#
|
198
|
+
# Iterates over each result's rank within the Page, passing each to
|
199
|
+
# the given _block_.
|
200
|
+
#
|
201
|
+
# each_rank { |rank| puts rank }
|
202
|
+
#
|
203
|
+
def each_rank(&block)
|
204
|
+
ranks.each(&block)
|
205
|
+
end
|
206
|
+
|
207
|
+
#
|
208
|
+
# Iterates over each result's title within the Page, passing each to
|
209
|
+
# the given _block_.
|
210
|
+
#
|
211
|
+
# each_title { |title| puts title }
|
212
|
+
#
|
213
|
+
def each_title(&block)
|
214
|
+
titles.each(&block)
|
215
|
+
end
|
216
|
+
|
217
|
+
#
|
218
|
+
# Iterates over each result's url within the Page, passing each to
|
219
|
+
# the given _block_.
|
220
|
+
#
|
221
|
+
# each_url { |url| puts url }
|
222
|
+
#
|
223
|
+
def each_url(&block)
|
224
|
+
urls.each(&block)
|
225
|
+
end
|
226
|
+
|
227
|
+
#
|
228
|
+
# Iterates over each result's summary within the Page, passing each
|
229
|
+
# to the given _block_.
|
230
|
+
#
|
231
|
+
# each_summary { |summary| puts summary }
|
232
|
+
#
|
233
|
+
def each_summary(&block)
|
234
|
+
summaries.each(&block)
|
235
|
+
end
|
236
|
+
|
237
|
+
#
|
238
|
+
# Iterates over each result's cached URLs within the Page, passing
|
239
|
+
# each to the given _block_.
|
240
|
+
#
|
241
|
+
# each_cached_url { |url| puts url }
|
242
|
+
#
|
243
|
+
def each_cached_url(&block)
|
244
|
+
cached_urls.each(&block)
|
245
|
+
end
|
246
|
+
|
247
|
+
#
|
248
|
+
# Iterates over each result's cached pages within the Page, passing
|
249
|
+
# each to the given _block_. If _opts_ are given, they will be used
|
250
|
+
# in accessing the cached pages.
|
251
|
+
#
|
252
|
+
# each_cached_page { |page| puts page.readlines }
|
253
|
+
#
|
254
|
+
def each_cached_page(opts={},&block)
|
255
|
+
cached_pages(opts).each(&block)
|
256
|
+
end
|
257
|
+
|
258
|
+
#
|
259
|
+
# Iterates over each result's similar Query URLs within the Page,
|
260
|
+
# passing each to the given _block_.
|
261
|
+
#
|
262
|
+
# each_similar_url { |url| puts url }
|
263
|
+
#
|
264
|
+
def each_similar_url(&block)
|
265
|
+
similar_urls.each(&block)
|
266
|
+
end
|
267
|
+
|
268
|
+
#
|
269
|
+
# Iterates over each result's similar Query within the Page, passing
|
270
|
+
# each to the given _block_.
|
271
|
+
#
|
272
|
+
# each_similar_query do |q|
|
273
|
+
# q.first_page do |page|
|
274
|
+
# puts page.urls.join("\n")
|
275
|
+
# end
|
276
|
+
# end
|
277
|
+
#
|
278
|
+
def each_similar_query(&block)
|
279
|
+
similar_queries.each(&block)
|
183
280
|
end
|
184
281
|
|
185
282
|
#
|
186
283
|
# Returns the ranks of the results that match the specified _block_.
|
187
284
|
#
|
188
|
-
# page.ranks_of { |result result.title =~ /awesome/ }
|
285
|
+
# page.ranks_of { |result| result.title =~ /awesome/ }
|
189
286
|
#
|
190
287
|
def ranks_of(&block)
|
191
288
|
results_with(&block).ranks
|
@@ -194,7 +291,7 @@ module GScraper
|
|
194
291
|
#
|
195
292
|
# Returns the titles of the results that match the specified _block_.
|
196
293
|
#
|
197
|
-
# page.titles_of { |result result.url.include?('www') }
|
294
|
+
# page.titles_of { |result| result.url.include?('www') }
|
198
295
|
#
|
199
296
|
def titles_of(&block)
|
200
297
|
results_with(&block).titles
|
@@ -203,7 +300,7 @@ module GScraper
|
|
203
300
|
#
|
204
301
|
# Returns the urls of the results that match the specified _block_.
|
205
302
|
#
|
206
|
-
# page.urls_of { |result result.summary =~ /awesome pants/ }
|
303
|
+
# page.urls_of { |result| result.summary =~ /awesome pants/ }
|
207
304
|
#
|
208
305
|
def urls_of(&block)
|
209
306
|
results_with(&block).urls
|
@@ -213,12 +310,53 @@ module GScraper
|
|
213
310
|
# Returns the summaries of the results that match the specified
|
214
311
|
# _block_.
|
215
312
|
#
|
216
|
-
# page.summaries_of { |result result.title =~ /what if/ }
|
313
|
+
# page.summaries_of { |result| result.title =~ /what if/ }
|
217
314
|
#
|
218
315
|
def summaries_of(&block)
|
219
316
|
results_with(&block).summaries
|
220
317
|
end
|
221
318
|
|
319
|
+
#
|
320
|
+
# Returns the cached URLs of the results that match the specified
|
321
|
+
# _block_.
|
322
|
+
#
|
323
|
+
# page.cached_urls_of { |result| result.title =~ /howdy/ }
|
324
|
+
#
|
325
|
+
def cached_urls_of(&block)
|
326
|
+
results_with(&block).cached_urls
|
327
|
+
end
|
328
|
+
|
329
|
+
#
|
330
|
+
# Returns the cached pages of the results that match the specified
|
331
|
+
# _block_. If _opts_ are given, they will be used in accessing
|
332
|
+
# the cached pages.
|
333
|
+
#
|
334
|
+
# page.cached_pages_of { |result| result.title =~ /dude/ }
|
335
|
+
#
|
336
|
+
def cached_pages_of(opts={},&block)
|
337
|
+
results_with(&block).cached_pages(opts)
|
338
|
+
end
|
339
|
+
|
340
|
+
#
|
341
|
+
# Returns the similar query URLs of the results that match the
|
342
|
+
# specified _block_.
|
343
|
+
#
|
344
|
+
# page.similar_urls_of { |result| result.title =~ /what if/ }
|
345
|
+
#
|
346
|
+
def similar_urls_of(&block)
|
347
|
+
results_with(&block).similar_urls
|
348
|
+
end
|
349
|
+
|
350
|
+
#
|
351
|
+
# Returns the similar Queries of the results that match the
|
352
|
+
# specified _block_.
|
353
|
+
#
|
354
|
+
# page.similar_queries_of { |result| result.title =~ /hackety/ }
|
355
|
+
#
|
356
|
+
def similar_queries_of(&block)
|
357
|
+
results_with(&block).similar_queries
|
358
|
+
end
|
359
|
+
|
222
360
|
end
|
223
361
|
end
|
224
362
|
end
|
@@ -10,7 +10,8 @@ module GScraper
|
|
10
10
|
module Search
|
11
11
|
class Query
|
12
12
|
|
13
|
-
|
13
|
+
SEARCH_HOST = 'www.google.com'
|
14
|
+
SEARCH_URL = "http://#{SEARCH_HOST}/search"
|
14
15
|
|
15
16
|
RESULTS_PER_PAGE = 10
|
16
17
|
|
@@ -90,7 +91,7 @@ module GScraper
|
|
90
91
|
def initialize(opts={},&block)
|
91
92
|
super()
|
92
93
|
|
93
|
-
@results_per_page = opts[:results_per_page] || RESULTS_PER_PAGE
|
94
|
+
@results_per_page = (opts[:results_per_page] || RESULTS_PER_PAGE)
|
94
95
|
|
95
96
|
@query = opts[:query]
|
96
97
|
@exact_phrase = opts[:exact_phrase]
|
@@ -104,12 +105,29 @@ module GScraper
|
|
104
105
|
|
105
106
|
if opts[:within_past_day]
|
106
107
|
@within_past_day = opts[:within_past_day]
|
108
|
+
@within_past_week = false
|
109
|
+
@within_past_months = false
|
110
|
+
@within_past_year = false
|
107
111
|
elsif opts[:within_past_week]
|
112
|
+
@within_past_day = false
|
108
113
|
@within_past_week = opts[:within_past_week]
|
114
|
+
@within_past_months = false
|
115
|
+
@within_past_year = false
|
109
116
|
elsif opts[:within_past_months]
|
117
|
+
@within_past_day = false
|
118
|
+
@within_past_week = false
|
110
119
|
@within_past_months = opts[:within_past_months]
|
120
|
+
@within_past_year = false
|
111
121
|
elsif opts[:within_past_year]
|
122
|
+
@within_past_day = false
|
123
|
+
@within_past_week = false
|
124
|
+
@within_past_months = false
|
112
125
|
@within_past_year = opts[:within_past_year]
|
126
|
+
else
|
127
|
+
@within_past_day = false
|
128
|
+
@within_past_week = false
|
129
|
+
@within_past_months = false
|
130
|
+
@within_past_year = false
|
113
131
|
end
|
114
132
|
|
115
133
|
@numeric_range = opts[:numeric_range]
|
@@ -318,7 +336,7 @@ module GScraper
|
|
318
336
|
def page_url(page_index)
|
319
337
|
url = search_url
|
320
338
|
|
321
|
-
url.query_params['start'] =
|
339
|
+
url.query_params['start'] = page_result_offset(page_index)
|
322
340
|
url.query_params['sa'] = 'N'
|
323
341
|
|
324
342
|
return url
|
@@ -327,33 +345,67 @@ module GScraper
|
|
327
345
|
#
|
328
346
|
# Returns a Page object containing Result objects at the specified
|
329
347
|
# _page_index_. If _opts_ are given, they will be used in accessing
|
330
|
-
# the SEARCH_URL.
|
348
|
+
# the SEARCH_URL. If a _block_ is given, it will be passed the newly
|
349
|
+
# created Page.
|
331
350
|
#
|
332
|
-
def page(page_index,opts={})
|
351
|
+
def page(page_index,opts={},&block)
|
333
352
|
doc = Hpricot(GScraper.open(page_url(page_index),opts))
|
353
|
+
|
334
354
|
new_page = Page.new
|
355
|
+
results = doc.search('//div.g')[0...@results_per_page.to_i]
|
356
|
+
|
357
|
+
results.each_with_index do |result,index|
|
358
|
+
rank = page_result_offset(page_index) + (index + 1)
|
359
|
+
title = result.at('//h2.r').inner_text
|
360
|
+
url = result.at('//h2.r/a').get_attribute('href')
|
361
|
+
|
362
|
+
summary = result.at('//td.j//font').children[0...-3].inject('') do |accum,elem|
|
363
|
+
accum + elem.inner_text
|
364
|
+
end
|
365
|
+
|
366
|
+
cached_url = nil
|
367
|
+
similar_url = nil
|
335
368
|
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
url = result.search('//h2.r/a').first.get_attribute('href')
|
340
|
-
# TODO: exclude URL and Links from summary text
|
341
|
-
summary = result.search('//td.j').first.inner_text
|
369
|
+
if (cached_link = result.at('//td.j//font/nobr/a:first'))
|
370
|
+
cached_url = cached_link.get_attribute('href')
|
371
|
+
end
|
342
372
|
|
343
|
-
|
373
|
+
if (similar_link = result.at('//td.j//font/nobr/a:last'))
|
374
|
+
similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
|
375
|
+
end
|
344
376
|
|
345
|
-
new_page << Result.new(rank,title,url,summary)
|
377
|
+
new_page << Result.new(rank,title,url,summary,cached_url,similar_url)
|
346
378
|
end
|
347
379
|
|
380
|
+
block.call(new_page) if block
|
348
381
|
return new_page
|
349
382
|
end
|
350
383
|
|
351
384
|
#
|
352
|
-
# Returns the
|
353
|
-
# will be used in accessing the SEARCH_URL.
|
385
|
+
# Returns the Results on the first page. If _opts_ are given, they
|
386
|
+
# will be used in accessing the SEARCH_URL. If a _block_ is given
|
387
|
+
# it will be passed the newly created Page.
|
388
|
+
#
|
389
|
+
def first_page(opts={},&block)
|
390
|
+
page(1,opts,&block)
|
391
|
+
end
|
392
|
+
|
393
|
+
#
|
394
|
+
# Returns the Result at the specified _index_. If _opts_ are given,
|
395
|
+
# they will be used in accessing the Page containing the requested
|
396
|
+
# Result.
|
354
397
|
#
|
355
|
-
def
|
356
|
-
page(
|
398
|
+
def result_at(index,opts={})
|
399
|
+
page(result_page_index(index),opts)[page_result_index(index)]
|
400
|
+
end
|
401
|
+
|
402
|
+
#
|
403
|
+
# Returns the first Result at the specified _index_. If _opts_ are
|
404
|
+
# given, they will be used in accessing the Page containing the
|
405
|
+
# requested Result.
|
406
|
+
#
|
407
|
+
def first_result(opts={})
|
408
|
+
result_at(1,opts)
|
357
409
|
end
|
358
410
|
|
359
411
|
#
|
@@ -387,8 +439,22 @@ module GScraper
|
|
387
439
|
#
|
388
440
|
# Returns the rank offset for the specified _page_index_.
|
389
441
|
#
|
390
|
-
def
|
391
|
-
(page_index.to_i - 1) * @
|
442
|
+
def page_result_offset(page_index)
|
443
|
+
(page_index.to_i - 1) * @results_per_page.to_i
|
444
|
+
end
|
445
|
+
|
446
|
+
#
|
447
|
+
# Returns the in-Page index of the _result_index_.
|
448
|
+
#
|
449
|
+
def page_result_index(result_index)
|
450
|
+
(result_index.to_i - 1) % @results_per_page.to_i
|
451
|
+
end
|
452
|
+
|
453
|
+
#
|
454
|
+
# Returns the page index for the specified _result_index_
|
455
|
+
#
|
456
|
+
def result_page_index(result_index)
|
457
|
+
((result_index.to_i - 1) / @results_per_page.to_i) + 1
|
392
458
|
end
|
393
459
|
|
394
460
|
end
|
@@ -14,15 +14,54 @@ module GScraper
|
|
14
14
|
# Summary from the result page
|
15
15
|
attr_reader :summary
|
16
16
|
|
17
|
+
# URL of the cached result page
|
18
|
+
attr_reader :cached_url
|
19
|
+
|
20
|
+
# URL of the similar results Query
|
21
|
+
attr_reader :similar_url
|
22
|
+
|
17
23
|
#
|
18
24
|
# Creates a new Result object with the given _rank_, _title_
|
19
|
-
# _summary_, _url_ and
|
25
|
+
# _summary_, _url_, _size_, _cache_url_ and _similar_url_.
|
20
26
|
#
|
21
|
-
def initialize(rank,title,url,summary)
|
27
|
+
def initialize(rank,title,url,summary,cached_url=nil,similar_url=nil)
|
22
28
|
@rank = rank
|
23
29
|
@title = title
|
24
30
|
@url = url
|
25
31
|
@summary = summary
|
32
|
+
@cached_url = cached_url
|
33
|
+
@similar_url = similar_url
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Opens the URL of the cached page for the Result. If _opts_ are
|
38
|
+
# given, they will be used in accessing the cached page URL.
|
39
|
+
#
|
40
|
+
# result.cached_page # => File
|
41
|
+
#
|
42
|
+
def cached_page(opts={})
|
43
|
+
if @cached_url
|
44
|
+
return GScraper.open(@cached_url,opts)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Create a new Query for results that are similar to the Result. If
|
50
|
+
# a _block_ is given, it will be passed the newly created Query
|
51
|
+
# object.
|
52
|
+
#
|
53
|
+
# result.similar_query # => Query
|
54
|
+
#
|
55
|
+
# result.similar_query do |q|
|
56
|
+
# q.first_page.each_url do |url|
|
57
|
+
# puts url
|
58
|
+
# end
|
59
|
+
# end
|
60
|
+
#
|
61
|
+
def similar_query(&block)
|
62
|
+
if @similar_url
|
63
|
+
return Query.from_url(@similar_url,&block)
|
64
|
+
end
|
26
65
|
end
|
27
66
|
|
28
67
|
#
|
data/lib/gscraper/version.rb
CHANGED
@@ -0,0 +1,103 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'gscraper/search/page'
|
3
|
+
require 'gscraper/search/query'
|
4
|
+
|
5
|
+
class PageResults < Test::Unit::TestCase
|
6
|
+
|
7
|
+
include GScraper
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@query = Search::Query.new(:query => 'ruby')
|
11
|
+
@page = @query.first_page
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_results_per_page
|
15
|
+
assert_equal @page.length, @query.results_per_page
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_first_result
|
19
|
+
assert_not_nil @page[0], "First Page for Query 'ruby' does not have a first Result"
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_last_result
|
23
|
+
assert_not_nil @page[-1], "First Page for Query 'ruby' does not have a last Result"
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_ranks
|
27
|
+
ranks = @page.ranks
|
28
|
+
|
29
|
+
assert_not_nil ranks, "First Page for Query 'ruby' does not have any ranks"
|
30
|
+
|
31
|
+
assert_equal ranks.class, Array, "The ranks of a Page must be an Array"
|
32
|
+
|
33
|
+
assert_equal ranks.empty?, false, "The ranks of the First Page are empty"
|
34
|
+
|
35
|
+
assert_equal ranks.length, @page.length
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_titles
|
39
|
+
titles = @page.titles
|
40
|
+
|
41
|
+
assert_not_nil titles, "First Page for Query 'ruby' does not have any titles"
|
42
|
+
|
43
|
+
assert_equal titles.class, Array, "The titles of a Page must be an Array"
|
44
|
+
|
45
|
+
assert_equal titles.empty?, false, "The titles of the First Page are empty"
|
46
|
+
|
47
|
+
assert_equal titles.length, @page.length
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_urls
|
51
|
+
urls = @page.urls
|
52
|
+
|
53
|
+
assert_not_nil urls, "First Page for Query 'ruby' does not have any urls"
|
54
|
+
|
55
|
+
assert_equal urls.class, Array, "The urls of a Page must be an Array"
|
56
|
+
|
57
|
+
assert_equal urls.empty?, false, "The urls of the First Page are empty"
|
58
|
+
|
59
|
+
assert_equal urls.length, @page.length
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_summaries
|
63
|
+
summaries = @page.summaries
|
64
|
+
|
65
|
+
assert_not_nil summaries, "First Page for Query 'ruby' does not have any summaries"
|
66
|
+
|
67
|
+
assert_equal summaries.class, Array, "The summaries of a Page must be an Array"
|
68
|
+
|
69
|
+
assert_equal summaries.empty?, false, "The summaries of the First Page are empty"
|
70
|
+
|
71
|
+
assert_equal summaries.length, @page.length
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_cached_urls
|
75
|
+
cached_urls = @page.cached_urls
|
76
|
+
|
77
|
+
assert_not_nil cached_urls, "First Page for Query 'ruby' does not have any cached_urls"
|
78
|
+
|
79
|
+
assert_equal cached_urls.class, Array, "The cached_urls of a Page must be an Array"
|
80
|
+
|
81
|
+
assert_equal cached_urls.empty?, false, "The cached_urls of the First Page are empty"
|
82
|
+
|
83
|
+
assert_equal cached_urls.length, @page.length
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_similar_urls
|
87
|
+
similar_urls = @page.similar_urls
|
88
|
+
|
89
|
+
assert_not_nil similar_urls, "First Page for Query 'ruby' does not have any similar URLs"
|
90
|
+
|
91
|
+
assert_equal similar_urls.class, Array, "The similar URLs of a Page must be an Array"
|
92
|
+
|
93
|
+
assert_equal similar_urls.empty?, false, "The similar URLs of the First Page are empty"
|
94
|
+
|
95
|
+
assert_equal similar_urls.length, @page.length
|
96
|
+
end
|
97
|
+
|
98
|
+
def teardown
|
99
|
+
@page = nil
|
100
|
+
@query = nil
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
@@ -11,10 +11,6 @@ class QueryFromURL < Test::Unit::TestCase
|
|
11
11
|
@query = Search::Query.from_url(QUERY_URL)
|
12
12
|
end
|
13
13
|
|
14
|
-
def teardown
|
15
|
-
@query = nil
|
16
|
-
end
|
17
|
-
|
18
14
|
def test_query
|
19
15
|
assert_equal @query.query, 'test'
|
20
16
|
end
|
@@ -47,4 +43,8 @@ class QueryFromURL < Test::Unit::TestCase
|
|
47
43
|
assert_nil @query.links_to
|
48
44
|
end
|
49
45
|
|
46
|
+
def teardown
|
47
|
+
@query = nil
|
48
|
+
end
|
49
|
+
|
50
50
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'gscraper/search/query'
|
3
|
+
|
4
|
+
class QueryPages < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include GScraper
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@query = Search::Query.new(:query => 'ruby')
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_first_page
|
13
|
+
page = @query.first_page
|
14
|
+
|
15
|
+
assert_not_nil page
|
16
|
+
assert_equal page.empty?, false, "Query of 'ruby' has zero results"
|
17
|
+
assert_equal page.length, @query.results_per_page
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_second_page
|
21
|
+
page = @query.page(2)
|
22
|
+
|
23
|
+
assert_not_nil page
|
24
|
+
assert_equal page.empty?, false, "Query of 'ruby' has zero results"
|
25
|
+
assert_equal page.length, @query.results_per_page
|
26
|
+
end
|
27
|
+
|
28
|
+
def teardown
|
29
|
+
@query = nil
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'gscraper/search/query'
|
3
|
+
|
4
|
+
class QueryResult < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include GScraper
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@query = Search::Query.new(:query => 'ruby')
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_first_result
|
13
|
+
result = @query.first_result
|
14
|
+
|
15
|
+
assert_not_nil result, "The Query for 'ruby' has no first-result"
|
16
|
+
assert_equal result.rank, 1, "The first result for the Query 'ruby' does not have the rank of 1"
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_second_result
|
20
|
+
result = @query.result_at(2)
|
21
|
+
|
22
|
+
assert_not_nil result, "The Query for 'ruby' has no second-result"
|
23
|
+
assert_equal result.rank, 2, "The second result for the Query 'ruby' does not have the rank of 2"
|
24
|
+
end
|
25
|
+
|
26
|
+
def teardown
|
27
|
+
@query = nil
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
data/test/test_gscraper.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: gscraper
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.1.
|
7
|
-
date: 2007-12-
|
6
|
+
version: 0.1.4
|
7
|
+
date: 2007-12-22 00:00:00 -08:00
|
8
8
|
summary: A ruby web-scraping interface to various Google Services
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -48,6 +48,9 @@ files:
|
|
48
48
|
- lib/gscraper/search.rb
|
49
49
|
- test/test_gscraper.rb
|
50
50
|
- test/search/query_from_url.rb
|
51
|
+
- test/search/query_result.rb
|
52
|
+
- test/search/query_pages.rb
|
53
|
+
- test/search/page_results.rb
|
51
54
|
test_files:
|
52
55
|
- test/test_gscraper.rb
|
53
56
|
rdoc_options:
|
@@ -65,6 +68,24 @@ extensions: []
|
|
65
68
|
requirements: []
|
66
69
|
|
67
70
|
dependencies:
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: hpricot
|
73
|
+
version_requirement:
|
74
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
75
|
+
requirements:
|
76
|
+
- - ">"
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: 0.0.0
|
79
|
+
version:
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: mechanize
|
82
|
+
version_requirement:
|
83
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ">"
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: 0.0.0
|
88
|
+
version:
|
68
89
|
- !ruby/object:Gem::Dependency
|
69
90
|
name: hoe
|
70
91
|
version_requirement:
|