gscraper 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +17 -2
- data/Manifest.txt +3 -0
- data/Rakefile +1 -0
- data/lib/gscraper/extensions/uri/http.rb +8 -2
- data/lib/gscraper/gscraper.rb +4 -3
- data/lib/gscraper/search/page.rb +181 -43
- data/lib/gscraper/search/query.rb +85 -19
- data/lib/gscraper/search/result.rb +41 -2
- data/lib/gscraper/version.rb +1 -1
- data/test/search/page_results.rb +103 -0
- data/test/search/query_from_url.rb +4 -4
- data/test/search/query_pages.rb +32 -0
- data/test/search/query_result.rb +30 -0
- data/test/test_gscraper.rb +3 -0
- metadata +23 -2
data/History.txt
CHANGED
@@ -1,7 +1,22 @@
|
|
1
|
+
== 0.1.4 / 2007-12-23
|
2
|
+
|
3
|
+
* Added Search::Query#result_at for easier access of a single result at
|
4
|
+
a given index.
|
5
|
+
* Adding scraping of the "Cached" and "Similar Pages" URLs of Search
|
6
|
+
Results.
|
7
|
+
* Added methods to Search::Page for accessing cached URLs, cached pages,
|
8
|
+
similar query URLs and similar Queries in mass.
|
9
|
+
* Search::Query#page and Search::Query#first_page now can receive blocks.
|
10
|
+
* Improved the formating of URL query parameters.
|
11
|
+
* Added more unit-tests.
|
12
|
+
* Fixed scraping of Search Result summaries.
|
13
|
+
* Fixed various bugs in Search::Query uncovered during unit-testing.
|
14
|
+
* Fixed typos in Search::Page's documentation.
|
15
|
+
|
1
16
|
== 0.1.3 / 2007-12-22
|
2
17
|
|
3
|
-
* Added the Page class, which contains many of convenance methods
|
4
|
-
searching through the results within a Page.
|
18
|
+
* Added the Search::Page class, which contains many of convenance methods
|
19
|
+
for searching through the results within a Page.
|
5
20
|
|
6
21
|
== 0.1.2 / 2007-12-22
|
7
22
|
|
data/Manifest.txt
CHANGED
data/Rakefile
CHANGED
@@ -12,6 +12,7 @@ Hoe.new('gscraper', GScraper::VERSION) do |p|
|
|
12
12
|
p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
|
13
13
|
p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
|
14
14
|
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
15
|
+
p.extra_deps = ['hpricot', 'mechanize']
|
15
16
|
end
|
16
17
|
|
17
18
|
# vim: syntax=Ruby
|
@@ -4,12 +4,14 @@ module URI
|
|
4
4
|
# Query parameters
|
5
5
|
attr_reader :query_params
|
6
6
|
|
7
|
+
alias_method :old_initialize, :initialize
|
8
|
+
|
7
9
|
#
|
8
10
|
# Creates a new URI::HTTP object and initializes query_params as a
|
9
11
|
# new Hash.
|
10
12
|
#
|
11
13
|
def initialize(*args)
|
12
|
-
|
14
|
+
old_initialize(*args)
|
13
15
|
|
14
16
|
@query_params = {}
|
15
17
|
parse_query_params
|
@@ -57,7 +59,11 @@ module URI
|
|
57
59
|
if value==true
|
58
60
|
"#{name}=active"
|
59
61
|
elsif value
|
60
|
-
|
62
|
+
if value.kind_of?(Array)
|
63
|
+
"#{name}=#{URI.encode(value.join(' '))}"
|
64
|
+
else
|
65
|
+
"#{name}=#{URI.encode(value.to_s)}"
|
66
|
+
end
|
61
67
|
else
|
62
68
|
"#{name}="
|
63
69
|
end
|
data/lib/gscraper/gscraper.rb
CHANGED
@@ -13,7 +13,7 @@ module GScraper
|
|
13
13
|
# Returns the GScraper User-Agent
|
14
14
|
#
|
15
15
|
def GScraper.user_agent
|
16
|
-
@user_agent
|
16
|
+
@user_agent ||= nil
|
17
17
|
end
|
18
18
|
|
19
19
|
#
|
@@ -24,10 +24,11 @@ module GScraper
|
|
24
24
|
end
|
25
25
|
|
26
26
|
#
|
27
|
-
# Opens the _uri_ with the given _opts_. The contents of the _uri_ will
|
28
|
-
#
|
27
|
+
# Opens the _uri_ with the given _opts_. The contents of the _uri_ will be
|
28
|
+
# returned.
|
29
29
|
#
|
30
30
|
# GScraper.open('http://www.hackety.org/')
|
31
|
+
#
|
31
32
|
# GScraper.open('http://tenderlovemaking.com/',
|
32
33
|
# :user_agent_alias => 'Linux Mozilla')
|
33
34
|
# GScraper.open('http://www.wired.com/', :user_agent => 'the future')
|
data/lib/gscraper/search/page.rb
CHANGED
@@ -115,77 +115,174 @@ module GScraper
|
|
115
115
|
|
116
116
|
#
|
117
117
|
# Returns an Array containing the ranks of the results within the
|
118
|
-
# Page.
|
118
|
+
# Page.
|
119
119
|
#
|
120
120
|
# page.ranks # => [...]
|
121
121
|
#
|
122
|
-
|
123
|
-
|
124
|
-
# end
|
125
|
-
#
|
126
|
-
def ranks(&block)
|
127
|
-
mapped = map { |result| result.rank }
|
128
|
-
|
129
|
-
mapped.each(&block) if block
|
130
|
-
return mapped
|
122
|
+
def ranks
|
123
|
+
map { |result| result.rank }
|
131
124
|
end
|
132
125
|
|
133
126
|
#
|
134
127
|
# Returns an Array containing the titles of the results within the
|
135
|
-
# Page.
|
128
|
+
# Page.
|
136
129
|
#
|
137
130
|
# page.titles # => [...]
|
138
131
|
#
|
139
|
-
|
140
|
-
|
141
|
-
# end
|
142
|
-
#
|
143
|
-
def titles(&block)
|
144
|
-
mapped = map { |result| result.title }
|
145
|
-
|
146
|
-
mapped.each(&block) if block
|
147
|
-
return mapped
|
132
|
+
def titles
|
133
|
+
map { |result| result.title }
|
148
134
|
end
|
149
135
|
|
150
136
|
#
|
151
137
|
# Returns an Array containing the URLs of the results within the
|
152
|
-
# Page.
|
138
|
+
# Page.
|
153
139
|
#
|
154
140
|
# page.urls # => [...]
|
155
141
|
#
|
156
|
-
|
157
|
-
|
158
|
-
# end
|
159
|
-
#
|
160
|
-
def urls(&block)
|
161
|
-
mapped = map { |result| result.url }
|
162
|
-
|
163
|
-
mapped.each(&block) if block
|
164
|
-
return mapped
|
142
|
+
def urls
|
143
|
+
map { |result| result.url }
|
165
144
|
end
|
166
145
|
|
167
146
|
#
|
168
147
|
# Returns an Array containing the summaries of the results within the
|
169
|
-
# Page.
|
170
|
-
# _block_.
|
148
|
+
# Page.
|
171
149
|
#
|
172
150
|
# page.summaries # => [...]
|
173
151
|
#
|
174
|
-
|
175
|
-
|
176
|
-
|
152
|
+
def summaries
|
153
|
+
map { |result| result.summary }
|
154
|
+
end
|
155
|
+
|
156
|
+
#
|
157
|
+
# Returns an Array containing the cached URLs of the results within
|
158
|
+
# the Page.
|
159
|
+
#
|
160
|
+
# page.cached_urls # => [...]
|
177
161
|
#
|
178
|
-
def
|
179
|
-
|
162
|
+
def cached_urls
|
163
|
+
map { |result| result.cached_url }
|
164
|
+
end
|
180
165
|
|
181
|
-
|
182
|
-
|
166
|
+
#
|
167
|
+
# Returns an Array containing the cached pages of the results within
|
168
|
+
# the Page. If _opts_ are given, they will be used in accessing the
|
169
|
+
# cached page.
|
170
|
+
#
|
171
|
+
# page.cached_pages # => [...]
|
172
|
+
#
|
173
|
+
def cached_pages(opts={})
|
174
|
+
map { |result| result.cached_page(opts) }
|
175
|
+
end
|
176
|
+
|
177
|
+
#
|
178
|
+
# Returns an Array containing the similar Query URLs of the results
|
179
|
+
# within the Page.
|
180
|
+
#
|
181
|
+
# page.similar_urls # => [...]
|
182
|
+
#
|
183
|
+
def similar_urls
|
184
|
+
map { |result| result.similar_url }
|
185
|
+
end
|
186
|
+
|
187
|
+
#
|
188
|
+
# Returns an Array containing the similar Queries of the results
|
189
|
+
# within the Page.
|
190
|
+
#
|
191
|
+
# page.similar_queries # => [...]
|
192
|
+
#
|
193
|
+
def similar_queries
|
194
|
+
map { |result| result.similar_query }
|
195
|
+
end
|
196
|
+
|
197
|
+
#
|
198
|
+
# Iterates over each result's rank within the Page, passing each to
|
199
|
+
# the given _block_.
|
200
|
+
#
|
201
|
+
# each_rank { |rank| puts rank }
|
202
|
+
#
|
203
|
+
def each_rank(&block)
|
204
|
+
ranks.each(&block)
|
205
|
+
end
|
206
|
+
|
207
|
+
#
|
208
|
+
# Iterates over each result's title within the Page, passing each to
|
209
|
+
# the given _block_.
|
210
|
+
#
|
211
|
+
# each_title { |title| puts title }
|
212
|
+
#
|
213
|
+
def each_title(&block)
|
214
|
+
titles.each(&block)
|
215
|
+
end
|
216
|
+
|
217
|
+
#
|
218
|
+
# Iterates over each result's url within the Page, passing each to
|
219
|
+
# the given _block_.
|
220
|
+
#
|
221
|
+
# each_url { |url| puts url }
|
222
|
+
#
|
223
|
+
def each_url(&block)
|
224
|
+
urls.each(&block)
|
225
|
+
end
|
226
|
+
|
227
|
+
#
|
228
|
+
# Iterates over each result's summary within the Page, passing each
|
229
|
+
# to the given _block_.
|
230
|
+
#
|
231
|
+
# each_summary { |summary| puts summary }
|
232
|
+
#
|
233
|
+
def each_summary(&block)
|
234
|
+
summaries.each(&block)
|
235
|
+
end
|
236
|
+
|
237
|
+
#
|
238
|
+
# Iterates over each result's cached URLs within the Page, passing
|
239
|
+
# each to the given _block_.
|
240
|
+
#
|
241
|
+
# each_cached_url { |url| puts url }
|
242
|
+
#
|
243
|
+
def each_cached_url(&block)
|
244
|
+
cached_urls.each(&block)
|
245
|
+
end
|
246
|
+
|
247
|
+
#
|
248
|
+
# Iterates over each result's cached pages within the Page, passing
|
249
|
+
# each to the given _block_. If _opts_ are given, they will be used
|
250
|
+
# in accessing the cached pages.
|
251
|
+
#
|
252
|
+
# each_cached_page { |page| puts page.readlines }
|
253
|
+
#
|
254
|
+
def each_cached_page(opts={},&block)
|
255
|
+
cached_pages(opts).each(&block)
|
256
|
+
end
|
257
|
+
|
258
|
+
#
|
259
|
+
# Iterates over each result's similar Query URLs within the Page,
|
260
|
+
# passing each to the given _block_.
|
261
|
+
#
|
262
|
+
# each_similar_url { |url| puts url }
|
263
|
+
#
|
264
|
+
def each_similar_url(&block)
|
265
|
+
similar_urls.each(&block)
|
266
|
+
end
|
267
|
+
|
268
|
+
#
|
269
|
+
# Iterates over each result's similar Query within the Page, passing
|
270
|
+
# each to the given _block_.
|
271
|
+
#
|
272
|
+
# each_similar_query do |q|
|
273
|
+
# q.first_page do |page|
|
274
|
+
# puts page.urls.join("\n")
|
275
|
+
# end
|
276
|
+
# end
|
277
|
+
#
|
278
|
+
def each_similar_query(&block)
|
279
|
+
similar_queries.each(&block)
|
183
280
|
end
|
184
281
|
|
185
282
|
#
|
186
283
|
# Returns the ranks of the results that match the specified _block_.
|
187
284
|
#
|
188
|
-
# page.ranks_of { |result result.title =~ /awesome/ }
|
285
|
+
# page.ranks_of { |result| result.title =~ /awesome/ }
|
189
286
|
#
|
190
287
|
def ranks_of(&block)
|
191
288
|
results_with(&block).ranks
|
@@ -194,7 +291,7 @@ module GScraper
|
|
194
291
|
#
|
195
292
|
# Returns the titles of the results that match the specified _block_.
|
196
293
|
#
|
197
|
-
# page.titles_of { |result result.url.include?('www') }
|
294
|
+
# page.titles_of { |result| result.url.include?('www') }
|
198
295
|
#
|
199
296
|
def titles_of(&block)
|
200
297
|
results_with(&block).titles
|
@@ -203,7 +300,7 @@ module GScraper
|
|
203
300
|
#
|
204
301
|
# Returns the urls of the results that match the specified _block_.
|
205
302
|
#
|
206
|
-
# page.urls_of { |result result.summary =~ /awesome pants/ }
|
303
|
+
# page.urls_of { |result| result.summary =~ /awesome pants/ }
|
207
304
|
#
|
208
305
|
def urls_of(&block)
|
209
306
|
results_with(&block).urls
|
@@ -213,12 +310,53 @@ module GScraper
|
|
213
310
|
# Returns the summaries of the results that match the specified
|
214
311
|
# _block_.
|
215
312
|
#
|
216
|
-
# page.summaries_of { |result result.title =~ /what if/ }
|
313
|
+
# page.summaries_of { |result| result.title =~ /what if/ }
|
217
314
|
#
|
218
315
|
def summaries_of(&block)
|
219
316
|
results_with(&block).summaries
|
220
317
|
end
|
221
318
|
|
319
|
+
#
|
320
|
+
# Returns the cached URLs of the results that match the specified
|
321
|
+
# _block_.
|
322
|
+
#
|
323
|
+
# page.cached_urls_of { |result| result.title =~ /howdy/ }
|
324
|
+
#
|
325
|
+
def cached_urls_of(&block)
|
326
|
+
results_with(&block).cached_urls
|
327
|
+
end
|
328
|
+
|
329
|
+
#
|
330
|
+
# Returns the cached pages of the results that match the specified
|
331
|
+
# _block_. If _opts_ are given, they will be used in accessing
|
332
|
+
# the cached pages.
|
333
|
+
#
|
334
|
+
# page.cached_pages_of { |result| result.title =~ /dude/ }
|
335
|
+
#
|
336
|
+
def cached_pages_of(opts={},&block)
|
337
|
+
results_with(&block).cached_pages(opts)
|
338
|
+
end
|
339
|
+
|
340
|
+
#
|
341
|
+
# Returns the similar query URLs of the results that match the
|
342
|
+
# specified _block_.
|
343
|
+
#
|
344
|
+
# page.similar_urls_of { |result| result.title =~ /what if/ }
|
345
|
+
#
|
346
|
+
def similar_urls_of(&block)
|
347
|
+
results_with(&block).similar_urls
|
348
|
+
end
|
349
|
+
|
350
|
+
#
|
351
|
+
# Returns the similar Queries of the results that match the
|
352
|
+
# specified _block_.
|
353
|
+
#
|
354
|
+
# page.similar_queries_of { |result| result.title =~ /hackety/ }
|
355
|
+
#
|
356
|
+
def similar_queries_of(&block)
|
357
|
+
results_with(&block).similar_queries
|
358
|
+
end
|
359
|
+
|
222
360
|
end
|
223
361
|
end
|
224
362
|
end
|
@@ -10,7 +10,8 @@ module GScraper
|
|
10
10
|
module Search
|
11
11
|
class Query
|
12
12
|
|
13
|
-
|
13
|
+
SEARCH_HOST = 'www.google.com'
|
14
|
+
SEARCH_URL = "http://#{SEARCH_HOST}/search"
|
14
15
|
|
15
16
|
RESULTS_PER_PAGE = 10
|
16
17
|
|
@@ -90,7 +91,7 @@ module GScraper
|
|
90
91
|
def initialize(opts={},&block)
|
91
92
|
super()
|
92
93
|
|
93
|
-
@results_per_page = opts[:results_per_page] || RESULTS_PER_PAGE
|
94
|
+
@results_per_page = (opts[:results_per_page] || RESULTS_PER_PAGE)
|
94
95
|
|
95
96
|
@query = opts[:query]
|
96
97
|
@exact_phrase = opts[:exact_phrase]
|
@@ -104,12 +105,29 @@ module GScraper
|
|
104
105
|
|
105
106
|
if opts[:within_past_day]
|
106
107
|
@within_past_day = opts[:within_past_day]
|
108
|
+
@within_past_week = false
|
109
|
+
@within_past_months = false
|
110
|
+
@within_past_year = false
|
107
111
|
elsif opts[:within_past_week]
|
112
|
+
@within_past_day = false
|
108
113
|
@within_past_week = opts[:within_past_week]
|
114
|
+
@within_past_months = false
|
115
|
+
@within_past_year = false
|
109
116
|
elsif opts[:within_past_months]
|
117
|
+
@within_past_day = false
|
118
|
+
@within_past_week = false
|
110
119
|
@within_past_months = opts[:within_past_months]
|
120
|
+
@within_past_year = false
|
111
121
|
elsif opts[:within_past_year]
|
122
|
+
@within_past_day = false
|
123
|
+
@within_past_week = false
|
124
|
+
@within_past_months = false
|
112
125
|
@within_past_year = opts[:within_past_year]
|
126
|
+
else
|
127
|
+
@within_past_day = false
|
128
|
+
@within_past_week = false
|
129
|
+
@within_past_months = false
|
130
|
+
@within_past_year = false
|
113
131
|
end
|
114
132
|
|
115
133
|
@numeric_range = opts[:numeric_range]
|
@@ -318,7 +336,7 @@ module GScraper
|
|
318
336
|
def page_url(page_index)
|
319
337
|
url = search_url
|
320
338
|
|
321
|
-
url.query_params['start'] =
|
339
|
+
url.query_params['start'] = page_result_offset(page_index)
|
322
340
|
url.query_params['sa'] = 'N'
|
323
341
|
|
324
342
|
return url
|
@@ -327,33 +345,67 @@ module GScraper
|
|
327
345
|
#
|
328
346
|
# Returns a Page object containing Result objects at the specified
|
329
347
|
# _page_index_. If _opts_ are given, they will be used in accessing
|
330
|
-
# the SEARCH_URL.
|
348
|
+
# the SEARCH_URL. If a _block_ is given, it will be passed the newly
|
349
|
+
# created Page.
|
331
350
|
#
|
332
|
-
def page(page_index,opts={})
|
351
|
+
def page(page_index,opts={},&block)
|
333
352
|
doc = Hpricot(GScraper.open(page_url(page_index),opts))
|
353
|
+
|
334
354
|
new_page = Page.new
|
355
|
+
results = doc.search('//div.g')[0...@results_per_page.to_i]
|
356
|
+
|
357
|
+
results.each_with_index do |result,index|
|
358
|
+
rank = page_result_offset(page_index) + (index + 1)
|
359
|
+
title = result.at('//h2.r').inner_text
|
360
|
+
url = result.at('//h2.r/a').get_attribute('href')
|
361
|
+
|
362
|
+
summary = result.at('//td.j//font').children[0...-3].inject('') do |accum,elem|
|
363
|
+
accum + elem.inner_text
|
364
|
+
end
|
365
|
+
|
366
|
+
cached_url = nil
|
367
|
+
similar_url = nil
|
335
368
|
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
url = result.search('//h2.r/a').first.get_attribute('href')
|
340
|
-
# TODO: exclude URL and Links from summary text
|
341
|
-
summary = result.search('//td.j').first.inner_text
|
369
|
+
if (cached_link = result.at('//td.j//font/nobr/a:first'))
|
370
|
+
cached_url = cached_link.get_attribute('href')
|
371
|
+
end
|
342
372
|
|
343
|
-
|
373
|
+
if (similar_link = result.at('//td.j//font/nobr/a:last'))
|
374
|
+
similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
|
375
|
+
end
|
344
376
|
|
345
|
-
new_page << Result.new(rank,title,url,summary)
|
377
|
+
new_page << Result.new(rank,title,url,summary,cached_url,similar_url)
|
346
378
|
end
|
347
379
|
|
380
|
+
block.call(new_page) if block
|
348
381
|
return new_page
|
349
382
|
end
|
350
383
|
|
351
384
|
#
|
352
|
-
# Returns the
|
353
|
-
# will be used in accessing the SEARCH_URL.
|
385
|
+
# Returns the Results on the first page. If _opts_ are given, they
|
386
|
+
# will be used in accessing the SEARCH_URL. If a _block_ is given
|
387
|
+
# it will be passed the newly created Page.
|
388
|
+
#
|
389
|
+
def first_page(opts={},&block)
|
390
|
+
page(1,opts,&block)
|
391
|
+
end
|
392
|
+
|
393
|
+
#
|
394
|
+
# Returns the Result at the specified _index_. If _opts_ are given,
|
395
|
+
# they will be used in accessing the Page containing the requested
|
396
|
+
# Result.
|
354
397
|
#
|
355
|
-
def
|
356
|
-
page(
|
398
|
+
def result_at(index,opts={})
|
399
|
+
page(result_page_index(index),opts)[page_result_index(index)]
|
400
|
+
end
|
401
|
+
|
402
|
+
#
|
403
|
+
# Returns the first Result at the specified _index_. If _opts_ are
|
404
|
+
# given, they will be used in accessing the Page containing the
|
405
|
+
# requested Result.
|
406
|
+
#
|
407
|
+
def first_result(opts={})
|
408
|
+
result_at(1,opts)
|
357
409
|
end
|
358
410
|
|
359
411
|
#
|
@@ -387,8 +439,22 @@ module GScraper
|
|
387
439
|
#
|
388
440
|
# Returns the rank offset for the specified _page_index_.
|
389
441
|
#
|
390
|
-
def
|
391
|
-
(page_index.to_i - 1) * @
|
442
|
+
def page_result_offset(page_index)
|
443
|
+
(page_index.to_i - 1) * @results_per_page.to_i
|
444
|
+
end
|
445
|
+
|
446
|
+
#
|
447
|
+
# Returns the in-Page index of the _result_index_.
|
448
|
+
#
|
449
|
+
def page_result_index(result_index)
|
450
|
+
(result_index.to_i - 1) % @results_per_page.to_i
|
451
|
+
end
|
452
|
+
|
453
|
+
#
|
454
|
+
# Returns the page index for the specified _result_index_
|
455
|
+
#
|
456
|
+
def result_page_index(result_index)
|
457
|
+
((result_index.to_i - 1) / @results_per_page.to_i) + 1
|
392
458
|
end
|
393
459
|
|
394
460
|
end
|
@@ -14,15 +14,54 @@ module GScraper
|
|
14
14
|
# Summary from the result page
|
15
15
|
attr_reader :summary
|
16
16
|
|
17
|
+
# URL of the cached result page
|
18
|
+
attr_reader :cached_url
|
19
|
+
|
20
|
+
# URL of the similar results Query
|
21
|
+
attr_reader :similar_url
|
22
|
+
|
17
23
|
#
|
18
24
|
# Creates a new Result object with the given _rank_, _title_
|
19
|
-
# _summary_, _url_ and
|
25
|
+
# _summary_, _url_, _size_, _cache_url_ and _similar_url_.
|
20
26
|
#
|
21
|
-
def initialize(rank,title,url,summary)
|
27
|
+
def initialize(rank,title,url,summary,cached_url=nil,similar_url=nil)
|
22
28
|
@rank = rank
|
23
29
|
@title = title
|
24
30
|
@url = url
|
25
31
|
@summary = summary
|
32
|
+
@cached_url = cached_url
|
33
|
+
@similar_url = similar_url
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Opens the URL of the cached page for the Result. If _opts_ are
|
38
|
+
# given, they will be used in accessing the cached page URL.
|
39
|
+
#
|
40
|
+
# result.cached_page # => File
|
41
|
+
#
|
42
|
+
def cached_page(opts={})
|
43
|
+
if @cached_url
|
44
|
+
return GScraper.open(@cached_url,opts)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Create a new Query for results that are similar to the Result. If
|
50
|
+
# a _block_ is given, it will be passed the newly created Query
|
51
|
+
# object.
|
52
|
+
#
|
53
|
+
# result.similar_query # => Query
|
54
|
+
#
|
55
|
+
# result.similar_query do |q|
|
56
|
+
# q.first_page.each_url do |url|
|
57
|
+
# puts url
|
58
|
+
# end
|
59
|
+
# end
|
60
|
+
#
|
61
|
+
def similar_query(&block)
|
62
|
+
if @similar_url
|
63
|
+
return Query.from_url(@similar_url,&block)
|
64
|
+
end
|
26
65
|
end
|
27
66
|
|
28
67
|
#
|
data/lib/gscraper/version.rb
CHANGED
@@ -0,0 +1,103 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'gscraper/search/page'
|
3
|
+
require 'gscraper/search/query'
|
4
|
+
|
5
|
+
class PageResults < Test::Unit::TestCase
|
6
|
+
|
7
|
+
include GScraper
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@query = Search::Query.new(:query => 'ruby')
|
11
|
+
@page = @query.first_page
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_results_per_page
|
15
|
+
assert_equal @page.length, @query.results_per_page
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_first_result
|
19
|
+
assert_not_nil @page[0], "First Page for Query 'ruby' does not have a first Result"
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_last_result
|
23
|
+
assert_not_nil @page[-1], "First Page for Query 'ruby' does not have a last Result"
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_ranks
|
27
|
+
ranks = @page.ranks
|
28
|
+
|
29
|
+
assert_not_nil ranks, "First Page for Query 'ruby' does not have any ranks"
|
30
|
+
|
31
|
+
assert_equal ranks.class, Array, "The ranks of a Page must be an Array"
|
32
|
+
|
33
|
+
assert_equal ranks.empty?, false, "The ranks of the First Page are empty"
|
34
|
+
|
35
|
+
assert_equal ranks.length, @page.length
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_titles
|
39
|
+
titles = @page.titles
|
40
|
+
|
41
|
+
assert_not_nil titles, "First Page for Query 'ruby' does not have any titles"
|
42
|
+
|
43
|
+
assert_equal titles.class, Array, "The titles of a Page must be an Array"
|
44
|
+
|
45
|
+
assert_equal titles.empty?, false, "The titles of the First Page are empty"
|
46
|
+
|
47
|
+
assert_equal titles.length, @page.length
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_urls
|
51
|
+
urls = @page.urls
|
52
|
+
|
53
|
+
assert_not_nil urls, "First Page for Query 'ruby' does not have any urls"
|
54
|
+
|
55
|
+
assert_equal urls.class, Array, "The urls of a Page must be an Array"
|
56
|
+
|
57
|
+
assert_equal urls.empty?, false, "The urls of the First Page are empty"
|
58
|
+
|
59
|
+
assert_equal urls.length, @page.length
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_summaries
|
63
|
+
summaries = @page.summaries
|
64
|
+
|
65
|
+
assert_not_nil summaries, "First Page for Query 'ruby' does not have any summaries"
|
66
|
+
|
67
|
+
assert_equal summaries.class, Array, "The summaries of a Page must be an Array"
|
68
|
+
|
69
|
+
assert_equal summaries.empty?, false, "The summaries of the First Page are empty"
|
70
|
+
|
71
|
+
assert_equal summaries.length, @page.length
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_cached_urls
|
75
|
+
cached_urls = @page.cached_urls
|
76
|
+
|
77
|
+
assert_not_nil cached_urls, "First Page for Query 'ruby' does not have any cached_urls"
|
78
|
+
|
79
|
+
assert_equal cached_urls.class, Array, "The cached_urls of a Page must be an Array"
|
80
|
+
|
81
|
+
assert_equal cached_urls.empty?, false, "The cached_urls of the First Page are empty"
|
82
|
+
|
83
|
+
assert_equal cached_urls.length, @page.length
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_similar_urls
|
87
|
+
similar_urls = @page.similar_urls
|
88
|
+
|
89
|
+
assert_not_nil similar_urls, "First Page for Query 'ruby' does not have any similar URLs"
|
90
|
+
|
91
|
+
assert_equal similar_urls.class, Array, "The similar URLs of a Page must be an Array"
|
92
|
+
|
93
|
+
assert_equal similar_urls.empty?, false, "The similar URLs of the First Page are empty"
|
94
|
+
|
95
|
+
assert_equal similar_urls.length, @page.length
|
96
|
+
end
|
97
|
+
|
98
|
+
def teardown
|
99
|
+
@page = nil
|
100
|
+
@query = nil
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
@@ -11,10 +11,6 @@ class QueryFromURL < Test::Unit::TestCase
|
|
11
11
|
@query = Search::Query.from_url(QUERY_URL)
|
12
12
|
end
|
13
13
|
|
14
|
-
def teardown
|
15
|
-
@query = nil
|
16
|
-
end
|
17
|
-
|
18
14
|
def test_query
|
19
15
|
assert_equal @query.query, 'test'
|
20
16
|
end
|
@@ -47,4 +43,8 @@ class QueryFromURL < Test::Unit::TestCase
|
|
47
43
|
assert_nil @query.links_to
|
48
44
|
end
|
49
45
|
|
46
|
+
def teardown
|
47
|
+
@query = nil
|
48
|
+
end
|
49
|
+
|
50
50
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'gscraper/search/query'
|
3
|
+
|
4
|
+
class QueryPages < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include GScraper
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@query = Search::Query.new(:query => 'ruby')
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_first_page
|
13
|
+
page = @query.first_page
|
14
|
+
|
15
|
+
assert_not_nil page
|
16
|
+
assert_equal page.empty?, false, "Query of 'ruby' has zero results"
|
17
|
+
assert_equal page.length, @query.results_per_page
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_second_page
|
21
|
+
page = @query.page(2)
|
22
|
+
|
23
|
+
assert_not_nil page
|
24
|
+
assert_equal page.empty?, false, "Query of 'ruby' has zero results"
|
25
|
+
assert_equal page.length, @query.results_per_page
|
26
|
+
end
|
27
|
+
|
28
|
+
def teardown
|
29
|
+
@query = nil
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'gscraper/search/query'
|
3
|
+
|
4
|
+
class QueryResult < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include GScraper
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@query = Search::Query.new(:query => 'ruby')
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_first_result
|
13
|
+
result = @query.first_result
|
14
|
+
|
15
|
+
assert_not_nil result, "The Query for 'ruby' has no first-result"
|
16
|
+
assert_equal result.rank, 1, "The first result for the Query 'ruby' does not have the rank of 1"
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_second_result
|
20
|
+
result = @query.result_at(2)
|
21
|
+
|
22
|
+
assert_not_nil result, "The Query for 'ruby' has no second-result"
|
23
|
+
assert_equal result.rank, 2, "The second result for the Query 'ruby' does not have the rank of 2"
|
24
|
+
end
|
25
|
+
|
26
|
+
def teardown
|
27
|
+
@query = nil
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
data/test/test_gscraper.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: gscraper
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.1.
|
7
|
-
date: 2007-12-
|
6
|
+
version: 0.1.4
|
7
|
+
date: 2007-12-22 00:00:00 -08:00
|
8
8
|
summary: A ruby web-scraping interface to various Google Services
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -48,6 +48,9 @@ files:
|
|
48
48
|
- lib/gscraper/search.rb
|
49
49
|
- test/test_gscraper.rb
|
50
50
|
- test/search/query_from_url.rb
|
51
|
+
- test/search/query_result.rb
|
52
|
+
- test/search/query_pages.rb
|
53
|
+
- test/search/page_results.rb
|
51
54
|
test_files:
|
52
55
|
- test/test_gscraper.rb
|
53
56
|
rdoc_options:
|
@@ -65,6 +68,24 @@ extensions: []
|
|
65
68
|
requirements: []
|
66
69
|
|
67
70
|
dependencies:
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: hpricot
|
73
|
+
version_requirement:
|
74
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
75
|
+
requirements:
|
76
|
+
- - ">"
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: 0.0.0
|
79
|
+
version:
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: mechanize
|
82
|
+
version_requirement:
|
83
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ">"
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: 0.0.0
|
88
|
+
version:
|
68
89
|
- !ruby/object:Gem::Dependency
|
69
90
|
name: hoe
|
70
91
|
version_requirement:
|