gscraper 0.1.7 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/COPYING.txt +339 -0
  2. data/History.txt +21 -0
  3. data/Manifest.txt +23 -10
  4. data/README.txt +17 -21
  5. data/Rakefile +3 -6
  6. data/lib/gscraper.rb +22 -0
  7. data/lib/gscraper/extensions.rb +22 -0
  8. data/lib/gscraper/extensions/uri.rb +22 -0
  9. data/lib/gscraper/extensions/uri/http.rb +25 -71
  10. data/lib/gscraper/extensions/uri/query_params.rb +96 -0
  11. data/lib/gscraper/gscraper.rb +30 -0
  12. data/lib/gscraper/has_pages.rb +114 -0
  13. data/lib/gscraper/licenses.rb +22 -0
  14. data/lib/gscraper/page.rb +64 -0
  15. data/lib/gscraper/search.rb +24 -0
  16. data/lib/gscraper/search/ajax_query.rb +176 -0
  17. data/lib/gscraper/search/page.rb +27 -72
  18. data/lib/gscraper/search/query.rb +46 -457
  19. data/lib/gscraper/search/result.rb +32 -29
  20. data/lib/gscraper/search/search.rb +44 -3
  21. data/lib/gscraper/search/web_query.rb +472 -0
  22. data/lib/gscraper/sponsored_ad.rb +26 -2
  23. data/lib/gscraper/sponsored_links.rb +77 -8
  24. data/lib/gscraper/version.rb +23 -1
  25. data/spec/extensions/uri/http_spec.rb +9 -0
  26. data/spec/extensions/uri/query_params_spec.rb +38 -0
  27. data/spec/gscraper_spec.rb +29 -0
  28. data/spec/has_pages_examples.rb +19 -0
  29. data/spec/has_sponsored_links_examples.rb +57 -0
  30. data/spec/helpers/query.rb +1 -0
  31. data/spec/helpers/uri.rb +8 -0
  32. data/spec/page_has_results_examples.rb +13 -0
  33. data/spec/search/ajax_query_spec.rb +124 -0
  34. data/spec/search/page_has_results_examples.rb +51 -0
  35. data/spec/search/query_spec.rb +103 -0
  36. data/spec/search/web_query_spec.rb +74 -0
  37. data/spec/spec_helper.rb +6 -0
  38. data/tasks/spec.rb +7 -0
  39. metadata +34 -20
  40. data/LICENSE.txt +0 -23
  41. data/lib/gscraper/web_agent.rb +0 -38
  42. data/test/search/page_results.rb +0 -103
  43. data/test/search/query_from_url.rb +0 -50
  44. data/test/search/query_pages.rb +0 -32
  45. data/test/search/query_result.rb +0 -30
  46. data/test/test_gscraper.rb +0 -4
@@ -1,12 +1,32 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/search/query'
2
- require 'gscraper/web_agent'
24
+ require 'gscraper/gscraper'
3
25
 
4
26
  module GScraper
5
27
  module Search
6
28
  class Result
7
29
 
8
- include WebAgent
9
-
10
30
  # Rank of the result page
11
31
  attr_reader :rank
12
32
 
@@ -30,6 +50,8 @@ module GScraper
30
50
  # _summary_, _url_, _size_, _cache_url_ and _similar_url_.
31
51
  #
32
52
  def initialize(rank,title,url,summary,cached_url=nil,similar_url=nil)
53
+ @agent = GScraper.web_agent
54
+
33
55
  @rank = rank
34
56
  @title = title
35
57
  @url = url
@@ -39,40 +61,21 @@ module GScraper
39
61
  end
40
62
 
41
63
  #
42
- # Fetches the page of the result. If a _block_ is given it will be
43
- # passed the page.
64
+ # Fetches the page of the result.
44
65
  #
45
- def page(&block)
46
- get_page(@url,&block)
66
+ def page
67
+ @agent.get(@url)
47
68
  end
48
69
 
49
70
  #
50
- # Create a new Query for results that are similar to the Result. If
51
- # a _block_ is given, it will be passed the newly created Query
52
- # object.
53
- #
54
- # result.similar_query # => Query
55
- #
56
- # result.similar_query do |q|
57
- # q.first_page.each_url do |url|
58
- # puts url
59
- # end
60
- # end
71
+ # Fetches the cached page of the result.
61
72
  #
62
- def similar_query(&block)
63
- if @similar_url
64
- return Query.from_url(@similar_url,&block)
73
+ def cached_page
74
+ if @cached_url
75
+ return @agent.get(@cached_url)
65
76
  end
66
77
  end
67
78
 
68
- #
69
- # Fetches the cached page of the result. If a _block_ is given it will
70
- # be passed the cached page.
71
- #
72
- def cached_page(&block)
73
- get_page(@cached_url,&block)
74
- end
75
-
76
79
  #
77
80
  # Returns a string containing the result's title.
78
81
  #
@@ -1,4 +1,27 @@
1
- require 'gscraper/search/query'
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ require 'gscraper/search/web_query'
24
+ require 'gscraper/search/ajax_query'
2
25
 
3
26
  module GScraper
4
27
  module Search
@@ -12,7 +35,7 @@ module GScraper
12
35
  # end
13
36
  #
14
37
  def Search.query(options={},&block)
15
- Query.new(options,&block)
38
+ WebQuery.new(options,&block)
16
39
  end
17
40
 
18
41
  #
@@ -27,7 +50,25 @@ module GScraper
27
50
  # end
28
51
  #
29
52
  def Search.query_from_url(url,&block)
30
- Query.from_url(url,&block)
53
+ WebQuery.from_url(url,&block)
54
+ end
55
+
56
+ #
57
+ # Returns a new AJAXQuery object with the given _options_. See
58
+ # AJAXQuery.new.
59
+ #
60
+ # Search.ajax_query(:query => 'ruby')
61
+ #
62
+ def Search.ajax_query(options={},&block)
63
+ AJAXQuery.new(options,&block)
64
+ end
65
+
66
+ #
67
+ # Returns the AJAXQuery object that represents the specified _url_.
68
+ # See AJAXQuery.from_url.
69
+ #
70
+ def Search.ajax_query_from_url(url,&block)
71
+ AJAXQuery.from_url(url,&block)
31
72
  end
32
73
  end
33
74
  end
@@ -0,0 +1,472 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ require 'gscraper/search/result'
24
+ require 'gscraper/search/page'
25
+ require 'gscraper/search/query'
26
+ require 'gscraper/sponsored_ad'
27
+ require 'gscraper/sponsored_links'
28
+ require 'gscraper/extensions/uri'
29
+ require 'gscraper/has_pages'
30
+ require 'gscraper/licenses'
31
+ require 'gscraper/gscraper'
32
+
33
+ require 'hpricot'
34
+
35
+ module GScraper
36
+ module Search
37
+ class WebQuery < Query
38
+
39
+ include HasPages
40
+
41
+ # Search host
42
+ SEARCH_HOST = 'www.google.com'
43
+
44
+ # Search URL
45
+ SEARCH_URL = "http://#{SEARCH_HOST}/search"
46
+
47
+ # Default results per-page
48
+ RESULTS_PER_PAGE = 10
49
+
50
+ # Results per-page
51
+ attr_accessor :results_per_page
52
+
53
+ # Search query
54
+ attr_accessor :query
55
+
56
+ # Search 'link' modifier
57
+ attr_accessor :link
58
+
59
+ # Search 'related' modifier
60
+ attr_accessor :related
61
+
62
+ # Search 'info' modifier
63
+ attr_accessor :info
64
+
65
+ # Search 'site' modifier
66
+ attr_accessor :site
67
+
68
+ # Search 'filetype' modifier
69
+ attr_accessor :filetype
70
+
71
+ # Search 'allintitle' modifier
72
+ attr_accessor :allintitle
73
+
74
+ # Search 'intitle' modifier
75
+ attr_accessor :intitle
76
+
77
+ # Search 'allinurl' modifier
78
+ attr_accessor :allinurl
79
+
80
+ # Search 'inurl' modifier
81
+ attr_accessor :inurl
82
+
83
+ # Search 'allintext' modifier
84
+ attr_accessor :allintext
85
+
86
+ # Search 'intext' modifier
87
+ attr_accessor :intext
88
+
89
+ # Search for results containing the exact phrase
90
+ attr_accessor :exact_phrase
91
+
92
+ # Search for results with the words
93
+ attr_accessor :with_words
94
+
95
+ # Search for results with-out the words
96
+ attr_accessor :without_words
97
+
98
+ # Search for results written in the language
99
+ attr_accessor :language
100
+
101
+ # Search for results from the region
102
+ attr_accessor :region
103
+
104
+ # Search for results in the format
105
+ attr_accessor :in_format
106
+
107
+ # Search for results not in the format
108
+ attr_accessor :not_in_format
109
+
110
+ # Search for results within the past day
111
+ attr_accessor :within_past_day
112
+
113
+ # Search for results within the past week
114
+ attr_accessor :within_past_week
115
+
116
+ # Search for results within the past months
117
+ attr_accessor :within_past_months
118
+
119
+ # Search for results within the past year
120
+ attr_accessor :within_past_year
121
+
122
+ # Search for results containing numbers between the range
123
+ attr_accessor :numeric_range
124
+
125
+ # Search for results where the query ocurrs within the area
126
+ attr_accessor :occurrs_within
127
+
128
+ # Search for results inside the domain
129
+ attr_accessor :inside_domain
130
+
131
+ # Search for results outside the domain
132
+ attr_accessor :outside_domain
133
+
134
+ # Search for results which have the rights
135
+ attr_accessor :rights
136
+
137
+ # Filter the search results
138
+ attr_accessor :filtered
139
+
140
+ # Search for results similar to the page
141
+ attr_accessor :similar_to
142
+
143
+ # Search for results linking to the page
144
+ attr_accessor :links_to
145
+
146
+ #
147
+ # Creates a new WebQuery object from the given search options. If a
148
+ # block is given, it will be passed the newly created query object.
149
+ #
150
+ # WebQuery.new(:query => 'ruby', :with_words => 'sow rspec')
151
+ #
152
+ # WebQuery.new(:exact_phrase => 'fluent interfaces') do |q|
153
+ # q.within_past_week = true
154
+ # end
155
+ #
156
+ def initialize(options={},&block)
157
+ @agent = GScraper.web_agent(options)
158
+
159
+ @results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
160
+
161
+ @language = options[:language]
162
+ @region = options[:region]
163
+
164
+ if options[:within_past_day]
165
+ @within_past_day = options[:within_past_day]
166
+ @within_past_week = false
167
+ @within_past_months = false
168
+ @within_past_year = false
169
+ elsif options[:within_past_week]
170
+ @within_past_day = false
171
+ @within_past_week = options[:within_past_week]
172
+ @within_past_months = false
173
+ @within_past_year = false
174
+ elsif options[:within_past_months]
175
+ @within_past_day = false
176
+ @within_past_week = false
177
+ @within_past_months = options[:within_past_months]
178
+ @within_past_year = false
179
+ elsif options[:within_past_year]
180
+ @within_past_day = false
181
+ @within_past_week = false
182
+ @within_past_months = false
183
+ @within_past_year = options[:within_past_year]
184
+ else
185
+ @within_past_day = false
186
+ @within_past_week = false
187
+ @within_past_months = false
188
+ @within_past_year = false
189
+ end
190
+
191
+ @occurrs_within = options[:occurrs_within]
192
+ @rights = options[:rights]
193
+ @filtered = options[:filtered]
194
+
195
+ @similar_to = options[:similar_to]
196
+ @links_to = options[:links_to]
197
+
198
+ super(options,&block)
199
+ end
200
+
201
+ #
202
+ # Creates a new WebQuery object from the specified URL. If a block is
203
+ # given, it will be passed the newly created WebQuery object.
204
+ #
205
+ # WebQuery.from_url('http://www.google.com/search?q=ruby+zen')
206
+ #
207
+ # WebQuery.from_url('http://www.google.com/search?q=ruby') do |q|
208
+ # q.within_last_month = true
209
+ # q.occurrs_within = :title
210
+ # end
211
+ #
212
+ def self.from_url(url,options={},&block)
213
+ url = URI(url.to_s)
214
+
215
+ options[:results_per_page] = url.query_params['num'].to_i
216
+
217
+ options[:query] = url.query_params['q']
218
+ options[:exact_phrase] = url.query_params['as_epq']
219
+ options[:with_words] = url.query_params['as_oq']
220
+ options[:without_words] = url.query_params['as_eq']
221
+
222
+ options[:language] = url.query_params['lr']
223
+ options[:region] = url.query_params['cr']
224
+
225
+ if url.query_params['as_filetype']
226
+ options[:filetype] = url.query_params['as_filetype']
227
+ end
228
+
229
+ case url.query_params['as_qdr']
230
+ when 'd'
231
+ options[:within_past_day] = true
232
+ when 'w'
233
+ options[:within_past_week] = true
234
+ when 'm'
235
+ options[:within_past_months] = 1
236
+ when 'm2'
237
+ options[:within_past_months] = 2
238
+ when 'm3'
239
+ options[:within_past_months] = 3
240
+ when 'm6'
241
+ options[:within_past_months] = 6
242
+ when 'y'
243
+ options[:within_past_year] = true
244
+ end
245
+
246
+ if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
247
+ options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,
248
+ url.query_params['as_nhi'].to_i)
249
+ end
250
+
251
+ case url.query_params['as_occt']
252
+ when 'title'
253
+ options[:occurrs_within] = :title
254
+ when 'body'
255
+ options[:occurrs_within] = :body
256
+ when 'url'
257
+ options[:occurrs_within] = :url
258
+ when 'links'
259
+ options[:occurrs_within] = :links
260
+ end
261
+
262
+ options[:site] = url.query_params['as_sitesearch']
263
+
264
+ case url.query_params['as_rights']
265
+ when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
266
+ options[:rights] = Licenses::CC_BY_NC_ND
267
+ when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
268
+ options[:rights] = Licenses::CC_BY_SA
269
+ when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
270
+ options[:rights] = Licenses::CC_BY_NC
271
+ when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
272
+ options[:rights] = Licenses::CC_BY
273
+ end
274
+
275
+ if url.query_params[:safe] == 'active'
276
+ options[:filtered] = true
277
+ end
278
+
279
+ if url.query_params['as_rq']
280
+ options[:similar_to] = url.query_params['as_rq']
281
+ elsif url.query_params['as_lq']
282
+ options[:links_to] = url.query_params['as_lq']
283
+ end
284
+
285
+ return self.new(options,&block)
286
+ end
287
+
288
+ #
289
+ # Returns the URL that represents the query.
290
+ #
291
+ def search_url
292
+ url = URI(SEARCH_URL)
293
+ query_expr = []
294
+
295
+ set_param = lambda { |param,value|
296
+ url.query_params[param.to_s] = value if value
297
+ }
298
+
299
+ set_param.call('num',@results_per_page)
300
+ set_param.call('q',expression)
301
+ set_param.call('as_epq',@exact_phrase)
302
+ set_param.call('as_oq',@with_words)
303
+ set_param.call('as_eq',@without_words)
304
+
305
+ set_param.call('lr',@language)
306
+ set_param.call('cr',@region)
307
+
308
+ set_param.call('as_filetype',@filetype)
309
+
310
+ if @within_past_day
311
+ url.query_params['as_qdr'] = 'd'
312
+ elsif @within_past_week
313
+ url.query_params['as_qdr'] = 'w'
314
+ elsif @within_past_months
315
+ case @within_past_months
316
+ when 1
317
+ url.query_params['as_qdr'] = 'm'
318
+ when 2
319
+ url.query_params['as_qdr'] = 'm2'
320
+ when 3
321
+ url.query_params['as_qdr'] = 'm3'
322
+ when 6
323
+ url.query_params['as_qdr'] = 'm6'
324
+ end
325
+ elsif @within_past_year
326
+ url.query_params['as_qdr'] = 'y'
327
+ end
328
+
329
+ if @numeric_range.kind_of?(Range)
330
+ url.query_params['as_nlo'] = @numeric_range.begin
331
+ url.query_params['as_nhi'] = @numeric_range.end
332
+ end
333
+
334
+ case @occurrs_within
335
+ when :title, 'title'
336
+ url.query_params['as_occt'] = 'title'
337
+ when :body, 'body'
338
+ url.query_params['as_occt'] = 'body'
339
+ when :url, 'url'
340
+ url.query_params['as_occt'] = 'url'
341
+ when :links, 'links'
342
+ url.query_params['as_occt'] = 'links'
343
+ end
344
+
345
+ set_param.call('as_sitesearch',@site)
346
+
347
+ case @rights
348
+ when Licenses::CC_BY_NC_ND
349
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
350
+ when Licenses::CC_BY_SA
351
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
352
+ when Licenses::CC_BY_ND
353
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
354
+ when Licenses::CC_BY
355
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
356
+ end
357
+
358
+ url.query_params['safe'] = 'active' if @filtered
359
+
360
+ if @similar_to
361
+ url.query_params['as_rq'] = @similar_to
362
+ elsif @links_to
363
+ url.query_params['as_lq'] = @links_to
364
+ end
365
+
366
+ return url
367
+ end
368
+
369
+ #
370
+ # Returns the URL that represents the query at the specific
371
+ # _page_index_.
372
+ #
373
+ def page_url(page_index)
374
+ url = search_url
375
+
376
+ url.query_params['start'] = result_offset_of(page_index)
377
+ url.query_params['sa'] = 'N'
378
+
379
+ return url
380
+ end
381
+
382
+ #
383
+ # Returns a Page object containing Result objects at the specified
384
+ # _page_index_.
385
+ #
386
+ def page(page_index)
387
+ Page.new do |new_page|
388
+ doc = @agent.get(page_url(page_index))
389
+ results = doc.search('//div.g')[0...@results_per_page.to_i]
390
+
391
+ rank_offset = result_offset_of(page_index)
392
+
393
+ results.each_with_index do |result,index|
394
+ rank = rank_offset + (index + 1)
395
+ link = result.at('//a.l')
396
+ title = link.inner_text
397
+ url = link.get_attribute('href')
398
+ summary_text = ''
399
+ cached_url = nil
400
+ similar_url = nil
401
+
402
+ if (content = (result.at('//td.j//font|//td.j/div')))
403
+ content.children.each do |elem|
404
+ break if (!(elem.text?) && elem.name=='br')
405
+
406
+ summary_text << elem.inner_text
407
+ end
408
+
409
+ if (cached_link = result.at('nobr/a:first'))
410
+ cached_url = cached_link.get_attribute('href')
411
+ end
412
+
413
+ if (similar_link = result.at('nobr/a:last'))
414
+ similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
415
+ end
416
+ end
417
+
418
+ new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
419
+ end
420
+ end
421
+ end
422
+
423
+ #
424
+ # Returns the first Result on the first_page.
425
+ #
426
+ def top_result
427
+ first_page.first
428
+ end
429
+
430
+ #
431
+ # Returns the Result at the specified _index_.
432
+ #
433
+ def result_at(index)
434
+ page(page_index_of(index))[result_index_of(index)]
435
+ end
436
+
437
+ #
438
+ # Returns a SponsoredLinks object containing SponsoredAd objects of
439
+ # the query.
440
+ #
441
+ def sponsored_links
442
+ SponsoredLinks.new do |links|
443
+ doc = @agent.get(search_url)
444
+
445
+ # top and side ads
446
+ doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
447
+ title = link.inner_text
448
+ url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
449
+
450
+ links << SponsoredAd.new(title,url)
451
+ end
452
+ end
453
+ end
454
+
455
+ #
456
+ # Returns the first sponsored link on the first page of results.
457
+ #
458
+ def top_sponsored_link
459
+ top_sponsored_links.first
460
+ end
461
+
462
+ #
463
+ # Iterates over the sponsored links on the first page of
464
+ # results passing each to the specified _block_.
465
+ #
466
+ def each_sponsored_link(&block)
467
+ sponsored_links.each(&block)
468
+ end
469
+
470
+ end
471
+ end
472
+ end