gscraper 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/COPYING.txt +339 -0
  2. data/History.txt +21 -0
  3. data/Manifest.txt +23 -10
  4. data/README.txt +17 -21
  5. data/Rakefile +3 -6
  6. data/lib/gscraper.rb +22 -0
  7. data/lib/gscraper/extensions.rb +22 -0
  8. data/lib/gscraper/extensions/uri.rb +22 -0
  9. data/lib/gscraper/extensions/uri/http.rb +25 -71
  10. data/lib/gscraper/extensions/uri/query_params.rb +96 -0
  11. data/lib/gscraper/gscraper.rb +30 -0
  12. data/lib/gscraper/has_pages.rb +114 -0
  13. data/lib/gscraper/licenses.rb +22 -0
  14. data/lib/gscraper/page.rb +64 -0
  15. data/lib/gscraper/search.rb +24 -0
  16. data/lib/gscraper/search/ajax_query.rb +176 -0
  17. data/lib/gscraper/search/page.rb +27 -72
  18. data/lib/gscraper/search/query.rb +46 -457
  19. data/lib/gscraper/search/result.rb +32 -29
  20. data/lib/gscraper/search/search.rb +44 -3
  21. data/lib/gscraper/search/web_query.rb +472 -0
  22. data/lib/gscraper/sponsored_ad.rb +26 -2
  23. data/lib/gscraper/sponsored_links.rb +77 -8
  24. data/lib/gscraper/version.rb +23 -1
  25. data/spec/extensions/uri/http_spec.rb +9 -0
  26. data/spec/extensions/uri/query_params_spec.rb +38 -0
  27. data/spec/gscraper_spec.rb +29 -0
  28. data/spec/has_pages_examples.rb +19 -0
  29. data/spec/has_sponsored_links_examples.rb +57 -0
  30. data/spec/helpers/query.rb +1 -0
  31. data/spec/helpers/uri.rb +8 -0
  32. data/spec/page_has_results_examples.rb +13 -0
  33. data/spec/search/ajax_query_spec.rb +124 -0
  34. data/spec/search/page_has_results_examples.rb +51 -0
  35. data/spec/search/query_spec.rb +103 -0
  36. data/spec/search/web_query_spec.rb +74 -0
  37. data/spec/spec_helper.rb +6 -0
  38. data/tasks/spec.rb +7 -0
  39. metadata +34 -20
  40. data/LICENSE.txt +0 -23
  41. data/lib/gscraper/web_agent.rb +0 -38
  42. data/test/search/page_results.rb +0 -103
  43. data/test/search/query_from_url.rb +0 -50
  44. data/test/search/query_pages.rb +0 -32
  45. data/test/search/query_result.rb +0 -30
  46. data/test/test_gscraper.rb +0 -4
@@ -1,12 +1,32 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/search/query'
2
- require 'gscraper/web_agent'
24
+ require 'gscraper/gscraper'
3
25
 
4
26
  module GScraper
5
27
  module Search
6
28
  class Result
7
29
 
8
- include WebAgent
9
-
10
30
  # Rank of the result page
11
31
  attr_reader :rank
12
32
 
@@ -30,6 +50,8 @@ module GScraper
30
50
  # _summary_, _url_, _size_, _cache_url_ and _similar_url_.
31
51
  #
32
52
  def initialize(rank,title,url,summary,cached_url=nil,similar_url=nil)
53
+ @agent = GScraper.web_agent
54
+
33
55
  @rank = rank
34
56
  @title = title
35
57
  @url = url
@@ -39,40 +61,21 @@ module GScraper
39
61
  end
40
62
 
41
63
  #
42
- # Fetches the page of the result. If a _block_ is given it will be
43
- # passed the page.
64
+ # Fetches the page of the result.
44
65
  #
45
- def page(&block)
46
- get_page(@url,&block)
66
+ def page
67
+ @agent.get(@url)
47
68
  end
48
69
 
49
70
  #
50
- # Create a new Query for results that are similar to the Result. If
51
- # a _block_ is given, it will be passed the newly created Query
52
- # object.
53
- #
54
- # result.similar_query # => Query
55
- #
56
- # result.similar_query do |q|
57
- # q.first_page.each_url do |url|
58
- # puts url
59
- # end
60
- # end
71
+ # Fetches the cached page of the result.
61
72
  #
62
- def similar_query(&block)
63
- if @similar_url
64
- return Query.from_url(@similar_url,&block)
73
+ def cached_page
74
+ if @cached_url
75
+ return @agent.get(@cached_url)
65
76
  end
66
77
  end
67
78
 
68
- #
69
- # Fetches the cached page of the result. If a _block_ is given it will
70
- # be passed the cached page.
71
- #
72
- def cached_page(&block)
73
- get_page(@cached_url,&block)
74
- end
75
-
76
79
  #
77
80
  # Returns a string containing the result's title.
78
81
  #
@@ -1,4 +1,27 @@
1
- require 'gscraper/search/query'
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ require 'gscraper/search/web_query'
24
+ require 'gscraper/search/ajax_query'
2
25
 
3
26
  module GScraper
4
27
  module Search
@@ -12,7 +35,7 @@ module GScraper
12
35
  # end
13
36
  #
14
37
  def Search.query(options={},&block)
15
- Query.new(options,&block)
38
+ WebQuery.new(options,&block)
16
39
  end
17
40
 
18
41
  #
@@ -27,7 +50,25 @@ module GScraper
27
50
  # end
28
51
  #
29
52
  def Search.query_from_url(url,&block)
30
- Query.from_url(url,&block)
53
+ WebQuery.from_url(url,&block)
54
+ end
55
+
56
+ #
57
+ # Returns a new AJAXQuery object with the given _options_. See
58
+ # AJAXQuery.new.
59
+ #
60
+ # Search.ajax_query(:query => 'ruby')
61
+ #
62
+ def Search.ajax_query(options={},&block)
63
+ AJAXQuery.new(options,&block)
64
+ end
65
+
66
+ #
67
+ # Returns the AJAXQuery object that represents the specified _url_.
68
+ # See AJAXQuery.from_url.
69
+ #
70
+ def Search.ajax_query_from_url(url,&block)
71
+ AJAXQuery.from_url(url,&block)
31
72
  end
32
73
  end
33
74
  end
@@ -0,0 +1,472 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ require 'gscraper/search/result'
24
+ require 'gscraper/search/page'
25
+ require 'gscraper/search/query'
26
+ require 'gscraper/sponsored_ad'
27
+ require 'gscraper/sponsored_links'
28
+ require 'gscraper/extensions/uri'
29
+ require 'gscraper/has_pages'
30
+ require 'gscraper/licenses'
31
+ require 'gscraper/gscraper'
32
+
33
+ require 'hpricot'
34
+
35
+ module GScraper
36
+ module Search
37
+ class WebQuery < Query
38
+
39
+ include HasPages
40
+
41
+ # Search host
42
+ SEARCH_HOST = 'www.google.com'
43
+
44
+ # Search URL
45
+ SEARCH_URL = "http://#{SEARCH_HOST}/search"
46
+
47
+ # Default results per-page
48
+ RESULTS_PER_PAGE = 10
49
+
50
+ # Results per-page
51
+ attr_accessor :results_per_page
52
+
53
+ # Search query
54
+ attr_accessor :query
55
+
56
+ # Search 'link' modifier
57
+ attr_accessor :link
58
+
59
+ # Search 'related' modifier
60
+ attr_accessor :related
61
+
62
+ # Search 'info' modifier
63
+ attr_accessor :info
64
+
65
+ # Search 'site' modifier
66
+ attr_accessor :site
67
+
68
+ # Search 'filetype' modifier
69
+ attr_accessor :filetype
70
+
71
+ # Search 'allintitle' modifier
72
+ attr_accessor :allintitle
73
+
74
+ # Search 'intitle' modifier
75
+ attr_accessor :intitle
76
+
77
+ # Search 'allinurl' modifier
78
+ attr_accessor :allinurl
79
+
80
+ # Search 'inurl' modifier
81
+ attr_accessor :inurl
82
+
83
+ # Search 'allintext' modifier
84
+ attr_accessor :allintext
85
+
86
+ # Search 'intext' modifier
87
+ attr_accessor :intext
88
+
89
+ # Search for results containing the exact phrase
90
+ attr_accessor :exact_phrase
91
+
92
+ # Search for results with the words
93
+ attr_accessor :with_words
94
+
95
+ # Search for results with-out the words
96
+ attr_accessor :without_words
97
+
98
+ # Search for results written in the language
99
+ attr_accessor :language
100
+
101
+ # Search for results from the region
102
+ attr_accessor :region
103
+
104
+ # Search for results in the format
105
+ attr_accessor :in_format
106
+
107
+ # Search for results not in the format
108
+ attr_accessor :not_in_format
109
+
110
+ # Search for results within the past day
111
+ attr_accessor :within_past_day
112
+
113
+ # Search for results within the past week
114
+ attr_accessor :within_past_week
115
+
116
+ # Search for results within the past months
117
+ attr_accessor :within_past_months
118
+
119
+ # Search for results within the past year
120
+ attr_accessor :within_past_year
121
+
122
+ # Search for results containing numbers between the range
123
+ attr_accessor :numeric_range
124
+
125
+ # Search for results where the query ocurrs within the area
126
+ attr_accessor :occurrs_within
127
+
128
+ # Search for results inside the domain
129
+ attr_accessor :inside_domain
130
+
131
+ # Search for results outside the domain
132
+ attr_accessor :outside_domain
133
+
134
+ # Search for results which have the rights
135
+ attr_accessor :rights
136
+
137
+ # Filter the search results
138
+ attr_accessor :filtered
139
+
140
+ # Search for results similar to the page
141
+ attr_accessor :similar_to
142
+
143
+ # Search for results linking to the page
144
+ attr_accessor :links_to
145
+
146
+ #
147
+ # Creates a new WebQuery object from the given search options. If a
148
+ # block is given, it will be passed the newly created query object.
149
+ #
150
+ # WebQuery.new(:query => 'ruby', :with_words => 'sow rspec')
151
+ #
152
+ # WebQuery.new(:exact_phrase => 'fluent interfaces') do |q|
153
+ # q.within_past_week = true
154
+ # end
155
+ #
156
+ def initialize(options={},&block)
157
+ @agent = GScraper.web_agent(options)
158
+
159
+ @results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
160
+
161
+ @language = options[:language]
162
+ @region = options[:region]
163
+
164
+ if options[:within_past_day]
165
+ @within_past_day = options[:within_past_day]
166
+ @within_past_week = false
167
+ @within_past_months = false
168
+ @within_past_year = false
169
+ elsif options[:within_past_week]
170
+ @within_past_day = false
171
+ @within_past_week = options[:within_past_week]
172
+ @within_past_months = false
173
+ @within_past_year = false
174
+ elsif options[:within_past_months]
175
+ @within_past_day = false
176
+ @within_past_week = false
177
+ @within_past_months = options[:within_past_months]
178
+ @within_past_year = false
179
+ elsif options[:within_past_year]
180
+ @within_past_day = false
181
+ @within_past_week = false
182
+ @within_past_months = false
183
+ @within_past_year = options[:within_past_year]
184
+ else
185
+ @within_past_day = false
186
+ @within_past_week = false
187
+ @within_past_months = false
188
+ @within_past_year = false
189
+ end
190
+
191
+ @occurrs_within = options[:occurrs_within]
192
+ @rights = options[:rights]
193
+ @filtered = options[:filtered]
194
+
195
+ @similar_to = options[:similar_to]
196
+ @links_to = options[:links_to]
197
+
198
+ super(options,&block)
199
+ end
200
+
201
+ #
202
+ # Creates a new WebQuery object from the specified URL. If a block is
203
+ # given, it will be passed the newly created WebQuery object.
204
+ #
205
+ # WebQuery.from_url('http://www.google.com/search?q=ruby+zen')
206
+ #
207
+ # WebQuery.from_url('http://www.google.com/search?q=ruby') do |q|
208
+ # q.within_last_month = true
209
+ # q.occurrs_within = :title
210
+ # end
211
+ #
212
+ def self.from_url(url,options={},&block)
213
+ url = URI(url.to_s)
214
+
215
+ options[:results_per_page] = url.query_params['num'].to_i
216
+
217
+ options[:query] = url.query_params['q']
218
+ options[:exact_phrase] = url.query_params['as_epq']
219
+ options[:with_words] = url.query_params['as_oq']
220
+ options[:without_words] = url.query_params['as_eq']
221
+
222
+ options[:language] = url.query_params['lr']
223
+ options[:region] = url.query_params['cr']
224
+
225
+ if url.query_params['as_filetype']
226
+ options[:filetype] = url.query_params['as_filetype']
227
+ end
228
+
229
+ case url.query_params['as_qdr']
230
+ when 'd'
231
+ options[:within_past_day] = true
232
+ when 'w'
233
+ options[:within_past_week] = true
234
+ when 'm'
235
+ options[:within_past_months] = 1
236
+ when 'm2'
237
+ options[:within_past_months] = 2
238
+ when 'm3'
239
+ options[:within_past_months] = 3
240
+ when 'm6'
241
+ options[:within_past_months] = 6
242
+ when 'y'
243
+ options[:within_past_year] = true
244
+ end
245
+
246
+ if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
247
+ options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,
248
+ url.query_params['as_nhi'].to_i)
249
+ end
250
+
251
+ case url.query_params['as_occt']
252
+ when 'title'
253
+ options[:occurrs_within] = :title
254
+ when 'body'
255
+ options[:occurrs_within] = :body
256
+ when 'url'
257
+ options[:occurrs_within] = :url
258
+ when 'links'
259
+ options[:occurrs_within] = :links
260
+ end
261
+
262
+ options[:site] = url.query_params['as_sitesearch']
263
+
264
+ case url.query_params['as_rights']
265
+ when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
266
+ options[:rights] = Licenses::CC_BY_NC_ND
267
+ when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
268
+ options[:rights] = Licenses::CC_BY_SA
269
+ when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
270
+ options[:rights] = Licenses::CC_BY_NC
271
+ when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
272
+ options[:rights] = Licenses::CC_BY
273
+ end
274
+
275
+ if url.query_params[:safe] == 'active'
276
+ options[:filtered] = true
277
+ end
278
+
279
+ if url.query_params['as_rq']
280
+ options[:similar_to] = url.query_params['as_rq']
281
+ elsif url.query_params['as_lq']
282
+ options[:links_to] = url.query_params['as_lq']
283
+ end
284
+
285
+ return self.new(options,&block)
286
+ end
287
+
288
+ #
289
+ # Returns the URL that represents the query.
290
+ #
291
+ def search_url
292
+ url = URI(SEARCH_URL)
293
+ query_expr = []
294
+
295
+ set_param = lambda { |param,value|
296
+ url.query_params[param.to_s] = value if value
297
+ }
298
+
299
+ set_param.call('num',@results_per_page)
300
+ set_param.call('q',expression)
301
+ set_param.call('as_epq',@exact_phrase)
302
+ set_param.call('as_oq',@with_words)
303
+ set_param.call('as_eq',@without_words)
304
+
305
+ set_param.call('lr',@language)
306
+ set_param.call('cr',@region)
307
+
308
+ set_param.call('as_filetype',@filetype)
309
+
310
+ if @within_past_day
311
+ url.query_params['as_qdr'] = 'd'
312
+ elsif @within_past_week
313
+ url.query_params['as_qdr'] = 'w'
314
+ elsif @within_past_months
315
+ case @within_past_months
316
+ when 1
317
+ url.query_params['as_qdr'] = 'm'
318
+ when 2
319
+ url.query_params['as_qdr'] = 'm2'
320
+ when 3
321
+ url.query_params['as_qdr'] = 'm3'
322
+ when 6
323
+ url.query_params['as_qdr'] = 'm6'
324
+ end
325
+ elsif @within_past_year
326
+ url.query_params['as_qdr'] = 'y'
327
+ end
328
+
329
+ if @numeric_range.kind_of?(Range)
330
+ url.query_params['as_nlo'] = @numeric_range.begin
331
+ url.query_params['as_nhi'] = @numeric_range.end
332
+ end
333
+
334
+ case @occurrs_within
335
+ when :title, 'title'
336
+ url.query_params['as_occt'] = 'title'
337
+ when :body, 'body'
338
+ url.query_params['as_occt'] = 'body'
339
+ when :url, 'url'
340
+ url.query_params['as_occt'] = 'url'
341
+ when :links, 'links'
342
+ url.query_params['as_occt'] = 'links'
343
+ end
344
+
345
+ set_param.call('as_sitesearch',@site)
346
+
347
+ case @rights
348
+ when Licenses::CC_BY_NC_ND
349
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
350
+ when Licenses::CC_BY_SA
351
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
352
+ when Licenses::CC_BY_ND
353
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
354
+ when Licenses::CC_BY
355
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
356
+ end
357
+
358
+ url.query_params['safe'] = 'active' if @filtered
359
+
360
+ if @similar_to
361
+ url.query_params['as_rq'] = @similar_to
362
+ elsif @links_to
363
+ url.query_params['as_lq'] = @links_to
364
+ end
365
+
366
+ return url
367
+ end
368
+
369
+ #
370
+ # Returns the URL that represents the query at the specific
371
+ # _page_index_.
372
+ #
373
+ def page_url(page_index)
374
+ url = search_url
375
+
376
+ url.query_params['start'] = result_offset_of(page_index)
377
+ url.query_params['sa'] = 'N'
378
+
379
+ return url
380
+ end
381
+
382
+ #
383
+ # Returns a Page object containing Result objects at the specified
384
+ # _page_index_.
385
+ #
386
+ def page(page_index)
387
+ Page.new do |new_page|
388
+ doc = @agent.get(page_url(page_index))
389
+ results = doc.search('//div.g')[0...@results_per_page.to_i]
390
+
391
+ rank_offset = result_offset_of(page_index)
392
+
393
+ results.each_with_index do |result,index|
394
+ rank = rank_offset + (index + 1)
395
+ link = result.at('//a.l')
396
+ title = link.inner_text
397
+ url = link.get_attribute('href')
398
+ summary_text = ''
399
+ cached_url = nil
400
+ similar_url = nil
401
+
402
+ if (content = (result.at('//td.j//font|//td.j/div')))
403
+ content.children.each do |elem|
404
+ break if (!(elem.text?) && elem.name=='br')
405
+
406
+ summary_text << elem.inner_text
407
+ end
408
+
409
+ if (cached_link = result.at('nobr/a:first'))
410
+ cached_url = cached_link.get_attribute('href')
411
+ end
412
+
413
+ if (similar_link = result.at('nobr/a:last'))
414
+ similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
415
+ end
416
+ end
417
+
418
+ new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
419
+ end
420
+ end
421
+ end
422
+
423
+ #
424
+ # Returns the first Result on the first_page.
425
+ #
426
+ def top_result
427
+ first_page.first
428
+ end
429
+
430
+ #
431
+ # Returns the Result at the specified _index_.
432
+ #
433
+ def result_at(index)
434
+ page(page_index_of(index))[result_index_of(index)]
435
+ end
436
+
437
+ #
438
+ # Returns a SponsoredLinks object containing SponsoredAd objects of
439
+ # the query.
440
+ #
441
+ def sponsored_links
442
+ SponsoredLinks.new do |links|
443
+ doc = @agent.get(search_url)
444
+
445
+ # top and side ads
446
+ doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
447
+ title = link.inner_text
448
+ url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
449
+
450
+ links << SponsoredAd.new(title,url)
451
+ end
452
+ end
453
+ end
454
+
455
+ #
456
+ # Returns the first sponsored link on the first page of results.
457
+ #
458
+ def top_sponsored_link
459
+ top_sponsored_links.first
460
+ end
461
+
462
+ #
463
+ # Iterates over the sponsored links on the first page of
464
+ # results passing each to the specified _block_.
465
+ #
466
+ def each_sponsored_link(&block)
467
+ sponsored_links.each(&block)
468
+ end
469
+
470
+ end
471
+ end
472
+ end