gscraper 0.1.7 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/COPYING.txt +339 -0
  2. data/History.txt +21 -0
  3. data/Manifest.txt +23 -10
  4. data/README.txt +17 -21
  5. data/Rakefile +3 -6
  6. data/lib/gscraper.rb +22 -0
  7. data/lib/gscraper/extensions.rb +22 -0
  8. data/lib/gscraper/extensions/uri.rb +22 -0
  9. data/lib/gscraper/extensions/uri/http.rb +25 -71
  10. data/lib/gscraper/extensions/uri/query_params.rb +96 -0
  11. data/lib/gscraper/gscraper.rb +30 -0
  12. data/lib/gscraper/has_pages.rb +114 -0
  13. data/lib/gscraper/licenses.rb +22 -0
  14. data/lib/gscraper/page.rb +64 -0
  15. data/lib/gscraper/search.rb +24 -0
  16. data/lib/gscraper/search/ajax_query.rb +176 -0
  17. data/lib/gscraper/search/page.rb +27 -72
  18. data/lib/gscraper/search/query.rb +46 -457
  19. data/lib/gscraper/search/result.rb +32 -29
  20. data/lib/gscraper/search/search.rb +44 -3
  21. data/lib/gscraper/search/web_query.rb +472 -0
  22. data/lib/gscraper/sponsored_ad.rb +26 -2
  23. data/lib/gscraper/sponsored_links.rb +77 -8
  24. data/lib/gscraper/version.rb +23 -1
  25. data/spec/extensions/uri/http_spec.rb +9 -0
  26. data/spec/extensions/uri/query_params_spec.rb +38 -0
  27. data/spec/gscraper_spec.rb +29 -0
  28. data/spec/has_pages_examples.rb +19 -0
  29. data/spec/has_sponsored_links_examples.rb +57 -0
  30. data/spec/helpers/query.rb +1 -0
  31. data/spec/helpers/uri.rb +8 -0
  32. data/spec/page_has_results_examples.rb +13 -0
  33. data/spec/search/ajax_query_spec.rb +124 -0
  34. data/spec/search/page_has_results_examples.rb +51 -0
  35. data/spec/search/query_spec.rb +103 -0
  36. data/spec/search/web_query_spec.rb +74 -0
  37. data/spec/spec_helper.rb +6 -0
  38. data/tasks/spec.rb +7 -0
  39. metadata +34 -20
  40. data/LICENSE.txt +0 -23
  41. data/lib/gscraper/web_agent.rb +0 -38
  42. data/test/search/page_results.rb +0 -103
  43. data/test/search/query_from_url.rb +0 -50
  44. data/test/search/query_pages.rb +0 -32
  45. data/test/search/query_result.rb +0 -30
  46. data/test/test_gscraper.rb +0 -4
@@ -1,3 +1,25 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  module GScraper
2
24
  module Licenses
3
25
  # Any desired license
@@ -0,0 +1,64 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ module GScraper
24
+ class Page < Array
25
+
26
+ #
27
+ # Creates a new Page object with the given _elements_. If a _block_
28
+ # is given, it will be passed the newly created Page object.
29
+ #
30
+ def initialize(elements=[],&block)
31
+ super(elements)
32
+
33
+ block.call(self) if block
34
+ end
35
+
36
+ #
37
+ # Returns a mapped Array of the elements within the Page using the
38
+ # given _block_. If the _block_ is not given, the page will be
39
+ # returned.
40
+ #
41
+ # page.map # => Page
42
+ #
43
+ # page.map { |element| element.field } # => [...]
44
+ #
45
+ def map(&block)
46
+ return self unless block
47
+
48
+ mapped = []
49
+
50
+ each { |element| mapped << block.call(element) }
51
+ return mapped
52
+ end
53
+
54
+ #
55
+ # Selects the elements within the Page which match the given _block_.
56
+ #
57
+ # page.select { |element| element.field =~ /ruby/i }
58
+ #
59
+ def select(&block)
60
+ self.class.new(super(&block))
61
+ end
62
+
63
+ end
64
+ end
@@ -1 +1,25 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ require 'gscraper/search/web_query'
24
+ require 'gscraper/search/ajax_query'
1
25
  require 'gscraper/search/search'
@@ -0,0 +1,176 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ require 'gscraper/search/result'
24
+ require 'gscraper/search/page'
25
+ require 'gscraper/search/query'
26
+ require 'gscraper/extensions/uri'
27
+ require 'gscraper/has_pages'
28
+ require 'gscraper/gscraper'
29
+
30
+ require 'json'
31
+
32
+ module GScraper
33
+ module Search
34
+ class AJAXQuery < Query
35
+
36
+ include HasPages
37
+
38
+ # Maximum results per-page
39
+ RESULTS_PER_PAGE = 8
40
+
41
+ # AJAX API host
42
+ API_HOST = 'www.google.com'
43
+
44
+ # AJAX API URL
45
+ API_URL = "http://#{API_HOST}/uds/GwebSearch?callback=google.search.WebSearch.RawCompletion&context=0&lstkp=0&rsz=large"
46
+
47
+ # Default language
48
+ DEFAULT_LANGUAGE = 'en'
49
+
50
+ # Default signature
51
+ DEFAULT_SIG = '582c1116317355adf613a6a843f19ece'
52
+
53
+ # Default key
54
+ DEFAULT_KEY = 'notsupplied'
55
+
56
+ # Default version
57
+ DEFAULT_VERSION = '1.0'
58
+
59
+ # The search language
60
+ attr_accessor :language
61
+
62
+ # The search signature
63
+ attr_accessor :sig
64
+
65
+ # The search key
66
+ attr_accessor :key
67
+
68
+ # The API version
69
+ attr_accessor :version
70
+
71
+ #
72
+ # Creates a new AJAXQuery with the given _options_. If a _block_ is
73
+ # given it will be passed the newly created AJAXQuery object.
74
+ #
75
+ # _options_ may contain the following keys:
76
+ # <tt>:language</tt>:: The search language. Defaults to <tt>:en</tt>.
77
+ # <tt>:sig</tt>:: The search signature. Defaults to
78
+ # +582c1116317355adf613a6a843f19ece+.
79
+ # <tt>:key</tt>:: The search key. Defaults to <tt>:notsupplied</tt>.
80
+ # <tt>:version</tt>:: The desired API version. Defaults to
81
+ # <tt>1.0</tt>.
82
+ #
83
+ def initialize(options={},&block)
84
+ @agent = GScraper.web_agent(options)
85
+
86
+ @language = (options[:language] || DEFAULT_LANGUAGE)
87
+
88
+ @sig = (options[:sig] || DEFAULT_SIG)
89
+ @key = (options[:key] || DEFAULT_KEY)
90
+ @version = (options[:version] || DEFAULT_VERSION)
91
+
92
+ super(options,&block)
93
+ end
94
+
95
+ #
96
+ # Creates a new AJAXQuery object from the specified URL. If a block is
97
+ # given, it will be passed the newly created AJAXQuery object.
98
+ #
99
+ def self.from_url(url,options={},&block)
100
+ url = URI(url.to_s)
101
+
102
+ options[:language] = url.query_params['hl']
103
+ options[:query] = url.query_params['q']
104
+
105
+ options[:sig] = url.query_params['sig']
106
+ options[:key] = url.query_params['key']
107
+ options[:version] = url.query_params['v']
108
+
109
+ return self.new(options,&block)
110
+ end
111
+
112
+ #
113
+ # Returns +RESULTS_PER_PAGE+.
114
+ #
115
+ def results_per_page
116
+ RESULTS_PER_PAGE
117
+ end
118
+
119
+ #
120
+ # Returns the URL that represents the query.
121
+ #
122
+ def search_url
123
+ search_url = URI(API_URL)
124
+
125
+ search_url.query_params['hl'] = @language
126
+ search_url.query_params['gss'] = '.com'
127
+ search_url.query_params['q'] = expression
128
+ search_url.query_params['sig'] = @sig
129
+ search_url.query_params['key'] = @key
130
+ search_url.query_params['v'] = @version
131
+
132
+ return search_url
133
+ end
134
+
135
+ #
136
+ # Returns the URL that represents the query of a specific
137
+ # _page_index_.
138
+ #
139
+ def page_url(page_index)
140
+ url = search_url
141
+
142
+ if page_index > 1
143
+ url.query_params['start'] = result_offset_of(page_index)
144
+ end
145
+
146
+ return url
147
+ end
148
+
149
+ #
150
+ # Returns a Page object containing Result objects at the specified
151
+ # _page_index_.
152
+ #
153
+ def page(page_index)
154
+ Page.new do |new_page|
155
+ body = @agent.get(page_url(page_index)).body
156
+ hash = JSON.parse(body.scan(/\{.*\}/).first)
157
+
158
+ rank_offset = result_offset_of(page_index)
159
+
160
+ if (hash.kind_of?(Hash) && hash['results'])
161
+ hash['results'].each_with_index do |result,index|
162
+ rank = rank_offset + (index + 1)
163
+ title = Hpricot(result['title']).inner_text
164
+ url = result['unescapedUrl']
165
+ summary = Hpricot(result['content']).inner_text
166
+ cached_url = result['cacheUrl']
167
+
168
+ new_page << Result.new(rank,title,url,summary,cached_url)
169
+ end
170
+ end
171
+ end
172
+ end
173
+
174
+ end
175
+ end
176
+ end
@@ -1,42 +1,31 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/search/result'
24
+ require 'gscraper/page'
2
25
 
3
26
  module GScraper
4
27
  module Search
5
- class Page < Array
6
-
7
- #
8
- # Creates a new Page object with the given _results_.
9
- #
10
- def initialize(results=[])
11
- super(results)
12
- end
13
-
14
- #
15
- # Returns a mapped Array of the results within the Page using the
16
- # given _block_. If the _block_ is not given, the page will be
17
- # returned.
18
- #
19
- # page.map # => Page
20
- #
21
- # page.map { |result| result.url } # => [...]
22
- #
23
- def map(&block)
24
- return self unless block
25
-
26
- mapped = []
27
-
28
- each { |result| mapped << block.call(result) }
29
- return mapped
30
- end
31
-
32
- #
33
- # Selects the results within the Page which match the given _block_.
34
- #
35
- # page.select { |result| result.title =~ /ruby/i }
36
- #
37
- def select(&block)
38
- Page.new(super(&block))
39
- end
28
+ class Page < GScraper::Page
40
29
 
41
30
  #
42
31
  # Selects the results using the specified _block_.
@@ -160,7 +149,7 @@ module GScraper
160
149
  # page.cached_urls # => [...]
161
150
  #
162
151
  def cached_urls
163
- map { |result| result.cached_url }
152
+ map { |result| result.cached_url }.compact
164
153
  end
165
154
 
166
155
  #
@@ -170,7 +159,7 @@ module GScraper
170
159
  # page.cached_pages # => [...]
171
160
  #
172
161
  def cached_pages
173
- map { |result| result.cached_page }
162
+ map { |result| result.cached_page }.compact
174
163
  end
175
164
 
176
165
  #
@@ -180,17 +169,7 @@ module GScraper
180
169
  # page.similar_urls # => [...]
181
170
  #
182
171
  def similar_urls
183
- map { |result| result.similar_url }
184
- end
185
-
186
- #
187
- # Returns an Array containing the similar Queries of the results
188
- # within the Page.
189
- #
190
- # page.similar_queries # => [...]
191
- #
192
- def similar_queries
193
- map { |result| result.similar_query }
172
+ map { |result| result.similar_url }.compact
194
173
  end
195
174
 
196
175
  #
@@ -263,20 +242,6 @@ module GScraper
263
242
  similar_urls.each(&block)
264
243
  end
265
244
 
266
- #
267
- # Iterates over each result's similar Query within the Page, passing
268
- # each to the given _block_.
269
- #
270
- # each_similar_query do |q|
271
- # q.first_page do |page|
272
- # puts page.urls.join("\n")
273
- # end
274
- # end
275
- #
276
- def each_similar_query(&block)
277
- similar_queries.each(&block)
278
- end
279
-
280
245
  #
281
246
  # Returns the ranks of the results that match the specified _block_.
282
247
  #
@@ -345,16 +310,6 @@ module GScraper
345
310
  results_with(&block).similar_urls
346
311
  end
347
312
 
348
- #
349
- # Returns the similar Queries of the results that match the
350
- # specified _block_.
351
- #
352
- # page.similar_queries_of { |result| result.title =~ /hackety/ }
353
- #
354
- def similar_queries_of(&block)
355
- results_with(&block).similar_queries
356
- end
357
-
358
313
  end
359
314
  end
360
315
  end
@@ -1,10 +1,33 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/search/result'
2
24
  require 'gscraper/search/page'
3
25
  require 'gscraper/sponsored_ad'
4
26
  require 'gscraper/sponsored_links'
5
27
  require 'gscraper/extensions/uri'
28
+ require 'gscraper/has_pages'
6
29
  require 'gscraper/licenses'
7
- require 'gscraper/web_agent'
30
+ require 'gscraper/gscraper'
8
31
 
9
32
  require 'hpricot'
10
33
 
@@ -12,20 +35,6 @@ module GScraper
12
35
  module Search
13
36
  class Query
14
37
 
15
- include WebAgent
16
-
17
- # Search host
18
- SEARCH_HOST = 'www.google.com'
19
-
20
- # Search URL
21
- SEARCH_URL = "http://#{SEARCH_HOST}/search"
22
-
23
- # Default results per-page
24
- RESULTS_PER_PAGE = 10
25
-
26
- # Results per-page
27
- attr_accessor :results_per_page
28
-
29
38
  # Search query
30
39
  attr_accessor :query
31
40
 
@@ -71,67 +80,14 @@ module GScraper
71
80
  # Search for results with-out the words
72
81
  attr_accessor :without_words
73
82
 
74
- # Search for results written in the language
75
- attr_accessor :language
76
-
77
- # Search for results from the region
78
- attr_accessor :region
79
-
80
- # Search for results in the format
81
- attr_accessor :in_format
82
-
83
- # Search for results not in the format
84
- attr_accessor :not_in_format
85
-
86
- # Search for results within the past day
87
- attr_accessor :within_past_day
88
-
89
- # Search for results within the past week
90
- attr_accessor :within_past_week
91
-
92
- # Search for results within the past months
93
- attr_accessor :within_past_months
94
-
95
- # Search for results within the past year
96
- attr_accessor :within_past_year
97
-
98
83
  # Search for results containing numbers between the range
99
84
  attr_accessor :numeric_range
100
85
 
101
- # Search for results where the query ocurrs within the area
102
- attr_accessor :occurrs_within
103
-
104
- # Search for results inside the domain
105
- attr_accessor :inside_domain
106
-
107
- # Search for results outside the domain
108
- attr_accessor :outside_domain
109
-
110
- # Search for results which have the rights
111
- attr_accessor :rights
112
-
113
- # Filter the search results
114
- attr_accessor :filtered
115
-
116
- # Search for results similar to the page
117
- attr_accessor :similar_to
118
-
119
- # Search for results linking to the page
120
- attr_accessor :links_to
121
-
122
86
  #
123
87
  # Creates a new Query object from the given search options. If a
124
- # block is given, it will be passed the newly created query object.
125
- #
126
- # Query.new(:query => 'ruby', :with_words => 'sow rspec')
127
- #
128
- # Query.new(:exact_phrase => 'fluent interfaces') do |q|
129
- # q.within_past_week = true
130
- # end
88
+ # block is given, it will be passed the newly created Query object.
131
89
  #
132
90
  def initialize(options={},&block)
133
- @results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
134
-
135
91
  @query = options[:query]
136
92
 
137
93
  @link = options[:link]
@@ -151,175 +107,34 @@ module GScraper
151
107
  @with_words = options[:with_words]
152
108
  @without_words = options[:without_words]
153
109
 
154
- @language = options[:language]
155
- @region = options[:region]
156
- @in_format = options[:in_format]
157
- @not_in_format = options[:not_in_format]
158
-
159
- if options[:within_past_day]
160
- @within_past_day = options[:within_past_day]
161
- @within_past_week = false
162
- @within_past_months = false
163
- @within_past_year = false
164
- elsif options[:within_past_week]
165
- @within_past_day = false
166
- @within_past_week = options[:within_past_week]
167
- @within_past_months = false
168
- @within_past_year = false
169
- elsif options[:within_past_months]
170
- @within_past_day = false
171
- @within_past_week = false
172
- @within_past_months = options[:within_past_months]
173
- @within_past_year = false
174
- elsif options[:within_past_year]
175
- @within_past_day = false
176
- @within_past_week = false
177
- @within_past_months = false
178
- @within_past_year = options[:within_past_year]
179
- else
180
- @within_past_day = false
181
- @within_past_week = false
182
- @within_past_months = false
183
- @within_past_year = false
184
- end
185
-
186
110
  @numeric_range = options[:numeric_range]
187
- @occurrs_within = options[:occurrs_within]
188
- @inside_domain = options[:inside_domain]
189
- @outside_domain = options[:outside_domain]
190
- @rights = options[:rights]
191
- @filtered = options[:filtered]
192
-
193
- @similar_to = options[:similar_to]
194
- @links_to = options[:links_to]
195
111
 
196
112
  block.call(self) if block
197
113
  end
198
114
 
199
115
  #
200
- # Creates a new Query object from the specified URL. If a block is
201
- # given, it will be passed the newly created Query object.
202
- #
203
- # Query.from_url('http://www.google.com/search?q=ruby+zen)
116
+ # Returns the query expression.
204
117
  #
205
- # Query.from_url('http://www.google.com/search?q=ruby') do |q|
206
- # q.within_last_month = true
207
- # q.occurrs_within = :title
208
- # end
209
- #
210
- def self.from_url(url,options={},&block)
211
- url = URI.parse(url)
212
-
213
- options[:results_per_page] = url.query_params['num']
214
-
215
- options[:query] = url.query_params['as_q']
216
- options[:exact_phrase] = url.query_params['as_epq']
217
- options[:with_words] = url.query_params['as_oq']
218
- options[:without_words] = url.query_params['as_eq']
219
-
220
- options[:language] = url.query_params['lr']
221
- options[:region] = url.query_params['cr']
222
-
223
- case url.query_params['as_ft']
224
- when 'i'
225
- options[:in_format] = url.query_params['as_filetype']
226
- when 'e'
227
- options[:not_in_format] = url.query_params['as_filetype']
228
- end
229
-
230
- case url.query_params['as_qdr']
231
- when 'd'
232
- options[:within_past_day] = true
233
- when 'w'
234
- options[:within_past_week] = true
235
- when 'm'
236
- options[:within_past_months] = 1
237
- when 'm2'
238
- options[:within_past_months] = 2
239
- when 'm3'
240
- options[:within_past_months] = 3
241
- when 'm6'
242
- options[:within_past_months] = 6
243
- when 'y'
244
- options[:within_past_year] = true
245
- end
246
-
247
- if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
248
- options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
249
- end
250
-
251
- case url.query_params['as_occt']
252
- when 'title'
253
- options[:occurrs_within] = :title
254
- when 'body'
255
- options[:occurrs_within] = :body
256
- when 'url'
257
- options[:occurrs_within] = :url
258
- when 'links'
259
- options[:occurrs_within] = :links
260
- end
261
-
262
- case url.query_params['as_dt']
263
- when 'i'
264
- options[:inside_domain] = url.query_params['as_sitesearch']
265
- when 'e'
266
- options[:outside_domain] = url.query_params['as_sitesearch']
267
- end
268
-
269
- case url.query_params['as_rights']
270
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
271
- options[:rights] = Licenses::CC_BY_NC_ND
272
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
273
- options[:rights] = Licenses::CC_BY_SA
274
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
275
- options[:rights] = Licenses::CC_BY_NC
276
- when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
277
- options[:rights] = Licenses::CC_BY
278
- end
279
-
280
- if url.query_params[:safe]=='active'
281
- options[:filtered] = true
282
- end
283
-
284
- if url.query_params['as_rq']
285
- options[:similar_to] = url.query_params['as_rq']
286
- elsif url.query_params['as_lq']
287
- options[:links_to] = url.query_params['as_lq']
288
- end
289
-
290
- return self.new(options,&block)
291
- end
292
-
293
- #
294
- # Returns the URL that represents the query.
295
- #
296
- def search_url
297
- url = URI(SEARCH_URL)
298
- query_expr = []
299
-
300
- set_param = lambda { |param,value|
301
- url.query_params[param.to_s] = value if value
302
- }
118
+ def expression
119
+ expr = []
303
120
 
304
121
  append_modifier = lambda { |name|
305
122
  modifier = instance_variable_get("@#{name}")
306
123
 
307
- query_expr << "#{name}:#{modifier}" if modifier
124
+ expr << "#{name}:#{modifier}" if modifier
308
125
  }
309
126
 
310
- join_ops = lambda { |name|
127
+ append_options = lambda { |name|
311
128
  ops = instance_variable_get("@#{name}")
312
129
 
313
130
  if ops.kind_of?(Array)
314
- query_expr << "#{name}:#{ops.join(' ')}"
131
+ expr << "#{name}:#{ops.join(' ')}"
315
132
  elsif ops
316
- query_expr << "#{name}:#{ops}"
133
+ expr << "#{name}:#{ops}"
317
134
  end
318
135
  }
319
136
 
320
- set_param.call('num',@results_per_page)
321
-
322
- query_expr << @query if @query
137
+ expr << @query if @query
323
138
 
324
139
  append_modifier.call(:link)
325
140
  append_modifier.call(:related)
@@ -327,256 +142,30 @@ module GScraper
327
142
  append_modifier.call(:site)
328
143
  append_modifier.call(:filetype)
329
144
 
330
- join_ops.call(:allintitle)
145
+ append_options.call(:allintitle)
331
146
  append_modifier.call(:intitle)
332
- join_ops.call(:allinurl)
147
+ append_options.call(:allinurl)
333
148
  append_modifier.call(:inurl)
334
- join_ops.call(:allintext)
149
+ append_options.call(:allintext)
335
150
  append_modifier.call(:intext)
336
151
 
337
- unless query_expr.empty?
338
- url.query_params['as_q'] = query_expr.join(' ')
339
- end
340
-
341
- set_param.call('as_epq',@exact_phrase)
342
- set_param.call('as_oq',@with_words)
343
- set_param.call('as_eq',@without_words)
344
-
345
- set_param.call('lr',@language)
346
- set_param.call('cr',@region)
347
-
348
- if @in_format
349
- url.query_params['as_ft'] = 'i'
350
- url.query_params['as_filtetype'] = @in_format
351
- elsif @not_in_format
352
- url.query_params['as_ft'] = 'e'
353
- url.query_params['as_filtetype'] = @not_in_format
354
- end
355
-
356
- if @within_past_day
357
- url.query_params['as_qdr'] = 'd'
358
- elsif @within_past_week
359
- url.query_params['as_qdr'] = 'w'
360
- elsif @within_past_months
361
- case @within_past_months
362
- when 1
363
- url.query_params['as_qdr'] = 'm'
364
- when 2
365
- url.query_params['as_qdr'] = 'm2'
366
- when 3
367
- url.query_params['as_qdr'] = 'm3'
368
- when 6
369
- url.query_params['as_qdr'] = 'm6'
370
- end
371
- elsif @within_past_year
372
- url.query_params['as_qdr'] = 'y'
373
- end
374
-
375
- if @numeric_range
376
- url.query_params['as_nlo'] = @numeric_range.begin
377
- url.query_params['as_nhi'] = @numeric_range.end
378
- end
379
-
380
- case @occurrs_within
381
- when :title, 'title'
382
- url.query_params['as_occt'] = 'title'
383
- when :body, 'body'
384
- url.query_params['as_occt'] = 'body'
385
- when :url, 'url'
386
- url.query_params['as_occt'] = 'url'
387
- when :links, 'links'
388
- url.query_params['as_occt'] = 'links'
152
+ if @exact_phrase
153
+ expr << "\"#{@exact_phrase}\""
389
154
  end
390
155
 
391
- if @inside_domain
392
- url.query_params['as_dt'] = 'i'
393
- url.query_params['as_sitesearch'] = @inside_domain
394
- elsif @outside_domain
395
- url.query_params['as_dt'] = 'e'
396
- url.query_params['as_sitesearch'] = @outside_domain
156
+ if @with_words.kind_of?(Array)
157
+ expr << @with_words.join(' OR ')
397
158
  end
398
-
399
- case @rights
400
- when Licenses::CC_BY_NC_ND
401
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
402
- when Licenses::CC_BY_SA
403
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
404
- when Licenses::CC_BY_ND
405
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
406
- when Licenses::CC_BY
407
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
159
+
160
+ if @without_words.kind_of?(Array)
161
+ expr << @without_words.map { |word| "-#{word}" }.join(' ')
408
162
  end
409
163
 
410
- url.query_params['safe'] = true if @filtered
411
-
412
- if @similar_to
413
- url.query_params['as_rq'] = @similar_to
414
- elsif @links_to
415
- url.query_params['as_lq'] = @links_to
164
+ if @numeric_range.kind_of?(Range)
165
+ expr << "#{@numeric_range.begin}..#{@numeric_range.end}"
416
166
  end
417
167
 
418
- return url
419
- end
420
-
421
- #
422
- # Returns the URL that represents the query at the specific
423
- # _page_index_.
424
- #
425
- def page_url(page_index)
426
- url = search_url
427
-
428
- url.query_params['start'] = page_result_offset(page_index)
429
- url.query_params['sa'] = 'N'
430
-
431
- return url
432
- end
433
-
434
- #
435
- # Returns a Page object containing Result objects at the specified
436
- # _page_index_. If a _block_ is given, it will be passed the newly
437
- # created Page.
438
- #
439
- def page(page_index,&block)
440
- doc = get_page(page_url(page_index))
441
-
442
- new_page = Page.new
443
- results = doc.search('//div.g')[0...@results_per_page.to_i]
444
-
445
- results.each_with_index do |result,index|
446
- rank = page_result_offset(page_index) + (index + 1)
447
- link = result.at('//a.l')
448
- title = link.inner_text
449
- url = link.get_attribute('href')
450
- summary_text = ''
451
- cached_url = nil
452
- similar_url = nil
453
-
454
- if (content = (result.at('//td.j//font|//td.j/div.sml')))
455
- content.children.each do |elem|
456
- break if (!(elem.text?) && elem.name=='br')
457
-
458
- summary_text << elem.inner_text
459
- end
460
-
461
- if (cached_link = result.at('nobr/a:first'))
462
- cached_url = cached_link.get_attribute('href')
463
- end
464
-
465
- if (similar_link = result.at('nobr/a:last'))
466
- similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
467
- end
468
- end
469
-
470
- new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
471
- end
472
-
473
- block.call(new_page) if block
474
- return new_page
475
- end
476
-
477
- #
478
- # Returns the Results on the first page. If a _block_ is given it
479
- # will be passed the newly created Page.
480
- #
481
- def first_page(&block)
482
- page(1,&block)
483
- end
484
-
485
- #
486
- # Returns the Result at the specified _index_.
487
- #
488
- def result_at(index)
489
- page(result_page_index(index))[page_result_index(index)]
490
- end
491
-
492
- #
493
- # Returns the first Result on the first_page.
494
- #
495
- def top_result
496
- result_at(1)
497
- end
498
-
499
- #
500
- # Iterates over the results at the specified _page_index_, passing
501
- # each to the given _block_.
502
- #
503
- # query.each_on_page(2) do |result|
504
- # puts result.title
505
- # end
506
- #
507
- def each_on_page(page_index,&block)
508
- page(page_index).each(&block)
509
- end
510
-
511
- #
512
- # Iterates over the results on the first page, passing each to the
513
- # given _block_.
514
- #
515
- # query.each_on_first_page do |result|
516
- # puts result.url
517
- # end
518
- #
519
- def each_on_first_page(&block)
520
- each_on_page(1,&block)
521
- end
522
-
523
- #
524
- # Returns a SponsoredLinks object containing SponsoredAd objects of
525
- # the query. If a _block_ is given, it will be passed the newly
526
- # created Page.
527
- #
528
- def sponsored_links(&block)
529
- doc = get_page(search_url)
530
- new_links = SponsoredLinks.new
531
-
532
- # top and side ads
533
- doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
534
- title = link.inner_text
535
- url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
536
-
537
- new_links << SponsoredAd.new(title,url)
538
- end
539
-
540
- block.call(new_links) if block
541
- return new_links
542
- end
543
-
544
- #
545
- # Returns the first sponsored link on the first page of results.
546
- #
547
- def top_sponsored_link
548
- top_sponsored_links.first
549
- end
550
-
551
- #
552
- # Iterates over the sponsored links on the first page of
553
- # results passing each to the specified _block_.
554
- #
555
- def each_sponsored_link(&block)
556
- sponsored_links.each(&block)
557
- end
558
-
559
- protected
560
-
561
- #
562
- # Returns the rank offset for the specified _page_index_.
563
- #
564
- def page_result_offset(page_index)
565
- (page_index.to_i - 1) * @results_per_page.to_i
566
- end
567
-
568
- #
569
- # Returns the in-Page index of the _result_index_.
570
- #
571
- def page_result_index(result_index)
572
- (result_index.to_i - 1) % @results_per_page.to_i
573
- end
574
-
575
- #
576
- # Returns the page index for the specified _result_index_
577
- #
578
- def result_page_index(result_index)
579
- ((result_index.to_i - 1) / @results_per_page.to_i) + 1
168
+ return expr.join(' ')
580
169
  end
581
170
 
582
171
  end