gscraper 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/COPYING.txt +339 -0
  2. data/History.txt +21 -0
  3. data/Manifest.txt +23 -10
  4. data/README.txt +17 -21
  5. data/Rakefile +3 -6
  6. data/lib/gscraper.rb +22 -0
  7. data/lib/gscraper/extensions.rb +22 -0
  8. data/lib/gscraper/extensions/uri.rb +22 -0
  9. data/lib/gscraper/extensions/uri/http.rb +25 -71
  10. data/lib/gscraper/extensions/uri/query_params.rb +96 -0
  11. data/lib/gscraper/gscraper.rb +30 -0
  12. data/lib/gscraper/has_pages.rb +114 -0
  13. data/lib/gscraper/licenses.rb +22 -0
  14. data/lib/gscraper/page.rb +64 -0
  15. data/lib/gscraper/search.rb +24 -0
  16. data/lib/gscraper/search/ajax_query.rb +176 -0
  17. data/lib/gscraper/search/page.rb +27 -72
  18. data/lib/gscraper/search/query.rb +46 -457
  19. data/lib/gscraper/search/result.rb +32 -29
  20. data/lib/gscraper/search/search.rb +44 -3
  21. data/lib/gscraper/search/web_query.rb +472 -0
  22. data/lib/gscraper/sponsored_ad.rb +26 -2
  23. data/lib/gscraper/sponsored_links.rb +77 -8
  24. data/lib/gscraper/version.rb +23 -1
  25. data/spec/extensions/uri/http_spec.rb +9 -0
  26. data/spec/extensions/uri/query_params_spec.rb +38 -0
  27. data/spec/gscraper_spec.rb +29 -0
  28. data/spec/has_pages_examples.rb +19 -0
  29. data/spec/has_sponsored_links_examples.rb +57 -0
  30. data/spec/helpers/query.rb +1 -0
  31. data/spec/helpers/uri.rb +8 -0
  32. data/spec/page_has_results_examples.rb +13 -0
  33. data/spec/search/ajax_query_spec.rb +124 -0
  34. data/spec/search/page_has_results_examples.rb +51 -0
  35. data/spec/search/query_spec.rb +103 -0
  36. data/spec/search/web_query_spec.rb +74 -0
  37. data/spec/spec_helper.rb +6 -0
  38. data/tasks/spec.rb +7 -0
  39. metadata +34 -20
  40. data/LICENSE.txt +0 -23
  41. data/lib/gscraper/web_agent.rb +0 -38
  42. data/test/search/page_results.rb +0 -103
  43. data/test/search/query_from_url.rb +0 -50
  44. data/test/search/query_pages.rb +0 -32
  45. data/test/search/query_result.rb +0 -30
  46. data/test/test_gscraper.rb +0 -4
@@ -1,3 +1,25 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  module GScraper
2
24
  module Licenses
3
25
  # Any desired license
@@ -0,0 +1,64 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ module GScraper
24
+ class Page < Array
25
+
26
+ #
27
+ # Creates a new Page object with the given _elements_. If a _block_
28
+ # is given, it will be passed the newly created Page object.
29
+ #
30
+ def initialize(elements=[],&block)
31
+ super(elements)
32
+
33
+ block.call(self) if block
34
+ end
35
+
36
+ #
37
+ # Returns a mapped Array of the elements within the Page using the
38
+ # given _block_. If the _block_ is not given, the page will be
39
+ # returned.
40
+ #
41
+ # page.map # => Page
42
+ #
43
+ # page.map { |element| element.field } # => [...]
44
+ #
45
+ def map(&block)
46
+ return self unless block
47
+
48
+ mapped = []
49
+
50
+ each { |element| mapped << block.call(element) }
51
+ return mapped
52
+ end
53
+
54
+ #
55
+ # Selects the elements within the Page which match the given _block_.
56
+ #
57
+ # page.select { |element| element.field =~ /ruby/i }
58
+ #
59
+ def select(&block)
60
+ self.class.new(super(&block))
61
+ end
62
+
63
+ end
64
+ end
@@ -1 +1,25 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ require 'gscraper/search/web_query'
24
+ require 'gscraper/search/ajax_query'
1
25
  require 'gscraper/search/search'
@@ -0,0 +1,176 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
23
+ require 'gscraper/search/result'
24
+ require 'gscraper/search/page'
25
+ require 'gscraper/search/query'
26
+ require 'gscraper/extensions/uri'
27
+ require 'gscraper/has_pages'
28
+ require 'gscraper/gscraper'
29
+
30
+ require 'json'
31
+
32
+ module GScraper
33
+ module Search
34
+ class AJAXQuery < Query
35
+
36
+ include HasPages
37
+
38
+ # Maximum results per-page
39
+ RESULTS_PER_PAGE = 8
40
+
41
+ # AJAX API host
42
+ API_HOST = 'www.google.com'
43
+
44
+ # AJAX API URL
45
+ API_URL = "http://#{API_HOST}/uds/GwebSearch?callback=google.search.WebSearch.RawCompletion&context=0&lstkp=0&rsz=large"
46
+
47
+ # Default language
48
+ DEFAULT_LANGUAGE = 'en'
49
+
50
+ # Default signature
51
+ DEFAULT_SIG = '582c1116317355adf613a6a843f19ece'
52
+
53
+ # Default key
54
+ DEFAULT_KEY = 'notsupplied'
55
+
56
+ # Default version
57
+ DEFAULT_VERSION = '1.0'
58
+
59
+ # The search language
60
+ attr_accessor :language
61
+
62
+ # The search signature
63
+ attr_accessor :sig
64
+
65
+ # The search key
66
+ attr_accessor :key
67
+
68
+ # The API version
69
+ attr_accessor :version
70
+
71
+ #
72
+ # Creates a new AJAXQuery with the given _options_. If a _block_ is
73
+ # given it will be passed the newly created AJAXQuery object.
74
+ #
75
+ # _options_ may contain the following keys:
76
+ # <tt>:language</tt>:: The search language. Defaults to <tt>:en</tt>.
77
+ # <tt>:sig</tt>:: The search signature. Defaults to
78
+ # +582c1116317355adf613a6a843f19ece+.
79
+ # <tt>:key</tt>:: The search key. Defaults to <tt>:notsupplied</tt>.
80
+ # <tt>:version</tt>:: The desired API version. Defaults to
81
+ # <tt>1.0</tt>.
82
+ #
83
+ def initialize(options={},&block)
84
+ @agent = GScraper.web_agent(options)
85
+
86
+ @language = (options[:language] || DEFAULT_LANGUAGE)
87
+
88
+ @sig = (options[:sig] || DEFAULT_SIG)
89
+ @key = (options[:key] || DEFAULT_KEY)
90
+ @version = (options[:version] || DEFAULT_VERSION)
91
+
92
+ super(options,&block)
93
+ end
94
+
95
+ #
96
+ # Creates a new AJAXQuery object from the specified URL. If a block is
97
+ # given, it will be passed the newly created AJAXQuery object.
98
+ #
99
+ def self.from_url(url,options={},&block)
100
+ url = URI(url.to_s)
101
+
102
+ options[:language] = url.query_params['hl']
103
+ options[:query] = url.query_params['q']
104
+
105
+ options[:sig] = url.query_params['sig']
106
+ options[:key] = url.query_params['key']
107
+ options[:version] = url.query_params['v']
108
+
109
+ return self.new(options,&block)
110
+ end
111
+
112
+ #
113
+ # Returns +RESULTS_PER_PAGE+.
114
+ #
115
+ def results_per_page
116
+ RESULTS_PER_PAGE
117
+ end
118
+
119
+ #
120
+ # Returns the URL that represents the query.
121
+ #
122
+ def search_url
123
+ search_url = URI(API_URL)
124
+
125
+ search_url.query_params['hl'] = @language
126
+ search_url.query_params['gss'] = '.com'
127
+ search_url.query_params['q'] = expression
128
+ search_url.query_params['sig'] = @sig
129
+ search_url.query_params['key'] = @key
130
+ search_url.query_params['v'] = @version
131
+
132
+ return search_url
133
+ end
134
+
135
+ #
136
+ # Returns the URL that represents the query of a specific
137
+ # _page_index_.
138
+ #
139
+ def page_url(page_index)
140
+ url = search_url
141
+
142
+ if page_index > 1
143
+ url.query_params['start'] = result_offset_of(page_index)
144
+ end
145
+
146
+ return url
147
+ end
148
+
149
+ #
150
+ # Returns a Page object containing Result objects at the specified
151
+ # _page_index_.
152
+ #
153
+ def page(page_index)
154
+ Page.new do |new_page|
155
+ body = @agent.get(page_url(page_index)).body
156
+ hash = JSON.parse(body.scan(/\{.*\}/).first)
157
+
158
+ rank_offset = result_offset_of(page_index)
159
+
160
+ if (hash.kind_of?(Hash) && hash['results'])
161
+ hash['results'].each_with_index do |result,index|
162
+ rank = rank_offset + (index + 1)
163
+ title = Hpricot(result['title']).inner_text
164
+ url = result['unescapedUrl']
165
+ summary = Hpricot(result['content']).inner_text
166
+ cached_url = result['cacheUrl']
167
+
168
+ new_page << Result.new(rank,title,url,summary,cached_url)
169
+ end
170
+ end
171
+ end
172
+ end
173
+
174
+ end
175
+ end
176
+ end
@@ -1,42 +1,31 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/search/result'
24
+ require 'gscraper/page'
2
25
 
3
26
  module GScraper
4
27
  module Search
5
- class Page < Array
6
-
7
- #
8
- # Creates a new Page object with the given _results_.
9
- #
10
- def initialize(results=[])
11
- super(results)
12
- end
13
-
14
- #
15
- # Returns a mapped Array of the results within the Page using the
16
- # given _block_. If the _block_ is not given, the page will be
17
- # returned.
18
- #
19
- # page.map # => Page
20
- #
21
- # page.map { |result| result.url } # => [...]
22
- #
23
- def map(&block)
24
- return self unless block
25
-
26
- mapped = []
27
-
28
- each { |result| mapped << block.call(result) }
29
- return mapped
30
- end
31
-
32
- #
33
- # Selects the results within the Page which match the given _block_.
34
- #
35
- # page.select { |result| result.title =~ /ruby/i }
36
- #
37
- def select(&block)
38
- Page.new(super(&block))
39
- end
28
+ class Page < GScraper::Page
40
29
 
41
30
  #
42
31
  # Selects the results using the specified _block_.
@@ -160,7 +149,7 @@ module GScraper
160
149
  # page.cached_urls # => [...]
161
150
  #
162
151
  def cached_urls
163
- map { |result| result.cached_url }
152
+ map { |result| result.cached_url }.compact
164
153
  end
165
154
 
166
155
  #
@@ -170,7 +159,7 @@ module GScraper
170
159
  # page.cached_pages # => [...]
171
160
  #
172
161
  def cached_pages
173
- map { |result| result.cached_page }
162
+ map { |result| result.cached_page }.compact
174
163
  end
175
164
 
176
165
  #
@@ -180,17 +169,7 @@ module GScraper
180
169
  # page.similar_urls # => [...]
181
170
  #
182
171
  def similar_urls
183
- map { |result| result.similar_url }
184
- end
185
-
186
- #
187
- # Returns an Array containing the similar Queries of the results
188
- # within the Page.
189
- #
190
- # page.similar_queries # => [...]
191
- #
192
- def similar_queries
193
- map { |result| result.similar_query }
172
+ map { |result| result.similar_url }.compact
194
173
  end
195
174
 
196
175
  #
@@ -263,20 +242,6 @@ module GScraper
263
242
  similar_urls.each(&block)
264
243
  end
265
244
 
266
- #
267
- # Iterates over each result's similar Query within the Page, passing
268
- # each to the given _block_.
269
- #
270
- # each_similar_query do |q|
271
- # q.first_page do |page|
272
- # puts page.urls.join("\n")
273
- # end
274
- # end
275
- #
276
- def each_similar_query(&block)
277
- similar_queries.each(&block)
278
- end
279
-
280
245
  #
281
246
  # Returns the ranks of the results that match the specified _block_.
282
247
  #
@@ -345,16 +310,6 @@ module GScraper
345
310
  results_with(&block).similar_urls
346
311
  end
347
312
 
348
- #
349
- # Returns the similar Queries of the results that match the
350
- # specified _block_.
351
- #
352
- # page.similar_queries_of { |result| result.title =~ /hackety/ }
353
- #
354
- def similar_queries_of(&block)
355
- results_with(&block).similar_queries
356
- end
357
-
358
313
  end
359
314
  end
360
315
  end
@@ -1,10 +1,33 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/search/result'
2
24
  require 'gscraper/search/page'
3
25
  require 'gscraper/sponsored_ad'
4
26
  require 'gscraper/sponsored_links'
5
27
  require 'gscraper/extensions/uri'
28
+ require 'gscraper/has_pages'
6
29
  require 'gscraper/licenses'
7
- require 'gscraper/web_agent'
30
+ require 'gscraper/gscraper'
8
31
 
9
32
  require 'hpricot'
10
33
 
@@ -12,20 +35,6 @@ module GScraper
12
35
  module Search
13
36
  class Query
14
37
 
15
- include WebAgent
16
-
17
- # Search host
18
- SEARCH_HOST = 'www.google.com'
19
-
20
- # Search URL
21
- SEARCH_URL = "http://#{SEARCH_HOST}/search"
22
-
23
- # Default results per-page
24
- RESULTS_PER_PAGE = 10
25
-
26
- # Results per-page
27
- attr_accessor :results_per_page
28
-
29
38
  # Search query
30
39
  attr_accessor :query
31
40
 
@@ -71,67 +80,14 @@ module GScraper
71
80
  # Search for results with-out the words
72
81
  attr_accessor :without_words
73
82
 
74
- # Search for results written in the language
75
- attr_accessor :language
76
-
77
- # Search for results from the region
78
- attr_accessor :region
79
-
80
- # Search for results in the format
81
- attr_accessor :in_format
82
-
83
- # Search for results not in the format
84
- attr_accessor :not_in_format
85
-
86
- # Search for results within the past day
87
- attr_accessor :within_past_day
88
-
89
- # Search for results within the past week
90
- attr_accessor :within_past_week
91
-
92
- # Search for results within the past months
93
- attr_accessor :within_past_months
94
-
95
- # Search for results within the past year
96
- attr_accessor :within_past_year
97
-
98
83
  # Search for results containing numbers between the range
99
84
  attr_accessor :numeric_range
100
85
 
101
- # Search for results where the query ocurrs within the area
102
- attr_accessor :occurrs_within
103
-
104
- # Search for results inside the domain
105
- attr_accessor :inside_domain
106
-
107
- # Search for results outside the domain
108
- attr_accessor :outside_domain
109
-
110
- # Search for results which have the rights
111
- attr_accessor :rights
112
-
113
- # Filter the search results
114
- attr_accessor :filtered
115
-
116
- # Search for results similar to the page
117
- attr_accessor :similar_to
118
-
119
- # Search for results linking to the page
120
- attr_accessor :links_to
121
-
122
86
  #
123
87
  # Creates a new Query object from the given search options. If a
124
- # block is given, it will be passed the newly created query object.
125
- #
126
- # Query.new(:query => 'ruby', :with_words => 'sow rspec')
127
- #
128
- # Query.new(:exact_phrase => 'fluent interfaces') do |q|
129
- # q.within_past_week = true
130
- # end
88
+ # block is given, it will be passed the newly created Query object.
131
89
  #
132
90
  def initialize(options={},&block)
133
- @results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
134
-
135
91
  @query = options[:query]
136
92
 
137
93
  @link = options[:link]
@@ -151,175 +107,34 @@ module GScraper
151
107
  @with_words = options[:with_words]
152
108
  @without_words = options[:without_words]
153
109
 
154
- @language = options[:language]
155
- @region = options[:region]
156
- @in_format = options[:in_format]
157
- @not_in_format = options[:not_in_format]
158
-
159
- if options[:within_past_day]
160
- @within_past_day = options[:within_past_day]
161
- @within_past_week = false
162
- @within_past_months = false
163
- @within_past_year = false
164
- elsif options[:within_past_week]
165
- @within_past_day = false
166
- @within_past_week = options[:within_past_week]
167
- @within_past_months = false
168
- @within_past_year = false
169
- elsif options[:within_past_months]
170
- @within_past_day = false
171
- @within_past_week = false
172
- @within_past_months = options[:within_past_months]
173
- @within_past_year = false
174
- elsif options[:within_past_year]
175
- @within_past_day = false
176
- @within_past_week = false
177
- @within_past_months = false
178
- @within_past_year = options[:within_past_year]
179
- else
180
- @within_past_day = false
181
- @within_past_week = false
182
- @within_past_months = false
183
- @within_past_year = false
184
- end
185
-
186
110
  @numeric_range = options[:numeric_range]
187
- @occurrs_within = options[:occurrs_within]
188
- @inside_domain = options[:inside_domain]
189
- @outside_domain = options[:outside_domain]
190
- @rights = options[:rights]
191
- @filtered = options[:filtered]
192
-
193
- @similar_to = options[:similar_to]
194
- @links_to = options[:links_to]
195
111
 
196
112
  block.call(self) if block
197
113
  end
198
114
 
199
115
  #
200
- # Creates a new Query object from the specified URL. If a block is
201
- # given, it will be passed the newly created Query object.
202
- #
203
- # Query.from_url('http://www.google.com/search?q=ruby+zen)
116
+ # Returns the query expression.
204
117
  #
205
- # Query.from_url('http://www.google.com/search?q=ruby') do |q|
206
- # q.within_last_month = true
207
- # q.occurrs_within = :title
208
- # end
209
- #
210
- def self.from_url(url,options={},&block)
211
- url = URI.parse(url)
212
-
213
- options[:results_per_page] = url.query_params['num']
214
-
215
- options[:query] = url.query_params['as_q']
216
- options[:exact_phrase] = url.query_params['as_epq']
217
- options[:with_words] = url.query_params['as_oq']
218
- options[:without_words] = url.query_params['as_eq']
219
-
220
- options[:language] = url.query_params['lr']
221
- options[:region] = url.query_params['cr']
222
-
223
- case url.query_params['as_ft']
224
- when 'i'
225
- options[:in_format] = url.query_params['as_filetype']
226
- when 'e'
227
- options[:not_in_format] = url.query_params['as_filetype']
228
- end
229
-
230
- case url.query_params['as_qdr']
231
- when 'd'
232
- options[:within_past_day] = true
233
- when 'w'
234
- options[:within_past_week] = true
235
- when 'm'
236
- options[:within_past_months] = 1
237
- when 'm2'
238
- options[:within_past_months] = 2
239
- when 'm3'
240
- options[:within_past_months] = 3
241
- when 'm6'
242
- options[:within_past_months] = 6
243
- when 'y'
244
- options[:within_past_year] = true
245
- end
246
-
247
- if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
248
- options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
249
- end
250
-
251
- case url.query_params['as_occt']
252
- when 'title'
253
- options[:occurrs_within] = :title
254
- when 'body'
255
- options[:occurrs_within] = :body
256
- when 'url'
257
- options[:occurrs_within] = :url
258
- when 'links'
259
- options[:occurrs_within] = :links
260
- end
261
-
262
- case url.query_params['as_dt']
263
- when 'i'
264
- options[:inside_domain] = url.query_params['as_sitesearch']
265
- when 'e'
266
- options[:outside_domain] = url.query_params['as_sitesearch']
267
- end
268
-
269
- case url.query_params['as_rights']
270
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
271
- options[:rights] = Licenses::CC_BY_NC_ND
272
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
273
- options[:rights] = Licenses::CC_BY_SA
274
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
275
- options[:rights] = Licenses::CC_BY_NC
276
- when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
277
- options[:rights] = Licenses::CC_BY
278
- end
279
-
280
- if url.query_params[:safe]=='active'
281
- options[:filtered] = true
282
- end
283
-
284
- if url.query_params['as_rq']
285
- options[:similar_to] = url.query_params['as_rq']
286
- elsif url.query_params['as_lq']
287
- options[:links_to] = url.query_params['as_lq']
288
- end
289
-
290
- return self.new(options,&block)
291
- end
292
-
293
- #
294
- # Returns the URL that represents the query.
295
- #
296
- def search_url
297
- url = URI(SEARCH_URL)
298
- query_expr = []
299
-
300
- set_param = lambda { |param,value|
301
- url.query_params[param.to_s] = value if value
302
- }
118
+ def expression
119
+ expr = []
303
120
 
304
121
  append_modifier = lambda { |name|
305
122
  modifier = instance_variable_get("@#{name}")
306
123
 
307
- query_expr << "#{name}:#{modifier}" if modifier
124
+ expr << "#{name}:#{modifier}" if modifier
308
125
  }
309
126
 
310
- join_ops = lambda { |name|
127
+ append_options = lambda { |name|
311
128
  ops = instance_variable_get("@#{name}")
312
129
 
313
130
  if ops.kind_of?(Array)
314
- query_expr << "#{name}:#{ops.join(' ')}"
131
+ expr << "#{name}:#{ops.join(' ')}"
315
132
  elsif ops
316
- query_expr << "#{name}:#{ops}"
133
+ expr << "#{name}:#{ops}"
317
134
  end
318
135
  }
319
136
 
320
- set_param.call('num',@results_per_page)
321
-
322
- query_expr << @query if @query
137
+ expr << @query if @query
323
138
 
324
139
  append_modifier.call(:link)
325
140
  append_modifier.call(:related)
@@ -327,256 +142,30 @@ module GScraper
327
142
  append_modifier.call(:site)
328
143
  append_modifier.call(:filetype)
329
144
 
330
- join_ops.call(:allintitle)
145
+ append_options.call(:allintitle)
331
146
  append_modifier.call(:intitle)
332
- join_ops.call(:allinurl)
147
+ append_options.call(:allinurl)
333
148
  append_modifier.call(:inurl)
334
- join_ops.call(:allintext)
149
+ append_options.call(:allintext)
335
150
  append_modifier.call(:intext)
336
151
 
337
- unless query_expr.empty?
338
- url.query_params['as_q'] = query_expr.join(' ')
339
- end
340
-
341
- set_param.call('as_epq',@exact_phrase)
342
- set_param.call('as_oq',@with_words)
343
- set_param.call('as_eq',@without_words)
344
-
345
- set_param.call('lr',@language)
346
- set_param.call('cr',@region)
347
-
348
- if @in_format
349
- url.query_params['as_ft'] = 'i'
350
- url.query_params['as_filtetype'] = @in_format
351
- elsif @not_in_format
352
- url.query_params['as_ft'] = 'e'
353
- url.query_params['as_filtetype'] = @not_in_format
354
- end
355
-
356
- if @within_past_day
357
- url.query_params['as_qdr'] = 'd'
358
- elsif @within_past_week
359
- url.query_params['as_qdr'] = 'w'
360
- elsif @within_past_months
361
- case @within_past_months
362
- when 1
363
- url.query_params['as_qdr'] = 'm'
364
- when 2
365
- url.query_params['as_qdr'] = 'm2'
366
- when 3
367
- url.query_params['as_qdr'] = 'm3'
368
- when 6
369
- url.query_params['as_qdr'] = 'm6'
370
- end
371
- elsif @within_past_year
372
- url.query_params['as_qdr'] = 'y'
373
- end
374
-
375
- if @numeric_range
376
- url.query_params['as_nlo'] = @numeric_range.begin
377
- url.query_params['as_nhi'] = @numeric_range.end
378
- end
379
-
380
- case @occurrs_within
381
- when :title, 'title'
382
- url.query_params['as_occt'] = 'title'
383
- when :body, 'body'
384
- url.query_params['as_occt'] = 'body'
385
- when :url, 'url'
386
- url.query_params['as_occt'] = 'url'
387
- when :links, 'links'
388
- url.query_params['as_occt'] = 'links'
152
+ if @exact_phrase
153
+ expr << "\"#{@exact_phrase}\""
389
154
  end
390
155
 
391
- if @inside_domain
392
- url.query_params['as_dt'] = 'i'
393
- url.query_params['as_sitesearch'] = @inside_domain
394
- elsif @outside_domain
395
- url.query_params['as_dt'] = 'e'
396
- url.query_params['as_sitesearch'] = @outside_domain
156
+ if @with_words.kind_of?(Array)
157
+ expr << @with_words.join(' OR ')
397
158
  end
398
-
399
- case @rights
400
- when Licenses::CC_BY_NC_ND
401
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
402
- when Licenses::CC_BY_SA
403
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
404
- when Licenses::CC_BY_ND
405
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
406
- when Licenses::CC_BY
407
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
159
+
160
+ if @without_words.kind_of?(Array)
161
+ expr << @without_words.map { |word| "-#{word}" }.join(' ')
408
162
  end
409
163
 
410
- url.query_params['safe'] = true if @filtered
411
-
412
- if @similar_to
413
- url.query_params['as_rq'] = @similar_to
414
- elsif @links_to
415
- url.query_params['as_lq'] = @links_to
164
+ if @numeric_range.kind_of?(Range)
165
+ expr << "#{@numeric_range.begin}..#{@numeric_range.end}"
416
166
  end
417
167
 
418
- return url
419
- end
420
-
421
- #
422
- # Returns the URL that represents the query at the specific
423
- # _page_index_.
424
- #
425
- def page_url(page_index)
426
- url = search_url
427
-
428
- url.query_params['start'] = page_result_offset(page_index)
429
- url.query_params['sa'] = 'N'
430
-
431
- return url
432
- end
433
-
434
- #
435
- # Returns a Page object containing Result objects at the specified
436
- # _page_index_. If a _block_ is given, it will be passed the newly
437
- # created Page.
438
- #
439
- def page(page_index,&block)
440
- doc = get_page(page_url(page_index))
441
-
442
- new_page = Page.new
443
- results = doc.search('//div.g')[0...@results_per_page.to_i]
444
-
445
- results.each_with_index do |result,index|
446
- rank = page_result_offset(page_index) + (index + 1)
447
- link = result.at('//a.l')
448
- title = link.inner_text
449
- url = link.get_attribute('href')
450
- summary_text = ''
451
- cached_url = nil
452
- similar_url = nil
453
-
454
- if (content = (result.at('//td.j//font|//td.j/div.sml')))
455
- content.children.each do |elem|
456
- break if (!(elem.text?) && elem.name=='br')
457
-
458
- summary_text << elem.inner_text
459
- end
460
-
461
- if (cached_link = result.at('nobr/a:first'))
462
- cached_url = cached_link.get_attribute('href')
463
- end
464
-
465
- if (similar_link = result.at('nobr/a:last'))
466
- similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
467
- end
468
- end
469
-
470
- new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
471
- end
472
-
473
- block.call(new_page) if block
474
- return new_page
475
- end
476
-
477
- #
478
- # Returns the Results on the first page. If a _block_ is given it
479
- # will be passed the newly created Page.
480
- #
481
- def first_page(&block)
482
- page(1,&block)
483
- end
484
-
485
- #
486
- # Returns the Result at the specified _index_.
487
- #
488
- def result_at(index)
489
- page(result_page_index(index))[page_result_index(index)]
490
- end
491
-
492
- #
493
- # Returns the first Result on the first_page.
494
- #
495
- def top_result
496
- result_at(1)
497
- end
498
-
499
- #
500
- # Iterates over the results at the specified _page_index_, passing
501
- # each to the given _block_.
502
- #
503
- # query.each_on_page(2) do |result|
504
- # puts result.title
505
- # end
506
- #
507
- def each_on_page(page_index,&block)
508
- page(page_index).each(&block)
509
- end
510
-
511
- #
512
- # Iterates over the results on the first page, passing each to the
513
- # given _block_.
514
- #
515
- # query.each_on_first_page do |result|
516
- # puts result.url
517
- # end
518
- #
519
- def each_on_first_page(&block)
520
- each_on_page(1,&block)
521
- end
522
-
523
- #
524
- # Returns a SponsoredLinks object containing SponsoredAd objects of
525
- # the query. If a _block_ is given, it will be passed the newly
526
- # created Page.
527
- #
528
- def sponsored_links(&block)
529
- doc = get_page(search_url)
530
- new_links = SponsoredLinks.new
531
-
532
- # top and side ads
533
- doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
534
- title = link.inner_text
535
- url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
536
-
537
- new_links << SponsoredAd.new(title,url)
538
- end
539
-
540
- block.call(new_links) if block
541
- return new_links
542
- end
543
-
544
- #
545
- # Returns the first sponsored link on the first page of results.
546
- #
547
- def top_sponsored_link
548
- top_sponsored_links.first
549
- end
550
-
551
- #
552
- # Iterates over the sponsored links on the first page of
553
- # results passing each to the specified _block_.
554
- #
555
- def each_sponsored_link(&block)
556
- sponsored_links.each(&block)
557
- end
558
-
559
- protected
560
-
561
- #
562
- # Returns the rank offset for the specified _page_index_.
563
- #
564
- def page_result_offset(page_index)
565
- (page_index.to_i - 1) * @results_per_page.to_i
566
- end
567
-
568
- #
569
- # Returns the in-Page index of the _result_index_.
570
- #
571
- def page_result_index(result_index)
572
- (result_index.to_i - 1) % @results_per_page.to_i
573
- end
574
-
575
- #
576
- # Returns the page index for the specified _result_index_
577
- #
578
- def result_page_index(result_index)
579
- ((result_index.to_i - 1) / @results_per_page.to_i) + 1
168
+ return expr.join(' ')
580
169
  end
581
170
 
582
171
  end