gscraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt ADDED
@@ -0,0 +1,5 @@
1
+ == 0.1.0 / 2007-12-20
2
+
3
+ * Initial release.
4
+ * Supports the Google Search service.
5
+
data/LICENSE.txt ADDED
@@ -0,0 +1,23 @@
1
+
2
+
3
+ The MIT License
4
+
5
+ Copyright (c) 2007 Hal Brodigan
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in
15
+ all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
+ THE SOFTWARE.
data/Manifest.txt ADDED
@@ -0,0 +1,17 @@
1
+ History.txt
2
+ LICENSE.txt
3
+ Manifest.txt
4
+ README.txt
5
+ Rakefile
6
+ lib/gscraper.rb
7
+ lib/gscraper/gscraper.rb
8
+ lib/gscraper/extensions/uri/http.rb
9
+ lib/gscraper/extensions/uri.rb
10
+ lib/gscraper/extensions.rb
11
+ lib/gscraper/licenses.rb
12
+ lib/gscraper/search/result.rb
13
+ lib/gscraper/search/query.rb
14
+ lib/gscraper/search/search.rb
15
+ lib/gscraper/search.rb
16
+ test/test_gscraper.rb
17
+ test/search/query_from_url.rb
data/README.txt ADDED
@@ -0,0 +1,46 @@
1
+ GScraper
2
+ by Postmodern Modulus III
3
+ http://rubyforge.net/projects/gscraper/
4
+
5
+ == DESCRIPTION:
6
+
7
+ GScraper is a web-scraping interface to various Google Services.
8
+
9
+ == FEATURES/PROBLEMS:
10
+
11
+ * Supports the Google Search service.
12
+ * Provides HTTP access with custom User-Agent strings.
13
+
14
+ == REQUIREMENTS:
15
+
16
+ * Hpricot
17
+ * Mechanize
18
+
19
+ == INSTALL:
20
+
21
+ sudo gem install gscraper
22
+
23
+ == LICENSE:
24
+
25
+ The MIT License
26
+
27
+ Copyright (c) 2007 Hal Brodigan
28
+
29
+ Permission is hereby granted, free of charge, to any person obtaining
30
+ a copy of this software and associated documentation files (the
31
+ 'Software'), to deal in the Software without restriction, including
32
+ without limitation the rights to use, copy, modify, merge, publish,
33
+ distribute, sublicense, and/or sell copies of the Software, and to
34
+ permit persons to whom the Software is furnished to do so, subject to
35
+ the following conditions:
36
+
37
+ The above copyright notice and this permission notice shall be
38
+ included in all copies or substantial portions of the Software.
39
+
40
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
41
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
42
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
43
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
44
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
45
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
46
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,17 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ require './lib/gscraper/version.rb'
6
+
7
+ Hoe.new('gscraper', GScraper::VERSION) do |p|
8
+ p.rubyforge_name = 'gscraper'
9
+ p.author = 'Postmodern Modulus III'
10
+ p.email = 'postmodern.mod3@gmail.com'
11
+ p.summary = 'A ruby web-scraping interface to various Google Services'
12
+ p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
13
+ p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
14
+ p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
15
+ end
16
+
17
+ # vim: syntax=Ruby
data/lib/gscraper.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'gscraper/search'
2
+ require 'gscraper/version'
@@ -0,0 +1 @@
1
+ require 'gscraper/extensions/uri'
@@ -0,0 +1 @@
1
+ require 'gscraper/extensions/uri/http'
@@ -0,0 +1,71 @@
1
+ module URI
2
+ class HTTP
3
+
4
+ # Query parameters
5
+ attr_reader :query_params
6
+
7
+ #
8
+ # Creates a new URI::HTTP object and initializes query_params as a
9
+ # new Hash.
10
+ #
11
+ def initialize(*args)
12
+ super(*args)
13
+
14
+ @query_params = {}
15
+ parse_query_params
16
+ end
17
+
18
+ #
19
+ # Sets the query data and updates query_params.
20
+ #
21
+ def query=(query_str)
22
+ new_query = super(query_str)
23
+ parse_query_params
24
+ return new_query
25
+ end
26
+
27
+ protected
28
+
29
+ #
30
+ # Parses the query parameters from the query data, populating
31
+ # query_params with the parsed parameters.
32
+ #
33
+ def parse_query_params
34
+ @query_params.clear
35
+
36
+ if @query
37
+ @query.split('&').each do |param|
38
+ name, value = param.split('=')
39
+
40
+ if value
41
+ @query_params[name] = URI.decode(value)
42
+ else
43
+ @query_params[name] = nil
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ # :nodoc
52
+ def path_query
53
+ str = @path
54
+
55
+ unless @query_params.empty?
56
+ str += '?' + @query_params.to_a.map { |name,value|
57
+ if value==true
58
+ "#{name}=active"
59
+ elsif value
60
+ "#{name}=#{URI.encode(value.to_s)}"
61
+ else
62
+ "#{name}="
63
+ end
64
+ }.join('&')
65
+ end
66
+
67
+ return str
68
+ end
69
+
70
+ end
71
+ end
@@ -0,0 +1,62 @@
1
+ require 'mechanize'
2
+ require 'open-uri'
3
+
4
+ module GScraper
5
+ #
6
+ # Returns the GScraper user-agent
7
+ #
8
+ def GScraper.user_agent
9
+ @user_agent
10
+ end
11
+
12
+ #
13
+ # Sets the GScraper user-agent to the specified _agent_.
14
+ #
15
+ def GScraper.user_agent=(agent)
16
+ @user_agent = agent
17
+ end
18
+
19
+ #
20
+ # Opens the _uri_ with the given _opts_. The contents of the _uri_ will
21
+ # be returned.
22
+ #
23
+ # GScraper.open('http://www.hackety.org/')
24
+ # GScraper.open('http://tenderlovemaking.com/',
25
+ # :user_agent_alias => 'Linux Mozilla')
26
+ # GScraper.open('http://www.wired.com/', :user_agent => 'the future')
27
+ #
28
+ def GScraper.open(uri,opts={})
29
+ headers = {}
30
+
31
+ if opts[:user_agent_alias]
32
+ headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[opts[:user_agent_alias]]
33
+ elsif opts[:user_agent]
34
+ headers['User-Agent'] = opts[:user_agent]
35
+ elsif GScraper.user_agent
36
+ headers['User-Agent'] = GScraper.user_agent
37
+ end
38
+
39
+ return Kernel.open(uri,headers)
40
+ end
41
+
42
+ #
43
+ # Creates a new Mechanize agent with the given _opts_.
44
+ #
45
+ # GScraper.http_agent
46
+ # GScraper.http_agent(:user_agent_alias => 'Linux Mozilla')
47
+ # GScraper.http_agent(:user_agent => 'wooden pants')
48
+ #
49
+ def GScraper.http_agent(opts={})
50
+ agent = WWW::Mechanize.new
51
+
52
+ if opts[:user_agent_alias]
53
+ agent.user_agent_alias = opts[:user_agent_alias]
54
+ elsif opts[:user_agent]
55
+ agent.user_agent = opts[:user_agent]
56
+ elsif GScraper.user_agent
57
+ agent.user_agent = GScraper.user_agent
58
+ end
59
+
60
+ return agent
61
+ end
62
+ end
@@ -0,0 +1,56 @@
1
+ module GScraper
2
+ module Licenses
3
+ ANY = nil
4
+
5
+ ALADDIN = :aladdin
6
+
7
+ ARTISTIC = :artistic
8
+
9
+ APACHE = :apache
10
+
11
+ APPLE = :apple
12
+
13
+ BSD = :bsd
14
+
15
+ COMMON_PUBLIC = :cpl
16
+
17
+ CC_BY = :cc_by
18
+
19
+ CC_BY_SA = :cc_by_sa
20
+
21
+ CC_BY_ND = :cc_by_nd
22
+
23
+ CC_BY_NC = :cc_by_nc_sa
24
+
25
+ CC_BY_ND_SA = :cc_by_nd
26
+
27
+ CC_BY_NC_SA = :cc_by_nc_sa
28
+
29
+ CC_BY_NC_ND = :cc_by_nc_nd
30
+
31
+ GPL = :gpl
32
+
33
+ LGPL = :lgpl
34
+
35
+ HISTORICAL = :disclaimer
36
+
37
+ IBM_PUBLIC = :ibm
38
+
39
+ LUCENT_PUBLIC = :lucent
40
+
41
+ MIT = :mit
42
+
43
+ MOZILLA_PUBLI = :mozilla
44
+
45
+ NASA_OSA = :nasa
46
+
47
+ PYTHON = :python
48
+
49
+ Q_PUBLIC = :qpl
50
+
51
+ SLEEPYCAT = :sleepycat
52
+
53
+ ZOPE_PUBLIC = :zope
54
+
55
+ end
56
+ end
@@ -0,0 +1 @@
1
+ require 'gscraper/search/search'
@@ -0,0 +1,394 @@
1
+ require 'gscraper/search/result'
2
+ require 'gscraper/extensions/uri'
3
+ require 'gscraper/licenses'
4
+ require 'gscraper/gscraper'
5
+
6
+ require 'hpricot'
7
+
8
+ module GScraper
9
+ module Search
10
+ class Query
11
+
12
+ SEARCH_URL = 'http://www.google.com/search'
13
+
14
+ RESULTS_PER_PAGE = 10
15
+
16
+ # Results per-page
17
+ attr_accessor :results_per_page
18
+
19
+ # Search query
20
+ attr_accessor :query
21
+
22
+ # Search for results containing the exact phrase
23
+ attr_accessor :exact_phrase
24
+
25
+ # Search for results with the words
26
+ attr_accessor :with_words
27
+
28
+ # Search for results with-out the words
29
+ attr_accessor :without_words
30
+
31
+ # Search for results written in the language
32
+ attr_accessor :language
33
+
34
+ # Search for results from the region
35
+ attr_accessor :region
36
+
37
+ # Search for results in the format
38
+ attr_accessor :in_format
39
+
40
+ # Search for results not in the format
41
+ attr_accessor :not_in_format
42
+
43
+ # Search for results within the past day
44
+ attr_accessor :within_past_day
45
+
46
+ # Search for results within the past week
47
+ attr_accessor :within_past_week
48
+
49
+ # Search for results within the past months
50
+ attr_accessor :within_past_months
51
+
52
+ # Search for results within the past year
53
+ attr_accessor :within_past_year
54
+
55
+ # Search for results containing numbers between the range
56
+ attr_accessor :numeric_range
57
+
58
+ # Search for results where the query ocurrs within the area
59
+ attr_accessor :occurrs_within
60
+
61
+ # Search for results inside the domain
62
+ attr_accessor :inside_domain
63
+
64
+ # Search for results outside the domain
65
+ attr_accessor :outside_domain
66
+
67
+ # Search for results which have the rights
68
+ attr_accessor :rights
69
+
70
+ # Filter the search results
71
+ attr_accessor :filtered
72
+
73
+ # Search for results similar to the page
74
+ attr_accessor :similar_to
75
+
76
+ # Search for results linking to the page
77
+ attr_accessor :links_to
78
+
79
+ #
80
+ # Creates a new Query object from the given search options. If a
81
+ # block is given, it will be passed the newly created query object.
82
+ #
83
+ # Query.new(:query => 'ruby', :with_words => 'rspec rails')
84
+ #
85
+ # Query.new(:exact_phrase => 'fluent interfaces') do |q|
86
+ # q.within_past_week = true
87
+ # end
88
+ #
89
+ def initialize(opts={},&block)
90
+ super()
91
+
92
+ @results_per_page = opts[:results_per_page] || RESULTS_PER_PAGE
93
+
94
+ @query = opts[:query]
95
+ @exact_phrase = opts[:exact_phrase]
96
+ @with_words = opts[:with_words]
97
+ @without_words = opts[:without_words]
98
+
99
+ @language = opts[:language]
100
+ @region = opts[:region]
101
+ @in_format = opts[:in_format]
102
+ @not_in_format = opts[:not_in_format]
103
+
104
+ if opts[:within_past_day]
105
+ @within_past_day = opts[:within_past_day]
106
+ elsif opts[:within_past_week]
107
+ @within_past_week = opts[:within_past_week]
108
+ elsif opts[:within_past_months]
109
+ @within_past_months = opts[:within_past_months]
110
+ elsif opts[:within_past_year]
111
+ @within_past_year = opts[:within_past_year]
112
+ end
113
+
114
+ @numeric_range = opts[:numeric_range]
115
+ @occurrs_within = opts[:occurrs_within]
116
+ @inside_domain = opts[:inside_domain]
117
+ @outside_domain = opts[:outside_domain]
118
+ @rights = opts[:rights]
119
+ @filtered = opts[:filtered]
120
+
121
+ @similar_to = opts[:similar_to]
122
+ @links_to = opts[:links_to]
123
+
124
+ block.call(self) if block
125
+ end
126
+
127
+ #
128
+ # Creates a new Query object from the specified URL. If a block is
129
+ # given, it will be passed the newly created Query object.
130
+ #
131
+ # Query.from_url('http://www.google.com/search?q=ruby+zen)
132
+ #
133
+ # Query.from_url('http://www.google.com/search?q=ruby') do |q|
134
+ # q.within_last_month = true
135
+ # q.occurrs_within = :title
136
+ # end
137
+ #
138
+ def self.from_url(url,&block)
139
+ url = URI.parse(url)
140
+ opts = {}
141
+
142
+ opts[:results_per_page] = url.query_params['num']
143
+
144
+ opts[:query] = url.query_params['as_q']
145
+ opts[:exact_phrase] = url.query_params['as_epq']
146
+ opts[:with_words] = url.query_params['as_oq']
147
+ opts[:without_words] = url.query_params['as_eq']
148
+
149
+ opts[:language] = url.query_params['lr']
150
+ opts[:region] = url.query_params['cr']
151
+
152
+ case url.query_params['as_ft']
153
+ when 'i'
154
+ opts[:in_format] = url.query_params['as_filetype']
155
+ when 'e'
156
+ opts[:not_in_format] = url.query_params['as_filetype']
157
+ end
158
+
159
+ case url.query_params['as_qdr']
160
+ when 'd'
161
+ opts[:within_past_day] = true
162
+ when 'w'
163
+ opts[:within_past_week] = true
164
+ when 'm'
165
+ opts[:within_past_months] = 1
166
+ when 'm2'
167
+ opts[:within_past_months] = 2
168
+ when 'm3'
169
+ opts[:within_past_months] = 3
170
+ when 'm6'
171
+ opts[:within_past_months] = 6
172
+ when 'y'
173
+ opts[:within_past_year] = true
174
+ end
175
+
176
+ if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
177
+ opts[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
178
+ end
179
+
180
+ case url.query_params['as_occt']
181
+ when 'title'
182
+ opts[:occurrs_within] = :title
183
+ when 'body'
184
+ opts[:occurrs_within] = :body
185
+ when 'url'
186
+ opts[:occurrs_within] = :url
187
+ when 'links'
188
+ opts[:occurrs_within] = :links
189
+ end
190
+
191
+ case url.query_params['as_dt']
192
+ when 'i'
193
+ opts[:inside_domain] = url.query_params['as_sitesearch']
194
+ when 'e'
195
+ opts[:outside_domain] = url.query_params['as_sitesearch']
196
+ end
197
+
198
+ case url.query_params['as_rights']
199
+ when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
200
+ opts[:rights] = Licenses::CC_BY_NC_ND
201
+ when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
202
+ opts[:rights] = Licenses::CC_BY_SA
203
+ when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
204
+ opts[:rights] = Licenses::CC_BY_ND
205
+ when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
206
+ opts[:rights] = Licenses::CC_BY
207
+ end
208
+
209
+ if url.query_params[:safe]=='active'
210
+ opts[:filtered] = true
211
+ end
212
+
213
+ if url.query_params['as_rq']
214
+ opts[:similar_to] = url.query_params['as_rq']
215
+ elsif url.query_params['as_lq']
216
+ opts[:links_to] = url.query_params['as_lq']
217
+ end
218
+
219
+ return self.new(opts,&block)
220
+ end
221
+
222
+ #
223
+ # Returns the URL that represents the query.
224
+ #
225
+ def search_url
226
+ url = URI.parse(SEARCH_URL)
227
+
228
+ if @results_per_page
229
+ url.query_params['num'] = @results_per_page
230
+ end
231
+
232
+ url.query_params['as_q'] = @query if @query
233
+ url.query_params['as_epq'] = @exact_phrase if @exact_phrase
234
+ url.query_params['as_oq'] = @with_words if @with_words
235
+ url.query_params['as_eq'] = @without_words if @without_words
236
+
237
+ url.query_params['lr'] = @language if @language
238
+ url.query_params['cr'] = @region if @region
239
+
240
+ if @in_format
241
+ url.query_params['as_ft'] = 'i'
242
+ url.query_params['as_filtetype'] = @in_format
243
+ elsif @not_in_format
244
+ url.query_params['as_ft'] = 'e'
245
+ url.query_params['as_filtetype'] = @not_in_format
246
+ end
247
+
248
+ if @within_past_day
249
+ url.query_params['as_qdr'] = 'd'
250
+ elsif @within_past_week
251
+ url.query_params['as_qdr'] = 'w'
252
+ elsif @within_past_months
253
+ case @within_past_months
254
+ when 1
255
+ url.query_params['as_qdr'] = 'm'
256
+ when 2
257
+ url.query_params['as_qdr'] = 'm2'
258
+ when 3
259
+ url.query_params['as_qdr'] = 'm3'
260
+ when 6
261
+ url.query_params['as_qdr'] = 'm6'
262
+ end
263
+ elsif @within_past_year
264
+ url.query_params['as_qdr'] = 'y'
265
+ end
266
+
267
+ if @numeric_range
268
+ url.query_params['as_nlo'] = @numeric_range.begin
269
+ url.query_params['as_nhi'] = @numeric_range.end
270
+ end
271
+
272
+ case @occurrs_within
273
+ when :title, 'title'
274
+ url.query_params['as_occt'] = 'title'
275
+ when :body, 'body'
276
+ url.query_params['as_occt'] = 'body'
277
+ when :url, 'url'
278
+ url.query_params['as_occt'] = 'url'
279
+ when :links, 'links'
280
+ url.query_params['as_occt'] = 'links'
281
+ end
282
+
283
+ if @inside_domain
284
+ url.query_params['as_dt'] = 'i'
285
+ url.query_params['as_sitesearch'] = @inside_domain
286
+ elsif @outside_domain
287
+ url.query_params['as_dt'] = 'e'
288
+ url.query_params['as_sitesearch'] = @outside_domain
289
+ end
290
+
291
+ case @rights
292
+ when Licenses::CC_BY_NC_ND
293
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
294
+ when Licenses::CC_BY_SA
295
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
296
+ when Licenses::CC_BY_ND
297
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
298
+ when Licenses::CC_BY
299
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
300
+ end
301
+
302
+ url.query_params['safe'] = true if @filtered
303
+
304
+ if @similar_to
305
+ url.query_params['as_rq'] = @similar_to
306
+ elsif @links_to
307
+ url.query_params['as_lq'] = @links_to
308
+ end
309
+
310
+ return url
311
+ end
312
+
313
+ #
314
+ # Returns the URL that represents the query at the specific
315
+ # _page_index_.
316
+ #
317
+ def page_url(page_index)
318
+ url = search_url
319
+
320
+ url.query_params['start'] = page_index_offset(page_index)
321
+ url.query_params['sa'] = 'N'
322
+
323
+ return url
324
+ end
325
+
326
+ #
327
+ # Returns an array of Result objects at the specified _page_index_.
328
+ # If _opts_ are given, they will be used in accessing the SEARCH_URL.
329
+ #
330
+ def page(page_index,opts={})
331
+ results = []
332
+ doc = Hpricot(GScraper.open(page_url(page_index),opts))
333
+
334
+ doc.search('//div.g').each_with_index do |result,index|
335
+ rank = page_index_offset(page_index) + (index + 1)
336
+ title = result.search('//h2.r').first.inner_text
337
+ url = result.search('//h2.r/a').first.get_attribute('href')
338
+ # TODO: exclude URL and Links from summary text
339
+ summary = result.search('//td.j').first.inner_text
340
+
341
+ # TODO: scrape Cached and Similar links
342
+
343
+ results << Result.new(rank,title,url,summary)
344
+ end
345
+
346
+ return results
347
+ end
348
+
349
+ #
350
+ # Returns the results on the first page. If _opts_ are given, they
351
+ # will be used in accessing the SEARCH_URL.
352
+ #
353
+ def first_page(opts={})
354
+ page(1,opts)
355
+ end
356
+
357
+ #
358
+ # Iterates over the results at the specified _page_index_, passing
359
+ # each to the given _block_. If _opts_ are given they will be used
360
+ # in accessing the SEARCH_URL.
361
+ #
362
+ # query.each_on_page(2) do |result|
363
+ # puts result.title
364
+ # end
365
+ #
366
+ def each_on_page(page_index,opts={},&block)
367
+ page(page_index,opts).each(&block)
368
+ end
369
+
370
+ #
371
+ # Iterates over the results on the first page, passing
372
+ # each to the given _block_. If _opts_ are given, they will be used
373
+ # in accessing the SEARCH_URL.
374
+ #
375
+ # query.each_on_first_page do |result|
376
+ # puts result.url
377
+ # end
378
+ #
379
+ def each_on_first_page(opts={},&block)
380
+ each_on_page(1,opts,&block)
381
+ end
382
+
383
+ protected
384
+
385
+ #
386
+ # Returns the rank offset for the specified _page_index_.
387
+ #
388
+ def page_index_offset(page_index)
389
+ (page_index.to_i - 1) * @result_per_page.to_i
390
+ end
391
+
392
+ end
393
+ end
394
+ end
@@ -0,0 +1,37 @@
1
+ module GScraper
2
+ module Search
3
+ class Result
4
+
5
+ # Rank of the result page
6
+ attr_reader :rank
7
+
8
+ # Title of the result page
9
+ attr_reader :title
10
+
11
+ # URL of the result page
12
+ attr_reader :url
13
+
14
+ # Summary from the result page
15
+ attr_reader :summary
16
+
17
+ #
18
+ # Creates a new Result object with the given _rank_, _title_
19
+ # _summary_, _url_ and _size_.
20
+ #
21
+ def initialize(rank,title,url,summary)
22
+ @rank = rank
23
+ @title = title
24
+ @url = url
25
+ @summary = summary
26
+ end
27
+
28
+ #
29
+ # Returns a string containing the result's title.
30
+ #
31
+ def to_s
32
+ @title.to_s
33
+ end
34
+
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,33 @@
1
+ require 'gscraper/search/query'
2
+
3
+ module GScraper
4
+ module Search
5
+ #
6
+ # Returns a new Query object with the given _opts_. See Query.new.
7
+ #
8
+ # Search.query(:query => 'ruby', :with_words => 'rspec rails')
9
+ #
10
+ # Search.query(:exact_phrase => 'fluent interfaces') do |q|
11
+ # q.within_past_week = true
12
+ # end
13
+ #
14
+ def Search.query(opts={},&block)
15
+ Query.new(opts,&block)
16
+ end
17
+
18
+ #
19
+ # Returns the Query object that represents the specified _url_.
20
+ # See Query.from_url.
21
+ #
22
+ # Search.query_from_url('http://www.google.com/search?q=ruby+zen)
23
+ #
24
+ # Search.query_from_url('http://www.google.com/search?q=ruby') do |q|
25
+ # q.within_last_month = true
26
+ # q.occurrs_within = :title
27
+ # end
28
+ #
29
+ def Search.query_from_url(url,&block)
30
+ Query.from_url(url,&block)
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,50 @@
1
+ require 'test/unit'
2
+ require 'gscraper/search/query'
3
+
4
+ class QueryFromURL < Test::Unit::TestCase
5
+
6
+ include GScraper
7
+
8
+ QUERY_URL = 'http://www.google.com/search?as_q=test&hl=en&num=20&btnG=Google+Search&as_epq=what+if&as_oq=dog&as_eq=haha&lr=&cr=&as_ft=i&as_filetype=&as_qdr=w&as_nlo=&as_nhi=&as_occt=body&as_dt=i&as_sitesearch=&as_rights=&safe=images'
9
+
10
+ def setup
11
+ @query = Search::Query.from_url(QUERY_URL)
12
+ end
13
+
14
+ def teardown
15
+ @query = nil
16
+ end
17
+
18
+ def test_query
19
+ assert_equal @query.query, 'test'
20
+ end
21
+
22
+ def test_exact_phrase
23
+ assert_equal @query.exact_phrase, 'what+if'
24
+ end
25
+
26
+ def test_with_words
27
+ assert_equal @query.with_words, 'dog'
28
+ end
29
+
30
+ def test_without_words
31
+ assert_equal @query.without_words, 'haha'
32
+ end
33
+
34
+ def test_within_past_week
35
+ assert_equal @query.within_past_week, true
36
+ end
37
+
38
+ def test_occurrs_within
39
+ assert_equal @query.occurrs_within, :body
40
+ end
41
+
42
+ def test_similar_to
43
+ assert_nil @query.similar_to
44
+ end
45
+
46
+ def test_links_to
47
+ assert_nil @query.links_to
48
+ end
49
+
50
+ end
@@ -0,0 +1,4 @@
1
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),'..','lib')))
2
+
3
+ require 'test/unit'
4
+ require 'search/query_from_url'
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.4
3
+ specification_version: 1
4
+ name: gscraper
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2007-12-21 00:00:00 -08:00
8
+ summary: A ruby web-scraping interface to various Google Services
9
+ require_paths:
10
+ - lib
11
+ email: postmodern.mod3@gmail.com
12
+ homepage: " by Postmodern Modulus III"
13
+ rubyforge_project: gscraper
14
+ description: "== FEATURES/PROBLEMS: * Supports the Google Search service. * Provides HTTP access with custom User-Agent strings. == REQUIREMENTS: * Hpricot * Mechanize == INSTALL:"
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Postmodern Modulus III
31
+ files:
32
+ - History.txt
33
+ - LICENSE.txt
34
+ - Manifest.txt
35
+ - README.txt
36
+ - Rakefile
37
+ - lib/gscraper.rb
38
+ - lib/gscraper/gscraper.rb
39
+ - lib/gscraper/extensions/uri/http.rb
40
+ - lib/gscraper/extensions/uri.rb
41
+ - lib/gscraper/extensions.rb
42
+ - lib/gscraper/licenses.rb
43
+ - lib/gscraper/search/result.rb
44
+ - lib/gscraper/search/query.rb
45
+ - lib/gscraper/search/search.rb
46
+ - lib/gscraper/search.rb
47
+ - test/test_gscraper.rb
48
+ - test/search/query_from_url.rb
49
+ test_files:
50
+ - test/test_gscraper.rb
51
+ rdoc_options:
52
+ - --main
53
+ - README.txt
54
+ extra_rdoc_files:
55
+ - History.txt
56
+ - LICENSE.txt
57
+ - Manifest.txt
58
+ - README.txt
59
+ executables: []
60
+
61
+ extensions: []
62
+
63
+ requirements: []
64
+
65
+ dependencies:
66
+ - !ruby/object:Gem::Dependency
67
+ name: hoe
68
+ version_requirement:
69
+ version_requirements: !ruby/object:Gem::Version::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: 1.3.0
74
+ version: