gscraper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt ADDED
@@ -0,0 +1,5 @@
1
+ == 0.1.0 / 2007-12-20
2
+
3
+ * Initial release.
4
+ * Supports the Google Search service.
5
+
data/LICENSE.txt ADDED
@@ -0,0 +1,23 @@
1
+
2
+
3
+ The MIT License
4
+
5
+ Copyright (c) 2007 Hal Brodigan
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in
15
+ all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
+ THE SOFTWARE.
data/Manifest.txt ADDED
@@ -0,0 +1,17 @@
1
+ History.txt
2
+ LICENSE.txt
3
+ Manifest.txt
4
+ README.txt
5
+ Rakefile
6
+ lib/gscraper.rb
7
+ lib/gscraper/gscraper.rb
8
+ lib/gscraper/extensions/uri/http.rb
9
+ lib/gscraper/extensions/uri.rb
10
+ lib/gscraper/extensions.rb
11
+ lib/gscraper/licenses.rb
12
+ lib/gscraper/search/result.rb
13
+ lib/gscraper/search/query.rb
14
+ lib/gscraper/search/search.rb
15
+ lib/gscraper/search.rb
16
+ test/test_gscraper.rb
17
+ test/search/query_from_url.rb
data/README.txt ADDED
@@ -0,0 +1,46 @@
1
+ GScraper
2
+ by Postmodern Modulus III
3
+ http://rubyforge.net/projects/gscraper/
4
+
5
+ == DESCRIPTION:
6
+
7
+ GScraper is a web-scraping interface to various Google Services.
8
+
9
+ == FEATURES/PROBLEMS:
10
+
11
+ * Supports the Google Search service.
12
+ * Provides HTTP access with custom User-Agent strings.
13
+
14
+ == REQUIREMENTS:
15
+
16
+ * Hpricot
17
+ * Mechanize
18
+
19
+ == INSTALL:
20
+
21
+ sudo gem install gscraper
22
+
23
+ == LICENSE:
24
+
25
+ The MIT License
26
+
27
+ Copyright (c) 2007 Hal Brodigan
28
+
29
+ Permission is hereby granted, free of charge, to any person obtaining
30
+ a copy of this software and associated documentation files (the
31
+ 'Software'), to deal in the Software without restriction, including
32
+ without limitation the rights to use, copy, modify, merge, publish,
33
+ distribute, sublicense, and/or sell copies of the Software, and to
34
+ permit persons to whom the Software is furnished to do so, subject to
35
+ the following conditions:
36
+
37
+ The above copyright notice and this permission notice shall be
38
+ included in all copies or substantial portions of the Software.
39
+
40
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
41
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
42
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
43
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
44
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
45
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
46
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,17 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ require './lib/gscraper/version.rb'
6
+
7
+ Hoe.new('gscraper', GScraper::VERSION) do |p|
8
+ p.rubyforge_name = 'gscraper'
9
+ p.author = 'Postmodern Modulus III'
10
+ p.email = 'postmodern.mod3@gmail.com'
11
+ p.summary = 'A ruby web-scraping interface to various Google Services'
12
+ p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
13
+ p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
14
+ p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
15
+ end
16
+
17
+ # vim: syntax=Ruby
data/lib/gscraper.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'gscraper/search'
2
+ require 'gscraper/version'
@@ -0,0 +1 @@
1
+ require 'gscraper/extensions/uri'
@@ -0,0 +1 @@
1
+ require 'gscraper/extensions/uri/http'
@@ -0,0 +1,71 @@
1
+ module URI
2
+ class HTTP
3
+
4
+ # Query parameters
5
+ attr_reader :query_params
6
+
7
+ #
8
+ # Creates a new URI::HTTP object and initializes query_params as a
9
+ # new Hash.
10
+ #
11
+ def initialize(*args)
12
+ super(*args)
13
+
14
+ @query_params = {}
15
+ parse_query_params
16
+ end
17
+
18
+ #
19
+ # Sets the query data and updates query_params.
20
+ #
21
+ def query=(query_str)
22
+ new_query = super(query_str)
23
+ parse_query_params
24
+ return new_query
25
+ end
26
+
27
+ protected
28
+
29
+ #
30
+ # Parses the query parameters from the query data, populating
31
+ # query_params with the parsed parameters.
32
+ #
33
+ def parse_query_params
34
+ @query_params.clear
35
+
36
+ if @query
37
+ @query.split('&').each do |param|
38
+ name, value = param.split('=')
39
+
40
+ if value
41
+ @query_params[name] = URI.decode(value)
42
+ else
43
+ @query_params[name] = nil
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ # :nodoc
52
+ def path_query
53
+ str = @path
54
+
55
+ unless @query_params.empty?
56
+ str += '?' + @query_params.to_a.map { |name,value|
57
+ if value==true
58
+ "#{name}=active"
59
+ elsif value
60
+ "#{name}=#{URI.encode(value.to_s)}"
61
+ else
62
+ "#{name}="
63
+ end
64
+ }.join('&')
65
+ end
66
+
67
+ return str
68
+ end
69
+
70
+ end
71
+ end
@@ -0,0 +1,62 @@
1
+ require 'mechanize'
2
+ require 'open-uri'
3
+
4
+ module GScraper
5
+ #
6
+ # Returns the GScraper user-agent
7
+ #
8
+ def GScraper.user_agent
9
+ @user_agent
10
+ end
11
+
12
+ #
13
+ # Sets the GScraper user-agent to the specified _agent_.
14
+ #
15
+ def GScraper.user_agent=(agent)
16
+ @user_agent = agent
17
+ end
18
+
19
+ #
20
+ # Opens the _uri_ with the given _opts_. The contents of the _uri_ will
21
+ # be returned.
22
+ #
23
+ # GScraper.open('http://www.hackety.org/')
24
+ # GScraper.open('http://tenderlovemaking.com/',
25
+ # :user_agent_alias => 'Linux Mozilla')
26
+ # GScraper.open('http://www.wired.com/', :user_agent => 'the future')
27
+ #
28
+ def GScraper.open(uri,opts={})
29
+ headers = {}
30
+
31
+ if opts[:user_agent_alias]
32
+ headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[opts[:user_agent_alias]]
33
+ elsif opts[:user_agent]
34
+ headers['User-Agent'] = opts[:user_agent]
35
+ elsif GScraper.user_agent
36
+ headers['User-Agent'] = GScraper.user_agent
37
+ end
38
+
39
+ return Kernel.open(uri,headers)
40
+ end
41
+
42
+ #
43
+ # Creates a new Mechanize agent with the given _opts_.
44
+ #
45
+ # GScraper.http_agent
46
+ # GScraper.http_agent(:user_agent_alias => 'Linux Mozilla')
47
+ # GScraper.http_agent(:user_agent => 'wooden pants')
48
+ #
49
+ def GScraper.http_agent(opts={})
50
+ agent = WWW::Mechanize.new
51
+
52
+ if opts[:user_agent_alias]
53
+ agent.user_agent_alias = opts[:user_agent_alias]
54
+ elsif opts[:user_agent]
55
+ agent.user_agent = opts[:user_agent]
56
+ elsif GScraper.user_agent
57
+ agent.user_agent = GScraper.user_agent
58
+ end
59
+
60
+ return agent
61
+ end
62
+ end
@@ -0,0 +1,56 @@
1
+ module GScraper
2
+ module Licenses
3
+ ANY = nil
4
+
5
+ ALADDIN = :aladdin
6
+
7
+ ARTISTIC = :artistic
8
+
9
+ APACHE = :apache
10
+
11
+ APPLE = :apple
12
+
13
+ BSD = :bsd
14
+
15
+ COMMON_PUBLIC = :cpl
16
+
17
+ CC_BY = :cc_by
18
+
19
+ CC_BY_SA = :cc_by_sa
20
+
21
+ CC_BY_ND = :cc_by_nd
22
+
23
+ CC_BY_NC = :cc_by_nc_sa
24
+
25
+ CC_BY_ND_SA = :cc_by_nd
26
+
27
+ CC_BY_NC_SA = :cc_by_nc_sa
28
+
29
+ CC_BY_NC_ND = :cc_by_nc_nd
30
+
31
+ GPL = :gpl
32
+
33
+ LGPL = :lgpl
34
+
35
+ HISTORICAL = :disclaimer
36
+
37
+ IBM_PUBLIC = :ibm
38
+
39
+ LUCENT_PUBLIC = :lucent
40
+
41
+ MIT = :mit
42
+
43
+ MOZILLA_PUBLI = :mozilla
44
+
45
+ NASA_OSA = :nasa
46
+
47
+ PYTHON = :python
48
+
49
+ Q_PUBLIC = :qpl
50
+
51
+ SLEEPYCAT = :sleepycat
52
+
53
+ ZOPE_PUBLIC = :zope
54
+
55
+ end
56
+ end
@@ -0,0 +1 @@
1
+ require 'gscraper/search/search'
@@ -0,0 +1,394 @@
1
+ require 'gscraper/search/result'
2
+ require 'gscraper/extensions/uri'
3
+ require 'gscraper/licenses'
4
+ require 'gscraper/gscraper'
5
+
6
+ require 'hpricot'
7
+
8
+ module GScraper
9
+ module Search
10
+ class Query
11
+
12
+ SEARCH_URL = 'http://www.google.com/search'
13
+
14
+ RESULTS_PER_PAGE = 10
15
+
16
+ # Results per-page
17
+ attr_accessor :results_per_page
18
+
19
+ # Search query
20
+ attr_accessor :query
21
+
22
+ # Search for results containing the exact phrase
23
+ attr_accessor :exact_phrase
24
+
25
+ # Search for results with the words
26
+ attr_accessor :with_words
27
+
28
+ # Search for results with-out the words
29
+ attr_accessor :without_words
30
+
31
+ # Search for results written in the language
32
+ attr_accessor :language
33
+
34
+ # Search for results from the region
35
+ attr_accessor :region
36
+
37
+ # Search for results in the format
38
+ attr_accessor :in_format
39
+
40
+ # Search for results not in the format
41
+ attr_accessor :not_in_format
42
+
43
+ # Search for results within the past day
44
+ attr_accessor :within_past_day
45
+
46
+ # Search for results within the past week
47
+ attr_accessor :within_past_week
48
+
49
+ # Search for results within the past months
50
+ attr_accessor :within_past_months
51
+
52
+ # Search for results within the past year
53
+ attr_accessor :within_past_year
54
+
55
+ # Search for results containing numbers between the range
56
+ attr_accessor :numeric_range
57
+
58
+ # Search for results where the query ocurrs within the area
59
+ attr_accessor :occurrs_within
60
+
61
+ # Search for results inside the domain
62
+ attr_accessor :inside_domain
63
+
64
+ # Search for results outside the domain
65
+ attr_accessor :outside_domain
66
+
67
+ # Search for results which have the rights
68
+ attr_accessor :rights
69
+
70
+ # Filter the search results
71
+ attr_accessor :filtered
72
+
73
+ # Search for results similar to the page
74
+ attr_accessor :similar_to
75
+
76
+ # Search for results linking to the page
77
+ attr_accessor :links_to
78
+
79
+ #
80
+ # Creates a new Query object from the given search options. If a
81
+ # block is given, it will be passed the newly created query object.
82
+ #
83
+ # Query.new(:query => 'ruby', :with_words => 'rspec rails')
84
+ #
85
+ # Query.new(:exact_phrase => 'fluent interfaces') do |q|
86
+ # q.within_past_week = true
87
+ # end
88
+ #
89
+ def initialize(opts={},&block)
90
+ super()
91
+
92
+ @results_per_page = opts[:results_per_page] || RESULTS_PER_PAGE
93
+
94
+ @query = opts[:query]
95
+ @exact_phrase = opts[:exact_phrase]
96
+ @with_words = opts[:with_words]
97
+ @without_words = opts[:without_words]
98
+
99
+ @language = opts[:language]
100
+ @region = opts[:region]
101
+ @in_format = opts[:in_format]
102
+ @not_in_format = opts[:not_in_format]
103
+
104
+ if opts[:within_past_day]
105
+ @within_past_day = opts[:within_past_day]
106
+ elsif opts[:within_past_week]
107
+ @within_past_week = opts[:within_past_week]
108
+ elsif opts[:within_past_months]
109
+ @within_past_months = opts[:within_past_months]
110
+ elsif opts[:within_past_year]
111
+ @within_past_year = opts[:within_past_year]
112
+ end
113
+
114
+ @numeric_range = opts[:numeric_range]
115
+ @occurrs_within = opts[:occurrs_within]
116
+ @inside_domain = opts[:inside_domain]
117
+ @outside_domain = opts[:outside_domain]
118
+ @rights = opts[:rights]
119
+ @filtered = opts[:filtered]
120
+
121
+ @similar_to = opts[:similar_to]
122
+ @links_to = opts[:links_to]
123
+
124
+ block.call(self) if block
125
+ end
126
+
127
+ #
128
+ # Creates a new Query object from the specified URL. If a block is
129
+ # given, it will be passed the newly created Query object.
130
+ #
131
+ # Query.from_url('http://www.google.com/search?q=ruby+zen)
132
+ #
133
+ # Query.from_url('http://www.google.com/search?q=ruby') do |q|
134
+ # q.within_last_month = true
135
+ # q.occurrs_within = :title
136
+ # end
137
+ #
138
+ def self.from_url(url,&block)
139
+ url = URI.parse(url)
140
+ opts = {}
141
+
142
+ opts[:results_per_page] = url.query_params['num']
143
+
144
+ opts[:query] = url.query_params['as_q']
145
+ opts[:exact_phrase] = url.query_params['as_epq']
146
+ opts[:with_words] = url.query_params['as_oq']
147
+ opts[:without_words] = url.query_params['as_eq']
148
+
149
+ opts[:language] = url.query_params['lr']
150
+ opts[:region] = url.query_params['cr']
151
+
152
+ case url.query_params['as_ft']
153
+ when 'i'
154
+ opts[:in_format] = url.query_params['as_filetype']
155
+ when 'e'
156
+ opts[:not_in_format] = url.query_params['as_filetype']
157
+ end
158
+
159
+ case url.query_params['as_qdr']
160
+ when 'd'
161
+ opts[:within_past_day] = true
162
+ when 'w'
163
+ opts[:within_past_week] = true
164
+ when 'm'
165
+ opts[:within_past_months] = 1
166
+ when 'm2'
167
+ opts[:within_past_months] = 2
168
+ when 'm3'
169
+ opts[:within_past_months] = 3
170
+ when 'm6'
171
+ opts[:within_past_months] = 6
172
+ when 'y'
173
+ opts[:within_past_year] = true
174
+ end
175
+
176
+ if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
177
+ opts[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
178
+ end
179
+
180
+ case url.query_params['as_occt']
181
+ when 'title'
182
+ opts[:occurrs_within] = :title
183
+ when 'body'
184
+ opts[:occurrs_within] = :body
185
+ when 'url'
186
+ opts[:occurrs_within] = :url
187
+ when 'links'
188
+ opts[:occurrs_within] = :links
189
+ end
190
+
191
+ case url.query_params['as_dt']
192
+ when 'i'
193
+ opts[:inside_domain] = url.query_params['as_sitesearch']
194
+ when 'e'
195
+ opts[:outside_domain] = url.query_params['as_sitesearch']
196
+ end
197
+
198
+ case url.query_params['as_rights']
199
+ when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
200
+ opts[:rights] = Licenses::CC_BY_NC_ND
201
+ when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
202
+ opts[:rights] = Licenses::CC_BY_SA
203
+ when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
204
+ opts[:rights] = Licenses::CC_BY_ND
205
+ when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
206
+ opts[:rights] = Licenses::CC_BY
207
+ end
208
+
209
+ if url.query_params[:safe]=='active'
210
+ opts[:filtered] = true
211
+ end
212
+
213
+ if url.query_params['as_rq']
214
+ opts[:similar_to] = url.query_params['as_rq']
215
+ elsif url.query_params['as_lq']
216
+ opts[:links_to] = url.query_params['as_lq']
217
+ end
218
+
219
+ return self.new(opts,&block)
220
+ end
221
+
222
+ #
223
+ # Returns the URL that represents the query.
224
+ #
225
+ def search_url
226
+ url = URI.parse(SEARCH_URL)
227
+
228
+ if @results_per_page
229
+ url.query_params['num'] = @results_per_page
230
+ end
231
+
232
+ url.query_params['as_q'] = @query if @query
233
+ url.query_params['as_epq'] = @exact_phrase if @exact_phrase
234
+ url.query_params['as_oq'] = @with_words if @with_words
235
+ url.query_params['as_eq'] = @without_words if @without_words
236
+
237
+ url.query_params['lr'] = @language if @language
238
+ url.query_params['cr'] = @region if @region
239
+
240
+ if @in_format
241
+ url.query_params['as_ft'] = 'i'
242
+ url.query_params['as_filtetype'] = @in_format
243
+ elsif @not_in_format
244
+ url.query_params['as_ft'] = 'e'
245
+ url.query_params['as_filtetype'] = @not_in_format
246
+ end
247
+
248
+ if @within_past_day
249
+ url.query_params['as_qdr'] = 'd'
250
+ elsif @within_past_week
251
+ url.query_params['as_qdr'] = 'w'
252
+ elsif @within_past_months
253
+ case @within_past_months
254
+ when 1
255
+ url.query_params['as_qdr'] = 'm'
256
+ when 2
257
+ url.query_params['as_qdr'] = 'm2'
258
+ when 3
259
+ url.query_params['as_qdr'] = 'm3'
260
+ when 6
261
+ url.query_params['as_qdr'] = 'm6'
262
+ end
263
+ elsif @within_past_year
264
+ url.query_params['as_qdr'] = 'y'
265
+ end
266
+
267
+ if @numeric_range
268
+ url.query_params['as_nlo'] = @numeric_range.begin
269
+ url.query_params['as_nhi'] = @numeric_range.end
270
+ end
271
+
272
+ case @occurrs_within
273
+ when :title, 'title'
274
+ url.query_params['as_occt'] = 'title'
275
+ when :body, 'body'
276
+ url.query_params['as_occt'] = 'body'
277
+ when :url, 'url'
278
+ url.query_params['as_occt'] = 'url'
279
+ when :links, 'links'
280
+ url.query_params['as_occt'] = 'links'
281
+ end
282
+
283
+ if @inside_domain
284
+ url.query_params['as_dt'] = 'i'
285
+ url.query_params['as_sitesearch'] = @inside_domain
286
+ elsif @outside_domain
287
+ url.query_params['as_dt'] = 'e'
288
+ url.query_params['as_sitesearch'] = @outside_domain
289
+ end
290
+
291
+ case @rights
292
+ when Licenses::CC_BY_NC_ND
293
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
294
+ when Licenses::CC_BY_SA
295
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
296
+ when Licenses::CC_BY_ND
297
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
298
+ when Licenses::CC_BY
299
+ url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
300
+ end
301
+
302
+ url.query_params['safe'] = true if @filtered
303
+
304
+ if @similar_to
305
+ url.query_params['as_rq'] = @similar_to
306
+ elsif @links_to
307
+ url.query_params['as_lq'] = @links_to
308
+ end
309
+
310
+ return url
311
+ end
312
+
313
+ #
314
+ # Returns the URL that represents the query at the specific
315
+ # _page_index_.
316
+ #
317
+ def page_url(page_index)
318
+ url = search_url
319
+
320
+ url.query_params['start'] = page_index_offset(page_index)
321
+ url.query_params['sa'] = 'N'
322
+
323
+ return url
324
+ end
325
+
326
+ #
327
+ # Returns an array of Result objects at the specified _page_index_.
328
+ # If _opts_ are given, they will be used in accessing the SEARCH_URL.
329
+ #
330
+ def page(page_index,opts={})
331
+ results = []
332
+ doc = Hpricot(GScraper.open(page_url(page_index),opts))
333
+
334
+ doc.search('//div.g').each_with_index do |result,index|
335
+ rank = page_index_offset(page_index) + (index + 1)
336
+ title = result.search('//h2.r').first.inner_text
337
+ url = result.search('//h2.r/a').first.get_attribute('href')
338
+ # TODO: exclude URL and Links from summary text
339
+ summary = result.search('//td.j').first.inner_text
340
+
341
+ # TODO: scrape Cached and Similar links
342
+
343
+ results << Result.new(rank,title,url,summary)
344
+ end
345
+
346
+ return results
347
+ end
348
+
349
+ #
350
+ # Returns the results on the first page. If _opts_ are given, they
351
+ # will be used in accessing the SEARCH_URL.
352
+ #
353
+ def first_page(opts={})
354
+ page(1,opts)
355
+ end
356
+
357
+ #
358
+ # Iterates over the results at the specified _page_index_, passing
359
+ # each to the given _block_. If _opts_ are given they will be used
360
+ # in accessing the SEARCH_URL.
361
+ #
362
+ # query.each_on_page(2) do |result|
363
+ # puts result.title
364
+ # end
365
+ #
366
+ def each_on_page(page_index,opts={},&block)
367
+ page(page_index,opts).each(&block)
368
+ end
369
+
370
+ #
371
+ # Iterates over the results on the first page, passing
372
+ # each to the given _block_. If _opts_ are given, they will be used
373
+ # in accessing the SEARCH_URL.
374
+ #
375
+ # query.each_on_first_page do |result|
376
+ # puts result.url
377
+ # end
378
+ #
379
+ def each_on_first_page(opts={},&block)
380
+ each_on_page(1,opts,&block)
381
+ end
382
+
383
+ protected
384
+
385
+ #
386
+ # Returns the rank offset for the specified _page_index_.
387
+ #
388
+ def page_index_offset(page_index)
389
+ (page_index.to_i - 1) * @result_per_page.to_i
390
+ end
391
+
392
+ end
393
+ end
394
+ end
@@ -0,0 +1,37 @@
1
+ module GScraper
2
+ module Search
3
+ class Result
4
+
5
+ # Rank of the result page
6
+ attr_reader :rank
7
+
8
+ # Title of the result page
9
+ attr_reader :title
10
+
11
+ # URL of the result page
12
+ attr_reader :url
13
+
14
+ # Summary from the result page
15
+ attr_reader :summary
16
+
17
+ #
18
+ # Creates a new Result object with the given _rank_, _title_
19
+ # _summary_, _url_ and _size_.
20
+ #
21
+ def initialize(rank,title,url,summary)
22
+ @rank = rank
23
+ @title = title
24
+ @url = url
25
+ @summary = summary
26
+ end
27
+
28
+ #
29
+ # Returns a string containing the result's title.
30
+ #
31
+ def to_s
32
+ @title.to_s
33
+ end
34
+
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,33 @@
1
+ require 'gscraper/search/query'
2
+
3
+ module GScraper
4
+ module Search
5
+ #
6
+ # Returns a new Query object with the given _opts_. See Query.new.
7
+ #
8
+ # Search.query(:query => 'ruby', :with_words => 'rspec rails')
9
+ #
10
+ # Search.query(:exact_phrase => 'fluent interfaces') do |q|
11
+ # q.within_past_week = true
12
+ # end
13
+ #
14
+ def Search.query(opts={},&block)
15
+ Query.new(opts,&block)
16
+ end
17
+
18
+ #
19
+ # Returns the Query object that represents the specified _url_.
20
+ # See Query.from_url.
21
+ #
22
+ # Search.query_from_url('http://www.google.com/search?q=ruby+zen)
23
+ #
24
+ # Search.query_from_url('http://www.google.com/search?q=ruby') do |q|
25
+ # q.within_last_month = true
26
+ # q.occurrs_within = :title
27
+ # end
28
+ #
29
+ def Search.query_from_url(url,&block)
30
+ Query.from_url(url,&block)
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,50 @@
1
+ require 'test/unit'
2
+ require 'gscraper/search/query'
3
+
4
+ class QueryFromURL < Test::Unit::TestCase
5
+
6
+ include GScraper
7
+
8
+ QUERY_URL = 'http://www.google.com/search?as_q=test&hl=en&num=20&btnG=Google+Search&as_epq=what+if&as_oq=dog&as_eq=haha&lr=&cr=&as_ft=i&as_filetype=&as_qdr=w&as_nlo=&as_nhi=&as_occt=body&as_dt=i&as_sitesearch=&as_rights=&safe=images'
9
+
10
+ def setup
11
+ @query = Search::Query.from_url(QUERY_URL)
12
+ end
13
+
14
+ def teardown
15
+ @query = nil
16
+ end
17
+
18
+ def test_query
19
+ assert_equal @query.query, 'test'
20
+ end
21
+
22
+ def test_exact_phrase
23
+ assert_equal @query.exact_phrase, 'what+if'
24
+ end
25
+
26
+ def test_with_words
27
+ assert_equal @query.with_words, 'dog'
28
+ end
29
+
30
+ def test_without_words
31
+ assert_equal @query.without_words, 'haha'
32
+ end
33
+
34
+ def test_within_past_week
35
+ assert_equal @query.within_past_week, true
36
+ end
37
+
38
+ def test_occurrs_within
39
+ assert_equal @query.occurrs_within, :body
40
+ end
41
+
42
+ def test_similar_to
43
+ assert_nil @query.similar_to
44
+ end
45
+
46
+ def test_links_to
47
+ assert_nil @query.links_to
48
+ end
49
+
50
+ end
@@ -0,0 +1,4 @@
1
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),'..','lib')))
2
+
3
+ require 'test/unit'
4
+ require 'search/query_from_url'
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.4
3
+ specification_version: 1
4
+ name: gscraper
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2007-12-21 00:00:00 -08:00
8
+ summary: A ruby web-scraping interface to various Google Services
9
+ require_paths:
10
+ - lib
11
+ email: postmodern.mod3@gmail.com
12
+ homepage: " by Postmodern Modulus III"
13
+ rubyforge_project: gscraper
14
+ description: "== FEATURES/PROBLEMS: * Supports the Google Search service. * Provides HTTP access with custom User-Agent strings. == REQUIREMENTS: * Hpricot * Mechanize == INSTALL:"
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Postmodern Modulus III
31
+ files:
32
+ - History.txt
33
+ - LICENSE.txt
34
+ - Manifest.txt
35
+ - README.txt
36
+ - Rakefile
37
+ - lib/gscraper.rb
38
+ - lib/gscraper/gscraper.rb
39
+ - lib/gscraper/extensions/uri/http.rb
40
+ - lib/gscraper/extensions/uri.rb
41
+ - lib/gscraper/extensions.rb
42
+ - lib/gscraper/licenses.rb
43
+ - lib/gscraper/search/result.rb
44
+ - lib/gscraper/search/query.rb
45
+ - lib/gscraper/search/search.rb
46
+ - lib/gscraper/search.rb
47
+ - test/test_gscraper.rb
48
+ - test/search/query_from_url.rb
49
+ test_files:
50
+ - test/test_gscraper.rb
51
+ rdoc_options:
52
+ - --main
53
+ - README.txt
54
+ extra_rdoc_files:
55
+ - History.txt
56
+ - LICENSE.txt
57
+ - Manifest.txt
58
+ - README.txt
59
+ executables: []
60
+
61
+ extensions: []
62
+
63
+ requirements: []
64
+
65
+ dependencies:
66
+ - !ruby/object:Gem::Dependency
67
+ name: hoe
68
+ version_requirement:
69
+ version_requirements: !ruby/object:Gem::Version::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: 1.3.0
74
+ version: