gscraper 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ == 0.1.3 / 2007-12-22
2
+
3
+ * Added the Page class, which contains many of convenance methods for
4
+ searching through the results within a Page.
5
+
1
6
  == 0.1.2 / 2007-12-22
2
7
 
3
8
  * Fixed a bug related to extracting the correct content-rights from search
data/Manifest.txt CHANGED
@@ -11,6 +11,7 @@ lib/gscraper/extensions/uri.rb
11
11
  lib/gscraper/extensions.rb
12
12
  lib/gscraper/licenses.rb
13
13
  lib/gscraper/search/result.rb
14
+ lib/gscraper/search/page.rb
14
15
  lib/gscraper/search/query.rb
15
16
  lib/gscraper/search/search.rb
16
17
  lib/gscraper/search.rb
@@ -0,0 +1,224 @@
1
+ require 'gscraper/search/result'
2
+
3
+ module GScraper
4
+ module Search
5
+ class Page < Array
6
+
7
+ #
8
+ # Creates a new Page object with the given _results_.
9
+ #
10
+ def initialize(results=[])
11
+ super(results)
12
+ end
13
+
14
+ #
15
+ # Returns a mapped Array of the results within the Page using the
16
+ # given _block_. If the _block_ is not given, the page will be
17
+ # returned.
18
+ #
19
+ # page.map # => Page
20
+ #
21
+ # page.map { |result| result.url } # => [...]
22
+ #
23
+ def map(&block)
24
+ return self unless block
25
+
26
+ mapped = []
27
+
28
+ each { |result| mapped << block.call(result) }
29
+ return mapped
30
+ end
31
+
32
+ #
33
+ # Selects the results within the Page which match the given _block_.
34
+ #
35
+ # page.select { |result| result.title =~ /ruby/i }
36
+ #
37
+ def select(&block)
38
+ Page.new(super(&block))
39
+ end
40
+
41
+ #
42
+ # Selects the results using the specified _block_.
43
+ #
44
+ # page.results_with { |result| result.title =~ /blog/ }
45
+ #
46
+ def results_with(&block)
47
+ select(&block)
48
+ end
49
+
50
+ #
51
+ # Selects the results with the matching _title_. The _title_ may be
52
+ # either a String or a Regexp. If _block_ is given, each matching
53
+ # result will be passed to the _block_.
54
+ #
55
+ # page.results_with_title('hackety org') #=> Page
56
+ #
57
+ # page.results_with_title(/awesome/) do |result|
58
+ # puts result.url
59
+ # end
60
+ #
61
+ def results_with_title(title,&block)
62
+ if title.kind_of?(Regexp)
63
+ results = results_with { |result| result.title =~ title }
64
+ else
65
+ results = results_with { |result| result.title == title }
66
+ end
67
+
68
+ results.each(&block) if block
69
+ return results
70
+ end
71
+
72
+ #
73
+ # Selects the results with the matching _url_. The _url_ may be
74
+ # either a String or a Regexp. If _block_ is given, each matching
75
+ # result will be passed to the _block_.
76
+ #
77
+ # page.results_with_url(/\.com/) # => Page
78
+ #
79
+ # page.results_with_url(/^https:\/\//) do |result|
80
+ # puts result.title
81
+ # end
82
+ #
83
+ def results_with_url(url,&block)
84
+ if url.kind_of?(Regexp)
85
+ results = results_with { |result| result.url =~ url }
86
+ else
87
+ results = results_with { |result| result.url == url }
88
+ end
89
+
90
+ results.each(&block) if block
91
+ return results
92
+ end
93
+
94
+ #
95
+ # Selects the results with the matching _summary_. The _summary_ may
96
+ # be either a String or a Regexp. If _block_ is given, each matching
97
+ # result will be passed to the _block_.
98
+ #
99
+ # page.results_with_summary(/cheese cake/) # => Page
100
+ #
101
+ # page.results_with_summary(/Scientifically/) do |result|
102
+ # puts result.url
103
+ # end
104
+ #
105
+ def results_with_summary(summary,&block)
106
+ if summary.kind_of?(Regexp)
107
+ results = results_with { |result| result.summary =~ summary }
108
+ else
109
+ results = results_with { |result| result.summary == summary }
110
+ end
111
+
112
+ results.each(&block) if block
113
+ return results
114
+ end
115
+
116
+ #
117
+ # Returns an Array containing the ranks of the results within the
118
+ # Page. If _block_ is given, each rank will be passed to the _block_.
119
+ #
120
+ # page.ranks # => [...]
121
+ #
122
+ # page.ranks do |rank|
123
+ # puts ranks
124
+ # end
125
+ #
126
+ def ranks(&block)
127
+ mapped = map { |result| result.rank }
128
+
129
+ mapped.each(&block) if block
130
+ return mapped
131
+ end
132
+
133
+ #
134
+ # Returns an Array containing the titles of the results within the
135
+ # Page. If _block_ is given, each title will be passed to the _block_.
136
+ #
137
+ # page.titles # => [...]
138
+ #
139
+ # page.titles do |title|
140
+ # puts title
141
+ # end
142
+ #
143
+ def titles(&block)
144
+ mapped = map { |result| result.title }
145
+
146
+ mapped.each(&block) if block
147
+ return mapped
148
+ end
149
+
150
+ #
151
+ # Returns an Array containing the URLs of the results within the
152
+ # Page. If _block_ is given, each URL will be passed to the _block_.
153
+ #
154
+ # page.urls # => [...]
155
+ #
156
+ # page.urls do |url|
157
+ # puts url
158
+ # end
159
+ #
160
+ def urls(&block)
161
+ mapped = map { |result| result.url }
162
+
163
+ mapped.each(&block) if block
164
+ return mapped
165
+ end
166
+
167
+ #
168
+ # Returns an Array containing the summaries of the results within the
169
+ # Page. If _block_ is given, each summary will be passed to the
170
+ # _block_.
171
+ #
172
+ # page.summaries # => [...]
173
+ #
174
+ # page.summaries do |summary|
175
+ # puts summary
176
+ # end
177
+ #
178
+ def summaries(&block)
179
+ mapped = map { |result| result.summaries }
180
+
181
+ mapped.each(&block) if block
182
+ return mapped
183
+ end
184
+
185
+ #
186
+ # Returns the ranks of the results that match the specified _block_.
187
+ #
188
+ # page.ranks_of { |result result.title =~ /awesome/ }
189
+ #
190
+ def ranks_of(&block)
191
+ results_with(&block).ranks
192
+ end
193
+
194
+ #
195
+ # Returns the titles of the results that match the specified _block_.
196
+ #
197
+ # page.titles_of { |result result.url.include?('www') }
198
+ #
199
+ def titles_of(&block)
200
+ results_with(&block).titles
201
+ end
202
+
203
+ #
204
+ # Returns the urls of the results that match the specified _block_.
205
+ #
206
+ # page.urls_of { |result result.summary =~ /awesome pants/ }
207
+ #
208
+ def urls_of(&block)
209
+ results_with(&block).urls
210
+ end
211
+
212
+ #
213
+ # Returns the summaries of the results that match the specified
214
+ # _block_.
215
+ #
216
+ # page.summaries_of { |result result.title =~ /what if/ }
217
+ #
218
+ def summaries_of(&block)
219
+ results_with(&block).summaries
220
+ end
221
+
222
+ end
223
+ end
224
+ end
@@ -1,4 +1,5 @@
1
1
  require 'gscraper/search/result'
2
+ require 'gscraper/search/page'
2
3
  require 'gscraper/extensions/uri'
3
4
  require 'gscraper/licenses'
4
5
  require 'gscraper/gscraper'
@@ -80,7 +81,7 @@ module GScraper
80
81
  # Creates a new Query object from the given search options. If a
81
82
  # block is given, it will be passed the newly created query object.
82
83
  #
83
- # Query.new(:query => 'ruby', :with_words => 'rspec rails')
84
+ # Query.new(:query => 'ruby', :with_words => 'sow rspec')
84
85
  #
85
86
  # Query.new(:exact_phrase => 'fluent interfaces') do |q|
86
87
  # q.within_past_week = true
@@ -324,12 +325,13 @@ module GScraper
324
325
  end
325
326
 
326
327
  #
327
- # Returns an array of Result objects at the specified _page_index_.
328
- # If _opts_ are given, they will be used in accessing the SEARCH_URL.
328
+ # Returns a Page object containing Result objects at the specified
329
+ # _page_index_. If _opts_ are given, they will be used in accessing
330
+ # the SEARCH_URL.
329
331
  #
330
332
  def page(page_index,opts={})
331
- results = []
332
333
  doc = Hpricot(GScraper.open(page_url(page_index),opts))
334
+ new_page = Page.new
333
335
 
334
336
  doc.search('//div.g').each_with_index do |result,index|
335
337
  rank = page_index_offset(page_index) + (index + 1)
@@ -340,10 +342,10 @@ module GScraper
340
342
 
341
343
  # TODO: scrape Cached and Similar links
342
344
 
343
- results << Result.new(rank,title,url,summary)
345
+ new_page << Result.new(rank,title,url,summary)
344
346
  end
345
347
 
346
- return results
348
+ return new_page
347
349
  end
348
350
 
349
351
  #
@@ -5,7 +5,7 @@ module GScraper
5
5
  #
6
6
  # Returns a new Query object with the given _opts_. See Query.new.
7
7
  #
8
- # Search.query(:query => 'ruby', :with_words => 'rspec rails')
8
+ # Search.query(:query => 'ruby', :with_words => 'sow rspec')
9
9
  #
10
10
  # Search.query(:exact_phrase => 'fluent interfaces') do |q|
11
11
  # q.within_past_week = true
@@ -1,3 +1,3 @@
1
1
  module GScraper
2
- VERSION = '0.1.2'
2
+ VERSION = '0.1.3'
3
3
  end
metadata CHANGED
@@ -3,7 +3,7 @@ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: gscraper
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.1.2
6
+ version: 0.1.3
7
7
  date: 2007-12-21 00:00:00 -08:00
8
8
  summary: A ruby web-scraping interface to various Google Services
9
9
  require_paths:
@@ -42,6 +42,7 @@ files:
42
42
  - lib/gscraper/extensions.rb
43
43
  - lib/gscraper/licenses.rb
44
44
  - lib/gscraper/search/result.rb
45
+ - lib/gscraper/search/page.rb
45
46
  - lib/gscraper/search/query.rb
46
47
  - lib/gscraper/search/search.rb
47
48
  - lib/gscraper/search.rb