gscraper 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ == 0.1.3 / 2007-12-22
2
+
3
+ * Added the Page class, which contains many of convenance methods for
4
+ searching through the results within a Page.
5
+
1
6
  == 0.1.2 / 2007-12-22
2
7
 
3
8
  * Fixed a bug related to extracting the correct content-rights from search
data/Manifest.txt CHANGED
@@ -11,6 +11,7 @@ lib/gscraper/extensions/uri.rb
11
11
  lib/gscraper/extensions.rb
12
12
  lib/gscraper/licenses.rb
13
13
  lib/gscraper/search/result.rb
14
+ lib/gscraper/search/page.rb
14
15
  lib/gscraper/search/query.rb
15
16
  lib/gscraper/search/search.rb
16
17
  lib/gscraper/search.rb
@@ -0,0 +1,224 @@
1
+ require 'gscraper/search/result'
2
+
3
+ module GScraper
4
+ module Search
5
+ class Page < Array
6
+
7
+ #
8
+ # Creates a new Page object with the given _results_.
9
+ #
10
+ def initialize(results=[])
11
+ super(results)
12
+ end
13
+
14
+ #
15
+ # Returns a mapped Array of the results within the Page using the
16
+ # given _block_. If the _block_ is not given, the page will be
17
+ # returned.
18
+ #
19
+ # page.map # => Page
20
+ #
21
+ # page.map { |result| result.url } # => [...]
22
+ #
23
+ def map(&block)
24
+ return self unless block
25
+
26
+ mapped = []
27
+
28
+ each { |result| mapped << block.call(result) }
29
+ return mapped
30
+ end
31
+
32
+ #
33
+ # Selects the results within the Page which match the given _block_.
34
+ #
35
+ # page.select { |result| result.title =~ /ruby/i }
36
+ #
37
+ def select(&block)
38
+ Page.new(super(&block))
39
+ end
40
+
41
+ #
42
+ # Selects the results using the specified _block_.
43
+ #
44
+ # page.results_with { |result| result.title =~ /blog/ }
45
+ #
46
+ def results_with(&block)
47
+ select(&block)
48
+ end
49
+
50
+ #
51
+ # Selects the results with the matching _title_. The _title_ may be
52
+ # either a String or a Regexp. If _block_ is given, each matching
53
+ # result will be passed to the _block_.
54
+ #
55
+ # page.results_with_title('hackety org') #=> Page
56
+ #
57
+ # page.results_with_title(/awesome/) do |result|
58
+ # puts result.url
59
+ # end
60
+ #
61
+ def results_with_title(title,&block)
62
+ if title.kind_of?(Regexp)
63
+ results = results_with { |result| result.title =~ title }
64
+ else
65
+ results = results_with { |result| result.title == title }
66
+ end
67
+
68
+ results.each(&block) if block
69
+ return results
70
+ end
71
+
72
+ #
73
+ # Selects the results with the matching _url_. The _url_ may be
74
+ # either a String or a Regexp. If _block_ is given, each matching
75
+ # result will be passed to the _block_.
76
+ #
77
+ # page.results_with_url(/\.com/) # => Page
78
+ #
79
+ # page.results_with_url(/^https:\/\//) do |result|
80
+ # puts result.title
81
+ # end
82
+ #
83
+ def results_with_url(url,&block)
84
+ if url.kind_of?(Regexp)
85
+ results = results_with { |result| result.url =~ url }
86
+ else
87
+ results = results_with { |result| result.url == url }
88
+ end
89
+
90
+ results.each(&block) if block
91
+ return results
92
+ end
93
+
94
+ #
95
+ # Selects the results with the matching _summary_. The _summary_ may
96
+ # be either a String or a Regexp. If _block_ is given, each matching
97
+ # result will be passed to the _block_.
98
+ #
99
+ # page.results_with_summary(/cheese cake/) # => Page
100
+ #
101
+ # page.results_with_summary(/Scientifically/) do |result|
102
+ # puts result.url
103
+ # end
104
+ #
105
+ def results_with_summary(summary,&block)
106
+ if summary.kind_of?(Regexp)
107
+ results = results_with { |result| result.summary =~ summary }
108
+ else
109
+ results = results_with { |result| result.summary == summary }
110
+ end
111
+
112
+ results.each(&block) if block
113
+ return results
114
+ end
115
+
116
+ #
117
+ # Returns an Array containing the ranks of the results within the
118
+ # Page. If _block_ is given, each rank will be passed to the _block_.
119
+ #
120
+ # page.ranks # => [...]
121
+ #
122
+ # page.ranks do |rank|
123
+ # puts ranks
124
+ # end
125
+ #
126
+ def ranks(&block)
127
+ mapped = map { |result| result.rank }
128
+
129
+ mapped.each(&block) if block
130
+ return mapped
131
+ end
132
+
133
+ #
134
+ # Returns an Array containing the titles of the results within the
135
+ # Page. If _block_ is given, each title will be passed to the _block_.
136
+ #
137
+ # page.titles # => [...]
138
+ #
139
+ # page.titles do |title|
140
+ # puts title
141
+ # end
142
+ #
143
+ def titles(&block)
144
+ mapped = map { |result| result.title }
145
+
146
+ mapped.each(&block) if block
147
+ return mapped
148
+ end
149
+
150
+ #
151
+ # Returns an Array containing the URLs of the results within the
152
+ # Page. If _block_ is given, each URL will be passed to the _block_.
153
+ #
154
+ # page.urls # => [...]
155
+ #
156
+ # page.urls do |url|
157
+ # puts url
158
+ # end
159
+ #
160
+ def urls(&block)
161
+ mapped = map { |result| result.url }
162
+
163
+ mapped.each(&block) if block
164
+ return mapped
165
+ end
166
+
167
+ #
168
+ # Returns an Array containing the summaries of the results within the
169
+ # Page. If _block_ is given, each summary will be passed to the
170
+ # _block_.
171
+ #
172
+ # page.summaries # => [...]
173
+ #
174
+ # page.summaries do |summary|
175
+ # puts summary
176
+ # end
177
+ #
178
+ def summaries(&block)
179
+ mapped = map { |result| result.summaries }
180
+
181
+ mapped.each(&block) if block
182
+ return mapped
183
+ end
184
+
185
+ #
186
+ # Returns the ranks of the results that match the specified _block_.
187
+ #
188
+ # page.ranks_of { |result result.title =~ /awesome/ }
189
+ #
190
+ def ranks_of(&block)
191
+ results_with(&block).ranks
192
+ end
193
+
194
+ #
195
+ # Returns the titles of the results that match the specified _block_.
196
+ #
197
+ # page.titles_of { |result result.url.include?('www') }
198
+ #
199
+ def titles_of(&block)
200
+ results_with(&block).titles
201
+ end
202
+
203
+ #
204
+ # Returns the urls of the results that match the specified _block_.
205
+ #
206
+ # page.urls_of { |result result.summary =~ /awesome pants/ }
207
+ #
208
+ def urls_of(&block)
209
+ results_with(&block).urls
210
+ end
211
+
212
+ #
213
+ # Returns the summaries of the results that match the specified
214
+ # _block_.
215
+ #
216
+ # page.summaries_of { |result result.title =~ /what if/ }
217
+ #
218
+ def summaries_of(&block)
219
+ results_with(&block).summaries
220
+ end
221
+
222
+ end
223
+ end
224
+ end
@@ -1,4 +1,5 @@
1
1
  require 'gscraper/search/result'
2
+ require 'gscraper/search/page'
2
3
  require 'gscraper/extensions/uri'
3
4
  require 'gscraper/licenses'
4
5
  require 'gscraper/gscraper'
@@ -80,7 +81,7 @@ module GScraper
80
81
  # Creates a new Query object from the given search options. If a
81
82
  # block is given, it will be passed the newly created query object.
82
83
  #
83
- # Query.new(:query => 'ruby', :with_words => 'rspec rails')
84
+ # Query.new(:query => 'ruby', :with_words => 'sow rspec')
84
85
  #
85
86
  # Query.new(:exact_phrase => 'fluent interfaces') do |q|
86
87
  # q.within_past_week = true
@@ -324,12 +325,13 @@ module GScraper
324
325
  end
325
326
 
326
327
  #
327
- # Returns an array of Result objects at the specified _page_index_.
328
- # If _opts_ are given, they will be used in accessing the SEARCH_URL.
328
+ # Returns a Page object containing Result objects at the specified
329
+ # _page_index_. If _opts_ are given, they will be used in accessing
330
+ # the SEARCH_URL.
329
331
  #
330
332
  def page(page_index,opts={})
331
- results = []
332
333
  doc = Hpricot(GScraper.open(page_url(page_index),opts))
334
+ new_page = Page.new
333
335
 
334
336
  doc.search('//div.g').each_with_index do |result,index|
335
337
  rank = page_index_offset(page_index) + (index + 1)
@@ -340,10 +342,10 @@ module GScraper
340
342
 
341
343
  # TODO: scrape Cached and Similar links
342
344
 
343
- results << Result.new(rank,title,url,summary)
345
+ new_page << Result.new(rank,title,url,summary)
344
346
  end
345
347
 
346
- return results
348
+ return new_page
347
349
  end
348
350
 
349
351
  #
@@ -5,7 +5,7 @@ module GScraper
5
5
  #
6
6
  # Returns a new Query object with the given _opts_. See Query.new.
7
7
  #
8
- # Search.query(:query => 'ruby', :with_words => 'rspec rails')
8
+ # Search.query(:query => 'ruby', :with_words => 'sow rspec')
9
9
  #
10
10
  # Search.query(:exact_phrase => 'fluent interfaces') do |q|
11
11
  # q.within_past_week = true
@@ -1,3 +1,3 @@
1
1
  module GScraper
2
- VERSION = '0.1.2'
2
+ VERSION = '0.1.3'
3
3
  end
metadata CHANGED
@@ -3,7 +3,7 @@ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: gscraper
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.1.2
6
+ version: 0.1.3
7
7
  date: 2007-12-21 00:00:00 -08:00
8
8
  summary: A ruby web-scraping interface to various Google Services
9
9
  require_paths:
@@ -42,6 +42,7 @@ files:
42
42
  - lib/gscraper/extensions.rb
43
43
  - lib/gscraper/licenses.rb
44
44
  - lib/gscraper/search/result.rb
45
+ - lib/gscraper/search/page.rb
45
46
  - lib/gscraper/search/query.rb
46
47
  - lib/gscraper/search/search.rb
47
48
  - lib/gscraper/search.rb