gscraper 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/Manifest.txt +1 -0
- data/lib/gscraper/search/page.rb +224 -0
- data/lib/gscraper/search/query.rb +8 -6
- data/lib/gscraper/search/search.rb +1 -1
- data/lib/gscraper/version.rb +1 -1
- metadata +2 -1
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -0,0 +1,224 @@
|
|
1
|
+
require 'gscraper/search/result'
|
2
|
+
|
3
|
+
module GScraper
|
4
|
+
module Search
|
5
|
+
class Page < Array
|
6
|
+
|
7
|
+
#
|
8
|
+
# Creates a new Page object with the given _results_.
|
9
|
+
#
|
10
|
+
def initialize(results=[])
|
11
|
+
super(results)
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Returns a mapped Array of the results within the Page using the
|
16
|
+
# given _block_. If the _block_ is not given, the page will be
|
17
|
+
# returned.
|
18
|
+
#
|
19
|
+
# page.map # => Page
|
20
|
+
#
|
21
|
+
# page.map { |result| result.url } # => [...]
|
22
|
+
#
|
23
|
+
def map(&block)
|
24
|
+
return self unless block
|
25
|
+
|
26
|
+
mapped = []
|
27
|
+
|
28
|
+
each { |result| mapped << block.call(result) }
|
29
|
+
return mapped
|
30
|
+
end
|
31
|
+
|
32
|
+
#
|
33
|
+
# Selects the results within the Page which match the given _block_.
|
34
|
+
#
|
35
|
+
# page.select { |result| result.title =~ /ruby/i }
|
36
|
+
#
|
37
|
+
def select(&block)
|
38
|
+
Page.new(super(&block))
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Selects the results using the specified _block_.
|
43
|
+
#
|
44
|
+
# page.results_with { |result| result.title =~ /blog/ }
|
45
|
+
#
|
46
|
+
def results_with(&block)
|
47
|
+
select(&block)
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# Selects the results with the matching _title_. The _title_ may be
|
52
|
+
# either a String or a Regexp. If _block_ is given, each matching
|
53
|
+
# result will be passed to the _block_.
|
54
|
+
#
|
55
|
+
# page.results_with_title('hackety org') #=> Page
|
56
|
+
#
|
57
|
+
# page.results_with_title(/awesome/) do |result|
|
58
|
+
# puts result.url
|
59
|
+
# end
|
60
|
+
#
|
61
|
+
def results_with_title(title,&block)
|
62
|
+
if title.kind_of?(Regexp)
|
63
|
+
results = results_with { |result| result.title =~ title }
|
64
|
+
else
|
65
|
+
results = results_with { |result| result.title == title }
|
66
|
+
end
|
67
|
+
|
68
|
+
results.each(&block) if block
|
69
|
+
return results
|
70
|
+
end
|
71
|
+
|
72
|
+
#
|
73
|
+
# Selects the results with the matching _url_. The _url_ may be
|
74
|
+
# either a String or a Regexp. If _block_ is given, each matching
|
75
|
+
# result will be passed to the _block_.
|
76
|
+
#
|
77
|
+
# page.results_with_url(/\.com/) # => Page
|
78
|
+
#
|
79
|
+
# page.results_with_url(/^https:\/\//) do |result|
|
80
|
+
# puts result.title
|
81
|
+
# end
|
82
|
+
#
|
83
|
+
def results_with_url(url,&block)
|
84
|
+
if url.kind_of?(Regexp)
|
85
|
+
results = results_with { |result| result.url =~ url }
|
86
|
+
else
|
87
|
+
results = results_with { |result| result.url == url }
|
88
|
+
end
|
89
|
+
|
90
|
+
results.each(&block) if block
|
91
|
+
return results
|
92
|
+
end
|
93
|
+
|
94
|
+
#
|
95
|
+
# Selects the results with the matching _summary_. The _summary_ may
|
96
|
+
# be either a String or a Regexp. If _block_ is given, each matching
|
97
|
+
# result will be passed to the _block_.
|
98
|
+
#
|
99
|
+
# page.results_with_summary(/cheese cake/) # => Page
|
100
|
+
#
|
101
|
+
# page.results_with_summary(/Scientifically/) do |result|
|
102
|
+
# puts result.url
|
103
|
+
# end
|
104
|
+
#
|
105
|
+
def results_with_summary(summary,&block)
|
106
|
+
if summary.kind_of?(Regexp)
|
107
|
+
results = results_with { |result| result.summary =~ summary }
|
108
|
+
else
|
109
|
+
results = results_with { |result| result.summary == summary }
|
110
|
+
end
|
111
|
+
|
112
|
+
results.each(&block) if block
|
113
|
+
return results
|
114
|
+
end
|
115
|
+
|
116
|
+
#
|
117
|
+
# Returns an Array containing the ranks of the results within the
|
118
|
+
# Page. If _block_ is given, each rank will be passed to the _block_.
|
119
|
+
#
|
120
|
+
# page.ranks # => [...]
|
121
|
+
#
|
122
|
+
# page.ranks do |rank|
|
123
|
+
# puts ranks
|
124
|
+
# end
|
125
|
+
#
|
126
|
+
def ranks(&block)
|
127
|
+
mapped = map { |result| result.rank }
|
128
|
+
|
129
|
+
mapped.each(&block) if block
|
130
|
+
return mapped
|
131
|
+
end
|
132
|
+
|
133
|
+
#
|
134
|
+
# Returns an Array containing the titles of the results within the
|
135
|
+
# Page. If _block_ is given, each title will be passed to the _block_.
|
136
|
+
#
|
137
|
+
# page.titles # => [...]
|
138
|
+
#
|
139
|
+
# page.titles do |title|
|
140
|
+
# puts title
|
141
|
+
# end
|
142
|
+
#
|
143
|
+
def titles(&block)
|
144
|
+
mapped = map { |result| result.title }
|
145
|
+
|
146
|
+
mapped.each(&block) if block
|
147
|
+
return mapped
|
148
|
+
end
|
149
|
+
|
150
|
+
#
|
151
|
+
# Returns an Array containing the URLs of the results within the
|
152
|
+
# Page. If _block_ is given, each URL will be passed to the _block_.
|
153
|
+
#
|
154
|
+
# page.urls # => [...]
|
155
|
+
#
|
156
|
+
# page.urls do |url|
|
157
|
+
# puts url
|
158
|
+
# end
|
159
|
+
#
|
160
|
+
def urls(&block)
|
161
|
+
mapped = map { |result| result.url }
|
162
|
+
|
163
|
+
mapped.each(&block) if block
|
164
|
+
return mapped
|
165
|
+
end
|
166
|
+
|
167
|
+
#
|
168
|
+
# Returns an Array containing the summaries of the results within the
|
169
|
+
# Page. If _block_ is given, each summary will be passed to the
|
170
|
+
# _block_.
|
171
|
+
#
|
172
|
+
# page.summaries # => [...]
|
173
|
+
#
|
174
|
+
# page.summaries do |summary|
|
175
|
+
# puts summary
|
176
|
+
# end
|
177
|
+
#
|
178
|
+
def summaries(&block)
|
179
|
+
mapped = map { |result| result.summaries }
|
180
|
+
|
181
|
+
mapped.each(&block) if block
|
182
|
+
return mapped
|
183
|
+
end
|
184
|
+
|
185
|
+
#
|
186
|
+
# Returns the ranks of the results that match the specified _block_.
|
187
|
+
#
|
188
|
+
# page.ranks_of { |result result.title =~ /awesome/ }
|
189
|
+
#
|
190
|
+
def ranks_of(&block)
|
191
|
+
results_with(&block).ranks
|
192
|
+
end
|
193
|
+
|
194
|
+
#
|
195
|
+
# Returns the titles of the results that match the specified _block_.
|
196
|
+
#
|
197
|
+
# page.titles_of { |result result.url.include?('www') }
|
198
|
+
#
|
199
|
+
def titles_of(&block)
|
200
|
+
results_with(&block).titles
|
201
|
+
end
|
202
|
+
|
203
|
+
#
|
204
|
+
# Returns the urls of the results that match the specified _block_.
|
205
|
+
#
|
206
|
+
# page.urls_of { |result result.summary =~ /awesome pants/ }
|
207
|
+
#
|
208
|
+
def urls_of(&block)
|
209
|
+
results_with(&block).urls
|
210
|
+
end
|
211
|
+
|
212
|
+
#
|
213
|
+
# Returns the summaries of the results that match the specified
|
214
|
+
# _block_.
|
215
|
+
#
|
216
|
+
# page.summaries_of { |result result.title =~ /what if/ }
|
217
|
+
#
|
218
|
+
def summaries_of(&block)
|
219
|
+
results_with(&block).summaries
|
220
|
+
end
|
221
|
+
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'gscraper/search/result'
|
2
|
+
require 'gscraper/search/page'
|
2
3
|
require 'gscraper/extensions/uri'
|
3
4
|
require 'gscraper/licenses'
|
4
5
|
require 'gscraper/gscraper'
|
@@ -80,7 +81,7 @@ module GScraper
|
|
80
81
|
# Creates a new Query object from the given search options. If a
|
81
82
|
# block is given, it will be passed the newly created query object.
|
82
83
|
#
|
83
|
-
# Query.new(:query => 'ruby', :with_words => 'rspec
|
84
|
+
# Query.new(:query => 'ruby', :with_words => 'sow rspec')
|
84
85
|
#
|
85
86
|
# Query.new(:exact_phrase => 'fluent interfaces') do |q|
|
86
87
|
# q.within_past_week = true
|
@@ -324,12 +325,13 @@ module GScraper
|
|
324
325
|
end
|
325
326
|
|
326
327
|
#
|
327
|
-
# Returns
|
328
|
-
# If _opts_ are given, they will be used in accessing
|
328
|
+
# Returns a Page object containing Result objects at the specified
|
329
|
+
# _page_index_. If _opts_ are given, they will be used in accessing
|
330
|
+
# the SEARCH_URL.
|
329
331
|
#
|
330
332
|
def page(page_index,opts={})
|
331
|
-
results = []
|
332
333
|
doc = Hpricot(GScraper.open(page_url(page_index),opts))
|
334
|
+
new_page = Page.new
|
333
335
|
|
334
336
|
doc.search('//div.g').each_with_index do |result,index|
|
335
337
|
rank = page_index_offset(page_index) + (index + 1)
|
@@ -340,10 +342,10 @@ module GScraper
|
|
340
342
|
|
341
343
|
# TODO: scrape Cached and Similar links
|
342
344
|
|
343
|
-
|
345
|
+
new_page << Result.new(rank,title,url,summary)
|
344
346
|
end
|
345
347
|
|
346
|
-
return
|
348
|
+
return new_page
|
347
349
|
end
|
348
350
|
|
349
351
|
#
|
@@ -5,7 +5,7 @@ module GScraper
|
|
5
5
|
#
|
6
6
|
# Returns a new Query object with the given _opts_. See Query.new.
|
7
7
|
#
|
8
|
-
# Search.query(:query => 'ruby', :with_words => 'rspec
|
8
|
+
# Search.query(:query => 'ruby', :with_words => 'sow rspec')
|
9
9
|
#
|
10
10
|
# Search.query(:exact_phrase => 'fluent interfaces') do |q|
|
11
11
|
# q.within_past_week = true
|
data/lib/gscraper/version.rb
CHANGED
metadata
CHANGED
@@ -3,7 +3,7 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: gscraper
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.1.
|
6
|
+
version: 0.1.3
|
7
7
|
date: 2007-12-21 00:00:00 -08:00
|
8
8
|
summary: A ruby web-scraping interface to various Google Services
|
9
9
|
require_paths:
|
@@ -42,6 +42,7 @@ files:
|
|
42
42
|
- lib/gscraper/extensions.rb
|
43
43
|
- lib/gscraper/licenses.rb
|
44
44
|
- lib/gscraper/search/result.rb
|
45
|
+
- lib/gscraper/search/page.rb
|
45
46
|
- lib/gscraper/search/query.rb
|
46
47
|
- lib/gscraper/search/search.rb
|
47
48
|
- lib/gscraper/search.rb
|