libcraigscrape 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +19 -0
- data/README +27 -11
- data/Rakefile +44 -2
- data/bin/craig_report_schema.yml +30 -21
- data/bin/craigwatch +232 -67
- data/bin/report_mailer/craigslist_report.html.erb +12 -9
- data/bin/report_mailer/craigslist_report.plain.erb +4 -1
- data/lib/geo_listings.rb +144 -0
- data/lib/libcraigscrape.rb +158 -650
- data/lib/listings.rb +144 -0
- data/lib/posting.rb +293 -0
- data/lib/scraper.rb +203 -0
- data/test/geolisting_samples/hierarchy_test071009/index.html +31 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/%20SW%20florida/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/more-nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/nonexist/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/south%20florida/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/index.html +355 -0
- data/test/test_craigslist_geolisting.rb +476 -380
- metadata +28 -2
data/lib/libcraigscrape.rb
CHANGED
@@ -2,696 +2,204 @@
|
|
2
2
|
#
|
3
3
|
# All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
4
4
|
#
|
5
|
-
require 'net/http'
|
6
|
-
require 'zlib'
|
7
5
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
6
|
+
# A base class encapsulating the various libcraigscrape objects, and providing most of the
|
7
|
+
# craigslist interaction methods. Currently, we're supporting the old Class methods
|
8
|
+
# in a legacy-compatibility mode, but these methods are marked for deprecation. Instead,
|
9
|
+
# create an instance of the Craigslist object, and use its Public Instance methods.
|
10
|
+
# See the README for easy to follow examples.
|
11
|
+
class CraigScrape; end
|
12
|
+
|
13
|
+
require 'listings'
|
14
|
+
require 'posting'
|
15
|
+
require 'geo_listings'
|
12
16
|
|
13
|
-
# A base class encapsulating the libcraigscrape objects, and providing some utility methods.
|
14
17
|
class CraigScrape
|
15
18
|
cattr_accessor :time_now
|
19
|
+
cattr_accessor :site_to_url_prefix
|
20
|
+
|
21
|
+
#--
|
22
|
+
# NOTE:
|
23
|
+
# The only reason I took this out is b/c I might want to test with a file://
|
24
|
+
# prefix at some point
|
25
|
+
#++
|
26
|
+
self.site_to_url_prefix = 'http://'
|
16
27
|
|
17
|
-
|
18
|
-
#
|
19
|
-
#
|
20
|
-
|
21
|
-
|
28
|
+
|
29
|
+
# Takes a variable number of site/path specifiers (strings) as an argument.
|
30
|
+
# This list gets flattened and passed to CraigScrape::GeoListings.find_sites .
|
31
|
+
# See that method's rdoc for a complete set of rules on what arguments are allowed here.
|
32
|
+
def initialize(*args)
|
33
|
+
@sites_specs = args.flatten
|
22
34
|
end
|
23
35
|
|
24
|
-
#
|
25
|
-
#
|
26
|
-
def
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
36
|
+
# Returns which sites are included in any operations performed by this object. This is directly
|
37
|
+
# ascertained from the initial constructor's spec-list
|
38
|
+
def sites
|
39
|
+
@sites ||= GeoListings.find_sites @sites_specs
|
40
|
+
@sites
|
41
|
+
end
|
42
|
+
|
43
|
+
# Determines all listings which can be construed by combining the sites specified in the object
|
44
|
+
# constructor with the provided url-path fragments.
|
45
|
+
#
|
46
|
+
# Passes the <b>first page listing</b> of each of these urls to the provided block.
|
47
|
+
def each_listing(*fragments)
|
48
|
+
listing_urls_for(fragments).each{|url| yield Listings.new(url) }
|
49
|
+
end
|
50
|
+
|
51
|
+
# Determines all listings which can be construed by combining the sites specified in the object
|
52
|
+
# constructor with the provided url-path fragments.
|
53
|
+
#
|
54
|
+
# Passes <b>each page on every listing</b> for the passed URLs to the provided block.
|
55
|
+
def each_page_in_each_listing(*fragments)
|
56
|
+
each_listing(*fragments) do |listing|
|
57
|
+
while listing
|
58
|
+
yield listing
|
59
|
+
listing = listing.next_page
|
40
60
|
end
|
41
61
|
end
|
42
|
-
|
43
|
-
ret
|
44
|
-
end
|
45
|
-
|
46
|
-
# Scrapes a single Post Url, and returns a Posting object representing its contents.
|
47
|
-
# Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
|
48
|
-
# Consider this method 'marked for deprecation'
|
49
|
-
def self.scrape_full_post(post_url)
|
50
|
-
CraigScrape::Posting.new post_url
|
51
62
|
end
|
52
|
-
|
53
|
-
#
|
54
|
-
#
|
55
|
-
|
56
|
-
|
57
|
-
|
63
|
+
|
64
|
+
# Determines all listings which can be construed by combining the sites specified in the object
|
65
|
+
# constructor with the provided url-path fragments.
|
66
|
+
#
|
67
|
+
# Returns the <b>first page listing</b> of each of these urls to the provided block.
|
68
|
+
def listings(*fragments)
|
69
|
+
listing_urls_for(fragments).collect{|url| Listings.new url }
|
58
70
|
end
|
59
71
|
|
60
|
-
#
|
61
|
-
#
|
62
|
-
# As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
|
63
|
-
# time-based cutoffs, at the expense of retrieving every post in full during enumerations.
|
72
|
+
# Determines all listings which can be construed by combining the sites specified in the object
|
73
|
+
# constructor with the provided url-path fragments.
|
64
74
|
#
|
65
|
-
#
|
66
|
-
|
67
|
-
|
75
|
+
# Passes all posts from each of these urls to the provided block, in the order they're parsed
|
76
|
+
# (for each listing, newest posts are returned first).
|
77
|
+
def each_post(*fragments)
|
78
|
+
each_page_in_each_listing(*fragments){ |l| l.posts.each{|p| yield p} }
|
68
79
|
end
|
69
|
-
|
70
|
-
#
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
ret =
|
77
|
-
|
80
|
+
|
81
|
+
# Determines all listings which can be construed by combining the sites specified in the object
|
82
|
+
# constructor with the provided url-path fragments.
|
83
|
+
#
|
84
|
+
# Returns all posts from each of these urls, in the order they're parsed
|
85
|
+
# (newest posts first).
|
86
|
+
def posts(*fragments)
|
87
|
+
ret = []
|
88
|
+
each_page_in_each_listing(*fragments){ |l| ret += l.posts }
|
78
89
|
ret
|
79
90
|
end
|
80
|
-
|
81
|
-
# Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related
|
82
|
-
# functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html
|
83
|
-
# methods. It also contains the http-related cattr_accessors:
|
84
|
-
#
|
85
|
-
# *logger* - a Logger object to debug http notices too. Defaults to nil
|
86
|
-
#
|
87
|
-
# *retries_on_fetch_fail* - The number of times to retry a failed uri download. Defaults to 4
|
88
|
-
#
|
89
|
-
# *sleep_between_fetch_retries* - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 15.
|
90
|
-
class Scraper
|
91
|
-
cattr_accessor :logger
|
92
|
-
cattr_accessor :sleep_between_fetch_retries
|
93
|
-
cattr_accessor :retries_on_fetch_fail
|
94
|
-
|
95
|
-
URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
|
96
|
-
HTML_TAG = /<\/?[^>]*>/
|
97
|
-
|
98
|
-
# Returns the full url that corresponds to this resource
|
99
|
-
attr_reader :url
|
100
|
-
|
101
|
-
# Set some defaults:
|
102
|
-
self.retries_on_fetch_fail = 4
|
103
|
-
self.sleep_between_fetch_retries = 15
|
104
91
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
# Scraper Objects can be created from either a full URL (string), or a Hash.
|
118
|
-
# Currently, this initializer isn't intended to be called from libcraigslist API users, though
|
119
|
-
# if you know what you're doing - feel free to try this out.
|
120
|
-
#
|
121
|
-
# A (string) url can be passed in a 'http://' scheme or a 'file://' scheme.
|
122
|
-
#
|
123
|
-
# When constructing from a hash, the keys in the hash will be used to set the object's corresponding values.
|
124
|
-
# This is useful to create an object without actually making an html request, this is used to set-up an
|
125
|
-
# object before it eager-loads any values not already passed in by the constructor hash. Though optional, if
|
126
|
-
# you're going to be setting this object up for eager-loadnig, be sure to pass in a :url key in your hash,
|
127
|
-
# Otherwise this will fail to eager load.
|
128
|
-
def initialize(init_via = nil)
|
129
|
-
if init_via.nil?
|
130
|
-
# Do nothing - possibly not a great idea, but we'll allow it
|
131
|
-
elsif init_via.kind_of? String
|
132
|
-
@url = init_via
|
133
|
-
elsif init_via.kind_of? Hash
|
134
|
-
init_via.each_pair{|k,v| instance_variable_set "@#{k}", v}
|
135
|
-
else
|
136
|
-
raise BadConstructionError, ("Unrecognized parameter passed to %s.new %s}" % [self.class.to_s, init_via.class.inspect])
|
92
|
+
# Determines all listings which can be construed by combining the sites specified in the object
|
93
|
+
# constructor with the provided url-path fragments.
|
94
|
+
#
|
95
|
+
# Returns all posts from each of these urls, which are newer than the provider 'newer_then' date.
|
96
|
+
# (Returns 'newest' posts first).
|
97
|
+
def posts_since(newer_then, *fragments)
|
98
|
+
ret = []
|
99
|
+
fragments.each do |frag|
|
100
|
+
each_post(frag) do |p|
|
101
|
+
break if p.post_date <= newer_then
|
102
|
+
ret << p
|
137
103
|
end
|
138
104
|
end
|
139
|
-
|
140
|
-
# Indicates whether the resource has yet been retrieved from its associated url.
|
141
|
-
# This is useful to distinguish whether the instance was instantiated for the purpose of an eager-load,
|
142
|
-
# but hasn't yet been fetched.
|
143
|
-
def downloaded?; !@html.nil?; end
|
144
|
-
|
145
|
-
# A URI object corresponding to this Scraped URL
|
146
|
-
def uri
|
147
|
-
@uri ||= URI.parse @url if @url
|
148
|
-
@uri
|
149
|
-
end
|
150
105
|
|
151
|
-
|
152
|
-
|
153
|
-
# Returns text with all html tags removed.
|
154
|
-
def strip_html(str)
|
155
|
-
str.gsub HTML_TAG, "" if str
|
156
|
-
end
|
157
|
-
|
158
|
-
# Easy way to fail noisily:
|
159
|
-
def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
|
160
|
-
|
161
|
-
# Returns text with all html entities converted to respective ascii character.
|
162
|
-
def he_decode(text); self.class.he_decode text; end
|
163
|
-
|
164
|
-
# Returns text with all html entities converted to respective ascii character.
|
165
|
-
def self.he_decode(text); HTMLEntities.new.decode text; end
|
166
|
-
|
167
|
-
# Derives a full url, using the current object's url and the provided href
|
168
|
-
def url_from_href(href) #:nodoc:
|
169
|
-
scheme, host, path = $1, $2, $3 if URL_PARTS.match href
|
170
|
-
|
171
|
-
scheme = uri.scheme if scheme.nil? or scheme.empty? and uri.respond_to? :scheme
|
172
|
-
|
173
|
-
host = uri.host if host.nil? or host.empty? and uri.respond_to? :host
|
174
|
-
|
175
|
-
path = (
|
176
|
-
(/\/$/.match(uri.path)) ?
|
177
|
-
'%s%s' % [uri.path,path] :
|
178
|
-
'%s/%s' % [File.dirname(uri.path),path]
|
179
|
-
) unless /^\//.match path
|
180
|
-
|
181
|
-
'%s://%s%s' % [scheme, host, path]
|
182
|
-
end
|
183
|
-
|
184
|
-
def fetch_uri(uri)
|
185
|
-
|
186
|
-
logger.info "Requesting: %s" % @url if logger
|
187
|
-
|
188
|
-
case uri.scheme
|
189
|
-
when 'file'
|
190
|
-
File.read uri.path
|
191
|
-
when /^http[s]?/
|
192
|
-
fetch_attempts = 0
|
193
|
-
|
194
|
-
begin
|
195
|
-
# This handles the redirects for us
|
196
|
-
resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri, nil
|
197
|
-
|
198
|
-
if resp.response.code == "200"
|
199
|
-
# Check for gzip, and decode:
|
200
|
-
data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
|
201
|
-
|
202
|
-
data
|
203
|
-
elsif resp.response['Location']
|
204
|
-
redirect_to = resp.response['Location']
|
205
|
-
|
206
|
-
fetch_uri URI.parse(url_from_href(redirect_to))
|
207
|
-
else
|
208
|
-
# Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
|
209
|
-
error_description = 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
|
210
|
-
|
211
|
-
logger.info error_description if logger
|
212
|
-
|
213
|
-
raise FetchError, error_description
|
214
|
-
end
|
215
|
-
rescue FetchError,Timeout::Error,Errno::ECONNRESET => err
|
216
|
-
logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
|
217
|
-
logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
|
218
|
-
|
219
|
-
fetch_attempts += 1
|
220
|
-
|
221
|
-
if fetch_attempts <= self.retries_on_fetch_fail
|
222
|
-
sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
|
223
|
-
logger.info 'Retrying fetch ....' if logger
|
224
|
-
retry
|
225
|
-
else
|
226
|
-
raise err
|
227
|
-
end
|
228
|
-
end
|
229
|
-
else
|
230
|
-
raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
|
231
|
-
end
|
232
|
-
end
|
233
|
-
|
234
|
-
def html
|
235
|
-
@html ||= Hpricot.parse fetch_uri(uri) if uri
|
236
|
-
@html
|
237
|
-
end
|
106
|
+
ret
|
238
107
|
end
|
108
|
+
|
109
|
+
class << self # Class methods
|
239
110
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
|
246
|
-
LOCATION = /Location\:[ ]+(.+)/
|
247
|
-
HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
|
248
|
-
POSTING_ID = /PostingID\:[ ]+([\d]+)/
|
249
|
-
REPLY_TO = /(.+)/
|
250
|
-
PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
|
251
|
-
USERBODY_PARTS = /\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>/m
|
252
|
-
IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
|
253
|
-
|
254
|
-
# This is really just for testing, in production use, uri.path is a better solution
|
255
|
-
attr_reader :href #:nodoc:
|
256
|
-
|
257
|
-
# Create a new Post via a url (String), or supplied parameters (Hash)
|
258
|
-
def initialize(*args)
|
259
|
-
super(*args)
|
260
|
-
|
261
|
-
# Validate that required fields are present, at least - if we've downloaded it from a url
|
262
|
-
parse_error! if args.first.kind_of? String and !flagged_for_removal? and !deleted_by_author? and [
|
263
|
-
contents,posting_id,post_time,header,title,full_section
|
264
|
-
].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
|
265
|
-
end
|
266
|
-
|
267
|
-
|
268
|
-
# String, The contents of the item's html body heading
|
269
|
-
def header
|
270
|
-
unless @header
|
271
|
-
h2 = html.at 'h2' if html
|
272
|
-
@header = he_decode h2.inner_html if h2
|
273
|
-
end
|
274
|
-
|
275
|
-
@header
|
276
|
-
end
|
277
|
-
|
278
|
-
# String, the item's title
|
279
|
-
def title
|
280
|
-
unless @title
|
281
|
-
title_tag = html.at 'title' if html
|
282
|
-
@title = he_decode title_tag.inner_html if title_tag
|
283
|
-
@title = nil if @title and @title.length == 0
|
284
|
-
end
|
285
|
-
|
286
|
-
@title
|
287
|
-
end
|
288
|
-
|
289
|
-
# Array, hierarchial representation of the posts section
|
290
|
-
def full_section
|
291
|
-
unless @full_section
|
292
|
-
@full_section = []
|
293
|
-
|
294
|
-
(html/"div[@class='bchead']//a").each do |a|
|
295
|
-
@full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
|
296
|
-
end if html
|
297
|
-
end
|
298
|
-
|
299
|
-
@full_section
|
300
|
-
end
|
111
|
+
#--
|
112
|
+
# NOTE: These Class methods are all marked for deprecation as of
|
113
|
+
# version 0.8.0, and should not be used with any new project code
|
114
|
+
#++
|
301
115
|
|
302
|
-
#
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
@reply_to
|
311
|
-
end
|
312
|
-
|
313
|
-
# Time, reflects the full timestamp of the posting
|
314
|
-
def post_time
|
315
|
-
unless @post_time
|
316
|
-
cursor = html.at 'hr' if html
|
317
|
-
cursor = cursor.next_node until cursor.nil? or POST_DATE.match cursor.to_s
|
318
|
-
@post_time = Time.parse $1 if $1
|
319
|
-
end
|
320
|
-
|
321
|
-
@post_time
|
116
|
+
# <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
|
117
|
+
# Instead, consider using CraigScrape::Listings.new
|
118
|
+
#
|
119
|
+
# Scrapes a single listing url and returns a Listings object representing the contents.
|
120
|
+
# Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
|
121
|
+
def scrape_listing(listing_url)
|
122
|
+
CraigScrape::Listings.new listing_url
|
322
123
|
end
|
323
124
|
|
324
|
-
#
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
@contents = user_body if html
|
339
|
-
@contents = he_decode @contents.strip if @contents
|
340
|
-
end
|
341
|
-
|
342
|
-
@contents
|
343
|
-
end
|
344
|
-
|
345
|
-
# String, the location of the item, as best could be parsed
|
346
|
-
def location
|
347
|
-
if @location.nil? and craigslist_body and html
|
348
|
-
# Location (when explicitly defined):
|
349
|
-
cursor = craigslist_body.at 'ul' unless @location
|
350
|
-
|
351
|
-
# Apa section includes other things in the li's (cats/dogs ok fields)
|
352
|
-
cursor.children.each do |li|
|
353
|
-
if LOCATION.match li.inner_html
|
354
|
-
@location = he_decode($1) and break
|
355
|
-
break
|
125
|
+
# <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
|
126
|
+
# Instead, consider using the CraigScrape::each_post method.
|
127
|
+
#
|
128
|
+
# Continually scrapes listings, using the supplied url as a starting point, until the supplied block returns true or
|
129
|
+
# until there's no more 'next page' links available to click on
|
130
|
+
def scrape_until(listing_url, &post_condition)
|
131
|
+
ret = []
|
132
|
+
|
133
|
+
listings = CraigScrape::Listings.new listing_url
|
134
|
+
catch "ScrapeBreak" do
|
135
|
+
while listings do
|
136
|
+
listings.posts.each do |post|
|
137
|
+
throw "ScrapeBreak" if post_condition.call(post)
|
138
|
+
ret << post
|
356
139
|
end
|
357
|
-
|
358
|
-
|
359
|
-
# Real estate listings can work a little different for location:
|
360
|
-
unless @location
|
361
|
-
cursor = craigslist_body.at 'small'
|
362
|
-
cursor = cursor.previous_node until cursor.nil? or cursor.text?
|
363
|
-
|
364
|
-
@location = he_decode(cursor.to_s.strip) if cursor
|
365
|
-
end
|
366
|
-
|
367
|
-
# So, *sometimes* the location just ends up being in the header, I don't know why:
|
368
|
-
@location = $1 if @location.nil? and HEADER_LOCATION.match header
|
369
|
-
end
|
370
|
-
|
371
|
-
@location
|
372
|
-
end
|
373
|
-
|
374
|
-
# Array, urls of the post's images that are *not* hosted on craigslist
|
375
|
-
def images
|
376
|
-
# Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
|
377
|
-
@images = (
|
378
|
-
contents ?
|
379
|
-
contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
|
380
|
-
[]
|
381
|
-
) unless @images
|
382
|
-
|
383
|
-
@images
|
384
|
-
end
|
385
|
-
|
386
|
-
# Array, urls of the post's craigslist-hosted images
|
387
|
-
def pics
|
388
|
-
unless @pics
|
389
|
-
@pics = []
|
390
|
-
|
391
|
-
if html and craigslist_body
|
392
|
-
# Now let's find the craigslist hosted images:
|
393
|
-
img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
|
394
|
-
|
395
|
-
@pics = (img_table / 'img').collect{|i| i[:src]} if img_table
|
140
|
+
|
141
|
+
listings = listings.next_page
|
396
142
|
end
|
397
143
|
end
|
398
|
-
|
399
|
-
|
400
|
-
end
|
401
|
-
|
402
|
-
# Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
|
403
|
-
def flagged_for_removal?
|
404
|
-
@flagged_for_removal = (
|
405
|
-
system_post? and header_as_plain == "This posting has been flagged for removal"
|
406
|
-
) if @flagged_for_removal.nil?
|
407
|
-
|
408
|
-
@flagged_for_removal
|
409
|
-
end
|
410
|
-
|
411
|
-
# Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
|
412
|
-
def deleted_by_author?
|
413
|
-
@deleted_by_author = (
|
414
|
-
system_post? and header_as_plain == "This posting has been deleted by its author."
|
415
|
-
) if @deleted_by_author.nil?
|
416
|
-
|
417
|
-
@deleted_by_author
|
418
|
-
end
|
419
|
-
|
420
|
-
|
421
|
-
# Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
|
422
|
-
# used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
423
|
-
def post_date
|
424
|
-
@post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
|
425
|
-
|
426
|
-
@post_date
|
427
|
-
end
|
428
|
-
|
429
|
-
# Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
|
430
|
-
# The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
|
431
|
-
# Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
|
432
|
-
# This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
|
433
|
-
# in a full page load from the post's url.
|
434
|
-
def label
|
435
|
-
unless @label or system_post?
|
436
|
-
@label = header
|
437
|
-
|
438
|
-
@label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
|
439
|
-
end
|
440
|
-
|
441
|
-
@label
|
442
|
-
end
|
443
|
-
|
444
|
-
# Array, which image types are listed for the post.
|
445
|
-
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
446
|
-
def img_types
|
447
|
-
unless @img_types
|
448
|
-
@img_types = []
|
449
|
-
|
450
|
-
@img_types << :img if images.length > 0
|
451
|
-
@img_types << :pic if pics.length > 0
|
452
|
-
end
|
453
|
-
|
454
|
-
@img_types
|
455
|
-
end
|
456
|
-
|
457
|
-
# Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
|
458
|
-
# this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
|
459
|
-
def section
|
460
|
-
unless @section
|
461
|
-
@section = full_section.last if full_section
|
462
|
-
end
|
463
|
-
|
464
|
-
@section
|
465
|
-
end
|
466
|
-
|
467
|
-
# true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
|
468
|
-
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
469
|
-
def has_img?
|
470
|
-
img_types.include? :img
|
471
|
-
end
|
472
|
-
|
473
|
-
# true if post summary has 'pic(s)'. 'pics' are different then imgs, in that craigslist is hosting the resource on craigslist's servers
|
474
|
-
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
475
|
-
def has_pic?
|
476
|
-
img_types.include? :pic
|
477
|
-
end
|
478
|
-
|
479
|
-
# true if post summary has either the img or pic label
|
480
|
-
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
481
|
-
def has_pic_or_img?
|
482
|
-
img_types.length > 0
|
483
|
-
end
|
484
|
-
|
485
|
-
# Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
|
486
|
-
# and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
487
|
-
def price
|
488
|
-
$1.tr('$','').to_f if label and PRICE.match label
|
489
|
-
end
|
490
|
-
|
491
|
-
# Returns the post contents with all html tags removed
|
492
|
-
def contents_as_plain
|
493
|
-
strip_html contents
|
144
|
+
|
145
|
+
ret
|
494
146
|
end
|
495
147
|
|
496
|
-
#
|
497
|
-
#
|
498
|
-
|
499
|
-
|
148
|
+
# <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
|
149
|
+
# Instead, consider using CraigScrape::Posting.new
|
150
|
+
#
|
151
|
+
# Scrapes a single Post Url, and returns a Posting object representing its contents.
|
152
|
+
# Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
|
153
|
+
def scrape_full_post(post_url)
|
154
|
+
CraigScrape::Posting.new post_url
|
500
155
|
end
|
501
156
|
|
502
|
-
#
|
503
|
-
#
|
504
|
-
|
505
|
-
|
157
|
+
# <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
|
158
|
+
# Instead, consider using the CraigScrape::each_post method.
|
159
|
+
#
|
160
|
+
# Continually scrapes listings, using the supplied url as a starting point, until 'count' summaries have been retrieved
|
161
|
+
# or no more 'next page' links are avialable to be clicked on. Returns an array of PostSummary objects.
|
162
|
+
def scrape_posts(listing_url, count)
|
163
|
+
count_so_far = 0
|
164
|
+
self.scrape_until(listing_url) {|post| count_so_far+=1; count < count_so_far }
|
506
165
|
end
|
507
166
|
|
508
|
-
|
509
|
-
|
510
|
-
#
|
511
|
-
#
|
512
|
-
#
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
def craigslist_body
|
520
|
-
Hpricot.parse $2 if USERBODY_PARTS.match html.to_s
|
167
|
+
# <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
|
168
|
+
# Instead, consider using the CraigScrape::posts_since method.
|
169
|
+
#
|
170
|
+
# Continually scrapes listings, until the date newer_then has been reached, or no more 'next page' links are avialable to be clicked on.
|
171
|
+
# Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
|
172
|
+
# As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
|
173
|
+
# time-based cutoffs, at the expense of retrieving every post in full during enumerations.
|
174
|
+
#
|
175
|
+
# <b>Note:</b> The results will not include post summaries having the newer_then date themselves.
|
176
|
+
def scrape_posts_since(listing_url, newer_then)
|
177
|
+
self.scrape_until(listing_url) {|post| post.post_date <= newer_then}
|
521
178
|
end
|
522
|
-
|
523
179
|
end
|
524
|
-
|
525
|
-
# Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
|
526
|
-
class Listings < Scraper
|
527
|
-
LABEL = /^(.+?)[ ]*\-$/
|
528
|
-
LOCATION = /^[ ]*\((.*?)\)$/
|
529
|
-
IMG_TYPE = /^[ ]*(.+)[ ]*$/
|
530
|
-
HEADER_DATE = /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/
|
531
|
-
SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
|
532
|
-
NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
|
533
|
-
|
534
|
-
# Array, PostSummary objects found in the listing
|
535
|
-
def posts
|
536
|
-
unless @posts
|
537
|
-
current_date = nil
|
538
|
-
@posts = []
|
539
180
|
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
when 'p'
|
553
|
-
post_summary = self.class.parse_summary el, current_date
|
554
|
-
|
555
|
-
# Validate that required fields are present:
|
556
|
-
parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
|
557
|
-
|
558
|
-
post_summary[:url] = url_from_href post_summary[:href]
|
559
|
-
|
560
|
-
@posts << CraigScrape::Posting.new(post_summary)
|
561
|
-
when 'h4'
|
562
|
-
# Let's make sense of the h4 tag, and then read all the p tags below it
|
563
|
-
if HEADER_DATE.match he_decode(el.inner_html)
|
564
|
-
# Generally, the H4 tags contain valid dates. When they do - this is easy:
|
565
|
-
current_date = CraigScrape.most_recently_expired_time $1, $2
|
566
|
-
elsif html.at('h4:last-of-type') == el
|
567
|
-
# There's a specific bug, where these nonsense h4's just appear without anything relevant inside them.
|
568
|
-
# They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
|
569
|
-
# we need to pull up the full post in order to accurate tell the date.
|
570
|
-
# Setting this to nil will achieve the eager-load.
|
571
|
-
current_date = nil
|
572
|
-
end
|
573
|
-
end
|
574
|
-
end
|
575
|
-
end
|
576
|
-
|
577
|
-
@posts
|
578
|
-
end
|
579
|
-
|
580
|
-
# String, URL Path href-fragment of the next page link
|
581
|
-
def next_page_href
|
582
|
-
unless @next_page_href
|
583
|
-
cursor = html.at 'p:last-of-type'
|
584
|
-
|
585
|
-
cursor = cursor.at 'a' if cursor
|
586
|
-
|
587
|
-
# Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
|
588
|
-
next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
|
589
|
-
|
590
|
-
# Search listings put their next page in a link towards the top
|
591
|
-
next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
|
592
|
-
|
593
|
-
# Some search pages have a bug, whereby a 'next page' link isn't displayed,
|
594
|
-
# even though we can see that theres another page listed in the page-number links block at the top
|
595
|
-
# and bottom of the listing page
|
596
|
-
unless next_link
|
597
|
-
cursor = html % 'div.sh:first-of-type > b:last-of-type'
|
598
|
-
|
599
|
-
# If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
|
600
|
-
# We're looking good.
|
601
|
-
next_link = cursor.next_sibling if cursor and /^[\d]+$/.match cursor.inner_html
|
602
|
-
end
|
603
|
-
|
604
|
-
# We have an anchor tag - so - let's assign the href:
|
605
|
-
@next_page_href = next_link[:href] if next_link
|
606
|
-
end
|
607
|
-
|
608
|
-
@next_page_href
|
609
|
-
end
|
181
|
+
private
|
182
|
+
|
183
|
+
# This takes a fragments paramter, and turns it into actual urls
|
184
|
+
def listing_urls_for(listing_fragments)
|
185
|
+
listing_fragments.collect{ |lf|
|
186
|
+
# This removes any /'s from he beginning of the fragment
|
187
|
+
lf = $1 if /^\/(.*)/.match lf
|
188
|
+
# This adds a '/' to the end of a path, so long as its not a query we're dealing with...
|
189
|
+
lf += '/' unless lf.index '?'
|
190
|
+
sites.collect { |site| '%s%s/%s' % [site_to_url_prefix,site,lf] }
|
191
|
+
}.flatten
|
192
|
+
end
|
610
193
|
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
end
|
194
|
+
# Returns the most recentlt expired time for the provided month and day
|
195
|
+
def self.most_recently_expired_time(month, day) #:nodoc:
|
196
|
+
now = (time_now) ? time_now : Time.now
|
615
197
|
|
616
|
-
#
|
617
|
-
|
618
|
-
|
619
|
-
ret = {}
|
620
|
-
|
621
|
-
title_anchor, section_anchor = p_element.search 'a'
|
622
|
-
location_tag = p_element.at 'font'
|
623
|
-
has_pic_tag = p_element.at 'span'
|
624
|
-
|
625
|
-
href = nil
|
626
|
-
|
627
|
-
location = he_decode p_element.at('font').inner_html if location_tag
|
628
|
-
ret[:location] = $1 if location and LOCATION.match location
|
629
|
-
|
630
|
-
ret[:img_types] = []
|
631
|
-
if has_pic_tag
|
632
|
-
img_type = he_decode has_pic_tag.inner_html
|
633
|
-
img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
|
634
|
-
|
635
|
-
ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
|
636
|
-
end
|
637
|
-
|
638
|
-
ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
|
639
|
-
|
640
|
-
ret[:post_date] = date
|
641
|
-
if SUMMARY_DATE.match he_decode(p_element.children[0])
|
642
|
-
ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
|
643
|
-
end
|
644
|
-
|
645
|
-
if title_anchor
|
646
|
-
label = he_decode title_anchor.inner_html
|
647
|
-
ret[:label] = $1 if LABEL.match label
|
198
|
+
# This ensures we always generate a time in the past, by guessing the year and subtracting one if we guessed wrong
|
199
|
+
ret = Time.local now.year, month, day
|
200
|
+
ret = Time.local now.year-1, month, day if ret > now
|
648
201
|
|
649
|
-
|
650
|
-
end
|
651
|
-
|
652
|
-
ret
|
653
|
-
end
|
654
|
-
end
|
655
|
-
|
656
|
-
# GeoListings represents a parsed Craigslist geo lisiting page. (i.e. {'http://geo.craigslist.org/iso/us'}[http://geo.craigslist.org/iso/us])
|
657
|
-
# These list all the craigslist sites in a given region.
|
658
|
-
class GeoListings < Scraper
|
659
|
-
LOCATION_NAME = /[ ]*\>[ ](.+)[ ]*/
|
660
|
-
GEOLISTING_BASE_URL = %{http://geo.craigslist.org/iso/}
|
661
|
-
|
662
|
-
# The geolisting constructor works like all other Scraper objects, in that it accepts a string 'url'.
|
663
|
-
# In addition though, here we'll accept an array like %w(us fl) which gets converted to
|
664
|
-
# {'http://geo.craigslist.org/iso/us/fl'}[http://geo.craigslist.org/iso/us/fl]
|
665
|
-
def initialize(init_via = nil)
|
666
|
-
super init_via.kind_of?(Array) ? "#{GEOLISTING_BASE_URL}#{init_via.join '/'}" : init_via
|
667
|
-
|
668
|
-
# Validate that required fields are present, at least - if we've downloaded it from a url
|
669
|
-
parse_error! unless location
|
670
|
-
end
|
671
|
-
|
672
|
-
# Returns the GeoLocation's full name
|
673
|
-
def location
|
674
|
-
unless @name
|
675
|
-
cursor = html % 'h3 > b > a:first-of-type'
|
676
|
-
cursor = cursor.next_node if cursor
|
677
|
-
@name = $1 if cursor and LOCATION_NAME.match he_decode(cursor.to_s)
|
678
|
-
end
|
679
|
-
|
680
|
-
@name
|
681
|
-
end
|
682
|
-
|
683
|
-
# Returns a hash of site name to urls in the current listing
|
684
|
-
def sites
|
685
|
-
unless @sites
|
686
|
-
@sites = {}
|
687
|
-
(html / 'div#list > a').each do |el_a|
|
688
|
-
site_name = he_decode strip_html(el_a.inner_html)
|
689
|
-
@sites[site_name] = el_a[:href]
|
690
|
-
end
|
691
|
-
end
|
692
|
-
|
693
|
-
@sites
|
694
|
-
end
|
202
|
+
ret
|
695
203
|
end
|
696
204
|
|
697
205
|
end
|