libcraigscrape 1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -1
- data/Gemfile +12 -0
- data/Rakefile +1 -54
- data/bin/craig_report_schema.yml +4 -1
- data/bin/craigwatch +148 -146
- data/bin/report_mailer/report.html.erb +20 -0
- data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +7 -6
- data/lib/geo_listings.rb +1 -1
- data/lib/libcraigscrape.rb +52 -59
- data/lib/listings.rb +75 -39
- data/lib/posting.rb +120 -63
- data/lib/scraper.rb +43 -63
- data/spec/assets/geolisting_iso_us_120412.html +441 -0
- data/spec/assets/listing_cta_ftl_112612.html +1470 -0
- data/spec/assets/listing_rea_miami_123012.html +1397 -0
- data/spec/assets/listing_search_ppa_nyc_121212.html +1584 -0
- data/spec/assets/posting_daytona_art_120512-2.html +160 -0
- data/spec/assets/posting_daytona_art_120512.html +153 -0
- data/spec/assets/posting_mdc_cto_ftl_112612.html +170 -0
- data/spec/assets/posting_mdc_reb_120612.html +183 -0
- data/spec/assets/posting_sfbay_1226.html +157 -0
- data/spec/assets/posting_sya_121012-2.html +122 -0
- data/spec/assets/posting_sya_121012.html +165 -0
- data/spec/assets/this_post_has_expired_old.html +48 -0
- data/spec/geolisting_spec.rb +9 -0
- data/spec/listings_spec.rb +77 -0
- data/spec/postings_spec.rb +157 -0
- data/spec/spec_helper.rb +8 -0
- data/test/test_craigslist_geolisting.rb +5 -5
- data/test/test_craigslist_listing.rb +30 -30
- data/test/test_craigslist_posting.rb +25 -145
- metadata +200 -114
- data/bin/report_mailer/craigslist_report.html.erb +0 -17
data/lib/libcraigscrape.rb
CHANGED
@@ -3,38 +3,34 @@
|
|
3
3
|
# All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
4
4
|
#
|
5
5
|
require 'rubygems'
|
6
|
-
|
7
|
-
|
8
|
-
gem 'nokogiri', '~> 1.4.4'
|
9
|
-
gem 'htmlentities', '~> 4.0.0'
|
10
|
-
|
11
|
-
|
12
|
-
require 'net/http'
|
13
|
-
require 'zlib'
|
14
|
-
require 'nokogiri'
|
6
|
+
require 'time'
|
7
|
+
require 'uri'
|
15
8
|
require 'htmlentities'
|
16
|
-
require 'active_support'
|
17
|
-
|
9
|
+
require 'active_support/core_ext/class/attribute_accessors'
|
10
|
+
require 'active_support/core_ext/time/calculations'
|
11
|
+
require 'htmlentities'
|
12
|
+
require 'nokogiri'
|
13
|
+
require 'typhoeus'
|
14
|
+
require 'money'
|
18
15
|
|
19
|
-
# A base class encapsulating the various libcraigscrape objects, and providing most of the
|
20
|
-
# craigslist interaction methods. Currently, we're supporting the old Class methods
|
16
|
+
# A base class encapsulating the various libcraigscrape objects, and providing most of the
|
17
|
+
# craigslist interaction methods. Currently, we're supporting the old Class methods
|
21
18
|
# in a legacy-compatibility mode, but these methods are marked for deprecation. Instead,
|
22
19
|
# create an instance of the Craigslist object, and use its Public Instance methods.
|
23
20
|
# See the README for easy to follow examples.
|
24
21
|
|
25
22
|
class CraigScrape
|
26
|
-
cattr_accessor :time_now
|
27
23
|
cattr_accessor :site_to_url_prefix
|
28
|
-
|
24
|
+
|
29
25
|
#--
|
30
26
|
# NOTE:
|
31
|
-
# The only reason I took this out is b/c I might want to test with a file://
|
27
|
+
# The only reason I took this out is b/c I might want to test with a file://
|
32
28
|
# prefix at some point
|
33
29
|
#++
|
34
30
|
self.site_to_url_prefix = 'http://'
|
35
31
|
|
36
|
-
|
37
|
-
# Takes a variable number of site/path specifiers (strings) as an argument.
|
32
|
+
|
33
|
+
# Takes a variable number of site/path specifiers (strings) as an argument.
|
38
34
|
# This list gets flattened and passed to CraigScrape::GeoListings.find_sites .
|
39
35
|
# See that method's rdoc for a complete set of rules on what arguments are allowed here.
|
40
36
|
def initialize(*args)
|
@@ -44,50 +40,50 @@ class CraigScrape
|
|
44
40
|
# Returns which sites are included in any operations performed by this object. This is directly
|
45
41
|
# ascertained from the initial constructor's spec-list
|
46
42
|
def sites
|
47
|
-
@sites ||= GeoListings.find_sites @sites_specs
|
43
|
+
@sites ||= GeoListings.find_sites @sites_specs
|
48
44
|
@sites
|
49
45
|
end
|
50
|
-
|
46
|
+
|
51
47
|
# Determines all listings which can be construed by combining the sites specified in the object
|
52
|
-
# constructor with the provided url-path fragments.
|
48
|
+
# constructor with the provided url-path fragments.
|
53
49
|
#
|
54
50
|
# Passes the <b>first page listing</b> of each of these urls to the provided block.
|
55
51
|
def each_listing(*fragments)
|
56
52
|
listing_urls_for(fragments).each{|url| yield Listings.new(url) }
|
57
53
|
end
|
58
|
-
|
54
|
+
|
59
55
|
# Determines all listings which can be construed by combining the sites specified in the object
|
60
|
-
# constructor with the provided url-path fragments.
|
56
|
+
# constructor with the provided url-path fragments.
|
61
57
|
#
|
62
58
|
# Passes <b>each page on every listing</b> for the passed URLs to the provided block.
|
63
59
|
def each_page_in_each_listing(*fragments)
|
64
60
|
each_listing(*fragments) do |listing|
|
65
61
|
while listing
|
66
62
|
yield listing
|
67
|
-
listing = listing.next_page
|
63
|
+
listing = listing.next_page
|
68
64
|
end
|
69
65
|
end
|
70
66
|
end
|
71
|
-
|
67
|
+
|
72
68
|
# Determines all listings which can be construed by combining the sites specified in the object
|
73
|
-
# constructor with the provided url-path fragments.
|
69
|
+
# constructor with the provided url-path fragments.
|
74
70
|
#
|
75
71
|
# Returns the <b>first page listing</b> of each of these urls to the provided block.
|
76
72
|
def listings(*fragments)
|
77
73
|
listing_urls_for(fragments).collect{|url| Listings.new url }
|
78
74
|
end
|
79
|
-
|
75
|
+
|
80
76
|
# Determines all listings which can be construed by combining the sites specified in the object
|
81
|
-
# constructor with the provided url-path fragments.
|
77
|
+
# constructor with the provided url-path fragments.
|
82
78
|
#
|
83
79
|
# Passes all posts from each of these urls to the provided block, in the order they're parsed
|
84
80
|
# (for each listing, newest posts are returned first).
|
85
81
|
def each_post(*fragments)
|
86
82
|
each_page_in_each_listing(*fragments){ |l| l.posts.each{|p| yield p} }
|
87
83
|
end
|
88
|
-
|
84
|
+
|
89
85
|
# Determines all listings which can be construed by combining the sites specified in the object
|
90
|
-
# constructor with the provided url-path fragments.
|
86
|
+
# constructor with the provided url-path fragments.
|
91
87
|
#
|
92
88
|
# Returns all posts from each of these urls, in the order they're parsed
|
93
89
|
# (newest posts first).
|
@@ -96,24 +92,32 @@ class CraigScrape
|
|
96
92
|
each_page_in_each_listing(*fragments){ |l| ret += l.posts }
|
97
93
|
ret
|
98
94
|
end
|
99
|
-
|
95
|
+
|
100
96
|
# Determines all listings which can be construed by combining the sites specified in the object
|
101
|
-
# constructor with the provided url-path fragments.
|
97
|
+
# constructor with the provided url-path fragments.
|
102
98
|
#
|
103
|
-
# Returns all posts from each of these urls, which are newer than the provider 'newer_then' date.
|
99
|
+
# Returns all posts from each of these urls, which are newer than (or equal to) the provider 'newer_then' date.
|
104
100
|
# (Returns 'newest' posts first).
|
101
|
+
#
|
102
|
+
# NOTE: New to version 1.1, if newer_then is a date, we compare to the post_date
|
103
|
+
# if newer_then is a Time, we compare to post_time. Be aware that post_time
|
104
|
+
# requires the entire post be loaded, and not just the summary - which will
|
105
|
+
# take longer to download.
|
105
106
|
def posts_since(newer_then, *fragments)
|
107
|
+
accessor = (newer_then.kind_of? Date) ? :post_date : :post_time
|
106
108
|
ret = []
|
107
109
|
fragments.each do |frag|
|
108
110
|
each_post(frag) do |p|
|
109
|
-
|
111
|
+
# We have to try the comparison, since post_time could conceivably be nil
|
112
|
+
# for the case of a system_post?
|
113
|
+
break if p.send(accessor).try(:<=, newer_then)
|
110
114
|
ret << p
|
111
115
|
end
|
112
116
|
end
|
113
117
|
|
114
|
-
ret
|
118
|
+
ret
|
115
119
|
end
|
116
|
-
|
120
|
+
|
117
121
|
class << self # Class methods
|
118
122
|
|
119
123
|
#--
|
@@ -122,11 +126,11 @@ class CraigScrape
|
|
122
126
|
#++
|
123
127
|
|
124
128
|
# <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
|
125
|
-
# Instead, consider using CraigScrape::Listings.new
|
129
|
+
# Instead, consider using CraigScrape::Listings.new
|
126
130
|
#
|
127
|
-
# Scrapes a single listing url and returns a Listings object representing the contents.
|
131
|
+
# Scrapes a single listing url and returns a Listings object representing the contents.
|
128
132
|
# Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
|
129
|
-
def scrape_listing(listing_url)
|
133
|
+
def scrape_listing(listing_url)
|
130
134
|
CraigScrape::Listings.new listing_url
|
131
135
|
end
|
132
136
|
|
@@ -137,24 +141,24 @@ class CraigScrape
|
|
137
141
|
# until there's no more 'next page' links available to click on
|
138
142
|
def scrape_until(listing_url, &post_condition)
|
139
143
|
ret = []
|
140
|
-
|
144
|
+
|
141
145
|
listings = CraigScrape::Listings.new listing_url
|
142
146
|
catch "ScrapeBreak" do
|
143
|
-
while listings do
|
147
|
+
while listings do
|
144
148
|
listings.posts.each do |post|
|
145
149
|
throw "ScrapeBreak" if post_condition.call(post)
|
146
150
|
ret << post
|
147
151
|
end
|
148
|
-
|
152
|
+
|
149
153
|
listings = listings.next_page
|
150
154
|
end
|
151
155
|
end
|
152
|
-
|
156
|
+
|
153
157
|
ret
|
154
158
|
end
|
155
159
|
|
156
160
|
# <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
|
157
|
-
# Instead, consider using CraigScrape::Posting.new
|
161
|
+
# Instead, consider using CraigScrape::Posting.new
|
158
162
|
#
|
159
163
|
# Scrapes a single Post Url, and returns a Posting object representing its contents.
|
160
164
|
# Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
|
@@ -176,7 +180,7 @@ class CraigScrape
|
|
176
180
|
# Instead, consider using the CraigScrape::posts_since method.
|
177
181
|
#
|
178
182
|
# Continually scrapes listings, until the date newer_then has been reached, or no more 'next page' links are avialable to be clicked on.
|
179
|
-
# Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
|
183
|
+
# Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
|
180
184
|
# As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
|
181
185
|
# time-based cutoffs, at the expense of retrieving every post in full during enumerations.
|
182
186
|
#
|
@@ -185,9 +189,9 @@ class CraigScrape
|
|
185
189
|
self.scrape_until(listing_url) {|post| post.post_date <= newer_then}
|
186
190
|
end
|
187
191
|
end
|
188
|
-
|
192
|
+
|
189
193
|
private
|
190
|
-
|
194
|
+
|
191
195
|
# This takes a fragments paramter, and turns it into actual urls
|
192
196
|
def listing_urls_for(listing_fragments)
|
193
197
|
listing_fragments.collect{ |lf|
|
@@ -198,20 +202,9 @@ class CraigScrape
|
|
198
202
|
sites.collect { |site| '%s%s/%s' % [site_to_url_prefix,site,lf] }
|
199
203
|
}.flatten
|
200
204
|
end
|
201
|
-
|
202
|
-
# Returns the most recentlt expired time for the provided month and day
|
203
|
-
def self.most_recently_expired_time(month, day) #:nodoc:
|
204
|
-
now = (time_now) ? time_now : Time.now
|
205
|
-
|
206
|
-
# This ensures we always generate a time in the past, by guessing the year and subtracting one if we guessed wrong
|
207
|
-
ret = Time.local now.year, month, day
|
208
|
-
ret = Time.local now.year-1, month, day if ret > now
|
209
|
-
|
210
|
-
ret
|
211
|
-
end
|
212
205
|
|
213
206
|
end
|
214
207
|
|
215
208
|
require 'listings'
|
216
209
|
require 'posting'
|
217
|
-
require 'geo_listings'
|
210
|
+
require 'geo_listings'
|
data/lib/listings.rb
CHANGED
@@ -13,7 +13,13 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
13
13
|
IMG_TYPE = /^[ ]*(.+)[ ]*$/
|
14
14
|
HEADER_DATE = /^[ ]*(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[ ]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Nov|Dec)[ ]+([0-9]{1,2})[ ]*$/i
|
15
15
|
SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
|
16
|
-
NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
|
16
|
+
NEXT_PAGE_LINK = /^[ ]*(?:next [\d]+ postings|Next \>\>)[ ]*$/
|
17
|
+
|
18
|
+
XPATH_POST_DATE = "*[@class='itemdate']"
|
19
|
+
XPATH_POST_IMGPIC = "*[@class='itempx']/*[@class='p']"
|
20
|
+
XPATH_PAGENAV_LINKS = "//*[@class='ban']//a"
|
21
|
+
# There's a couple places that the price hangs out. We search in this order
|
22
|
+
XPATHS_POST_PRICE = ["*[@class='itempp']", "*[@class='itemph']"]
|
17
23
|
|
18
24
|
# Array, PostSummary objects found in the listing
|
19
25
|
def posts
|
@@ -35,7 +41,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
35
41
|
post_tags.each do |el|
|
36
42
|
case el.name
|
37
43
|
when 'p'
|
38
|
-
post_summary =
|
44
|
+
post_summary = parse_summary el, current_date
|
39
45
|
|
40
46
|
# Validate that required fields are present:
|
41
47
|
parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
|
@@ -47,7 +53,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
47
53
|
# Let's make sense of the h4 tag, and then read all the p tags below it
|
48
54
|
if HEADER_DATE.match he_decode(el.inner_html)
|
49
55
|
# Generally, the H4 tags contain valid dates. When they do - this is easy:
|
50
|
-
current_date =
|
56
|
+
current_date = Date.parse [$1, $2].join('/')
|
51
57
|
elsif html.at('h4:last-of-type') == el
|
52
58
|
# There's a specific bug in craigslist, where these nonsense h4's just appear without anything relevant inside them.
|
53
59
|
# They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
|
@@ -65,29 +71,37 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
65
71
|
# String, URL Path href-fragment of the next page link
|
66
72
|
def next_page_href
|
67
73
|
unless @next_page_href
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
#
|
86
|
-
|
74
|
+
|
75
|
+
if html.at_xpath(XPATH_PAGENAV_LINKS)
|
76
|
+
# Post 12/3
|
77
|
+
next_link = html.xpath(XPATH_PAGENAV_LINKS).find{|link| NEXT_PAGE_LINK.match link.content}
|
78
|
+
@next_page_href = next_link[:href] if next_link
|
79
|
+
else
|
80
|
+
# Old style
|
81
|
+
cursor = html.at 'p:last-of-type'
|
82
|
+
|
83
|
+
cursor = cursor.at 'a' if cursor
|
84
|
+
|
85
|
+
# Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
|
86
|
+
next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
|
87
|
+
|
88
|
+
# Search listings put their next page in a link towards the top
|
89
|
+
next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
|
90
|
+
|
91
|
+
# Some search pages have a bug, whereby a 'next page' link isn't displayed,
|
92
|
+
# even though we can see that theres another page listed in the page-number links block at the top
|
93
|
+
# and bottom of the listing page
|
94
|
+
unless next_link
|
95
|
+
cursor = html % 'div.sh:first-of-type > b:last-of-type'
|
96
|
+
|
97
|
+
# If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
|
98
|
+
# We're looking good.
|
99
|
+
next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
|
100
|
+
end
|
101
|
+
|
102
|
+
# We have an anchor tag - so - let's assign the href:
|
103
|
+
@next_page_href = next_link[:href] if next_link
|
87
104
|
end
|
88
|
-
|
89
|
-
# We have an anchor tag - so - let's assign the href:
|
90
|
-
@next_page_href = next_link[:href] if next_link
|
91
105
|
end
|
92
106
|
|
93
107
|
@next_page_href
|
@@ -100,12 +114,14 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
100
114
|
|
101
115
|
# Returns a Listings object of the next_page_url on the current listings object
|
102
116
|
def next_page
|
103
|
-
CraigScrape::Listings.new next_page_url if next_page_url
|
117
|
+
CraigScrape::Listings.new URI.encode(next_page_url) if next_page_url
|
104
118
|
end
|
105
|
-
|
119
|
+
|
120
|
+
private
|
121
|
+
|
106
122
|
# Takes a paragraph element and returns a mostly-parsed Posting
|
107
123
|
# We separate this from the rest of the parsing both for readability and ease of testing
|
108
|
-
def
|
124
|
+
def parse_summary(p_element, date = nil) #:nodoc:
|
109
125
|
ret = {}
|
110
126
|
|
111
127
|
title_anchor = nil
|
@@ -126,26 +142,45 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
126
142
|
end
|
127
143
|
|
128
144
|
location_tag = p_element.at 'font'
|
129
|
-
has_pic_tag = p_element.at 'span'
|
130
145
|
|
131
146
|
href = nil
|
132
147
|
|
133
148
|
location = he_decode p_element.at('font').inner_html if location_tag
|
134
149
|
ret[:location] = $1 if location and LOCATION.match location
|
135
150
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
151
|
+
price_path = XPATHS_POST_PRICE.find{|path|
|
152
|
+
content = p_element.at_xpath(path).try(:content)
|
153
|
+
(!content.nil? and !content.empty?)
|
154
|
+
}
|
155
|
+
ret[:price] = Money.new($1.to_i * 100, 'USD') if price_path and
|
156
|
+
/\$([\d]+)/.match(p_element.at_xpath(price_path).content)
|
140
157
|
|
141
|
-
|
158
|
+
ret[:img_types] = []
|
159
|
+
if p_element.at_xpath XPATH_POST_IMGPIC
|
160
|
+
# Post 12/3
|
161
|
+
ret[:img_types] = p_element.at_xpath(XPATH_POST_IMGPIC).content.scan(/\w+/).collect(&:to_sym)
|
162
|
+
else
|
163
|
+
# Old style:
|
164
|
+
has_pic_tag = p_element.at 'span'
|
165
|
+
if has_pic_tag
|
166
|
+
img_type = he_decode has_pic_tag.inner_html
|
167
|
+
img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
|
168
|
+
|
169
|
+
ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
|
170
|
+
end
|
142
171
|
end
|
143
172
|
|
144
|
-
ret[:section] = he_decode(section_anchor.inner_html)
|
145
|
-
|
173
|
+
ret[:section] = he_decode(section_anchor.inner_html) if section_anchor
|
174
|
+
|
146
175
|
ret[:post_date] = date
|
147
|
-
if
|
148
|
-
|
176
|
+
if p_element.at_xpath(XPATH_POST_DATE)
|
177
|
+
# Post 12/3
|
178
|
+
if /\A([^ ]+) ([\d]+)\Z/.match p_element.at_xpath(XPATH_POST_DATE).content.strip
|
179
|
+
ret[:post_date] = Date.parse [$1, $2].join('/')
|
180
|
+
end
|
181
|
+
elsif SUMMARY_DATE.match he_decode(p_element.children[0])
|
182
|
+
# Old style
|
183
|
+
ret[:post_date] = Date.parse [$1, $2].join('/')
|
149
184
|
end
|
150
185
|
|
151
186
|
if title_anchor
|
@@ -157,4 +192,5 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
157
192
|
|
158
193
|
ret
|
159
194
|
end
|
160
|
-
|
195
|
+
|
196
|
+
end
|