olek-libcraigscrape 1.0.3 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -6
- data/COPYING.LESSER +1 -1
- data/README +10 -10
- data/Rakefile +5 -54
- data/bin/craig_report_schema.yml +3 -3
- data/bin/craigwatch +32 -44
- data/bin/report_mailer/report.html.erb +17 -0
- data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +6 -6
- data/lib/geo_listings.rb +24 -24
- data/lib/libcraigscrape.rb +6 -11
- data/lib/listings.rb +62 -45
- data/lib/posting.rb +153 -106
- data/lib/scraper.rb +37 -94
- data/test/libcraigscrape_test_helpers.rb +10 -10
- data/test/test_craigslist_geolisting.rb +53 -53
- data/test/test_craigslist_listing.rb +26 -26
- data/test/test_craigslist_posting.rb +39 -38
- metadata +38 -114
- data/bin/report_mailer/craigslist_report.html.erb +0 -17
data/lib/libcraigscrape.rb
CHANGED
@@ -3,18 +3,13 @@
|
|
3
3
|
# All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
4
4
|
#
|
5
5
|
require 'rubygems'
|
6
|
-
|
7
|
-
|
8
|
-
gem 'nokogiri', '>= 1.4.4'
|
9
|
-
gem 'htmlentities', '>= 4.0.0'
|
10
|
-
|
11
|
-
|
12
|
-
require 'net/http'
|
13
|
-
require 'zlib'
|
14
|
-
require 'nokogiri'
|
6
|
+
require 'time'
|
7
|
+
require 'uri'
|
15
8
|
require 'htmlentities'
|
16
|
-
require 'active_support'
|
17
|
-
|
9
|
+
require 'active_support/core_ext/class/attribute_accessors'
|
10
|
+
require 'htmlentities'
|
11
|
+
require 'nokogiri'
|
12
|
+
require 'typhoeus'
|
18
13
|
|
19
14
|
# A base class encapsulating the various libcraigscrape objects, and providing most of the
|
20
15
|
# craigslist interaction methods. Currently, we're supporting the old Class methods
|
data/lib/listings.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# = About listings.rb
|
2
2
|
#
|
3
3
|
# This file contains the parsing code, and logic relating to post-listing pages. You
|
4
|
-
# should never need to include this file directly, as all of libcraigscrape's objects and methods
|
4
|
+
# should never need to include this file directly, as all of libcraigscrape's objects and methods
|
5
5
|
# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
6
6
|
#
|
7
7
|
require 'scraper'
|
@@ -13,7 +13,10 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
13
13
|
IMG_TYPE = /^[ ]*(.+)[ ]*$/
|
14
14
|
HEADER_DATE = /^[ ]*(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[ ]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Nov|Dec)[ ]+([0-9]{1,2})[ ]*$/i
|
15
15
|
SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
|
16
|
-
NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
|
16
|
+
NEXT_PAGE_LINK = /^[ ]*(?:next [\d]+ postings|Next \>\>)[ ]*$/
|
17
|
+
|
18
|
+
XPATH_POST_DATE = "*[@class='itemdate']"
|
19
|
+
XPATH_PAGENAV_LINKS = "//*[@class='ban']//a"
|
17
20
|
|
18
21
|
# Array, PostSummary objects found in the listing
|
19
22
|
def posts
|
@@ -22,12 +25,12 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
22
25
|
@posts = []
|
23
26
|
|
24
27
|
# All we care about are p and h4 tags. This seemed to be the only way I could do this on Nokogiri:
|
25
|
-
post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
|
28
|
+
post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
|
26
29
|
|
27
30
|
# The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
|
28
31
|
post_tags.pop if (
|
29
|
-
post_tags.length > 0 and
|
30
|
-
post_tags.last.at('a') and
|
32
|
+
post_tags.length > 0 and
|
33
|
+
post_tags.last.at('a') and
|
31
34
|
NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
|
32
35
|
)
|
33
36
|
|
@@ -39,7 +42,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
39
42
|
|
40
43
|
# Validate that required fields are present:
|
41
44
|
parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
|
42
|
-
|
45
|
+
|
43
46
|
post_summary[:url] = url_from_href post_summary[:href]
|
44
47
|
|
45
48
|
@posts << CraigScrape::Posting.new(post_summary)
|
@@ -50,13 +53,13 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
50
53
|
current_date = CraigScrape.most_recently_expired_time $1, $2
|
51
54
|
elsif html.at('h4:last-of-type') == el
|
52
55
|
# There's a specific bug in craigslist, where these nonsense h4's just appear without anything relevant inside them.
|
53
|
-
# They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
|
56
|
+
# They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
|
54
57
|
# we need to pull up the full post in order to accurate tell the date.
|
55
58
|
# Setting this to nil will achieve the eager-load.
|
56
59
|
current_date = nil
|
57
60
|
end
|
58
|
-
end
|
59
|
-
end
|
61
|
+
end
|
62
|
+
end
|
60
63
|
end
|
61
64
|
|
62
65
|
@posts
|
@@ -65,44 +68,52 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
65
68
|
# String, URL Path href-fragment of the next page link
|
66
69
|
def next_page_href
|
67
70
|
unless @next_page_href
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
#
|
86
|
-
|
71
|
+
|
72
|
+
if html.at_xpath(XPATH_PAGENAV_LINKS)
|
73
|
+
# Post 12/3
|
74
|
+
next_link = html.xpath(XPATH_PAGENAV_LINKS).find{|link| NEXT_PAGE_LINK.match link.content}
|
75
|
+
@next_page_href = next_link[:href]
|
76
|
+
else
|
77
|
+
# Old style
|
78
|
+
cursor = html.at 'p:last-of-type'
|
79
|
+
|
80
|
+
cursor = cursor.at 'a' if cursor
|
81
|
+
|
82
|
+
# Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
|
83
|
+
next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
|
84
|
+
|
85
|
+
# Search listings put their next page in a link towards the top
|
86
|
+
next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
|
87
|
+
|
88
|
+
# Some search pages have a bug, whereby a 'next page' link isn't displayed,
|
89
|
+
# even though we can see that theres another page listed in the page-number links block at the top
|
90
|
+
# and bottom of the listing page
|
91
|
+
unless next_link
|
92
|
+
cursor = html % 'div.sh:first-of-type > b:last-of-type'
|
93
|
+
|
94
|
+
# If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
|
95
|
+
# We're looking good.
|
96
|
+
next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
|
97
|
+
end
|
98
|
+
|
99
|
+
# We have an anchor tag - so - let's assign the href:
|
100
|
+
@next_page_href = next_link[:href] if next_link
|
87
101
|
end
|
88
|
-
|
89
|
-
# We have an anchor tag - so - let's assign the href:
|
90
|
-
@next_page_href = next_link[:href] if next_link
|
91
102
|
end
|
92
|
-
|
103
|
+
|
93
104
|
@next_page_href
|
94
105
|
end
|
95
|
-
|
106
|
+
|
96
107
|
# String, Full URL Path of the 'next page' link
|
97
108
|
def next_page_url
|
98
109
|
(next_page_href) ? url_from_href(next_page_href) : nil
|
99
110
|
end
|
100
|
-
|
111
|
+
|
101
112
|
# Returns a Listings object of the next_page_url on the current listings object
|
102
113
|
def next_page
|
103
114
|
CraigScrape::Listings.new next_page_url if next_page_url
|
104
115
|
end
|
105
|
-
|
116
|
+
|
106
117
|
# Takes a paragraph element and returns a mostly-parsed Posting
|
107
118
|
# We separate this from the rest of the parsing both for readability and ease of testing
|
108
119
|
def self.parse_summary(p_element, date = nil) #:nodoc:
|
@@ -111,8 +122,8 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
111
122
|
title_anchor = nil
|
112
123
|
section_anchor = nil
|
113
124
|
|
114
|
-
# This loop got a little more complicated after Craigslist start inserting weird <spans>'s in
|
115
|
-
# its list summary postings (See test_new_listing_span051710)
|
125
|
+
# This loop got a little more complicated after Craigslist start inserting weird <spans>'s in
|
126
|
+
# its list summary postings (See test_new_listing_span051710)
|
116
127
|
p_element.search('a').each do |a_el|
|
117
128
|
# We want the first a-tag that doesn't have spans in it to be the title anchor
|
118
129
|
if title_anchor.nil?
|
@@ -124,12 +135,12 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
124
135
|
break
|
125
136
|
end
|
126
137
|
end
|
127
|
-
|
138
|
+
|
128
139
|
location_tag = p_element.at 'font'
|
129
140
|
has_pic_tag = p_element.at 'span'
|
130
|
-
|
141
|
+
|
131
142
|
href = nil
|
132
|
-
|
143
|
+
|
133
144
|
location = he_decode p_element.at('font').inner_html if location_tag
|
134
145
|
ret[:location] = $1 if location and LOCATION.match location
|
135
146
|
|
@@ -141,20 +152,26 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
141
152
|
ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
|
142
153
|
end
|
143
154
|
|
144
|
-
ret[:section] = he_decode(section_anchor.inner_html)
|
145
|
-
|
155
|
+
ret[:section] = he_decode(section_anchor.inner_html) if section_anchor
|
156
|
+
|
146
157
|
ret[:post_date] = date
|
147
|
-
if
|
158
|
+
if p_element.at_xpath(XPATH_POST_DATE)
|
159
|
+
# Post 12/3
|
160
|
+
if /\A([^ ]+) ([\d]+)\Z/.match p_element.at_xpath(XPATH_POST_DATE).content.strip
|
161
|
+
ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
|
162
|
+
end
|
163
|
+
elsif SUMMARY_DATE.match he_decode(p_element.children[0])
|
164
|
+
# Old style
|
148
165
|
ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
|
149
166
|
end
|
150
167
|
|
151
168
|
if title_anchor
|
152
169
|
label = he_decode title_anchor.inner_html
|
153
170
|
ret[:label] = $1 if LABEL.match label
|
154
|
-
|
171
|
+
|
155
172
|
ret[:href] = title_anchor[:href]
|
156
173
|
end
|
157
|
-
|
174
|
+
|
158
175
|
ret
|
159
176
|
end
|
160
177
|
end
|
data/lib/posting.rb
CHANGED
@@ -1,27 +1,37 @@
|
|
1
1
|
# = About posting.rb
|
2
2
|
#
|
3
3
|
# This file contains the parsing code, and logic relating to craiglist postings. You
|
4
|
-
# should never need to include this file directly, as all of libcraigscrape's objects and methods
|
4
|
+
# should never need to include this file directly, as all of libcraigscrape's objects and methods
|
5
5
|
# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'scraper'
|
9
9
|
|
10
10
|
# Posting represents a fully downloaded, and parsed, Craigslist post.
|
11
|
-
# This class is generally returned by the listing scrape methods, and
|
12
|
-
# contains the post summaries for a specific search url, or a general listing category
|
11
|
+
# This class is generally returned by the listing scrape methods, and
|
12
|
+
# contains the post summaries for a specific search url, or a general listing category
|
13
13
|
class CraigScrape::Posting < CraigScrape::Scraper
|
14
|
-
|
14
|
+
|
15
15
|
POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
|
16
16
|
LOCATION = /Location\:[ ]+(.+)/
|
17
17
|
HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
|
18
|
-
POSTING_ID = /PostingID\:[ ]
|
18
|
+
POSTING_ID = /PostingID\:[ ]*([\d]+)/
|
19
19
|
REPLY_TO = /(.+)/
|
20
20
|
PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
|
21
|
+
# NOTE: we implement the (?:) to first check the 'old' style format, and then the 'new style'
|
22
|
+
# (As of 12/03's parse changes)
|
21
23
|
USERBODY_PARTS = /^(.+)\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>(.+)$/m
|
22
24
|
HTML_HEADER = /^(.+)\<div id\=\"userbody\">/m
|
23
25
|
IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
|
24
26
|
|
27
|
+
# This is used to determine if there's a parse error
|
28
|
+
REQUIRED_FIELDS = %w(contents posting_id post_time header title full_section)
|
29
|
+
|
30
|
+
XPATH_USERBODY = "//*[@id='userbody']"
|
31
|
+
XPATH_BLURBS = "//ul[@class='blurbs']"
|
32
|
+
XPATH_PICS = "//*[@class='tn']/a/@href"
|
33
|
+
XPATH_REPLY_TO = "//*[@class='dateReplyBar']/small/a"
|
34
|
+
|
25
35
|
# This is really just for testing, in production use, uri.path is a better solution
|
26
36
|
attr_reader :href #:nodoc:
|
27
37
|
|
@@ -30,14 +40,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
30
40
|
super(*args)
|
31
41
|
|
32
42
|
# Validate that required fields are present, at least - if we've downloaded it from a url
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
43
|
+
if args.first.kind_of? String and is_active_post?
|
44
|
+
unparsed_fields = REQUIRED_FIELDS.find_all{|f|
|
45
|
+
val = send(f)
|
46
|
+
val.nil? or (val.respond_to? :length and val.length == 0)
|
47
|
+
}
|
48
|
+
parse_error! unparsed_fields unless unparsed_fields.empty?
|
49
|
+
end
|
50
|
+
|
41
51
|
end
|
42
52
|
|
43
53
|
|
@@ -47,10 +57,10 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
47
57
|
h2 = html_head.at 'h2' if html_head
|
48
58
|
@header = he_decode h2.inner_html if h2
|
49
59
|
end
|
50
|
-
|
60
|
+
|
51
61
|
@header
|
52
62
|
end
|
53
|
-
|
63
|
+
|
54
64
|
# String, the item's title
|
55
65
|
def title
|
56
66
|
unless @title
|
@@ -58,7 +68,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
58
68
|
@title = he_decode title_tag.inner_html if title_tag
|
59
69
|
@title = nil if @title and @title.length == 0
|
60
70
|
end
|
61
|
-
|
71
|
+
|
62
72
|
@title
|
63
73
|
end
|
64
74
|
|
@@ -66,8 +76,8 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
66
76
|
def full_section
|
67
77
|
unless @full_section
|
68
78
|
@full_section = []
|
69
|
-
|
70
|
-
(html_head/"
|
79
|
+
|
80
|
+
(html_head / "*[@class='bchead']//a").each do |a|
|
71
81
|
@full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
|
72
82
|
end if html_head
|
73
83
|
end
|
@@ -78,84 +88,103 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
78
88
|
# String, represents the post's reply-to address, if listed
|
79
89
|
def reply_to
|
80
90
|
unless @reply_to
|
81
|
-
|
82
|
-
|
83
|
-
|
91
|
+
if html.at_xpath(XPATH_REPLY_TO)
|
92
|
+
@reply_to = html.at_xpath(XPATH_REPLY_TO).content
|
93
|
+
else
|
94
|
+
cursor = html_head.at 'hr' if html_head
|
95
|
+
cursor = cursor.next until cursor.nil? or cursor.name == 'a'
|
96
|
+
@reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
|
97
|
+
end
|
84
98
|
end
|
85
|
-
|
99
|
+
|
86
100
|
@reply_to
|
87
101
|
end
|
88
|
-
|
89
|
-
# Time, reflects the full timestamp of the posting
|
102
|
+
|
103
|
+
# Time, reflects the full timestamp of the posting
|
90
104
|
def post_time
|
91
105
|
unless @post_time
|
92
106
|
cursor = html_head.at 'hr' if html_head
|
93
107
|
cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
|
94
108
|
@post_time = Time.parse $1 if $1
|
95
109
|
end
|
96
|
-
|
110
|
+
|
97
111
|
@post_time
|
98
112
|
end
|
99
113
|
|
100
114
|
# Integer, Craigslist's unique posting id
|
101
115
|
def posting_id
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
116
|
+
if @posting_id
|
117
|
+
|
118
|
+
elsif USERBODY_PARTS.match html_source
|
119
|
+
# Old style:
|
120
|
+
html_footer = $4
|
121
|
+
cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING
|
122
|
+
cursor = cursor.next until cursor.nil? or
|
123
|
+
@posting_id = $1.to_i if POSTING_ID.match html_footer.to_s
|
124
|
+
else
|
125
|
+
# Post 12/3
|
126
|
+
@posting_id = $1.to_i if POSTING_ID.match html.xpath("//*[@class='postingidtext']").to_s
|
106
127
|
end
|
107
|
-
|
128
|
+
|
108
129
|
@posting_id
|
109
130
|
end
|
110
|
-
|
131
|
+
|
111
132
|
# String, The full-html contents of the post
|
112
133
|
def contents
|
113
134
|
unless @contents
|
114
135
|
@contents = user_body if html_source
|
115
|
-
@contents = he_decode
|
136
|
+
@contents = he_decode(@contents).strip if @contents
|
116
137
|
end
|
117
|
-
|
138
|
+
|
118
139
|
@contents
|
119
140
|
end
|
120
|
-
|
141
|
+
|
121
142
|
# String, the location of the item, as best could be parsed
|
122
143
|
def location
|
123
|
-
if @location.nil? and
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
144
|
+
if @location.nil? and html
|
145
|
+
|
146
|
+
if html.at_xpath(XPATH_BLURBS)
|
147
|
+
# This is the post-12/3/12 style:
|
148
|
+
@location = $1 if html.xpath(XPATH_BLURBS).first.children.any?{|c|
|
149
|
+
LOCATION.match c.content}
|
150
|
+
elsif craigslist_body
|
151
|
+
# Location (when explicitly defined):
|
152
|
+
cursor = craigslist_body.at 'ul' unless @location
|
153
|
+
|
154
|
+
# This is the legacy style:
|
155
|
+
# Note: Apa section includes other things in the li's (cats/dogs ok fields)
|
156
|
+
cursor.children.each do |li|
|
157
|
+
if LOCATION.match li.inner_html
|
158
|
+
@location = he_decode($1) and break
|
159
|
+
break
|
160
|
+
end
|
161
|
+
end if cursor
|
162
|
+
|
163
|
+
# Real estate listings can work a little different for location:
|
164
|
+
unless @location
|
165
|
+
cursor = craigslist_body.at 'small'
|
166
|
+
cursor = cursor.previous until cursor.nil? or cursor.text?
|
167
|
+
|
168
|
+
@location = he_decode(cursor.to_s.strip) if cursor
|
132
169
|
end
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
unless @location
|
137
|
-
cursor = craigslist_body.at 'small'
|
138
|
-
cursor = cursor.previous until cursor.nil? or cursor.text?
|
139
|
-
|
140
|
-
@location = he_decode(cursor.to_s.strip) if cursor
|
170
|
+
|
171
|
+
# So, *sometimes* the location just ends up being in the header, I don't know why:
|
172
|
+
@location = $1 if @location.nil? and HEADER_LOCATION.match header
|
141
173
|
end
|
142
|
-
|
143
|
-
# So, *sometimes* the location just ends up being in the header, I don't know why:
|
144
|
-
@location = $1 if @location.nil? and HEADER_LOCATION.match header
|
145
174
|
end
|
146
|
-
|
175
|
+
|
147
176
|
@location
|
148
177
|
end
|
149
178
|
|
150
179
|
# Array, urls of the post's images that are *not* hosted on craigslist
|
151
180
|
def images
|
152
181
|
# Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
|
153
|
-
@images = (
|
154
|
-
contents ?
|
182
|
+
@images = (
|
183
|
+
contents ?
|
155
184
|
contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
|
156
|
-
[]
|
185
|
+
[]
|
157
186
|
) unless @images
|
158
|
-
|
187
|
+
|
159
188
|
@images
|
160
189
|
end
|
161
190
|
|
@@ -163,15 +192,20 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
163
192
|
def pics
|
164
193
|
unless @pics
|
165
194
|
@pics = []
|
166
|
-
|
167
|
-
if html
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
195
|
+
|
196
|
+
if html
|
197
|
+
if html.at_xpath(XPATH_PICS)
|
198
|
+
@pics = html.xpath(XPATH_PICS).collect(&:value)
|
199
|
+
elsif craigslist_body
|
200
|
+
# This is the pre-12/3/12 style:
|
201
|
+
# Now let's find the craigslist hosted images:
|
202
|
+
img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
|
203
|
+
|
204
|
+
@pics = (img_table / 'img').collect{|i| i[:src]} if img_table
|
205
|
+
end
|
172
206
|
end
|
173
207
|
end
|
174
|
-
|
208
|
+
|
175
209
|
@pics
|
176
210
|
end
|
177
211
|
|
@@ -180,38 +214,37 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
180
214
|
@flagged_for_removal = (
|
181
215
|
system_post? and header_as_plain == "This posting has been flagged for removal"
|
182
216
|
) if @flagged_for_removal.nil?
|
183
|
-
|
217
|
+
|
184
218
|
@flagged_for_removal
|
185
219
|
end
|
186
|
-
|
220
|
+
|
187
221
|
# Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
|
188
222
|
def deleted_by_author?
|
189
223
|
@deleted_by_author = (
|
190
224
|
system_post? and header_as_plain == "This posting has been deleted by its author."
|
191
225
|
) if @deleted_by_author.nil?
|
192
|
-
|
226
|
+
|
193
227
|
@deleted_by_author
|
194
228
|
end
|
195
|
-
|
229
|
+
|
196
230
|
# Returns true if this Post was parsed, and represents a 'This posting has expired.' notice
|
197
231
|
def posting_has_expired?
|
198
232
|
@posting_has_expired = (
|
199
233
|
system_post? and header_as_plain == "This posting has expired."
|
200
234
|
) if @posting_has_expired.nil?
|
201
|
-
|
235
|
+
|
202
236
|
@posting_has_expired
|
203
237
|
end
|
204
|
-
|
205
|
-
|
238
|
+
|
206
239
|
# Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
|
207
240
|
# used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
208
241
|
def post_date
|
209
242
|
@post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
|
210
|
-
|
243
|
+
|
211
244
|
@post_date
|
212
245
|
end
|
213
|
-
|
214
|
-
# Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
|
246
|
+
|
247
|
+
# Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
|
215
248
|
# The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
|
216
249
|
# Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
|
217
250
|
# This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
|
@@ -219,37 +252,31 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
219
252
|
def label
|
220
253
|
unless @label or system_post?
|
221
254
|
@label = header
|
222
|
-
|
255
|
+
|
223
256
|
@label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
|
224
257
|
end
|
225
|
-
|
258
|
+
|
226
259
|
@label
|
227
260
|
end
|
228
261
|
|
229
262
|
# Array, which image types are listed for the post.
|
230
263
|
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
231
264
|
def img_types
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
@img_types << :img if images.length > 0
|
236
|
-
@img_types << :pic if pics.length > 0
|
237
|
-
end
|
238
|
-
|
239
|
-
@img_types
|
265
|
+
@img_types || [ (images.length > 0) ? :img : nil,
|
266
|
+
(pics.length > 0) ? :pic : nil ].compact
|
240
267
|
end
|
241
|
-
|
242
|
-
# Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
|
268
|
+
|
269
|
+
# Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
|
243
270
|
# this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
|
244
271
|
def section
|
245
272
|
unless @section
|
246
|
-
@section = full_section.last if full_section
|
273
|
+
@section = full_section.last if full_section
|
247
274
|
end
|
248
|
-
|
275
|
+
|
249
276
|
@section
|
250
277
|
end
|
251
278
|
|
252
|
-
# true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
|
279
|
+
# true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
|
253
280
|
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
254
281
|
def has_img?
|
255
282
|
img_types.include? :img
|
@@ -272,50 +299,70 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
272
299
|
def price
|
273
300
|
$1.tr('$','').to_f if label and PRICE.match label
|
274
301
|
end
|
275
|
-
|
302
|
+
|
276
303
|
# Returns the post contents with all html tags removed
|
277
304
|
def contents_as_plain
|
278
305
|
strip_html contents
|
279
306
|
end
|
280
307
|
|
281
|
-
# Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
|
308
|
+
# Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
|
282
309
|
# 'system_post' we may get tags in here
|
283
310
|
def header_as_plain
|
284
311
|
strip_html header
|
285
312
|
end
|
286
313
|
|
287
|
-
# Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
|
314
|
+
# Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
|
288
315
|
# This returns true or false if that case applies
|
289
316
|
def system_post?
|
290
317
|
[contents,posting_id,post_time,title].all?{|f| f.nil?}
|
291
318
|
end
|
292
319
|
|
320
|
+
# This is mostly used to determine if the post should be checked for
|
321
|
+
# parse errors. Might be useful for someone else though
|
322
|
+
def is_active_post?
|
323
|
+
[flagged_for_removal?, posting_has_expired?, deleted_by_author?].none?
|
324
|
+
end
|
325
|
+
|
293
326
|
private
|
294
327
|
|
295
|
-
# I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
|
328
|
+
# I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
|
296
329
|
# return everything above the user_body
|
297
330
|
def html_head
|
298
331
|
@html_head = Nokogiri::HTML $1, nil, HTML_ENCODING if @html_head.nil? and HTML_HEADER.match html_source
|
299
332
|
# We return html itself if HTML_HEADER doesn't match, which would be case for a 404 page or something
|
300
333
|
@html_head ||= html
|
301
|
-
|
334
|
+
|
302
335
|
@html_head
|
303
336
|
end
|
304
337
|
|
305
|
-
# Since we started having so many problems with Hpricot flipping out on whack content bodies,
|
306
|
-
# I added this to return everything south of the user_body
|
307
|
-
def html_footer
|
308
|
-
$4 if USERBODY_PARTS.match html_source
|
309
|
-
end
|
310
|
-
|
311
338
|
# OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
|
312
|
-
# This bad html trips up
|
339
|
+
# This bad html trips up html parsers, and I've resorted to splitting the page up using string parsing like so:
|
313
340
|
# We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
|
314
341
|
def user_body
|
315
|
-
|
342
|
+
if USERBODY_PARTS.match html_source
|
343
|
+
# This is the pre-12/3/12 style:
|
344
|
+
$2
|
345
|
+
elsif html.at_xpath(XPATH_USERBODY)
|
346
|
+
# There's a bunch of junk in here that we don't want, so this loop removes
|
347
|
+
# everything after (and including) the last script tag, from the result
|
348
|
+
user_body = html.xpath(XPATH_USERBODY)
|
349
|
+
hit_delimeter = false
|
350
|
+
# Since some posts don't actually have the script tag:
|
351
|
+
delimeter = user_body.at_xpath('script') ? :script : :comment
|
352
|
+
user_body.first.children.to_a.reverse.reject{ |p|
|
353
|
+
if hit_delimeter
|
354
|
+
false
|
355
|
+
elsif ( (delimeter == :script and p.name == 'script') or
|
356
|
+
(delimeter == :comment and p.comment? and p.content.strip == "START CLTAGS") )
|
357
|
+
hit_delimeter = true
|
358
|
+
else
|
359
|
+
true
|
360
|
+
end
|
361
|
+
}.reverse.collect(&:to_s).join
|
362
|
+
end
|
316
363
|
end
|
317
|
-
|
318
|
-
# Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
|
364
|
+
|
365
|
+
# Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
|
319
366
|
# So - we'll return it as a Nokogiri object.
|
320
367
|
def craigslist_body
|
321
368
|
Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source
|