olek-libcraigscrape 1.0.3 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +12 -6
- data/COPYING.LESSER +1 -1
- data/README +10 -10
- data/Rakefile +5 -54
- data/bin/craig_report_schema.yml +3 -3
- data/bin/craigwatch +32 -44
- data/bin/report_mailer/report.html.erb +17 -0
- data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +6 -6
- data/lib/geo_listings.rb +24 -24
- data/lib/libcraigscrape.rb +6 -11
- data/lib/listings.rb +62 -45
- data/lib/posting.rb +153 -106
- data/lib/scraper.rb +37 -94
- data/test/libcraigscrape_test_helpers.rb +10 -10
- data/test/test_craigslist_geolisting.rb +53 -53
- data/test/test_craigslist_listing.rb +26 -26
- data/test/test_craigslist_posting.rb +39 -38
- metadata +38 -114
- data/bin/report_mailer/craigslist_report.html.erb +0 -17
data/lib/libcraigscrape.rb
CHANGED
@@ -3,18 +3,13 @@
|
|
3
3
|
# All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
4
4
|
#
|
5
5
|
require 'rubygems'
|
6
|
-
|
7
|
-
|
8
|
-
gem 'nokogiri', '>= 1.4.4'
|
9
|
-
gem 'htmlentities', '>= 4.0.0'
|
10
|
-
|
11
|
-
|
12
|
-
require 'net/http'
|
13
|
-
require 'zlib'
|
14
|
-
require 'nokogiri'
|
6
|
+
require 'time'
|
7
|
+
require 'uri'
|
15
8
|
require 'htmlentities'
|
16
|
-
require 'active_support'
|
17
|
-
|
9
|
+
require 'active_support/core_ext/class/attribute_accessors'
|
10
|
+
require 'htmlentities'
|
11
|
+
require 'nokogiri'
|
12
|
+
require 'typhoeus'
|
18
13
|
|
19
14
|
# A base class encapsulating the various libcraigscrape objects, and providing most of the
|
20
15
|
# craigslist interaction methods. Currently, we're supporting the old Class methods
|
data/lib/listings.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# = About listings.rb
|
2
2
|
#
|
3
3
|
# This file contains the parsing code, and logic relating to post-listing pages. You
|
4
|
-
# should never need to include this file directly, as all of libcraigscrape's objects and methods
|
4
|
+
# should never need to include this file directly, as all of libcraigscrape's objects and methods
|
5
5
|
# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
6
6
|
#
|
7
7
|
require 'scraper'
|
@@ -13,7 +13,10 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
13
13
|
IMG_TYPE = /^[ ]*(.+)[ ]*$/
|
14
14
|
HEADER_DATE = /^[ ]*(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[ ]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Nov|Dec)[ ]+([0-9]{1,2})[ ]*$/i
|
15
15
|
SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
|
16
|
-
NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
|
16
|
+
NEXT_PAGE_LINK = /^[ ]*(?:next [\d]+ postings|Next \>\>)[ ]*$/
|
17
|
+
|
18
|
+
XPATH_POST_DATE = "*[@class='itemdate']"
|
19
|
+
XPATH_PAGENAV_LINKS = "//*[@class='ban']//a"
|
17
20
|
|
18
21
|
# Array, PostSummary objects found in the listing
|
19
22
|
def posts
|
@@ -22,12 +25,12 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
22
25
|
@posts = []
|
23
26
|
|
24
27
|
# All we care about are p and h4 tags. This seemed to be the only way I could do this on Nokogiri:
|
25
|
-
post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
|
28
|
+
post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
|
26
29
|
|
27
30
|
# The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
|
28
31
|
post_tags.pop if (
|
29
|
-
post_tags.length > 0 and
|
30
|
-
post_tags.last.at('a') and
|
32
|
+
post_tags.length > 0 and
|
33
|
+
post_tags.last.at('a') and
|
31
34
|
NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
|
32
35
|
)
|
33
36
|
|
@@ -39,7 +42,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
39
42
|
|
40
43
|
# Validate that required fields are present:
|
41
44
|
parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
|
42
|
-
|
45
|
+
|
43
46
|
post_summary[:url] = url_from_href post_summary[:href]
|
44
47
|
|
45
48
|
@posts << CraigScrape::Posting.new(post_summary)
|
@@ -50,13 +53,13 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
50
53
|
current_date = CraigScrape.most_recently_expired_time $1, $2
|
51
54
|
elsif html.at('h4:last-of-type') == el
|
52
55
|
# There's a specific bug in craigslist, where these nonsense h4's just appear without anything relevant inside them.
|
53
|
-
# They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
|
56
|
+
# They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
|
54
57
|
# we need to pull up the full post in order to accurate tell the date.
|
55
58
|
# Setting this to nil will achieve the eager-load.
|
56
59
|
current_date = nil
|
57
60
|
end
|
58
|
-
end
|
59
|
-
end
|
61
|
+
end
|
62
|
+
end
|
60
63
|
end
|
61
64
|
|
62
65
|
@posts
|
@@ -65,44 +68,52 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
65
68
|
# String, URL Path href-fragment of the next page link
|
66
69
|
def next_page_href
|
67
70
|
unless @next_page_href
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
#
|
86
|
-
|
71
|
+
|
72
|
+
if html.at_xpath(XPATH_PAGENAV_LINKS)
|
73
|
+
# Post 12/3
|
74
|
+
next_link = html.xpath(XPATH_PAGENAV_LINKS).find{|link| NEXT_PAGE_LINK.match link.content}
|
75
|
+
@next_page_href = next_link[:href]
|
76
|
+
else
|
77
|
+
# Old style
|
78
|
+
cursor = html.at 'p:last-of-type'
|
79
|
+
|
80
|
+
cursor = cursor.at 'a' if cursor
|
81
|
+
|
82
|
+
# Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
|
83
|
+
next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
|
84
|
+
|
85
|
+
# Search listings put their next page in a link towards the top
|
86
|
+
next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
|
87
|
+
|
88
|
+
# Some search pages have a bug, whereby a 'next page' link isn't displayed,
|
89
|
+
# even though we can see that theres another page listed in the page-number links block at the top
|
90
|
+
# and bottom of the listing page
|
91
|
+
unless next_link
|
92
|
+
cursor = html % 'div.sh:first-of-type > b:last-of-type'
|
93
|
+
|
94
|
+
# If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
|
95
|
+
# We're looking good.
|
96
|
+
next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
|
97
|
+
end
|
98
|
+
|
99
|
+
# We have an anchor tag - so - let's assign the href:
|
100
|
+
@next_page_href = next_link[:href] if next_link
|
87
101
|
end
|
88
|
-
|
89
|
-
# We have an anchor tag - so - let's assign the href:
|
90
|
-
@next_page_href = next_link[:href] if next_link
|
91
102
|
end
|
92
|
-
|
103
|
+
|
93
104
|
@next_page_href
|
94
105
|
end
|
95
|
-
|
106
|
+
|
96
107
|
# String, Full URL Path of the 'next page' link
|
97
108
|
def next_page_url
|
98
109
|
(next_page_href) ? url_from_href(next_page_href) : nil
|
99
110
|
end
|
100
|
-
|
111
|
+
|
101
112
|
# Returns a Listings object of the next_page_url on the current listings object
|
102
113
|
def next_page
|
103
114
|
CraigScrape::Listings.new next_page_url if next_page_url
|
104
115
|
end
|
105
|
-
|
116
|
+
|
106
117
|
# Takes a paragraph element and returns a mostly-parsed Posting
|
107
118
|
# We separate this from the rest of the parsing both for readability and ease of testing
|
108
119
|
def self.parse_summary(p_element, date = nil) #:nodoc:
|
@@ -111,8 +122,8 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
111
122
|
title_anchor = nil
|
112
123
|
section_anchor = nil
|
113
124
|
|
114
|
-
# This loop got a little more complicated after Craigslist start inserting weird <spans>'s in
|
115
|
-
# its list summary postings (See test_new_listing_span051710)
|
125
|
+
# This loop got a little more complicated after Craigslist start inserting weird <spans>'s in
|
126
|
+
# its list summary postings (See test_new_listing_span051710)
|
116
127
|
p_element.search('a').each do |a_el|
|
117
128
|
# We want the first a-tag that doesn't have spans in it to be the title anchor
|
118
129
|
if title_anchor.nil?
|
@@ -124,12 +135,12 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
124
135
|
break
|
125
136
|
end
|
126
137
|
end
|
127
|
-
|
138
|
+
|
128
139
|
location_tag = p_element.at 'font'
|
129
140
|
has_pic_tag = p_element.at 'span'
|
130
|
-
|
141
|
+
|
131
142
|
href = nil
|
132
|
-
|
143
|
+
|
133
144
|
location = he_decode p_element.at('font').inner_html if location_tag
|
134
145
|
ret[:location] = $1 if location and LOCATION.match location
|
135
146
|
|
@@ -141,20 +152,26 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
141
152
|
ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
|
142
153
|
end
|
143
154
|
|
144
|
-
ret[:section] = he_decode(section_anchor.inner_html)
|
145
|
-
|
155
|
+
ret[:section] = he_decode(section_anchor.inner_html) if section_anchor
|
156
|
+
|
146
157
|
ret[:post_date] = date
|
147
|
-
if
|
158
|
+
if p_element.at_xpath(XPATH_POST_DATE)
|
159
|
+
# Post 12/3
|
160
|
+
if /\A([^ ]+) ([\d]+)\Z/.match p_element.at_xpath(XPATH_POST_DATE).content.strip
|
161
|
+
ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
|
162
|
+
end
|
163
|
+
elsif SUMMARY_DATE.match he_decode(p_element.children[0])
|
164
|
+
# Old style
|
148
165
|
ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
|
149
166
|
end
|
150
167
|
|
151
168
|
if title_anchor
|
152
169
|
label = he_decode title_anchor.inner_html
|
153
170
|
ret[:label] = $1 if LABEL.match label
|
154
|
-
|
171
|
+
|
155
172
|
ret[:href] = title_anchor[:href]
|
156
173
|
end
|
157
|
-
|
174
|
+
|
158
175
|
ret
|
159
176
|
end
|
160
177
|
end
|
data/lib/posting.rb
CHANGED
@@ -1,27 +1,37 @@
|
|
1
1
|
# = About posting.rb
|
2
2
|
#
|
3
3
|
# This file contains the parsing code, and logic relating to craiglist postings. You
|
4
|
-
# should never need to include this file directly, as all of libcraigscrape's objects and methods
|
4
|
+
# should never need to include this file directly, as all of libcraigscrape's objects and methods
|
5
5
|
# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'scraper'
|
9
9
|
|
10
10
|
# Posting represents a fully downloaded, and parsed, Craigslist post.
|
11
|
-
# This class is generally returned by the listing scrape methods, and
|
12
|
-
# contains the post summaries for a specific search url, or a general listing category
|
11
|
+
# This class is generally returned by the listing scrape methods, and
|
12
|
+
# contains the post summaries for a specific search url, or a general listing category
|
13
13
|
class CraigScrape::Posting < CraigScrape::Scraper
|
14
|
-
|
14
|
+
|
15
15
|
POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
|
16
16
|
LOCATION = /Location\:[ ]+(.+)/
|
17
17
|
HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
|
18
|
-
POSTING_ID = /PostingID\:[ ]
|
18
|
+
POSTING_ID = /PostingID\:[ ]*([\d]+)/
|
19
19
|
REPLY_TO = /(.+)/
|
20
20
|
PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
|
21
|
+
# NOTE: we implement the (?:) to first check the 'old' style format, and then the 'new style'
|
22
|
+
# (As of 12/03's parse changes)
|
21
23
|
USERBODY_PARTS = /^(.+)\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>(.+)$/m
|
22
24
|
HTML_HEADER = /^(.+)\<div id\=\"userbody\">/m
|
23
25
|
IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
|
24
26
|
|
27
|
+
# This is used to determine if there's a parse error
|
28
|
+
REQUIRED_FIELDS = %w(contents posting_id post_time header title full_section)
|
29
|
+
|
30
|
+
XPATH_USERBODY = "//*[@id='userbody']"
|
31
|
+
XPATH_BLURBS = "//ul[@class='blurbs']"
|
32
|
+
XPATH_PICS = "//*[@class='tn']/a/@href"
|
33
|
+
XPATH_REPLY_TO = "//*[@class='dateReplyBar']/small/a"
|
34
|
+
|
25
35
|
# This is really just for testing, in production use, uri.path is a better solution
|
26
36
|
attr_reader :href #:nodoc:
|
27
37
|
|
@@ -30,14 +40,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
30
40
|
super(*args)
|
31
41
|
|
32
42
|
# Validate that required fields are present, at least - if we've downloaded it from a url
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
43
|
+
if args.first.kind_of? String and is_active_post?
|
44
|
+
unparsed_fields = REQUIRED_FIELDS.find_all{|f|
|
45
|
+
val = send(f)
|
46
|
+
val.nil? or (val.respond_to? :length and val.length == 0)
|
47
|
+
}
|
48
|
+
parse_error! unparsed_fields unless unparsed_fields.empty?
|
49
|
+
end
|
50
|
+
|
41
51
|
end
|
42
52
|
|
43
53
|
|
@@ -47,10 +57,10 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
47
57
|
h2 = html_head.at 'h2' if html_head
|
48
58
|
@header = he_decode h2.inner_html if h2
|
49
59
|
end
|
50
|
-
|
60
|
+
|
51
61
|
@header
|
52
62
|
end
|
53
|
-
|
63
|
+
|
54
64
|
# String, the item's title
|
55
65
|
def title
|
56
66
|
unless @title
|
@@ -58,7 +68,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
58
68
|
@title = he_decode title_tag.inner_html if title_tag
|
59
69
|
@title = nil if @title and @title.length == 0
|
60
70
|
end
|
61
|
-
|
71
|
+
|
62
72
|
@title
|
63
73
|
end
|
64
74
|
|
@@ -66,8 +76,8 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
66
76
|
def full_section
|
67
77
|
unless @full_section
|
68
78
|
@full_section = []
|
69
|
-
|
70
|
-
(html_head/"
|
79
|
+
|
80
|
+
(html_head / "*[@class='bchead']//a").each do |a|
|
71
81
|
@full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
|
72
82
|
end if html_head
|
73
83
|
end
|
@@ -78,84 +88,103 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
78
88
|
# String, represents the post's reply-to address, if listed
|
79
89
|
def reply_to
|
80
90
|
unless @reply_to
|
81
|
-
|
82
|
-
|
83
|
-
|
91
|
+
if html.at_xpath(XPATH_REPLY_TO)
|
92
|
+
@reply_to = html.at_xpath(XPATH_REPLY_TO).content
|
93
|
+
else
|
94
|
+
cursor = html_head.at 'hr' if html_head
|
95
|
+
cursor = cursor.next until cursor.nil? or cursor.name == 'a'
|
96
|
+
@reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
|
97
|
+
end
|
84
98
|
end
|
85
|
-
|
99
|
+
|
86
100
|
@reply_to
|
87
101
|
end
|
88
|
-
|
89
|
-
# Time, reflects the full timestamp of the posting
|
102
|
+
|
103
|
+
# Time, reflects the full timestamp of the posting
|
90
104
|
def post_time
|
91
105
|
unless @post_time
|
92
106
|
cursor = html_head.at 'hr' if html_head
|
93
107
|
cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
|
94
108
|
@post_time = Time.parse $1 if $1
|
95
109
|
end
|
96
|
-
|
110
|
+
|
97
111
|
@post_time
|
98
112
|
end
|
99
113
|
|
100
114
|
# Integer, Craigslist's unique posting id
|
101
115
|
def posting_id
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
116
|
+
if @posting_id
|
117
|
+
|
118
|
+
elsif USERBODY_PARTS.match html_source
|
119
|
+
# Old style:
|
120
|
+
html_footer = $4
|
121
|
+
cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING
|
122
|
+
cursor = cursor.next until cursor.nil? or
|
123
|
+
@posting_id = $1.to_i if POSTING_ID.match html_footer.to_s
|
124
|
+
else
|
125
|
+
# Post 12/3
|
126
|
+
@posting_id = $1.to_i if POSTING_ID.match html.xpath("//*[@class='postingidtext']").to_s
|
106
127
|
end
|
107
|
-
|
128
|
+
|
108
129
|
@posting_id
|
109
130
|
end
|
110
|
-
|
131
|
+
|
111
132
|
# String, The full-html contents of the post
|
112
133
|
def contents
|
113
134
|
unless @contents
|
114
135
|
@contents = user_body if html_source
|
115
|
-
@contents = he_decode
|
136
|
+
@contents = he_decode(@contents).strip if @contents
|
116
137
|
end
|
117
|
-
|
138
|
+
|
118
139
|
@contents
|
119
140
|
end
|
120
|
-
|
141
|
+
|
121
142
|
# String, the location of the item, as best could be parsed
|
122
143
|
def location
|
123
|
-
if @location.nil? and
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
144
|
+
if @location.nil? and html
|
145
|
+
|
146
|
+
if html.at_xpath(XPATH_BLURBS)
|
147
|
+
# This is the post-12/3/12 style:
|
148
|
+
@location = $1 if html.xpath(XPATH_BLURBS).first.children.any?{|c|
|
149
|
+
LOCATION.match c.content}
|
150
|
+
elsif craigslist_body
|
151
|
+
# Location (when explicitly defined):
|
152
|
+
cursor = craigslist_body.at 'ul' unless @location
|
153
|
+
|
154
|
+
# This is the legacy style:
|
155
|
+
# Note: Apa section includes other things in the li's (cats/dogs ok fields)
|
156
|
+
cursor.children.each do |li|
|
157
|
+
if LOCATION.match li.inner_html
|
158
|
+
@location = he_decode($1) and break
|
159
|
+
break
|
160
|
+
end
|
161
|
+
end if cursor
|
162
|
+
|
163
|
+
# Real estate listings can work a little different for location:
|
164
|
+
unless @location
|
165
|
+
cursor = craigslist_body.at 'small'
|
166
|
+
cursor = cursor.previous until cursor.nil? or cursor.text?
|
167
|
+
|
168
|
+
@location = he_decode(cursor.to_s.strip) if cursor
|
132
169
|
end
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
unless @location
|
137
|
-
cursor = craigslist_body.at 'small'
|
138
|
-
cursor = cursor.previous until cursor.nil? or cursor.text?
|
139
|
-
|
140
|
-
@location = he_decode(cursor.to_s.strip) if cursor
|
170
|
+
|
171
|
+
# So, *sometimes* the location just ends up being in the header, I don't know why:
|
172
|
+
@location = $1 if @location.nil? and HEADER_LOCATION.match header
|
141
173
|
end
|
142
|
-
|
143
|
-
# So, *sometimes* the location just ends up being in the header, I don't know why:
|
144
|
-
@location = $1 if @location.nil? and HEADER_LOCATION.match header
|
145
174
|
end
|
146
|
-
|
175
|
+
|
147
176
|
@location
|
148
177
|
end
|
149
178
|
|
150
179
|
# Array, urls of the post's images that are *not* hosted on craigslist
|
151
180
|
def images
|
152
181
|
# Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
|
153
|
-
@images = (
|
154
|
-
contents ?
|
182
|
+
@images = (
|
183
|
+
contents ?
|
155
184
|
contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
|
156
|
-
[]
|
185
|
+
[]
|
157
186
|
) unless @images
|
158
|
-
|
187
|
+
|
159
188
|
@images
|
160
189
|
end
|
161
190
|
|
@@ -163,15 +192,20 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
163
192
|
def pics
|
164
193
|
unless @pics
|
165
194
|
@pics = []
|
166
|
-
|
167
|
-
if html
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
195
|
+
|
196
|
+
if html
|
197
|
+
if html.at_xpath(XPATH_PICS)
|
198
|
+
@pics = html.xpath(XPATH_PICS).collect(&:value)
|
199
|
+
elsif craigslist_body
|
200
|
+
# This is the pre-12/3/12 style:
|
201
|
+
# Now let's find the craigslist hosted images:
|
202
|
+
img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
|
203
|
+
|
204
|
+
@pics = (img_table / 'img').collect{|i| i[:src]} if img_table
|
205
|
+
end
|
172
206
|
end
|
173
207
|
end
|
174
|
-
|
208
|
+
|
175
209
|
@pics
|
176
210
|
end
|
177
211
|
|
@@ -180,38 +214,37 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
180
214
|
@flagged_for_removal = (
|
181
215
|
system_post? and header_as_plain == "This posting has been flagged for removal"
|
182
216
|
) if @flagged_for_removal.nil?
|
183
|
-
|
217
|
+
|
184
218
|
@flagged_for_removal
|
185
219
|
end
|
186
|
-
|
220
|
+
|
187
221
|
# Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
|
188
222
|
def deleted_by_author?
|
189
223
|
@deleted_by_author = (
|
190
224
|
system_post? and header_as_plain == "This posting has been deleted by its author."
|
191
225
|
) if @deleted_by_author.nil?
|
192
|
-
|
226
|
+
|
193
227
|
@deleted_by_author
|
194
228
|
end
|
195
|
-
|
229
|
+
|
196
230
|
# Returns true if this Post was parsed, and represents a 'This posting has expired.' notice
|
197
231
|
def posting_has_expired?
|
198
232
|
@posting_has_expired = (
|
199
233
|
system_post? and header_as_plain == "This posting has expired."
|
200
234
|
) if @posting_has_expired.nil?
|
201
|
-
|
235
|
+
|
202
236
|
@posting_has_expired
|
203
237
|
end
|
204
|
-
|
205
|
-
|
238
|
+
|
206
239
|
# Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
|
207
240
|
# used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
208
241
|
def post_date
|
209
242
|
@post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
|
210
|
-
|
243
|
+
|
211
244
|
@post_date
|
212
245
|
end
|
213
|
-
|
214
|
-
# Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
|
246
|
+
|
247
|
+
# Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
|
215
248
|
# The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
|
216
249
|
# Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
|
217
250
|
# This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
|
@@ -219,37 +252,31 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
219
252
|
def label
|
220
253
|
unless @label or system_post?
|
221
254
|
@label = header
|
222
|
-
|
255
|
+
|
223
256
|
@label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
|
224
257
|
end
|
225
|
-
|
258
|
+
|
226
259
|
@label
|
227
260
|
end
|
228
261
|
|
229
262
|
# Array, which image types are listed for the post.
|
230
263
|
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
231
264
|
def img_types
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
@img_types << :img if images.length > 0
|
236
|
-
@img_types << :pic if pics.length > 0
|
237
|
-
end
|
238
|
-
|
239
|
-
@img_types
|
265
|
+
@img_types || [ (images.length > 0) ? :img : nil,
|
266
|
+
(pics.length > 0) ? :pic : nil ].compact
|
240
267
|
end
|
241
|
-
|
242
|
-
# Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
|
268
|
+
|
269
|
+
# Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
|
243
270
|
# this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
|
244
271
|
def section
|
245
272
|
unless @section
|
246
|
-
@section = full_section.last if full_section
|
273
|
+
@section = full_section.last if full_section
|
247
274
|
end
|
248
|
-
|
275
|
+
|
249
276
|
@section
|
250
277
|
end
|
251
278
|
|
252
|
-
# true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
|
279
|
+
# true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
|
253
280
|
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
254
281
|
def has_img?
|
255
282
|
img_types.include? :img
|
@@ -272,50 +299,70 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
272
299
|
def price
|
273
300
|
$1.tr('$','').to_f if label and PRICE.match label
|
274
301
|
end
|
275
|
-
|
302
|
+
|
276
303
|
# Returns the post contents with all html tags removed
|
277
304
|
def contents_as_plain
|
278
305
|
strip_html contents
|
279
306
|
end
|
280
307
|
|
281
|
-
# Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
|
308
|
+
# Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
|
282
309
|
# 'system_post' we may get tags in here
|
283
310
|
def header_as_plain
|
284
311
|
strip_html header
|
285
312
|
end
|
286
313
|
|
287
|
-
# Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
|
314
|
+
# Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
|
288
315
|
# This returns true or false if that case applies
|
289
316
|
def system_post?
|
290
317
|
[contents,posting_id,post_time,title].all?{|f| f.nil?}
|
291
318
|
end
|
292
319
|
|
320
|
+
# This is mostly used to determine if the post should be checked for
|
321
|
+
# parse errors. Might be useful for someone else though
|
322
|
+
def is_active_post?
|
323
|
+
[flagged_for_removal?, posting_has_expired?, deleted_by_author?].none?
|
324
|
+
end
|
325
|
+
|
293
326
|
private
|
294
327
|
|
295
|
-
# I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
|
328
|
+
# I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
|
296
329
|
# return everything above the user_body
|
297
330
|
def html_head
|
298
331
|
@html_head = Nokogiri::HTML $1, nil, HTML_ENCODING if @html_head.nil? and HTML_HEADER.match html_source
|
299
332
|
# We return html itself if HTML_HEADER doesn't match, which would be case for a 404 page or something
|
300
333
|
@html_head ||= html
|
301
|
-
|
334
|
+
|
302
335
|
@html_head
|
303
336
|
end
|
304
337
|
|
305
|
-
# Since we started having so many problems with Hpricot flipping out on whack content bodies,
|
306
|
-
# I added this to return everything south of the user_body
|
307
|
-
def html_footer
|
308
|
-
$4 if USERBODY_PARTS.match html_source
|
309
|
-
end
|
310
|
-
|
311
338
|
# OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
|
312
|
-
# This bad html trips up
|
339
|
+
# This bad html trips up html parsers, and I've resorted to splitting the page up using string parsing like so:
|
313
340
|
# We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
|
314
341
|
def user_body
|
315
|
-
|
342
|
+
if USERBODY_PARTS.match html_source
|
343
|
+
# This is the pre-12/3/12 style:
|
344
|
+
$2
|
345
|
+
elsif html.at_xpath(XPATH_USERBODY)
|
346
|
+
# There's a bunch of junk in here that we don't want, so this loop removes
|
347
|
+
# everything after (and including) the last script tag, from the result
|
348
|
+
user_body = html.xpath(XPATH_USERBODY)
|
349
|
+
hit_delimeter = false
|
350
|
+
# Since some posts don't actually have the script tag:
|
351
|
+
delimeter = user_body.at_xpath('script') ? :script : :comment
|
352
|
+
user_body.first.children.to_a.reverse.reject{ |p|
|
353
|
+
if hit_delimeter
|
354
|
+
false
|
355
|
+
elsif ( (delimeter == :script and p.name == 'script') or
|
356
|
+
(delimeter == :comment and p.comment? and p.content.strip == "START CLTAGS") )
|
357
|
+
hit_delimeter = true
|
358
|
+
else
|
359
|
+
true
|
360
|
+
end
|
361
|
+
}.reverse.collect(&:to_s).join
|
362
|
+
end
|
316
363
|
end
|
317
|
-
|
318
|
-
# Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
|
364
|
+
|
365
|
+
# Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
|
319
366
|
# So - we'll return it as a Nokogiri object.
|
320
367
|
def craigslist_body
|
321
368
|
Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source
|