olek-libcraigscrape 1.0.3 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,18 +3,13 @@
3
3
  # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
4
4
  #
5
5
  require 'rubygems'
6
-
7
- gem 'activesupport', '~> 2.3'
8
- gem 'nokogiri', '>= 1.4.4'
9
- gem 'htmlentities', '>= 4.0.0'
10
-
11
-
12
- require 'net/http'
13
- require 'zlib'
14
- require 'nokogiri'
6
+ require 'time'
7
+ require 'uri'
15
8
  require 'htmlentities'
16
- require 'active_support'
17
-
9
+ require 'active_support/core_ext/class/attribute_accessors'
10
+ require 'htmlentities'
11
+ require 'nokogiri'
12
+ require 'typhoeus'
18
13
 
19
14
  # A base class encapsulating the various libcraigscrape objects, and providing most of the
20
15
  # craigslist interaction methods. Currently, we're supporting the old Class methods
data/lib/listings.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # = About listings.rb
2
2
  #
3
3
  # This file contains the parsing code, and logic relating to post-listing pages. You
4
- # should never need to include this file directly, as all of libcraigscrape's objects and methods
4
+ # should never need to include this file directly, as all of libcraigscrape's objects and methods
5
5
  # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
6
6
  #
7
7
  require 'scraper'
@@ -13,7 +13,10 @@ class CraigScrape::Listings < CraigScrape::Scraper
13
13
  IMG_TYPE = /^[ ]*(.+)[ ]*$/
14
14
  HEADER_DATE = /^[ ]*(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[ ]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Nov|Dec)[ ]+([0-9]{1,2})[ ]*$/i
15
15
  SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
16
- NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
16
+ NEXT_PAGE_LINK = /^[ ]*(?:next [\d]+ postings|Next \>\>)[ ]*$/
17
+
18
+ XPATH_POST_DATE = "*[@class='itemdate']"
19
+ XPATH_PAGENAV_LINKS = "//*[@class='ban']//a"
17
20
 
18
21
  # Array, PostSummary objects found in the listing
19
22
  def posts
@@ -22,12 +25,12 @@ class CraigScrape::Listings < CraigScrape::Scraper
22
25
  @posts = []
23
26
 
24
27
  # All we care about are p and h4 tags. This seemed to be the only way I could do this on Nokogiri:
25
- post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
28
+ post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
26
29
 
27
30
  # The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
28
31
  post_tags.pop if (
29
- post_tags.length > 0 and
30
- post_tags.last.at('a') and
32
+ post_tags.length > 0 and
33
+ post_tags.last.at('a') and
31
34
  NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
32
35
  )
33
36
 
@@ -39,7 +42,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
39
42
 
40
43
  # Validate that required fields are present:
41
44
  parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
42
-
45
+
43
46
  post_summary[:url] = url_from_href post_summary[:href]
44
47
 
45
48
  @posts << CraigScrape::Posting.new(post_summary)
@@ -50,13 +53,13 @@ class CraigScrape::Listings < CraigScrape::Scraper
50
53
  current_date = CraigScrape.most_recently_expired_time $1, $2
51
54
  elsif html.at('h4:last-of-type') == el
52
55
  # There's a specific bug in craigslist, where these nonsense h4's just appear without anything relevant inside them.
53
- # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
56
+ # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
54
57
  # we need to pull up the full post in order to accurate tell the date.
55
58
  # Setting this to nil will achieve the eager-load.
56
59
  current_date = nil
57
60
  end
58
- end
59
- end
61
+ end
62
+ end
60
63
  end
61
64
 
62
65
  @posts
@@ -65,44 +68,52 @@ class CraigScrape::Listings < CraigScrape::Scraper
65
68
  # String, URL Path href-fragment of the next page link
66
69
  def next_page_href
67
70
  unless @next_page_href
68
- cursor = html.at 'p:last-of-type'
69
-
70
- cursor = cursor.at 'a' if cursor
71
-
72
- # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
73
- next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
74
-
75
- # Search listings put their next page in a link towards the top
76
- next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
77
-
78
- # Some search pages have a bug, whereby a 'next page' link isn't displayed,
79
- # even though we can see that theres another page listed in the page-number links block at the top
80
- # and bottom of the listing page
81
- unless next_link
82
- cursor = html % 'div.sh:first-of-type > b:last-of-type'
83
-
84
- # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
85
- # We're looking good.
86
- next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
71
+
72
+ if html.at_xpath(XPATH_PAGENAV_LINKS)
73
+ # Post 12/3
74
+ next_link = html.xpath(XPATH_PAGENAV_LINKS).find{|link| NEXT_PAGE_LINK.match link.content}
75
+ @next_page_href = next_link[:href]
76
+ else
77
+ # Old style
78
+ cursor = html.at 'p:last-of-type'
79
+
80
+ cursor = cursor.at 'a' if cursor
81
+
82
+ # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
83
+ next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
84
+
85
+ # Search listings put their next page in a link towards the top
86
+ next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
87
+
88
+ # Some search pages have a bug, whereby a 'next page' link isn't displayed,
89
+ # even though we can see that theres another page listed in the page-number links block at the top
90
+ # and bottom of the listing page
91
+ unless next_link
92
+ cursor = html % 'div.sh:first-of-type > b:last-of-type'
93
+
94
+ # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
95
+ # We're looking good.
96
+ next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
97
+ end
98
+
99
+ # We have an anchor tag - so - let's assign the href:
100
+ @next_page_href = next_link[:href] if next_link
87
101
  end
88
-
89
- # We have an anchor tag - so - let's assign the href:
90
- @next_page_href = next_link[:href] if next_link
91
102
  end
92
-
103
+
93
104
  @next_page_href
94
105
  end
95
-
106
+
96
107
  # String, Full URL Path of the 'next page' link
97
108
  def next_page_url
98
109
  (next_page_href) ? url_from_href(next_page_href) : nil
99
110
  end
100
-
111
+
101
112
  # Returns a Listings object of the next_page_url on the current listings object
102
113
  def next_page
103
114
  CraigScrape::Listings.new next_page_url if next_page_url
104
115
  end
105
-
116
+
106
117
  # Takes a paragraph element and returns a mostly-parsed Posting
107
118
  # We separate this from the rest of the parsing both for readability and ease of testing
108
119
  def self.parse_summary(p_element, date = nil) #:nodoc:
@@ -111,8 +122,8 @@ class CraigScrape::Listings < CraigScrape::Scraper
111
122
  title_anchor = nil
112
123
  section_anchor = nil
113
124
 
114
- # This loop got a little more complicated after Craigslist start inserting weird <spans>'s in
115
- # its list summary postings (See test_new_listing_span051710)
125
+ # This loop got a little more complicated after Craigslist start inserting weird <spans>'s in
126
+ # its list summary postings (See test_new_listing_span051710)
116
127
  p_element.search('a').each do |a_el|
117
128
  # We want the first a-tag that doesn't have spans in it to be the title anchor
118
129
  if title_anchor.nil?
@@ -124,12 +135,12 @@ class CraigScrape::Listings < CraigScrape::Scraper
124
135
  break
125
136
  end
126
137
  end
127
-
138
+
128
139
  location_tag = p_element.at 'font'
129
140
  has_pic_tag = p_element.at 'span'
130
-
141
+
131
142
  href = nil
132
-
143
+
133
144
  location = he_decode p_element.at('font').inner_html if location_tag
134
145
  ret[:location] = $1 if location and LOCATION.match location
135
146
 
@@ -141,20 +152,26 @@ class CraigScrape::Listings < CraigScrape::Scraper
141
152
  ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
142
153
  end
143
154
 
144
- ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
145
-
155
+ ret[:section] = he_decode(section_anchor.inner_html) if section_anchor
156
+
146
157
  ret[:post_date] = date
147
- if SUMMARY_DATE.match he_decode(p_element.children[0])
158
+ if p_element.at_xpath(XPATH_POST_DATE)
159
+ # Post 12/3
160
+ if /\A([^ ]+) ([\d]+)\Z/.match p_element.at_xpath(XPATH_POST_DATE).content.strip
161
+ ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
162
+ end
163
+ elsif SUMMARY_DATE.match he_decode(p_element.children[0])
164
+ # Old style
148
165
  ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
149
166
  end
150
167
 
151
168
  if title_anchor
152
169
  label = he_decode title_anchor.inner_html
153
170
  ret[:label] = $1 if LABEL.match label
154
-
171
+
155
172
  ret[:href] = title_anchor[:href]
156
173
  end
157
-
174
+
158
175
  ret
159
176
  end
160
177
  end
data/lib/posting.rb CHANGED
@@ -1,27 +1,37 @@
1
1
  # = About posting.rb
2
2
  #
3
3
  # This file contains the parsing code, and logic relating to craiglist postings. You
4
- # should never need to include this file directly, as all of libcraigscrape's objects and methods
4
+ # should never need to include this file directly, as all of libcraigscrape's objects and methods
5
5
  # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
6
6
  #
7
7
 
8
8
  require 'scraper'
9
9
 
10
10
  # Posting represents a fully downloaded, and parsed, Craigslist post.
11
- # This class is generally returned by the listing scrape methods, and
12
- # contains the post summaries for a specific search url, or a general listing category
11
+ # This class is generally returned by the listing scrape methods, and
12
+ # contains the post summaries for a specific search url, or a general listing category
13
13
  class CraigScrape::Posting < CraigScrape::Scraper
14
-
14
+
15
15
  POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
16
16
  LOCATION = /Location\:[ ]+(.+)/
17
17
  HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
18
- POSTING_ID = /PostingID\:[ ]+([\d]+)/
18
+ POSTING_ID = /PostingID\:[ ]*([\d]+)/
19
19
  REPLY_TO = /(.+)/
20
20
  PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
21
+ # NOTE: we implement the (?:) to first check the 'old' style format, and then the 'new style'
22
+ # (As of 12/03's parse changes)
21
23
  USERBODY_PARTS = /^(.+)\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>(.+)$/m
22
24
  HTML_HEADER = /^(.+)\<div id\=\"userbody\">/m
23
25
  IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
24
26
 
27
+ # This is used to determine if there's a parse error
28
+ REQUIRED_FIELDS = %w(contents posting_id post_time header title full_section)
29
+
30
+ XPATH_USERBODY = "//*[@id='userbody']"
31
+ XPATH_BLURBS = "//ul[@class='blurbs']"
32
+ XPATH_PICS = "//*[@class='tn']/a/@href"
33
+ XPATH_REPLY_TO = "//*[@class='dateReplyBar']/small/a"
34
+
25
35
  # This is really just for testing, in production use, uri.path is a better solution
26
36
  attr_reader :href #:nodoc:
27
37
 
@@ -30,14 +40,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
30
40
  super(*args)
31
41
 
32
42
  # Validate that required fields are present, at least - if we've downloaded it from a url
33
- parse_error! if (
34
- args.first.kind_of? String and
35
- !flagged_for_removal? and
36
- !posting_has_expired? and
37
- !deleted_by_author? and [
38
- contents,posting_id,post_time,header,title,full_section
39
- ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
40
- )
43
+ if args.first.kind_of? String and is_active_post?
44
+ unparsed_fields = REQUIRED_FIELDS.find_all{|f|
45
+ val = send(f)
46
+ val.nil? or (val.respond_to? :length and val.length == 0)
47
+ }
48
+ parse_error! unparsed_fields unless unparsed_fields.empty?
49
+ end
50
+
41
51
  end
42
52
 
43
53
 
@@ -47,10 +57,10 @@ class CraigScrape::Posting < CraigScrape::Scraper
47
57
  h2 = html_head.at 'h2' if html_head
48
58
  @header = he_decode h2.inner_html if h2
49
59
  end
50
-
60
+
51
61
  @header
52
62
  end
53
-
63
+
54
64
  # String, the item's title
55
65
  def title
56
66
  unless @title
@@ -58,7 +68,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
58
68
  @title = he_decode title_tag.inner_html if title_tag
59
69
  @title = nil if @title and @title.length == 0
60
70
  end
61
-
71
+
62
72
  @title
63
73
  end
64
74
 
@@ -66,8 +76,8 @@ class CraigScrape::Posting < CraigScrape::Scraper
66
76
  def full_section
67
77
  unless @full_section
68
78
  @full_section = []
69
-
70
- (html_head/"div[@class='bchead']//a").each do |a|
79
+
80
+ (html_head / "*[@class='bchead']//a").each do |a|
71
81
  @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
72
82
  end if html_head
73
83
  end
@@ -78,84 +88,103 @@ class CraigScrape::Posting < CraigScrape::Scraper
78
88
  # String, represents the post's reply-to address, if listed
79
89
  def reply_to
80
90
  unless @reply_to
81
- cursor = html_head.at 'hr' if html_head
82
- cursor = cursor.next until cursor.nil? or cursor.name == 'a'
83
- @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
91
+ if html.at_xpath(XPATH_REPLY_TO)
92
+ @reply_to = html.at_xpath(XPATH_REPLY_TO).content
93
+ else
94
+ cursor = html_head.at 'hr' if html_head
95
+ cursor = cursor.next until cursor.nil? or cursor.name == 'a'
96
+ @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
97
+ end
84
98
  end
85
-
99
+
86
100
  @reply_to
87
101
  end
88
-
89
- # Time, reflects the full timestamp of the posting
102
+
103
+ # Time, reflects the full timestamp of the posting
90
104
  def post_time
91
105
  unless @post_time
92
106
  cursor = html_head.at 'hr' if html_head
93
107
  cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
94
108
  @post_time = Time.parse $1 if $1
95
109
  end
96
-
110
+
97
111
  @post_time
98
112
  end
99
113
 
100
114
  # Integer, Craigslist's unique posting id
101
115
  def posting_id
102
- unless @posting_id
103
- cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING if html_footer
104
- cursor = cursor.next until cursor.nil? or POSTING_ID.match cursor.to_s
105
- @posting_id = $1.to_i if $1
116
+ if @posting_id
117
+
118
+ elsif USERBODY_PARTS.match html_source
119
+ # Old style:
120
+ html_footer = $4
121
+ cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING
122
+ cursor = cursor.next until cursor.nil? or
123
+ @posting_id = $1.to_i if POSTING_ID.match html_footer.to_s
124
+ else
125
+ # Post 12/3
126
+ @posting_id = $1.to_i if POSTING_ID.match html.xpath("//*[@class='postingidtext']").to_s
106
127
  end
107
-
128
+
108
129
  @posting_id
109
130
  end
110
-
131
+
111
132
  # String, The full-html contents of the post
112
133
  def contents
113
134
  unless @contents
114
135
  @contents = user_body if html_source
115
- @contents = he_decode @contents.strip if @contents
136
+ @contents = he_decode(@contents).strip if @contents
116
137
  end
117
-
138
+
118
139
  @contents
119
140
  end
120
-
141
+
121
142
  # String, the location of the item, as best could be parsed
122
143
  def location
123
- if @location.nil? and craigslist_body and html
124
- # Location (when explicitly defined):
125
- cursor = craigslist_body.at 'ul' unless @location
126
-
127
- # Apa section includes other things in the li's (cats/dogs ok fields)
128
- cursor.children.each do |li|
129
- if LOCATION.match li.inner_html
130
- @location = he_decode($1) and break
131
- break
144
+ if @location.nil? and html
145
+
146
+ if html.at_xpath(XPATH_BLURBS)
147
+ # This is the post-12/3/12 style:
148
+ @location = $1 if html.xpath(XPATH_BLURBS).first.children.any?{|c|
149
+ LOCATION.match c.content}
150
+ elsif craigslist_body
151
+ # Location (when explicitly defined):
152
+ cursor = craigslist_body.at 'ul' unless @location
153
+
154
+ # This is the legacy style:
155
+ # Note: Apa section includes other things in the li's (cats/dogs ok fields)
156
+ cursor.children.each do |li|
157
+ if LOCATION.match li.inner_html
158
+ @location = he_decode($1) and break
159
+ break
160
+ end
161
+ end if cursor
162
+
163
+ # Real estate listings can work a little different for location:
164
+ unless @location
165
+ cursor = craigslist_body.at 'small'
166
+ cursor = cursor.previous until cursor.nil? or cursor.text?
167
+
168
+ @location = he_decode(cursor.to_s.strip) if cursor
132
169
  end
133
- end if cursor
134
-
135
- # Real estate listings can work a little different for location:
136
- unless @location
137
- cursor = craigslist_body.at 'small'
138
- cursor = cursor.previous until cursor.nil? or cursor.text?
139
-
140
- @location = he_decode(cursor.to_s.strip) if cursor
170
+
171
+ # So, *sometimes* the location just ends up being in the header, I don't know why:
172
+ @location = $1 if @location.nil? and HEADER_LOCATION.match header
141
173
  end
142
-
143
- # So, *sometimes* the location just ends up being in the header, I don't know why:
144
- @location = $1 if @location.nil? and HEADER_LOCATION.match header
145
174
  end
146
-
175
+
147
176
  @location
148
177
  end
149
178
 
150
179
  # Array, urls of the post's images that are *not* hosted on craigslist
151
180
  def images
152
181
  # Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
153
- @images = (
154
- contents ?
182
+ @images = (
183
+ contents ?
155
184
  contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
156
- []
185
+ []
157
186
  ) unless @images
158
-
187
+
159
188
  @images
160
189
  end
161
190
 
@@ -163,15 +192,20 @@ class CraigScrape::Posting < CraigScrape::Scraper
163
192
  def pics
164
193
  unless @pics
165
194
  @pics = []
166
-
167
- if html and craigslist_body
168
- # Now let's find the craigslist hosted images:
169
- img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
170
-
171
- @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
195
+
196
+ if html
197
+ if html.at_xpath(XPATH_PICS)
198
+ @pics = html.xpath(XPATH_PICS).collect(&:value)
199
+ elsif craigslist_body
200
+ # This is the pre-12/3/12 style:
201
+ # Now let's find the craigslist hosted images:
202
+ img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
203
+
204
+ @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
205
+ end
172
206
  end
173
207
  end
174
-
208
+
175
209
  @pics
176
210
  end
177
211
 
@@ -180,38 +214,37 @@ class CraigScrape::Posting < CraigScrape::Scraper
180
214
  @flagged_for_removal = (
181
215
  system_post? and header_as_plain == "This posting has been flagged for removal"
182
216
  ) if @flagged_for_removal.nil?
183
-
217
+
184
218
  @flagged_for_removal
185
219
  end
186
-
220
+
187
221
  # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
188
222
  def deleted_by_author?
189
223
  @deleted_by_author = (
190
224
  system_post? and header_as_plain == "This posting has been deleted by its author."
191
225
  ) if @deleted_by_author.nil?
192
-
226
+
193
227
  @deleted_by_author
194
228
  end
195
-
229
+
196
230
  # Returns true if this Post was parsed, and represents a 'This posting has expired.' notice
197
231
  def posting_has_expired?
198
232
  @posting_has_expired = (
199
233
  system_post? and header_as_plain == "This posting has expired."
200
234
  ) if @posting_has_expired.nil?
201
-
235
+
202
236
  @posting_has_expired
203
237
  end
204
-
205
-
238
+
206
239
  # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
207
240
  # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
208
241
  def post_date
209
242
  @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
210
-
243
+
211
244
  @post_date
212
245
  end
213
-
214
- # Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
246
+
247
+ # Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
215
248
  # The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
216
249
  # Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
217
250
  # This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
@@ -219,37 +252,31 @@ class CraigScrape::Posting < CraigScrape::Scraper
219
252
  def label
220
253
  unless @label or system_post?
221
254
  @label = header
222
-
255
+
223
256
  @label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
224
257
  end
225
-
258
+
226
259
  @label
227
260
  end
228
261
 
229
262
  # Array, which image types are listed for the post.
230
263
  # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
231
264
  def img_types
232
- unless @img_types
233
- @img_types = []
234
-
235
- @img_types << :img if images.length > 0
236
- @img_types << :pic if pics.length > 0
237
- end
238
-
239
- @img_types
265
+ @img_types || [ (images.length > 0) ? :img : nil,
266
+ (pics.length > 0) ? :pic : nil ].compact
240
267
  end
241
-
242
- # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
268
+
269
+ # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
243
270
  # this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
244
271
  def section
245
272
  unless @section
246
- @section = full_section.last if full_section
273
+ @section = full_section.last if full_section
247
274
  end
248
-
275
+
249
276
  @section
250
277
  end
251
278
 
252
- # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
279
+ # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
253
280
  # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
254
281
  def has_img?
255
282
  img_types.include? :img
@@ -272,50 +299,70 @@ class CraigScrape::Posting < CraigScrape::Scraper
272
299
  def price
273
300
  $1.tr('$','').to_f if label and PRICE.match label
274
301
  end
275
-
302
+
276
303
  # Returns the post contents with all html tags removed
277
304
  def contents_as_plain
278
305
  strip_html contents
279
306
  end
280
307
 
281
- # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
308
+ # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
282
309
  # 'system_post' we may get tags in here
283
310
  def header_as_plain
284
311
  strip_html header
285
312
  end
286
313
 
287
- # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
314
+ # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
288
315
  # This returns true or false if that case applies
289
316
  def system_post?
290
317
  [contents,posting_id,post_time,title].all?{|f| f.nil?}
291
318
  end
292
319
 
320
+ # This is mostly used to determine if the post should be checked for
321
+ # parse errors. Might be useful for someone else though
322
+ def is_active_post?
323
+ [flagged_for_removal?, posting_has_expired?, deleted_by_author?].none?
324
+ end
325
+
293
326
  private
294
327
 
295
- # I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
328
+ # I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
296
329
  # return everything above the user_body
297
330
  def html_head
298
331
  @html_head = Nokogiri::HTML $1, nil, HTML_ENCODING if @html_head.nil? and HTML_HEADER.match html_source
299
332
  # We return html itself if HTML_HEADER doesn't match, which would be case for a 404 page or something
300
333
  @html_head ||= html
301
-
334
+
302
335
  @html_head
303
336
  end
304
337
 
305
- # Since we started having so many problems with Hpricot flipping out on whack content bodies,
306
- # I added this to return everything south of the user_body
307
- def html_footer
308
- $4 if USERBODY_PARTS.match html_source
309
- end
310
-
311
338
  # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
312
- # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
339
+ # This bad html trips up html parsers, and I've resorted to splitting the page up using string parsing like so:
313
340
  # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
314
341
  def user_body
315
- $2 if USERBODY_PARTS.match html_source
342
+ if USERBODY_PARTS.match html_source
343
+ # This is the pre-12/3/12 style:
344
+ $2
345
+ elsif html.at_xpath(XPATH_USERBODY)
346
+ # There's a bunch of junk in here that we don't want, so this loop removes
347
+ # everything after (and including) the last script tag, from the result
348
+ user_body = html.xpath(XPATH_USERBODY)
349
+ hit_delimeter = false
350
+ # Since some posts don't actually have the script tag:
351
+ delimeter = user_body.at_xpath('script') ? :script : :comment
352
+ user_body.first.children.to_a.reverse.reject{ |p|
353
+ if hit_delimeter
354
+ false
355
+ elsif ( (delimeter == :script and p.name == 'script') or
356
+ (delimeter == :comment and p.comment? and p.content.strip == "START CLTAGS") )
357
+ hit_delimeter = true
358
+ else
359
+ true
360
+ end
361
+ }.reverse.collect(&:to_s).join
362
+ end
316
363
  end
317
-
318
- # Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
364
+
365
+ # Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
319
366
  # So - we'll return it as a Nokogiri object.
320
367
  def craigslist_body
321
368
  Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source