olek-libcraigscrape 1.0.3 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,18 +3,13 @@
3
3
  # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
4
4
  #
5
5
  require 'rubygems'
6
-
7
- gem 'activesupport', '~> 2.3'
8
- gem 'nokogiri', '>= 1.4.4'
9
- gem 'htmlentities', '>= 4.0.0'
10
-
11
-
12
- require 'net/http'
13
- require 'zlib'
14
- require 'nokogiri'
6
+ require 'time'
7
+ require 'uri'
15
8
  require 'htmlentities'
16
- require 'active_support'
17
-
9
+ require 'active_support/core_ext/class/attribute_accessors'
10
+ require 'htmlentities'
11
+ require 'nokogiri'
12
+ require 'typhoeus'
18
13
 
19
14
  # A base class encapsulating the various libcraigscrape objects, and providing most of the
20
15
  # craigslist interaction methods. Currently, we're supporting the old Class methods
data/lib/listings.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # = About listings.rb
2
2
  #
3
3
  # This file contains the parsing code, and logic relating to post-listing pages. You
4
- # should never need to include this file directly, as all of libcraigscrape's objects and methods
4
+ # should never need to include this file directly, as all of libcraigscrape's objects and methods
5
5
  # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
6
6
  #
7
7
  require 'scraper'
@@ -13,7 +13,10 @@ class CraigScrape::Listings < CraigScrape::Scraper
13
13
  IMG_TYPE = /^[ ]*(.+)[ ]*$/
14
14
  HEADER_DATE = /^[ ]*(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[ ]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Nov|Dec)[ ]+([0-9]{1,2})[ ]*$/i
15
15
  SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
16
- NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
16
+ NEXT_PAGE_LINK = /^[ ]*(?:next [\d]+ postings|Next \>\>)[ ]*$/
17
+
18
+ XPATH_POST_DATE = "*[@class='itemdate']"
19
+ XPATH_PAGENAV_LINKS = "//*[@class='ban']//a"
17
20
 
18
21
  # Array, PostSummary objects found in the listing
19
22
  def posts
@@ -22,12 +25,12 @@ class CraigScrape::Listings < CraigScrape::Scraper
22
25
  @posts = []
23
26
 
24
27
  # All we care about are p and h4 tags. This seemed to be the only way I could do this on Nokogiri:
25
- post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
28
+ post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
26
29
 
27
30
  # The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
28
31
  post_tags.pop if (
29
- post_tags.length > 0 and
30
- post_tags.last.at('a') and
32
+ post_tags.length > 0 and
33
+ post_tags.last.at('a') and
31
34
  NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
32
35
  )
33
36
 
@@ -39,7 +42,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
39
42
 
40
43
  # Validate that required fields are present:
41
44
  parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
42
-
45
+
43
46
  post_summary[:url] = url_from_href post_summary[:href]
44
47
 
45
48
  @posts << CraigScrape::Posting.new(post_summary)
@@ -50,13 +53,13 @@ class CraigScrape::Listings < CraigScrape::Scraper
50
53
  current_date = CraigScrape.most_recently_expired_time $1, $2
51
54
  elsif html.at('h4:last-of-type') == el
52
55
  # There's a specific bug in craigslist, where these nonsense h4's just appear without anything relevant inside them.
53
- # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
56
+ # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
54
57
  # we need to pull up the full post in order to accurate tell the date.
55
58
  # Setting this to nil will achieve the eager-load.
56
59
  current_date = nil
57
60
  end
58
- end
59
- end
61
+ end
62
+ end
60
63
  end
61
64
 
62
65
  @posts
@@ -65,44 +68,52 @@ class CraigScrape::Listings < CraigScrape::Scraper
65
68
  # String, URL Path href-fragment of the next page link
66
69
  def next_page_href
67
70
  unless @next_page_href
68
- cursor = html.at 'p:last-of-type'
69
-
70
- cursor = cursor.at 'a' if cursor
71
-
72
- # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
73
- next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
74
-
75
- # Search listings put their next page in a link towards the top
76
- next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
77
-
78
- # Some search pages have a bug, whereby a 'next page' link isn't displayed,
79
- # even though we can see that theres another page listed in the page-number links block at the top
80
- # and bottom of the listing page
81
- unless next_link
82
- cursor = html % 'div.sh:first-of-type > b:last-of-type'
83
-
84
- # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
85
- # We're looking good.
86
- next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
71
+
72
+ if html.at_xpath(XPATH_PAGENAV_LINKS)
73
+ # Post 12/3
74
+ next_link = html.xpath(XPATH_PAGENAV_LINKS).find{|link| NEXT_PAGE_LINK.match link.content}
75
+ @next_page_href = next_link[:href]
76
+ else
77
+ # Old style
78
+ cursor = html.at 'p:last-of-type'
79
+
80
+ cursor = cursor.at 'a' if cursor
81
+
82
+ # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
83
+ next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
84
+
85
+ # Search listings put their next page in a link towards the top
86
+ next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
87
+
88
+ # Some search pages have a bug, whereby a 'next page' link isn't displayed,
89
+ # even though we can see that theres another page listed in the page-number links block at the top
90
+ # and bottom of the listing page
91
+ unless next_link
92
+ cursor = html % 'div.sh:first-of-type > b:last-of-type'
93
+
94
+ # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
95
+ # We're looking good.
96
+ next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
97
+ end
98
+
99
+ # We have an anchor tag - so - let's assign the href:
100
+ @next_page_href = next_link[:href] if next_link
87
101
  end
88
-
89
- # We have an anchor tag - so - let's assign the href:
90
- @next_page_href = next_link[:href] if next_link
91
102
  end
92
-
103
+
93
104
  @next_page_href
94
105
  end
95
-
106
+
96
107
  # String, Full URL Path of the 'next page' link
97
108
  def next_page_url
98
109
  (next_page_href) ? url_from_href(next_page_href) : nil
99
110
  end
100
-
111
+
101
112
  # Returns a Listings object of the next_page_url on the current listings object
102
113
  def next_page
103
114
  CraigScrape::Listings.new next_page_url if next_page_url
104
115
  end
105
-
116
+
106
117
  # Takes a paragraph element and returns a mostly-parsed Posting
107
118
  # We separate this from the rest of the parsing both for readability and ease of testing
108
119
  def self.parse_summary(p_element, date = nil) #:nodoc:
@@ -111,8 +122,8 @@ class CraigScrape::Listings < CraigScrape::Scraper
111
122
  title_anchor = nil
112
123
  section_anchor = nil
113
124
 
114
- # This loop got a little more complicated after Craigslist start inserting weird <spans>'s in
115
- # its list summary postings (See test_new_listing_span051710)
125
+ # This loop got a little more complicated after Craigslist start inserting weird <spans>'s in
126
+ # its list summary postings (See test_new_listing_span051710)
116
127
  p_element.search('a').each do |a_el|
117
128
  # We want the first a-tag that doesn't have spans in it to be the title anchor
118
129
  if title_anchor.nil?
@@ -124,12 +135,12 @@ class CraigScrape::Listings < CraigScrape::Scraper
124
135
  break
125
136
  end
126
137
  end
127
-
138
+
128
139
  location_tag = p_element.at 'font'
129
140
  has_pic_tag = p_element.at 'span'
130
-
141
+
131
142
  href = nil
132
-
143
+
133
144
  location = he_decode p_element.at('font').inner_html if location_tag
134
145
  ret[:location] = $1 if location and LOCATION.match location
135
146
 
@@ -141,20 +152,26 @@ class CraigScrape::Listings < CraigScrape::Scraper
141
152
  ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
142
153
  end
143
154
 
144
- ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
145
-
155
+ ret[:section] = he_decode(section_anchor.inner_html) if section_anchor
156
+
146
157
  ret[:post_date] = date
147
- if SUMMARY_DATE.match he_decode(p_element.children[0])
158
+ if p_element.at_xpath(XPATH_POST_DATE)
159
+ # Post 12/3
160
+ if /\A([^ ]+) ([\d]+)\Z/.match p_element.at_xpath(XPATH_POST_DATE).content.strip
161
+ ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
162
+ end
163
+ elsif SUMMARY_DATE.match he_decode(p_element.children[0])
164
+ # Old style
148
165
  ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
149
166
  end
150
167
 
151
168
  if title_anchor
152
169
  label = he_decode title_anchor.inner_html
153
170
  ret[:label] = $1 if LABEL.match label
154
-
171
+
155
172
  ret[:href] = title_anchor[:href]
156
173
  end
157
-
174
+
158
175
  ret
159
176
  end
160
177
  end
data/lib/posting.rb CHANGED
@@ -1,27 +1,37 @@
1
1
  # = About posting.rb
2
2
  #
3
3
  # This file contains the parsing code, and logic relating to craiglist postings. You
4
- # should never need to include this file directly, as all of libcraigscrape's objects and methods
4
+ # should never need to include this file directly, as all of libcraigscrape's objects and methods
5
5
  # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
6
6
  #
7
7
 
8
8
  require 'scraper'
9
9
 
10
10
  # Posting represents a fully downloaded, and parsed, Craigslist post.
11
- # This class is generally returned by the listing scrape methods, and
12
- # contains the post summaries for a specific search url, or a general listing category
11
+ # This class is generally returned by the listing scrape methods, and
12
+ # contains the post summaries for a specific search url, or a general listing category
13
13
  class CraigScrape::Posting < CraigScrape::Scraper
14
-
14
+
15
15
  POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
16
16
  LOCATION = /Location\:[ ]+(.+)/
17
17
  HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
18
- POSTING_ID = /PostingID\:[ ]+([\d]+)/
18
+ POSTING_ID = /PostingID\:[ ]*([\d]+)/
19
19
  REPLY_TO = /(.+)/
20
20
  PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
21
+ # NOTE: we implement the (?:) to first check the 'old' style format, and then the 'new style'
22
+ # (As of 12/03's parse changes)
21
23
  USERBODY_PARTS = /^(.+)\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>(.+)$/m
22
24
  HTML_HEADER = /^(.+)\<div id\=\"userbody\">/m
23
25
  IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
24
26
 
27
+ # This is used to determine if there's a parse error
28
+ REQUIRED_FIELDS = %w(contents posting_id post_time header title full_section)
29
+
30
+ XPATH_USERBODY = "//*[@id='userbody']"
31
+ XPATH_BLURBS = "//ul[@class='blurbs']"
32
+ XPATH_PICS = "//*[@class='tn']/a/@href"
33
+ XPATH_REPLY_TO = "//*[@class='dateReplyBar']/small/a"
34
+
25
35
  # This is really just for testing, in production use, uri.path is a better solution
26
36
  attr_reader :href #:nodoc:
27
37
 
@@ -30,14 +40,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
30
40
  super(*args)
31
41
 
32
42
  # Validate that required fields are present, at least - if we've downloaded it from a url
33
- parse_error! if (
34
- args.first.kind_of? String and
35
- !flagged_for_removal? and
36
- !posting_has_expired? and
37
- !deleted_by_author? and [
38
- contents,posting_id,post_time,header,title,full_section
39
- ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
40
- )
43
+ if args.first.kind_of? String and is_active_post?
44
+ unparsed_fields = REQUIRED_FIELDS.find_all{|f|
45
+ val = send(f)
46
+ val.nil? or (val.respond_to? :length and val.length == 0)
47
+ }
48
+ parse_error! unparsed_fields unless unparsed_fields.empty?
49
+ end
50
+
41
51
  end
42
52
 
43
53
 
@@ -47,10 +57,10 @@ class CraigScrape::Posting < CraigScrape::Scraper
47
57
  h2 = html_head.at 'h2' if html_head
48
58
  @header = he_decode h2.inner_html if h2
49
59
  end
50
-
60
+
51
61
  @header
52
62
  end
53
-
63
+
54
64
  # String, the item's title
55
65
  def title
56
66
  unless @title
@@ -58,7 +68,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
58
68
  @title = he_decode title_tag.inner_html if title_tag
59
69
  @title = nil if @title and @title.length == 0
60
70
  end
61
-
71
+
62
72
  @title
63
73
  end
64
74
 
@@ -66,8 +76,8 @@ class CraigScrape::Posting < CraigScrape::Scraper
66
76
  def full_section
67
77
  unless @full_section
68
78
  @full_section = []
69
-
70
- (html_head/"div[@class='bchead']//a").each do |a|
79
+
80
+ (html_head / "*[@class='bchead']//a").each do |a|
71
81
  @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
72
82
  end if html_head
73
83
  end
@@ -78,84 +88,103 @@ class CraigScrape::Posting < CraigScrape::Scraper
78
88
  # String, represents the post's reply-to address, if listed
79
89
  def reply_to
80
90
  unless @reply_to
81
- cursor = html_head.at 'hr' if html_head
82
- cursor = cursor.next until cursor.nil? or cursor.name == 'a'
83
- @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
91
+ if html.at_xpath(XPATH_REPLY_TO)
92
+ @reply_to = html.at_xpath(XPATH_REPLY_TO).content
93
+ else
94
+ cursor = html_head.at 'hr' if html_head
95
+ cursor = cursor.next until cursor.nil? or cursor.name == 'a'
96
+ @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
97
+ end
84
98
  end
85
-
99
+
86
100
  @reply_to
87
101
  end
88
-
89
- # Time, reflects the full timestamp of the posting
102
+
103
+ # Time, reflects the full timestamp of the posting
90
104
  def post_time
91
105
  unless @post_time
92
106
  cursor = html_head.at 'hr' if html_head
93
107
  cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
94
108
  @post_time = Time.parse $1 if $1
95
109
  end
96
-
110
+
97
111
  @post_time
98
112
  end
99
113
 
100
114
  # Integer, Craigslist's unique posting id
101
115
  def posting_id
102
- unless @posting_id
103
- cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING if html_footer
104
- cursor = cursor.next until cursor.nil? or POSTING_ID.match cursor.to_s
105
- @posting_id = $1.to_i if $1
116
+ if @posting_id
117
+
118
+ elsif USERBODY_PARTS.match html_source
119
+ # Old style:
120
+ html_footer = $4
121
+ cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING
122
+ cursor = cursor.next until cursor.nil? or
123
+ @posting_id = $1.to_i if POSTING_ID.match html_footer.to_s
124
+ else
125
+ # Post 12/3
126
+ @posting_id = $1.to_i if POSTING_ID.match html.xpath("//*[@class='postingidtext']").to_s
106
127
  end
107
-
128
+
108
129
  @posting_id
109
130
  end
110
-
131
+
111
132
  # String, The full-html contents of the post
112
133
  def contents
113
134
  unless @contents
114
135
  @contents = user_body if html_source
115
- @contents = he_decode @contents.strip if @contents
136
+ @contents = he_decode(@contents).strip if @contents
116
137
  end
117
-
138
+
118
139
  @contents
119
140
  end
120
-
141
+
121
142
  # String, the location of the item, as best could be parsed
122
143
  def location
123
- if @location.nil? and craigslist_body and html
124
- # Location (when explicitly defined):
125
- cursor = craigslist_body.at 'ul' unless @location
126
-
127
- # Apa section includes other things in the li's (cats/dogs ok fields)
128
- cursor.children.each do |li|
129
- if LOCATION.match li.inner_html
130
- @location = he_decode($1) and break
131
- break
144
+ if @location.nil? and html
145
+
146
+ if html.at_xpath(XPATH_BLURBS)
147
+ # This is the post-12/3/12 style:
148
+ @location = $1 if html.xpath(XPATH_BLURBS).first.children.any?{|c|
149
+ LOCATION.match c.content}
150
+ elsif craigslist_body
151
+ # Location (when explicitly defined):
152
+ cursor = craigslist_body.at 'ul' unless @location
153
+
154
+ # This is the legacy style:
155
+ # Note: Apa section includes other things in the li's (cats/dogs ok fields)
156
+ cursor.children.each do |li|
157
+ if LOCATION.match li.inner_html
158
+ @location = he_decode($1) and break
159
+ break
160
+ end
161
+ end if cursor
162
+
163
+ # Real estate listings can work a little different for location:
164
+ unless @location
165
+ cursor = craigslist_body.at 'small'
166
+ cursor = cursor.previous until cursor.nil? or cursor.text?
167
+
168
+ @location = he_decode(cursor.to_s.strip) if cursor
132
169
  end
133
- end if cursor
134
-
135
- # Real estate listings can work a little different for location:
136
- unless @location
137
- cursor = craigslist_body.at 'small'
138
- cursor = cursor.previous until cursor.nil? or cursor.text?
139
-
140
- @location = he_decode(cursor.to_s.strip) if cursor
170
+
171
+ # So, *sometimes* the location just ends up being in the header, I don't know why:
172
+ @location = $1 if @location.nil? and HEADER_LOCATION.match header
141
173
  end
142
-
143
- # So, *sometimes* the location just ends up being in the header, I don't know why:
144
- @location = $1 if @location.nil? and HEADER_LOCATION.match header
145
174
  end
146
-
175
+
147
176
  @location
148
177
  end
149
178
 
150
179
  # Array, urls of the post's images that are *not* hosted on craigslist
151
180
  def images
152
181
  # Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
153
- @images = (
154
- contents ?
182
+ @images = (
183
+ contents ?
155
184
  contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
156
- []
185
+ []
157
186
  ) unless @images
158
-
187
+
159
188
  @images
160
189
  end
161
190
 
@@ -163,15 +192,20 @@ class CraigScrape::Posting < CraigScrape::Scraper
163
192
  def pics
164
193
  unless @pics
165
194
  @pics = []
166
-
167
- if html and craigslist_body
168
- # Now let's find the craigslist hosted images:
169
- img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
170
-
171
- @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
195
+
196
+ if html
197
+ if html.at_xpath(XPATH_PICS)
198
+ @pics = html.xpath(XPATH_PICS).collect(&:value)
199
+ elsif craigslist_body
200
+ # This is the pre-12/3/12 style:
201
+ # Now let's find the craigslist hosted images:
202
+ img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
203
+
204
+ @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
205
+ end
172
206
  end
173
207
  end
174
-
208
+
175
209
  @pics
176
210
  end
177
211
 
@@ -180,38 +214,37 @@ class CraigScrape::Posting < CraigScrape::Scraper
180
214
  @flagged_for_removal = (
181
215
  system_post? and header_as_plain == "This posting has been flagged for removal"
182
216
  ) if @flagged_for_removal.nil?
183
-
217
+
184
218
  @flagged_for_removal
185
219
  end
186
-
220
+
187
221
  # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
188
222
  def deleted_by_author?
189
223
  @deleted_by_author = (
190
224
  system_post? and header_as_plain == "This posting has been deleted by its author."
191
225
  ) if @deleted_by_author.nil?
192
-
226
+
193
227
  @deleted_by_author
194
228
  end
195
-
229
+
196
230
  # Returns true if this Post was parsed, and represents a 'This posting has expired.' notice
197
231
  def posting_has_expired?
198
232
  @posting_has_expired = (
199
233
  system_post? and header_as_plain == "This posting has expired."
200
234
  ) if @posting_has_expired.nil?
201
-
235
+
202
236
  @posting_has_expired
203
237
  end
204
-
205
-
238
+
206
239
  # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
207
240
  # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
208
241
  def post_date
209
242
  @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
210
-
243
+
211
244
  @post_date
212
245
  end
213
-
214
- # Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
246
+
247
+ # Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
215
248
  # The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
216
249
  # Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
217
250
  # This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
@@ -219,37 +252,31 @@ class CraigScrape::Posting < CraigScrape::Scraper
219
252
  def label
220
253
  unless @label or system_post?
221
254
  @label = header
222
-
255
+
223
256
  @label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
224
257
  end
225
-
258
+
226
259
  @label
227
260
  end
228
261
 
229
262
  # Array, which image types are listed for the post.
230
263
  # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
231
264
  def img_types
232
- unless @img_types
233
- @img_types = []
234
-
235
- @img_types << :img if images.length > 0
236
- @img_types << :pic if pics.length > 0
237
- end
238
-
239
- @img_types
265
+ @img_types || [ (images.length > 0) ? :img : nil,
266
+ (pics.length > 0) ? :pic : nil ].compact
240
267
  end
241
-
242
- # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
268
+
269
+ # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
243
270
  # this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
244
271
  def section
245
272
  unless @section
246
- @section = full_section.last if full_section
273
+ @section = full_section.last if full_section
247
274
  end
248
-
275
+
249
276
  @section
250
277
  end
251
278
 
252
- # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
279
+ # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
253
280
  # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
254
281
  def has_img?
255
282
  img_types.include? :img
@@ -272,50 +299,70 @@ class CraigScrape::Posting < CraigScrape::Scraper
272
299
  def price
273
300
  $1.tr('$','').to_f if label and PRICE.match label
274
301
  end
275
-
302
+
276
303
  # Returns the post contents with all html tags removed
277
304
  def contents_as_plain
278
305
  strip_html contents
279
306
  end
280
307
 
281
- # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
308
+ # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
282
309
  # 'system_post' we may get tags in here
283
310
  def header_as_plain
284
311
  strip_html header
285
312
  end
286
313
 
287
- # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
314
+ # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
288
315
  # This returns true or false if that case applies
289
316
  def system_post?
290
317
  [contents,posting_id,post_time,title].all?{|f| f.nil?}
291
318
  end
292
319
 
320
+ # This is mostly used to determine if the post should be checked for
321
+ # parse errors. Might be useful for someone else though
322
+ def is_active_post?
323
+ [flagged_for_removal?, posting_has_expired?, deleted_by_author?].none?
324
+ end
325
+
293
326
  private
294
327
 
295
- # I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
328
+ # I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
296
329
  # return everything above the user_body
297
330
  def html_head
298
331
  @html_head = Nokogiri::HTML $1, nil, HTML_ENCODING if @html_head.nil? and HTML_HEADER.match html_source
299
332
  # We return html itself if HTML_HEADER doesn't match, which would be case for a 404 page or something
300
333
  @html_head ||= html
301
-
334
+
302
335
  @html_head
303
336
  end
304
337
 
305
- # Since we started having so many problems with Hpricot flipping out on whack content bodies,
306
- # I added this to return everything south of the user_body
307
- def html_footer
308
- $4 if USERBODY_PARTS.match html_source
309
- end
310
-
311
338
  # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
312
- # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
339
+ # This bad html trips up html parsers, and I've resorted to splitting the page up using string parsing like so:
313
340
  # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
314
341
  def user_body
315
- $2 if USERBODY_PARTS.match html_source
342
+ if USERBODY_PARTS.match html_source
343
+ # This is the pre-12/3/12 style:
344
+ $2
345
+ elsif html.at_xpath(XPATH_USERBODY)
346
+ # There's a bunch of junk in here that we don't want, so this loop removes
347
+ # everything after (and including) the last script tag, from the result
348
+ user_body = html.xpath(XPATH_USERBODY)
349
+ hit_delimeter = false
350
+ # Since some posts don't actually have the script tag:
351
+ delimeter = user_body.at_xpath('script') ? :script : :comment
352
+ user_body.first.children.to_a.reverse.reject{ |p|
353
+ if hit_delimeter
354
+ false
355
+ elsif ( (delimeter == :script and p.name == 'script') or
356
+ (delimeter == :comment and p.comment? and p.content.strip == "START CLTAGS") )
357
+ hit_delimeter = true
358
+ else
359
+ true
360
+ end
361
+ }.reverse.collect(&:to_s).join
362
+ end
316
363
  end
317
-
318
- # Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
364
+
365
+ # Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
319
366
  # So - we'll return it as a Nokogiri object.
320
367
  def craigslist_body
321
368
  Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source