libcraigscrape 1.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/posting.rb CHANGED
@@ -14,14 +14,25 @@ class CraigScrape::Posting < CraigScrape::Scraper
14
14
 
15
15
  POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
16
16
  LOCATION = /Location\:[ ]+(.+)/
17
- HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
18
- POSTING_ID = /PostingID\:[ ]+([\d]+)/
17
+ HEADER_LOCATION = /\((.+)\)$/
18
+ POSTING_ID = /PostingID\:[ ]*([\d]+)/
19
19
  REPLY_TO = /(.+)/
20
20
  PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
21
+
22
+ # NOTE: we implement the (?:) to first check the 'old' style format, and then the 'new style'
23
+ # (As of 12/03's parse changes)
21
24
  USERBODY_PARTS = /^(.+)\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>(.+)$/m
22
25
  HTML_HEADER = /^(.+)\<div id\=\"userbody\">/m
23
26
  IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
24
27
 
28
+ # This is used to determine if there's a parse error
29
+ REQUIRED_FIELDS = %w(contents posting_id post_time header title full_section)
30
+
31
+ XPATH_USERBODY = "//*[@id='userbody']"
32
+ XPATH_BLURBS = "//ul[@class='blurbs']"
33
+ XPATH_PICS = "//*[@class='tn']/a/@href"
34
+ XPATH_REPLY_TO = "//*[@class='dateReplyBar']/small/a"
35
+
25
36
  # This is really just for testing, in production use, uri.path is a better solution
26
37
  attr_reader :href #:nodoc:
27
38
 
@@ -30,14 +41,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
30
41
  super(*args)
31
42
 
32
43
  # Validate that required fields are present, at least - if we've downloaded it from a url
33
- parse_error! if (
34
- args.first.kind_of? String and
35
- !flagged_for_removal? and
36
- !posting_has_expired? and
37
- !deleted_by_author? and [
38
- contents,posting_id,post_time,header,title,full_section
39
- ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
40
- )
44
+ if args.first.kind_of? String and is_active_post?
45
+ unparsed_fields = REQUIRED_FIELDS.find_all{|f|
46
+ val = send(f)
47
+ val.nil? or (val.respond_to? :length and val.length == 0)
48
+ }
49
+ parse_error! unparsed_fields unless unparsed_fields.empty?
50
+ end
51
+
41
52
  end
42
53
 
43
54
 
@@ -67,7 +78,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
67
78
  unless @full_section
68
79
  @full_section = []
69
80
 
70
- (html_head/"div[@class='bchead']//a").each do |a|
81
+ (html_head / "*[@class='bchead']//a").each do |a|
71
82
  @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
72
83
  end if html_head
73
84
  end
@@ -78,9 +89,13 @@ class CraigScrape::Posting < CraigScrape::Scraper
78
89
  # String, represents the post's reply-to address, if listed
79
90
  def reply_to
80
91
  unless @reply_to
81
- cursor = html_head.at 'hr' if html_head
82
- cursor = cursor.next until cursor.nil? or cursor.name == 'a'
83
- @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
92
+ if html.at_xpath(XPATH_REPLY_TO)
93
+ @reply_to = html.at_xpath(XPATH_REPLY_TO).content
94
+ else
95
+ cursor = html_head.at 'hr' if html_head
96
+ cursor = cursor.next until cursor.nil? or cursor.name == 'a'
97
+ @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
98
+ end
84
99
  end
85
100
 
86
101
  @reply_to
@@ -91,7 +106,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
91
106
  unless @post_time
92
107
  cursor = html_head.at 'hr' if html_head
93
108
  cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
94
- @post_time = Time.parse $1 if $1
109
+ @post_time = DateTime.parse($1) if $1
95
110
  end
96
111
 
97
112
  @post_time
@@ -99,10 +114,17 @@ class CraigScrape::Posting < CraigScrape::Scraper
99
114
 
100
115
  # Integer, Craigslist's unique posting id
101
116
  def posting_id
102
- unless @posting_id
103
- cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING if html_footer
104
- cursor = cursor.next until cursor.nil? or POSTING_ID.match cursor.to_s
105
- @posting_id = $1.to_i if $1
117
+ if @posting_id
118
+
119
+ elsif USERBODY_PARTS.match html_source
120
+ # Old style:
121
+ html_footer = $4
122
+ cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING
123
+ cursor = cursor.next until cursor.nil? or
124
+ @posting_id = $1.to_i if POSTING_ID.match html_footer.to_s
125
+ else
126
+ # Post 12/3
127
+ @posting_id = $1.to_i if POSTING_ID.match html.xpath("//*[@class='postingidtext']").to_s
106
128
  end
107
129
 
108
130
  @posting_id
@@ -112,7 +134,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
112
134
  def contents
113
135
  unless @contents
114
136
  @contents = user_body if html_source
115
- @contents = he_decode @contents.strip if @contents
137
+ @contents = he_decode(@contents).strip if @contents
116
138
  end
117
139
 
118
140
  @contents
@@ -120,27 +142,40 @@ class CraigScrape::Posting < CraigScrape::Scraper
120
142
 
121
143
  # String, the location of the item, as best could be parsed
122
144
  def location
123
- if @location.nil? and craigslist_body and html
124
- # Location (when explicitly defined):
125
- cursor = craigslist_body.at 'ul' unless @location
126
-
127
- # Apa section includes other things in the li's (cats/dogs ok fields)
128
- cursor.children.each do |li|
129
- if LOCATION.match li.inner_html
130
- @location = he_decode($1) and break
131
- break
132
- end
133
- end if cursor
145
+ if @location.nil? and html
146
+
147
+ if html.at_xpath(XPATH_BLURBS)
148
+ # This is the post-12/3/12 style:
149
+
150
+ # Sometimes the Location is in the body :
151
+ @location = $1 if html.xpath(XPATH_BLURBS).first.children.any?{|c|
152
+ LOCATION.match c.content}
134
153
 
135
- # Real estate listings can work a little different for location:
136
- unless @location
137
- cursor = craigslist_body.at 'small'
138
- cursor = cursor.previous until cursor.nil? or cursor.text?
154
+ elsif craigslist_body
155
+ # Location (when explicitly defined):
156
+ cursor = craigslist_body.at 'ul' unless @location
157
+
158
+ # This is the legacy style:
159
+ # Note: Apa section includes other things in the li's (cats/dogs ok fields)
160
+ cursor.children.each do |li|
161
+ if LOCATION.match li.inner_html
162
+ @location = he_decode($1) and break
163
+ break
164
+ end
165
+ end if cursor
166
+
167
+ # Real estate listings can work a little different for location:
168
+ unless @location
169
+ cursor = craigslist_body.at 'small'
170
+ cursor = cursor.previous until cursor.nil? or cursor.text?
171
+
172
+ @location = he_decode(cursor.to_s.strip) if cursor
173
+ end
139
174
 
140
- @location = he_decode(cursor.to_s.strip) if cursor
141
175
  end
142
176
 
143
- # So, *sometimes* the location just ends up being in the header, I don't know why:
177
+ # So, *sometimes* the location just ends up being in the header, I don't know why.
178
+ # This happens on old-style and new-style posts:
144
179
  @location = $1 if @location.nil? and HEADER_LOCATION.match header
145
180
  end
146
181
 
@@ -164,11 +199,16 @@ class CraigScrape::Posting < CraigScrape::Scraper
164
199
  unless @pics
165
200
  @pics = []
166
201
 
167
- if html and craigslist_body
168
- # Now let's find the craigslist hosted images:
169
- img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
170
-
171
- @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
202
+ if html
203
+ if html.at_xpath(XPATH_PICS)
204
+ @pics = html.xpath(XPATH_PICS).collect(&:value)
205
+ elsif craigslist_body
206
+ # This is the pre-12/3/12 style:
207
+ # Now let's find the craigslist hosted images:
208
+ img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
209
+
210
+ @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
211
+ end
172
212
  end
173
213
  end
174
214
 
@@ -202,11 +242,10 @@ class CraigScrape::Posting < CraigScrape::Scraper
202
242
  @posting_has_expired
203
243
  end
204
244
 
205
-
206
245
  # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
207
246
  # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
208
247
  def post_date
209
- @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
248
+ @post_date = post_time.to_date unless @post_date or post_time.nil?
210
249
 
211
250
  @post_date
212
251
  end
@@ -229,14 +268,8 @@ class CraigScrape::Posting < CraigScrape::Scraper
229
268
  # Array, which image types are listed for the post.
230
269
  # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
231
270
  def img_types
232
- unless @img_types
233
- @img_types = []
234
-
235
- @img_types << :img if images.length > 0
236
- @img_types << :pic if pics.length > 0
237
- end
238
-
239
- @img_types
271
+ @img_types || [ (images.length > 0) ? :img : nil,
272
+ (pics.length > 0) ? :pic : nil ].compact
240
273
  end
241
274
 
242
275
  # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
@@ -270,7 +303,11 @@ class CraigScrape::Posting < CraigScrape::Scraper
270
303
  # Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
271
304
  # and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
272
305
  def price
273
- $1.tr('$','').to_f if label and PRICE.match label
306
+ unless @price
307
+ (header and PRICE.match label) ?
308
+ @price = Money.new($1.tr('$','').to_i*100, 'USD') : nil
309
+ end
310
+ @price
274
311
  end
275
312
 
276
313
  # Returns the post contents with all html tags removed
@@ -290,6 +327,12 @@ class CraigScrape::Posting < CraigScrape::Scraper
290
327
  [contents,posting_id,post_time,title].all?{|f| f.nil?}
291
328
  end
292
329
 
330
+ # This is mostly used to determine if the post should be checked for
331
+ # parse errors. Might be useful for someone else though
332
+ def is_active_post?
333
+ [flagged_for_removal?, posting_has_expired?, deleted_by_author?].none?
334
+ end
335
+
293
336
  private
294
337
 
295
338
  # I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
@@ -302,17 +345,31 @@ class CraigScrape::Posting < CraigScrape::Scraper
302
345
  @html_head
303
346
  end
304
347
 
305
- # Since we started having so many problems with Hpricot flipping out on whack content bodies,
306
- # I added this to return everything south of the user_body
307
- def html_footer
308
- $4 if USERBODY_PARTS.match html_source
309
- end
310
-
311
348
  # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
312
- # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
349
+ # This bad html trips up html parsers, and I've resorted to splitting the page up using string parsing like so:
313
350
  # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
314
- def user_body
315
- $2 if USERBODY_PARTS.match html_source
351
+ def user_body
352
+ if USERBODY_PARTS.match html_source
353
+ # This is the pre-12/3/12 style:
354
+ $2
355
+ elsif html.at_xpath(XPATH_USERBODY)
356
+ # There's a bunch of junk in here that we don't want, so this loop removes
357
+ # everything after (and including) the last script tag, from the result
358
+ user_body = html.xpath(XPATH_USERBODY)
359
+ hit_delimeter = false
360
+ # Since some posts don't actually have the script tag:
361
+ delimeter = user_body.at_xpath('script') ? :script : :comment
362
+ user_body.first.children.to_a.reverse.reject{ |p|
363
+ if hit_delimeter
364
+ false
365
+ elsif ( (delimeter == :script and p.name == 'script') or
366
+ (delimeter == :comment and p.comment? and p.content.strip == "START CLTAGS") )
367
+ hit_delimeter = true
368
+ else
369
+ true
370
+ end
371
+ }.reverse.collect(&:to_s).join
372
+ end
316
373
  end
317
374
 
318
375
  # Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
@@ -321,4 +378,4 @@ class CraigScrape::Posting < CraigScrape::Scraper
321
378
  Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source
322
379
  end
323
380
 
324
- end
381
+ end
data/lib/scraper.rb CHANGED
@@ -15,39 +15,27 @@
15
15
  #
16
16
  # <b>logger</b> - a Logger object to debug http notices too. Defaults to nil
17
17
  #
18
- # <b>retries_on_fetch_fail</b> - The number of times to retry a failed uri download. Defaults to 8
19
- #
20
- # <b>sleep_between_fetch_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 30.
21
- #
22
- # <b>retries_on_404_fail</b> - The number of times to retry a Resource Not Found error (http Response code 404). Defaults to 3.
23
- #
24
- # <b>sleep_between_404_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a Resource Not Found error. Defaults to 3.
25
- #
18
+
26
19
  class CraigScrape::Scraper
27
20
  cattr_accessor :logger
28
- cattr_accessor :sleep_between_fetch_retries
29
- cattr_accessor :retries_on_fetch_fail
30
21
  cattr_accessor :retries_on_404_fail
31
22
  cattr_accessor :sleep_between_404_retries
32
- cattr_accessor :maximum_redirects_per_request
23
+
24
+ self.retries_on_404_fail = 3
25
+ self.sleep_between_404_retries = 3
33
26
 
34
27
  URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
35
28
  HTML_TAG = /<\/?[^>]*>/
36
29
  # We have to specify this to nokogiri. Sometimes it tries to figure out encoding on its own, and craigslist users post crazy bytes sometimes
37
30
  HTML_ENCODING = "UTF-8"
31
+
32
+ HTTP_HEADERS = { "Cache-Control" => "no-cache", "Pragma" => "no-cache",
33
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
34
+ "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19"}
38
35
 
39
36
  # Returns the full url that corresponds to this resource
40
37
  attr_reader :url
41
38
 
42
- # Set some defaults:
43
- self.retries_on_fetch_fail = 8
44
- self.sleep_between_fetch_retries = 30
45
-
46
- self.retries_on_404_fail = 3
47
- self.sleep_between_404_retries = 3
48
-
49
- self.maximum_redirects_per_request = 20
50
-
51
39
  class BadConstructionError < StandardError #:nodoc:
52
40
  end
53
41
 
@@ -57,9 +45,6 @@ class CraigScrape::Scraper
57
45
  class BadUrlError < StandardError #:nodoc:
58
46
  end
59
47
 
60
- class MaxRedirectError < StandardError #:nodoc:
61
- end
62
-
63
48
  class FetchError < StandardError #:nodoc:
64
49
  end
65
50
 
@@ -100,21 +85,37 @@ class CraigScrape::Scraper
100
85
  @uri
101
86
  end
102
87
 
88
+ # This method is mostly useful for our specs, but it's included in case anyone
89
+ # else wants it. It returns all currently-defined instance variables, and is
90
+ # mostly useful for the specs. Probably this doesn't do what you think, and
91
+ # should only be used to determine what's been parsed by the object thus-far.
92
+ # (And does not include parseable attributes which have yet to be determined
93
+ def attributes
94
+ Hash[self.instance_variables.collect{|i|
95
+ [i.to_s.tr('@','').to_sym, instance_variable_get(i) ] }]
96
+ end
97
+
103
98
  private
104
99
 
105
100
  # Returns text with all html tags removed.
106
101
  def strip_html(str)
107
- str.gsub HTML_TAG, "" if str
102
+ he_decode(str).gsub HTML_TAG, "" if str
108
103
  end
109
104
 
110
105
  # Easy way to fail noisily:
111
- def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
106
+ def parse_error!(fields = nil)
107
+ raise ParseError, "Error while parsing %s:\n %s%s" % [
108
+ self.class.to_s, html,
109
+ (fields) ? ("\nRequired fields missing: %s" % fields.join(', ')) : '']
110
+ end
112
111
 
113
112
  # Returns text with all html entities converted to respective ascii character.
114
113
  def he_decode(text); self.class.he_decode text; end
115
114
 
116
115
  # Returns text with all html entities converted to respective ascii character.
117
- def self.he_decode(text); HTMLEntities.new.decode text; end
116
+ def self.he_decode(text)
117
+ HTMLEntities.new.decode text
118
+ end
118
119
 
119
120
  # Derives a full url, using the current object's url and the provided href
120
121
  def url_from_href(href) #:nodoc:
@@ -133,42 +134,34 @@ class CraigScrape::Scraper
133
134
  '%s://%s%s' % [scheme, host, path]
134
135
  end
135
136
 
136
- def fetch_uri(uri, redirect_count = 0)
137
- logger.info "Requesting (%d): %s" % [redirect_count, @url.inspect] if logger
137
+ def fetch_uri(uri)
138
+ logger.info "Requesting: %s" % [@url.inspect] if logger
138
139
 
139
- raise MaxRedirectError, "Max redirects (#{redirect_count}) reached for URL: #{@url}" if redirect_count > self.maximum_redirects_per_request-1
140
-
141
- case uri.scheme
140
+ (case uri.scheme
142
141
  when 'file'
143
142
  # If this is a directory, we'll try to approximate http a bit by loading a '/index.html'
144
- File.read( File.directory?(uri.path) ? "#{uri.path}/index.html" : uri.path )
143
+ File.read( File.directory?(uri.path) ?
144
+ "#{uri.path}/index.html" : uri.path , :encoding => 'BINARY')
145
145
  when /^http[s]?/
146
- fetch_http uri, redirect_count
146
+ fetch_http uri
147
147
  else
148
148
  raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
149
- end
149
+ end).force_encoding("ISO-8859-1").encode("UTF-8")
150
150
  end
151
-
152
- def fetch_http(uri, redirect_count = 0)
151
+
152
+ def fetch_http(uri)
153
153
  fetch_attempts = 0
154
154
  resource_not_found_attempts = 0
155
155
 
156
156
  begin
157
- # This handles the redirects for us
158
- resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri, nil
159
-
160
- if resp.response.code == "200"
161
- # Check for gzip, and decode:
162
- data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
163
-
164
- data
165
- elsif resp.response['Location']
166
- redirect_to = resp.response['Location']
167
-
168
- fetch_uri URI.parse(url_from_href(redirect_to)), redirect_count+1
157
+ resp = Typhoeus.get uri.to_s, :followlocation => true,
158
+ :headers => HTTP_HEADERS
159
+
160
+ if resp.response_code == 200
161
+ resp.response_body
169
162
  else
170
163
  # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
171
- raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
164
+ raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response_code ]
172
165
  end
173
166
  rescue ResourceNotFoundError => err
174
167
  logger.info err.message if logger
@@ -182,19 +175,6 @@ class CraigScrape::Scraper
182
175
  else
183
176
  raise err
184
177
  end
185
- rescue FetchError,Timeout::Error,Errno::ECONNRESET,EOFError => err
186
- logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
187
- logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
188
-
189
- fetch_attempts += 1
190
-
191
- if fetch_attempts <= self.retries_on_fetch_fail
192
- sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
193
- logger.info 'Retrying fetch ....' if logger
194
- retry
195
- else
196
- raise err
197
- end
198
178
  end
199
179
  end
200
180
 
@@ -209,4 +189,4 @@ class CraigScrape::Scraper
209
189
  @html ||= Nokogiri::HTML html_source, nil, HTML_ENCODING if html_source
210
190
  @html
211
191
  end
212
- end
192
+ end