libcraigscrape 1.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/posting.rb CHANGED
@@ -14,14 +14,25 @@ class CraigScrape::Posting < CraigScrape::Scraper
14
14
 
15
15
  POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
16
16
  LOCATION = /Location\:[ ]+(.+)/
17
- HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
18
- POSTING_ID = /PostingID\:[ ]+([\d]+)/
17
+ HEADER_LOCATION = /\((.+)\)$/
18
+ POSTING_ID = /PostingID\:[ ]*([\d]+)/
19
19
  REPLY_TO = /(.+)/
20
20
  PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
21
+
22
+ # NOTE: we implement the (?:) to first check the 'old' style format, and then the 'new style'
23
+ # (As of 12/03's parse changes)
21
24
  USERBODY_PARTS = /^(.+)\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>(.+)$/m
22
25
  HTML_HEADER = /^(.+)\<div id\=\"userbody\">/m
23
26
  IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
24
27
 
28
+ # This is used to determine if there's a parse error
29
+ REQUIRED_FIELDS = %w(contents posting_id post_time header title full_section)
30
+
31
+ XPATH_USERBODY = "//*[@id='userbody']"
32
+ XPATH_BLURBS = "//ul[@class='blurbs']"
33
+ XPATH_PICS = "//*[@class='tn']/a/@href"
34
+ XPATH_REPLY_TO = "//*[@class='dateReplyBar']/small/a"
35
+
25
36
  # This is really just for testing, in production use, uri.path is a better solution
26
37
  attr_reader :href #:nodoc:
27
38
 
@@ -30,14 +41,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
30
41
  super(*args)
31
42
 
32
43
  # Validate that required fields are present, at least - if we've downloaded it from a url
33
- parse_error! if (
34
- args.first.kind_of? String and
35
- !flagged_for_removal? and
36
- !posting_has_expired? and
37
- !deleted_by_author? and [
38
- contents,posting_id,post_time,header,title,full_section
39
- ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
40
- )
44
+ if args.first.kind_of? String and is_active_post?
45
+ unparsed_fields = REQUIRED_FIELDS.find_all{|f|
46
+ val = send(f)
47
+ val.nil? or (val.respond_to? :length and val.length == 0)
48
+ }
49
+ parse_error! unparsed_fields unless unparsed_fields.empty?
50
+ end
51
+
41
52
  end
42
53
 
43
54
 
@@ -67,7 +78,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
67
78
  unless @full_section
68
79
  @full_section = []
69
80
 
70
- (html_head/"div[@class='bchead']//a").each do |a|
81
+ (html_head / "*[@class='bchead']//a").each do |a|
71
82
  @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
72
83
  end if html_head
73
84
  end
@@ -78,9 +89,13 @@ class CraigScrape::Posting < CraigScrape::Scraper
78
89
  # String, represents the post's reply-to address, if listed
79
90
  def reply_to
80
91
  unless @reply_to
81
- cursor = html_head.at 'hr' if html_head
82
- cursor = cursor.next until cursor.nil? or cursor.name == 'a'
83
- @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
92
+ if html.at_xpath(XPATH_REPLY_TO)
93
+ @reply_to = html.at_xpath(XPATH_REPLY_TO).content
94
+ else
95
+ cursor = html_head.at 'hr' if html_head
96
+ cursor = cursor.next until cursor.nil? or cursor.name == 'a'
97
+ @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
98
+ end
84
99
  end
85
100
 
86
101
  @reply_to
@@ -91,7 +106,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
91
106
  unless @post_time
92
107
  cursor = html_head.at 'hr' if html_head
93
108
  cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
94
- @post_time = Time.parse $1 if $1
109
+ @post_time = DateTime.parse($1) if $1
95
110
  end
96
111
 
97
112
  @post_time
@@ -99,10 +114,17 @@ class CraigScrape::Posting < CraigScrape::Scraper
99
114
 
100
115
  # Integer, Craigslist's unique posting id
101
116
  def posting_id
102
- unless @posting_id
103
- cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING if html_footer
104
- cursor = cursor.next until cursor.nil? or POSTING_ID.match cursor.to_s
105
- @posting_id = $1.to_i if $1
117
+ if @posting_id
118
+
119
+ elsif USERBODY_PARTS.match html_source
120
+ # Old style:
121
+ html_footer = $4
122
+ cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING
123
+ cursor = cursor.next until cursor.nil? or
124
+ @posting_id = $1.to_i if POSTING_ID.match html_footer.to_s
125
+ else
126
+ # Post 12/3
127
+ @posting_id = $1.to_i if POSTING_ID.match html.xpath("//*[@class='postingidtext']").to_s
106
128
  end
107
129
 
108
130
  @posting_id
@@ -112,7 +134,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
112
134
  def contents
113
135
  unless @contents
114
136
  @contents = user_body if html_source
115
- @contents = he_decode @contents.strip if @contents
137
+ @contents = he_decode(@contents).strip if @contents
116
138
  end
117
139
 
118
140
  @contents
@@ -120,27 +142,40 @@ class CraigScrape::Posting < CraigScrape::Scraper
120
142
 
121
143
  # String, the location of the item, as best could be parsed
122
144
  def location
123
- if @location.nil? and craigslist_body and html
124
- # Location (when explicitly defined):
125
- cursor = craigslist_body.at 'ul' unless @location
126
-
127
- # Apa section includes other things in the li's (cats/dogs ok fields)
128
- cursor.children.each do |li|
129
- if LOCATION.match li.inner_html
130
- @location = he_decode($1) and break
131
- break
132
- end
133
- end if cursor
145
+ if @location.nil? and html
146
+
147
+ if html.at_xpath(XPATH_BLURBS)
148
+ # This is the post-12/3/12 style:
149
+
150
+ # Sometimes the Location is in the body :
151
+ @location = $1 if html.xpath(XPATH_BLURBS).first.children.any?{|c|
152
+ LOCATION.match c.content}
134
153
 
135
- # Real estate listings can work a little different for location:
136
- unless @location
137
- cursor = craigslist_body.at 'small'
138
- cursor = cursor.previous until cursor.nil? or cursor.text?
154
+ elsif craigslist_body
155
+ # Location (when explicitly defined):
156
+ cursor = craigslist_body.at 'ul' unless @location
157
+
158
+ # This is the legacy style:
159
+ # Note: Apa section includes other things in the li's (cats/dogs ok fields)
160
+ cursor.children.each do |li|
161
+ if LOCATION.match li.inner_html
162
+ @location = he_decode($1) and break
163
+ break
164
+ end
165
+ end if cursor
166
+
167
+ # Real estate listings can work a little different for location:
168
+ unless @location
169
+ cursor = craigslist_body.at 'small'
170
+ cursor = cursor.previous until cursor.nil? or cursor.text?
171
+
172
+ @location = he_decode(cursor.to_s.strip) if cursor
173
+ end
139
174
 
140
- @location = he_decode(cursor.to_s.strip) if cursor
141
175
  end
142
176
 
143
- # So, *sometimes* the location just ends up being in the header, I don't know why:
177
+ # So, *sometimes* the location just ends up being in the header, I don't know why.
178
+ # This happens on old-style and new-style posts:
144
179
  @location = $1 if @location.nil? and HEADER_LOCATION.match header
145
180
  end
146
181
 
@@ -164,11 +199,16 @@ class CraigScrape::Posting < CraigScrape::Scraper
164
199
  unless @pics
165
200
  @pics = []
166
201
 
167
- if html and craigslist_body
168
- # Now let's find the craigslist hosted images:
169
- img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
170
-
171
- @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
202
+ if html
203
+ if html.at_xpath(XPATH_PICS)
204
+ @pics = html.xpath(XPATH_PICS).collect(&:value)
205
+ elsif craigslist_body
206
+ # This is the pre-12/3/12 style:
207
+ # Now let's find the craigslist hosted images:
208
+ img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
209
+
210
+ @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
211
+ end
172
212
  end
173
213
  end
174
214
 
@@ -202,11 +242,10 @@ class CraigScrape::Posting < CraigScrape::Scraper
202
242
  @posting_has_expired
203
243
  end
204
244
 
205
-
206
245
  # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
207
246
  # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
208
247
  def post_date
209
- @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
248
+ @post_date = post_time.to_date unless @post_date or post_time.nil?
210
249
 
211
250
  @post_date
212
251
  end
@@ -229,14 +268,8 @@ class CraigScrape::Posting < CraigScrape::Scraper
229
268
  # Array, which image types are listed for the post.
230
269
  # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
231
270
  def img_types
232
- unless @img_types
233
- @img_types = []
234
-
235
- @img_types << :img if images.length > 0
236
- @img_types << :pic if pics.length > 0
237
- end
238
-
239
- @img_types
271
+ @img_types || [ (images.length > 0) ? :img : nil,
272
+ (pics.length > 0) ? :pic : nil ].compact
240
273
  end
241
274
 
242
275
  # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
@@ -270,7 +303,11 @@ class CraigScrape::Posting < CraigScrape::Scraper
270
303
  # Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
271
304
  # and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
272
305
  def price
273
- $1.tr('$','').to_f if label and PRICE.match label
306
+ unless @price
307
+ (header and PRICE.match label) ?
308
+ @price = Money.new($1.tr('$','').to_i*100, 'USD') : nil
309
+ end
310
+ @price
274
311
  end
275
312
 
276
313
  # Returns the post contents with all html tags removed
@@ -290,6 +327,12 @@ class CraigScrape::Posting < CraigScrape::Scraper
290
327
  [contents,posting_id,post_time,title].all?{|f| f.nil?}
291
328
  end
292
329
 
330
+ # This is mostly used to determine if the post should be checked for
331
+ # parse errors. Might be useful for someone else though
332
+ def is_active_post?
333
+ [flagged_for_removal?, posting_has_expired?, deleted_by_author?].none?
334
+ end
335
+
293
336
  private
294
337
 
295
338
  # I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
@@ -302,17 +345,31 @@ class CraigScrape::Posting < CraigScrape::Scraper
302
345
  @html_head
303
346
  end
304
347
 
305
- # Since we started having so many problems with Hpricot flipping out on whack content bodies,
306
- # I added this to return everything south of the user_body
307
- def html_footer
308
- $4 if USERBODY_PARTS.match html_source
309
- end
310
-
311
348
  # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
312
- # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
349
+ # This bad html trips up html parsers, and I've resorted to splitting the page up using string parsing like so:
313
350
  # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
314
- def user_body
315
- $2 if USERBODY_PARTS.match html_source
351
+ def user_body
352
+ if USERBODY_PARTS.match html_source
353
+ # This is the pre-12/3/12 style:
354
+ $2
355
+ elsif html.at_xpath(XPATH_USERBODY)
356
+ # There's a bunch of junk in here that we don't want, so this loop removes
357
+ # everything after (and including) the last script tag, from the result
358
+ user_body = html.xpath(XPATH_USERBODY)
359
+ hit_delimeter = false
360
+ # Since some posts don't actually have the script tag:
361
+ delimeter = user_body.at_xpath('script') ? :script : :comment
362
+ user_body.first.children.to_a.reverse.reject{ |p|
363
+ if hit_delimeter
364
+ false
365
+ elsif ( (delimeter == :script and p.name == 'script') or
366
+ (delimeter == :comment and p.comment? and p.content.strip == "START CLTAGS") )
367
+ hit_delimeter = true
368
+ else
369
+ true
370
+ end
371
+ }.reverse.collect(&:to_s).join
372
+ end
316
373
  end
317
374
 
318
375
  # Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
@@ -321,4 +378,4 @@ class CraigScrape::Posting < CraigScrape::Scraper
321
378
  Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source
322
379
  end
323
380
 
324
- end
381
+ end
data/lib/scraper.rb CHANGED
@@ -15,39 +15,27 @@
15
15
  #
16
16
  # <b>logger</b> - a Logger object to debug http notices too. Defaults to nil
17
17
  #
18
- # <b>retries_on_fetch_fail</b> - The number of times to retry a failed uri download. Defaults to 8
19
- #
20
- # <b>sleep_between_fetch_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 30.
21
- #
22
- # <b>retries_on_404_fail</b> - The number of times to retry a Resource Not Found error (http Response code 404). Defaults to 3.
23
- #
24
- # <b>sleep_between_404_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a Resource Not Found error. Defaults to 3.
25
- #
18
+
26
19
  class CraigScrape::Scraper
27
20
  cattr_accessor :logger
28
- cattr_accessor :sleep_between_fetch_retries
29
- cattr_accessor :retries_on_fetch_fail
30
21
  cattr_accessor :retries_on_404_fail
31
22
  cattr_accessor :sleep_between_404_retries
32
- cattr_accessor :maximum_redirects_per_request
23
+
24
+ self.retries_on_404_fail = 3
25
+ self.sleep_between_404_retries = 3
33
26
 
34
27
  URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
35
28
  HTML_TAG = /<\/?[^>]*>/
36
29
  # We have to specify this to nokogiri. Sometimes it tries to figure out encoding on its own, and craigslist users post crazy bytes sometimes
37
30
  HTML_ENCODING = "UTF-8"
31
+
32
+ HTTP_HEADERS = { "Cache-Control" => "no-cache", "Pragma" => "no-cache",
33
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
34
+ "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19"}
38
35
 
39
36
  # Returns the full url that corresponds to this resource
40
37
  attr_reader :url
41
38
 
42
- # Set some defaults:
43
- self.retries_on_fetch_fail = 8
44
- self.sleep_between_fetch_retries = 30
45
-
46
- self.retries_on_404_fail = 3
47
- self.sleep_between_404_retries = 3
48
-
49
- self.maximum_redirects_per_request = 20
50
-
51
39
  class BadConstructionError < StandardError #:nodoc:
52
40
  end
53
41
 
@@ -57,9 +45,6 @@ class CraigScrape::Scraper
57
45
  class BadUrlError < StandardError #:nodoc:
58
46
  end
59
47
 
60
- class MaxRedirectError < StandardError #:nodoc:
61
- end
62
-
63
48
  class FetchError < StandardError #:nodoc:
64
49
  end
65
50
 
@@ -100,21 +85,37 @@ class CraigScrape::Scraper
100
85
  @uri
101
86
  end
102
87
 
88
+ # This method is mostly useful for our specs, but it's included in case anyone
89
+ # else wants it. It returns all currently-defined instance variables, and is
90
+ # mostly useful for the specs. Probably this doesn't do what you think, and
91
+ # should only be used to determine what's been parsed by the object thus-far.
92
+ # (And does not include parseable attributes which have yet to be determined
93
+ def attributes
94
+ Hash[self.instance_variables.collect{|i|
95
+ [i.to_s.tr('@','').to_sym, instance_variable_get(i) ] }]
96
+ end
97
+
103
98
  private
104
99
 
105
100
  # Returns text with all html tags removed.
106
101
  def strip_html(str)
107
- str.gsub HTML_TAG, "" if str
102
+ he_decode(str).gsub HTML_TAG, "" if str
108
103
  end
109
104
 
110
105
  # Easy way to fail noisily:
111
- def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
106
+ def parse_error!(fields = nil)
107
+ raise ParseError, "Error while parsing %s:\n %s%s" % [
108
+ self.class.to_s, html,
109
+ (fields) ? ("\nRequired fields missing: %s" % fields.join(', ')) : '']
110
+ end
112
111
 
113
112
  # Returns text with all html entities converted to respective ascii character.
114
113
  def he_decode(text); self.class.he_decode text; end
115
114
 
116
115
  # Returns text with all html entities converted to respective ascii character.
117
- def self.he_decode(text); HTMLEntities.new.decode text; end
116
+ def self.he_decode(text)
117
+ HTMLEntities.new.decode text
118
+ end
118
119
 
119
120
  # Derives a full url, using the current object's url and the provided href
120
121
  def url_from_href(href) #:nodoc:
@@ -133,42 +134,34 @@ class CraigScrape::Scraper
133
134
  '%s://%s%s' % [scheme, host, path]
134
135
  end
135
136
 
136
- def fetch_uri(uri, redirect_count = 0)
137
- logger.info "Requesting (%d): %s" % [redirect_count, @url.inspect] if logger
137
+ def fetch_uri(uri)
138
+ logger.info "Requesting: %s" % [@url.inspect] if logger
138
139
 
139
- raise MaxRedirectError, "Max redirects (#{redirect_count}) reached for URL: #{@url}" if redirect_count > self.maximum_redirects_per_request-1
140
-
141
- case uri.scheme
140
+ (case uri.scheme
142
141
  when 'file'
143
142
  # If this is a directory, we'll try to approximate http a bit by loading a '/index.html'
144
- File.read( File.directory?(uri.path) ? "#{uri.path}/index.html" : uri.path )
143
+ File.read( File.directory?(uri.path) ?
144
+ "#{uri.path}/index.html" : uri.path , :encoding => 'BINARY')
145
145
  when /^http[s]?/
146
- fetch_http uri, redirect_count
146
+ fetch_http uri
147
147
  else
148
148
  raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
149
- end
149
+ end).force_encoding("ISO-8859-1").encode("UTF-8")
150
150
  end
151
-
152
- def fetch_http(uri, redirect_count = 0)
151
+
152
+ def fetch_http(uri)
153
153
  fetch_attempts = 0
154
154
  resource_not_found_attempts = 0
155
155
 
156
156
  begin
157
- # This handles the redirects for us
158
- resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri, nil
159
-
160
- if resp.response.code == "200"
161
- # Check for gzip, and decode:
162
- data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
163
-
164
- data
165
- elsif resp.response['Location']
166
- redirect_to = resp.response['Location']
167
-
168
- fetch_uri URI.parse(url_from_href(redirect_to)), redirect_count+1
157
+ resp = Typhoeus.get uri.to_s, :followlocation => true,
158
+ :headers => HTTP_HEADERS
159
+
160
+ if resp.response_code == 200
161
+ resp.response_body
169
162
  else
170
163
  # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
171
- raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
164
+ raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response_code ]
172
165
  end
173
166
  rescue ResourceNotFoundError => err
174
167
  logger.info err.message if logger
@@ -182,19 +175,6 @@ class CraigScrape::Scraper
182
175
  else
183
176
  raise err
184
177
  end
185
- rescue FetchError,Timeout::Error,Errno::ECONNRESET,EOFError => err
186
- logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
187
- logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
188
-
189
- fetch_attempts += 1
190
-
191
- if fetch_attempts <= self.retries_on_fetch_fail
192
- sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
193
- logger.info 'Retrying fetch ....' if logger
194
- retry
195
- else
196
- raise err
197
- end
198
178
  end
199
179
  end
200
180
 
@@ -209,4 +189,4 @@ class CraigScrape::Scraper
209
189
  @html ||= Nokogiri::HTML html_source, nil, HTML_ENCODING if html_source
210
190
  @html
211
191
  end
212
- end
192
+ end