libcraigscrape 0.6 → 0.6.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,5 +1,13 @@
1
1
  == Change Log
2
2
 
3
+ === Release 0.6.5 (Jun 8, 2009)
4
+ - Added PostFull::deleted_by_author? , added test case for said condition
5
+ - Fixed a bug that caused the library to die in weird ways if there wasn't a title tag on a parsed page
6
+ - Apparently Craigslist starting gzip-encoding *some* listings. Added gzip decoding support
7
+ - Found a bug when parsing the location field on some full_posts in the apa sections
8
+ - Added support for file:// uri's int he scrape_* functions, and revised the tests to use these uri's
9
+ - Fixed a bug that caused errors to be raised with legitimately empty listing pages
10
+
3
11
  === Release 0.6.0 (May 21, 2009)
4
12
  - Added PostFull::flagged_for_removal?
5
13
  - Fixed a couple small parse bugs found in production
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ include FileUtils
11
11
  RbConfig = Config unless defined? RbConfig
12
12
 
13
13
  NAME = "libcraigscrape"
14
- VERS = ENV['VERSION'] || "0.6"
14
+ VERS = ENV['VERSION'] || "0.6.5"
15
15
  PKG = "#{NAME}-#{VERS}"
16
16
 
17
17
  RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
@@ -57,8 +57,11 @@ Rake::RDocTask.new do |rdoc|
57
57
  end
58
58
 
59
59
  Rake::GemPackageTask.new(SPEC) do |p|
60
- p.need_tar = true
61
- p.gem_spec = SPEC
60
+ p.need_tar = false
61
+ p.need_tar_gz = true
62
+ p.need_tar_bz2 = true
63
+ p.need_zip = true
64
+ p.gem_spec = SPEC
62
65
  end
63
66
 
64
67
  task "lib" do
@@ -74,35 +77,3 @@ task :uninstall => [:clean] do
74
77
  sh %{sudo gem uninstall #{NAME}}
75
78
  end
76
79
 
77
- task :pkg_archives do
78
- base_dir = File.dirname __FILE__
79
- package_name = '%s-%s' % [NAME,VERS]
80
- packages_base = "#{base_dir}/pkg"
81
- packaging_dir = '%s/%s' % [ packages_base,package_name ]
82
-
83
- begin
84
- # First we create a proper package-X.X directory:
85
- PKG_FILES.each do |p_f|
86
- base_file = '%s/%s' % [base_dir, p_f]
87
- packaged_file = '%s/%s' % [packaging_dir, p_f]
88
- packaged_file_dirname = File.dirname packaged_file
89
-
90
- # We really don't care to do anything about these - we'll recreate it when/if its needed
91
- next if File.directory? base_file
92
-
93
- FileUtils.mkdir_p packaged_file_dirname unless File.directory? packaged_file_dirname
94
-
95
- FileUtils.cp base_file, packaged_file unless File.exists? packaged_file
96
- end
97
-
98
- # Remove any old archives we'd be replacing:
99
- %w(zip tar.bz2).each{ |ext| FileUtils.rm "#{packaging_dir}.#{ext}" if File.exist? "#{packaging_dir}.#{ext}" }
100
-
101
- # Now let's create some archives:
102
- sh %{cd #{packages_base} && tar -cjvf #{package_name}.tar.bz2 #{package_name}}
103
- sh %{cd #{packages_base} && zip -r #{package_name}.zip #{package_name}}
104
- ensure
105
- # Delete that temp directory we created at the start here
106
- FileUtils.rmtree packaging_dir
107
- end
108
- end
@@ -2,10 +2,11 @@
2
2
  #
3
3
  # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
4
4
  #
5
+ require 'net/http'
6
+ require 'zlib'
5
7
 
6
8
  require 'rubygems'
7
9
  require 'hpricot'
8
- require 'net/http'
9
10
  require 'htmlentities'
10
11
  require 'activesupport'
11
12
 
@@ -31,10 +32,14 @@ class CraigScrape
31
32
  end
32
33
 
33
34
  module ParseObjectHelper #:nodoc:
35
+ private
34
36
  def he_decode(text)
35
37
  HTMLEntities.new.decode text
36
38
  end
37
39
  end
40
+
41
+ class BadUrlError < StandardError #:nodoc:
42
+ end
38
43
 
39
44
  class ParseError < StandardError #:nodoc:
40
45
  end
@@ -92,7 +97,7 @@ class CraigScrape
92
97
 
93
98
  title = page.at('title')
94
99
  @title = he_decode title.inner_html if title
95
- @title = nil if @title.length ==0
100
+ @title = nil if @title and @title.length ==0
96
101
 
97
102
  @full_section = []
98
103
  (page/"div[@class='bchead']//a").each do |a|
@@ -125,18 +130,56 @@ class CraigScrape
125
130
  # This will make it easier for the next guy to work with if wants to parse out the information we're disgarding...
126
131
  parse_craig_body Hpricot.parse(craigbody_as_s) if craigbody_as_s
127
132
 
133
+ # We'll first set these edge cases to false, unless the block below decides otherwise
134
+ @flagged_for_removal = false
135
+ @deleted_by_author = false
136
+
137
+ # Time to check for errors and edge cases
138
+ if [@contents,@posting_id,@post_time,@title].all?{|f| f.nil?}
139
+ case @header.gsub(HTML_TAG, "")
140
+ when "This posting has been flagged for removal"
141
+ @flagged_for_removal = true
142
+ when "This posting has been deleted by its author."
143
+ @deleted_by_author = true
144
+ end
145
+ end
146
+
128
147
  # Validate that required fields are present:
129
- raise ParseError, "Unable to parse PostFull: %s" % page.to_html if !flagged_for_removal? and [
148
+ raise ParseError, "Unable to parse PostFull: %s" % page.to_html if !flagged_for_removal? and !deleted_by_author? and [
130
149
  @contents,@posting_id,@post_time,@header,@title,@full_section
131
150
  ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
132
151
  end
152
+
153
+ # Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
154
+ def flagged_for_removal?; @flagged_for_removal; end
155
+
156
+ # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
157
+ def deleted_by_author?; @deleted_by_author; end
158
+
159
+ # Returns the price (as float) of the item, as best ascertained by the post header
160
+ def price
161
+ $1.to_f if @title and @header and PRICE.match(@header.gsub(/#{@title}/, ''))
162
+ end
163
+
164
+ # Returns the post contents with all html tags removed
165
+ def contents_as_plain
166
+ @contents.gsub HTML_TAG, "" if @contents
167
+ end
168
+
169
+ private
133
170
 
134
171
  # I left this here as a stub, since someone may want to parse more then what I'm currently scraping from this part of the page
135
172
  def parse_craig_body(craigbody_els) #:nodoc:
136
173
  # Location (when explicitly defined):
137
- cursor = craigbody_els.at 'ul'
138
- cursor = cursor.at 'li' if cursor
139
- @location = he_decode $1 if cursor and LOCATION.match cursor.inner_html
174
+ cursor = craigbody_els.at 'ul' unless @location
175
+
176
+ # Apa section includes other things in the li's (cats/dogs ok fields)
177
+ cursor.children.each do |li|
178
+ if LOCATION.match li.inner_html
179
+ @location = he_decode($1) and break
180
+ break
181
+ end
182
+ end if cursor
140
183
 
141
184
  # Real estate listings can work a little different for location:
142
185
  unless @location
@@ -151,24 +194,6 @@ class CraigScrape
151
194
 
152
195
  @images = (img_table / 'img').collect{|i| i[:src]} if img_table
153
196
  end
154
-
155
- # Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
156
- def flagged_for_removal?
157
- (
158
- [@contents,@posting_id,@post_time,@title].all?{|f| f.nil?} and
159
- @header.gsub(HTML_TAG, "") == "This posting has been flagged for removal"
160
- )
161
- end
162
-
163
- # Returns the price (as float) of the item, as best ascertained by the post header
164
- def price
165
- $1.to_f if @title and @header and PRICE.match(@header.gsub(/#{@title}/, ''))
166
- end
167
-
168
- # Returns the post contents with all html tags removed
169
- def contents_as_plain
170
- @contents.gsub HTML_TAG, "" if @contents
171
- end
172
197
  end
173
198
 
174
199
  # Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
@@ -207,7 +232,7 @@ class CraigScrape
207
232
  @next_page_href = next_link[:href] if next_link
208
233
 
209
234
  # Validate that required fields are present:
210
- raise ParseError, "Unable to parse Listings: %s" % page.to_html unless @posts.length > 0
235
+ raise ParseError, "Unable to parse Listings: %s" % page.to_html if tags_worth_parsing.length > 0 and @posts.length == 0
211
236
  end
212
237
 
213
238
  end
@@ -304,7 +329,7 @@ class CraigScrape
304
329
 
305
330
  # Requests and returns the PostFull object that corresponds with this summary's full_url
306
331
  def full_post
307
- @full_post = CraigScrape.scrape_full_post(full_url) if @full_post.nil? and full_url
332
+ @full_post ||= CraigScrape.scrape_full_post full_url if full_url
308
333
 
309
334
  @full_post
310
335
  end
@@ -366,42 +391,51 @@ class CraigScrape
366
391
  def self.scrape_posts_since(listing_url, newer_then)
367
392
  self.scrape_until(listing_url) {|post| post.date <= newer_then}
368
393
  end
369
-
394
+
370
395
  def self.fetch_url(uri) #:nodoc:
371
- fetch_attempts = 0
372
-
373
- begin
374
- # This handles the redirects for us
375
- uri_dest = ( uri.class == String ) ? URI.parse(uri) : uri
376
-
377
- logger.info "Requesting: %s" % uri_dest.to_s if logger
378
-
379
- resp, data = Net::HTTP.new( uri_dest.host, uri_dest.port).get uri_dest.request_uri, nil
380
-
381
- if resp.response.code == "200"
382
- data
383
- elsif resp.response['Location']
384
- redirect_to = resp.response['Location']
385
- self.fetch_url(redirect_to)
386
- else
387
- # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
388
- error_description = 'Unable to fetch "%s" (%s)' % [ uri_dest.to_s, resp.response.code ]
396
+ uri_dest = ( uri.class == String ) ? URI.parse(uri) : uri
397
+
398
+ logger.info "Requesting: %s" % uri_dest.to_s if logger
389
399
 
390
- logger.info error_description if logger
400
+ case uri_dest.scheme
401
+ when 'file'
402
+ File.read uri_dest.path
403
+ when /^http[s]?/
404
+ fetch_attempts = 0
391
405
 
392
- raise FetchError, error_description
393
- end
394
- rescue FetchError => err
395
- fetch_attempts += 1
406
+ begin
407
+ # This handles the redirects for us
408
+ resp, data = Net::HTTP.new( uri_dest.host, uri_dest.port).get uri_dest.request_uri, nil
396
409
 
397
- if retries_on_fetch_fail <= CraigScrape.retries_on_fetch_fail
398
- sleep CraigScrape.sleep_between_fetch_retries if CraigScrape.sleep_between_fetch_retries
399
- retry
410
+ if resp.response.code == "200"
411
+ # Check for gzip, and decode:
412
+ data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
413
+
414
+ data
415
+ elsif resp.response['Location']
416
+ redirect_to = resp.response['Location']
417
+ self.fetch_url(redirect_to)
418
+ else
419
+ # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
420
+ error_description = 'Unable to fetch "%s" (%s)' % [ uri_dest.to_s, resp.response.code ]
421
+
422
+ logger.info error_description if logger
423
+
424
+ raise FetchError, error_description
425
+ end
426
+ rescue FetchError => err
427
+ fetch_attempts += 1
428
+
429
+ if retries_on_fetch_fail <= CraigScrape.retries_on_fetch_fail
430
+ sleep CraigScrape.sleep_between_fetch_retries if CraigScrape.sleep_between_fetch_retries
431
+ retry
432
+ else
433
+ raise err
434
+ end
435
+ end
400
436
  else
401
- raise err
402
- end
437
+ raise BadUrlError, "Unknown URI scheme for the url: #{uri_dest.to_s}"
403
438
  end
404
-
405
439
  end
406
440
 
407
441
  def self.uri_from_href(base_uri, href) #:nodoc:
@@ -0,0 +1,128 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
+ <html><head>
3
+ <title>treasure coast arts/crafts for sale classifieds - craigslist</title>
4
+
5
+ <meta name="description" content="craigslist arts/crafts for sale classifieds for treasure coast ">
6
+ <meta name="keywords" content="treasure coast arts/crafts for sale craigslist, classifieds, want ads ">
7
+
8
+
9
+
10
+ <link rel=alternate type="application/rss+xml" href="index.rss" title="RSS feed for craigslist | arts/crafts for sale in treasure coast ">
11
+ <link rel="stylesheet" title="craigslist" href="http://www.craigslist.org/styles/craigslist.css" type="text/css" media="all">
12
+ </head>
13
+
14
+ <body class="toc">
15
+
16
+ <a name="top"></a>
17
+
18
+ <div class="bchead"><span id="ef">
19
+
20
+ [ <a href="http://www.craigslist.org/about/help/">help</a> ]
21
+ [ <a href="https://post.craigslist.org/psl/S">post</a> ]</span>
22
+
23
+ <a href="/"> treasure coast craigslist</a> &gt; <a href="/art/">arts/crafts for sale</a></div>
24
+
25
+ <blockquote>
26
+ <form action="/search/art" method="get" onsubmit="ckCAbb();">
27
+
28
+ <script type="text/javascript"><!--
29
+ function ckCAbb() {
30
+ t = document.getElementById("cAbb");
31
+ if (t.value == "art") { t.disabled = true; }
32
+ }
33
+ -->
34
+ </script>
35
+
36
+ <table width="95%" cellpadding="2" style="white-space: nowrap; background:#eee; border:1px solid gray;" summary="">
37
+ <tr>
38
+ <td align="right" width="1">search for:</td>
39
+ <td width="30%"><input id="query" name="query" size="30" value=""> in:
40
+ <select id="cAbb" name="catAbbreviation">
41
+ <option value="ccc">all community<option value="eee">all event<option value="sss">all for sale / wanted<option disabled value="">--<option value="art" selected> art &amp; crafts
42
+ <option value="pts"> auto parts
43
+ <option value="bab"> baby &amp; kid stuff
44
+ <option value="bar"> barter
45
+ <option value="bik"> bicycles
46
+ <option value="boa"> boats
47
+ <option value="bks"> books
48
+ <option value="bfs"> business
49
+ <option value="cta"> cars &amp; trucks - all
50
+ <option value="ctd"> cars &amp; trucks - by dealer
51
+ <option value="cto"> cars &amp; trucks - by owner
52
+ <option value="emd"> cds / dvds / vhs
53
+ <option value="clo"> clothing
54
+ <option value="clt"> collectibles
55
+ <option value="sys"> computers &amp; tech
56
+ <option value="ele"> electronics
57
+ <option value="grd"> farm &amp; garden
58
+ <option value="zip"> free stuff
59
+ <option value="fua"> furniture - all
60
+ <option value="fud"> furniture - by dealer
61
+ <option value="fuo"> furniture - by owner
62
+ <option value="tag"> games &amp; toys
63
+ <option value="gms"> garage sales
64
+ <option value="for"> general
65
+ <option value="hsh"> household
66
+ <option value="wan"> items wanted
67
+ <option value="jwl"> jewelry
68
+ <option value="mat"> materials
69
+ <option value="mcy"> motorcycles/scooters
70
+ <option value="msg"> musical instruments
71
+ <option value="pho"> photo/video
72
+ <option value="rvs"> recreational vehicles
73
+ <option value="spo"> sporting goods
74
+ <option value="tix"> tickets
75
+ <option value="tls"> tools
76
+ <option disabled value="">--<option value="ggg">all gigs<option value="hhh">all housing<option value="jjj">all jobs<option value="ppp">all personals<option value="res">all resume<option value="bbb">all services offered</select>
77
+ <input type="submit" value="Search">
78
+ </td><td>
79
+ <label><input type="checkbox" name="srchType" value="T"
80
+ title="check this box to search only posting titles"> only search titles</label>
81
+ </td>
82
+ </tr>
83
+
84
+ <tr>
85
+ <td align="right" width="1">price:</td>
86
+ <td><input name="minAsk" size="6" value="min" onfocus="value=''">&nbsp;<input name="maxAsk" size="6" value="max" onfocus="value=''">&nbsp;</td>
87
+ <td align="left"><label><input type="checkbox" name="hasPic" value="1"> has image</label></td>
88
+ </tr></table></form></blockquote><span id="showPics"></span><span id="hidePics"></span>
89
+
90
+ <blockquote>
91
+ <table width="95%" summary="">
92
+ <tr>
93
+ <td valign="top">[ Mon, 08 Jun 17:37:29 ]</td>
94
+ <td valign="top" id="messages"><span class="hl"> [ <a href="http://www.recalls.gov/">avoid recalled items</a> ] </span> <span class="hl"> [ <a href="/about/prohibited.items">partial list of prohibited items</a> ] </span> <span class="hl"> [<a href="/cgi-bin/success.stories.cgi">success story?</a>]</span> <span class="hl"> [ <b><a href="/about/scams">AVOIDING SCAMS &amp; FRAUD</a></b> ] </span> <span class="hl"> [ <b><a href="/about/safety">PERSONAL SAFETY TIPS</a></b> ] </span> </td>
95
+ </tr>
96
+ </table>
97
+
98
+
99
+
100
+
101
+
102
+ <div id="footer">
103
+ <hr>
104
+ <span id="copy">
105
+ Copyright &copy; 2009 craigslist, inc.<br>
106
+ <a href="#top">Back to top of page</a>
107
+ </span>
108
+ <span class="rss">
109
+ <a class="l" href="http://treasure.craigslist.org/art/index.rss">RSS</a>
110
+ <a href="http://www.craigslist.org/about/rss">(?)</a><br>
111
+ <a class="y" href="http://add.my.yahoo.com/rss?url=http://treasure.craigslist.org/art/index.rss">add to My Yahoo!</a>
112
+ </span>
113
+ </div>
114
+ <br><br>
115
+
116
+ <div id="floater">&nbsp;</div>
117
+
118
+ </blockquote>
119
+ <script type="text/javascript" src="http://www.craigslist.org/js/jquery.js"></script><script type="text/javascript" src="http://www.craigslist.org/js/tocs.js"></script>
120
+ <script type="text/javascript">
121
+ <!--
122
+ initImgs();
123
+ -->
124
+ </script>
125
+
126
+
127
+ </body>
128
+ </html>
@@ -0,0 +1,92 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title>2bth for no deposit req</title>
5
+ <meta name="robots" content="NOARCHIVE,NOFOLLOW">
6
+ <link rel="stylesheet" title="craigslist" href="http://www.craigslist.org/styles/craigslist.css" type="text/css" media="all">
7
+ </head>
8
+
9
+ <body onload="initFlag(1207457727)" class="posting">
10
+
11
+ <div class="bchead">
12
+ <a id="ef" href="/email.friend?postingID=1207457727">email this posting to a friend</a>
13
+ <a href="http://miami.craigslist.org">south florida craigslist</a>
14
+ &gt; <a href="/brw/">broward county</a> &gt; <a href="/brw/apa/">apts/housing for rent</a>
15
+ </div>
16
+
17
+ <div id="flags">
18
+ <div id="flagMsg">
19
+ please <a href="http://www.craigslist.org/about/help/flags_and_community_moderation">flag</a> with care:
20
+ </div>
21
+ <div id="flagChooser">
22
+ <br>
23
+ <a class="fl" id="flag16" href="/flag/?flagCode=16&amp;postingID=1207457727"
24
+ title="Wrong category, wrong site, discusses another post, or otherwise misplaced">
25
+ miscategorized</a>
26
+ <br>
27
+
28
+ <a class="fl" id="flag28" href="/flag/?flagCode=28&amp;postingID=1207457727"
29
+ title="Violates craigslist Terms Of Use or other posted guidelines">
30
+ prohibited</a>
31
+ <br>
32
+
33
+ <a class="fl" id="flag15" href="/flag/?flagCode=15&amp;postingID=1207457727"
34
+ title="Posted too frequently, in multiple cities/categories, or is too commercial">
35
+ spam/overpost</a>
36
+ <br>
37
+
38
+ <a class="fl" id="flag9" href="/flag/?flagCode=9&amp;postingID=1207457727"
39
+ title="Should be considered for inclusion in the Best-Of-Craigslist">
40
+ best of craigslist</a>
41
+ <br>
42
+ </div>
43
+ </div>
44
+
45
+ <div id="tsb">
46
+ <a href="http://www.craigslist.org/about/FHA.html">Stating a discriminatory preference in a housing post is illegal - please flag discriminatory posts as prohibited</a></div> <div id="tsb"> <em>Avoid scams and fraud by dealing locally!</em> Beware any arrangement involving Western Union, Moneygram, wire transfer, or a landlord/owner who is out of the country or cannot meet you in person. <a href="http://www.craigslist.org/about/scams.html">More info</a></div>
47
+
48
+
49
+ <h2>$1350 / 3br - 2bth for no deposit req (Coral Springs)</h2>
50
+ <hr>
51
+ Reply to: <a href="mailto:hous-ccpap-1207457727@craigslist.org?subject=%241350%20%2F%203br%20-%202bth%20for%20no%20deposit%20req%20(Coral%20Springs)">hous-ccpap-1207457727@craigslist.org</a> <sup>[<a href="http://www.craigslist.org/about/help/replying_to_posts" target="_blank">Errors when replying to ads?</a>]</sup><br>
52
+ Date: 2009-06-05, 6:56PM EDT<br>
53
+ <br>
54
+ <br>
55
+ <div id="userbody">
56
+ <p><br>Call!! asking for a new owner.<br> no deposit required rent to own properties. <br> <br> Defaulting payment records are not a problem, <br> we will help you protect the previous owners credit history! 2&#48;&#50;-56&#55;-637&#49; <br><br></p>
57
+
58
+
59
+ <br><br><ul>
60
+ <li>cats are OK - purrr
61
+ <li>dogs are OK - wooof
62
+ <li> Location: Coral Springs
63
+ <li>it's NOT ok to contact this poster with services or other commercial interests</ul>
64
+
65
+ <table summary="craigslist hosted images">
66
+ <tr>
67
+ <td align="center"><img src="http://images.craigslist.org/3k43pe3o8ZZZZZZZZZ9655022102a3ea51624.jpg" alt="image 1207457727-0"></td>
68
+ <td align="center"><img src="http://images.craigslist.org/3n13m53p6ZZZZZZZZZ96596515e51237a179c.jpg" alt="image 1207457727-1"></td>
69
+ </tr>
70
+ <tr>
71
+ <td align="center"><img src="http://images.craigslist.org/3od3p33leZZZZZZZZZ9656d614da8e3a51dd9.jpg" alt="image 1207457727-2"></td>
72
+ <td align="center"><img src="http://images.craigslist.org/3pb3oa3leZZZZZZZZZ965eb60e4d2344019fb.jpg" alt="image 1207457727-3"></td>
73
+ </tr>
74
+ </table>
75
+
76
+ </div>
77
+ PostingID: 1207457727<br>
78
+
79
+ <br>
80
+
81
+ <hr>
82
+ <ul class="clfooter">
83
+ <li>Copyright &copy; 2009 craigslist, inc.</li>
84
+ <li><a href="http://www.craigslist.org/about/terms.of.use.html">terms of use</a></li>
85
+ <li><a href="http://www.craigslist.org/about/privacy_policy">privacy policy</a></li>
86
+ <li><a href="/forums/?forumID=8">feedback forum</a></li>
87
+ </ul>
88
+ <script type="text/javascript" src="http://www.craigslist.org/js/jquery.js"></script>
89
+ <script type="text/javascript" src="http://www.craigslist.org/js/postings.js"></script>
90
+ </body>
91
+ </html>
92
+
@@ -0,0 +1,37 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title></title>
5
+ <meta name="robots" content="NOARCHIVE,NOFOLLOW" />
6
+ <link href="http://www.craigslist.org/styles/craigslist.css" title="craigslist" rel="stylesheet" media="all" type="text/css" />
7
+ </head>
8
+
9
+ <body class="posting" onload="initFlag(1187861811)">
10
+
11
+ <div class="bchead">
12
+
13
+ <a href="http://miami.craigslist.org">south florida craigslist</a>
14
+ &gt; <a href="/brw/">broward county</a> &gt; <a href="/brw/cto/">cars &amp; trucks - by owner</a>
15
+ </div>
16
+
17
+
18
+
19
+ <hr />
20
+ <br />
21
+ <br />
22
+ <h2>This posting has been deleted by its author.</h2>
23
+ <h5>(The title on the listings page will be removed in just a few minutes.)</h5>
24
+
25
+ <br /><br />
26
+
27
+ <hr />
28
+ <ul class="clfooter">
29
+ <li>Copyright &copy; 2009 craigslist, inc.</li>
30
+ <li><a href="http://www.craigslist.org/about/terms.of.use.html">terms of use</a></li>
31
+ <li><a href="http://www.craigslist.org/about/privacy_policy">privacy policy</a></li>
32
+ <li><a href="/forums/?forumID=8">feedback forum</a></li>
33
+ </ul>
34
+ <script src="http://www.craigslist.org/js/jquery.js" type="text/javascript"></script>
35
+ <script src="http://www.craigslist.org/js/postings.js" type="text/javascript"></script>
36
+ </body>
37
+ </html>
@@ -119,8 +119,8 @@ EOD
119
119
  assert_equal 25.0, six.price
120
120
  end
121
121
 
122
- def test_listings_parse
123
- category = CraigScrape::Listings.new read_as_hpricot('listing_samples/category_output.html')
122
+ def test_listings_parse
123
+ category = CraigScrape.scrape_listing relative_uri_for('listing_samples/category_output.html')
124
124
  assert_equal 'index100.html', category.next_page_href
125
125
  assert_equal 100, category.posts.length
126
126
  category.posts[0..80].each do |l|
@@ -128,19 +128,19 @@ EOD
128
128
  assert_equal 18, l.date.day
129
129
  end
130
130
 
131
- category2 = CraigScrape::Listings.new read_as_hpricot('listing_samples/category_output_2.html')
131
+ category2 = CraigScrape.scrape_listing relative_uri_for('listing_samples/category_output_2.html')
132
132
  assert_equal 'index900.html', category2.next_page_href
133
133
  assert_equal 100, category2.posts.length
134
134
 
135
- long_search = CraigScrape::Listings.new read_as_hpricot('listing_samples/long_search_output.html')
135
+ long_search = CraigScrape.scrape_listing relative_uri_for('listing_samples/long_search_output.html')
136
136
  assert_equal '/search/rea?query=house&minAsk=min&maxAsk=max&bedrooms=&s=800', long_search.next_page_href
137
137
  assert_equal 100, long_search.posts.length
138
138
 
139
- short_search = CraigScrape::Listings.new read_as_hpricot('listing_samples/short_search_output.html')
139
+ short_search = CraigScrape.scrape_listing relative_uri_for('listing_samples/short_search_output.html')
140
140
  assert_equal nil, short_search.next_page_href
141
141
  assert_equal 93, short_search.posts.length
142
142
 
143
- mia_fua_index8900_052109 = CraigScrape::Listings.new read_as_hpricot('listing_samples/mia_fua_index8900.5.21.09.html')
143
+ mia_fua_index8900_052109 = CraigScrape.scrape_listing relative_uri_for('listing_samples/mia_fua_index8900.5.21.09.html')
144
144
  assert_equal 'index9000.html', mia_fua_index8900_052109.next_page_href
145
145
  assert_equal 100, mia_fua_index8900_052109.posts.length
146
146
  mia_fua_index8900_052109.posts[0..13].each do |l|
@@ -151,10 +151,14 @@ EOD
151
151
  assert_equal 5, l.date.month
152
152
  assert_equal 14, l.date.day
153
153
  end
154
+
155
+ empty_listings = CraigScrape.scrape_listing relative_uri_for('listing_samples/empty_listings.html')
156
+ assert_equal nil, empty_listings.next_page_href
157
+ assert_equal [], empty_listings.posts
154
158
  end
155
159
 
156
160
  def test_posting_parse
157
- posting0 = CraigScrape::PostFull.new read_as_hpricot('post_samples/posting0.html')
161
+ posting0 = CraigScrape.scrape_full_post relative_uri_for('post_samples/posting0.html')
158
162
  assert_equal "Has storage for videos/dvds. About 2 ft high by 21/2 ft widw. Almond/light beige color", posting0.contents
159
163
  assert_equal ["south florida craigslist", "miami / dade", "furniture - by owner"], posting0.full_section
160
164
  assert_equal "tv cart on wheels - $35 (NMB)", posting0.header
@@ -167,7 +171,7 @@ EOD
167
171
  assert_equal "Has storage for videos/dvds. About 2 ft high by 21/2 ft widw. Almond/light beige color",posting0.contents_as_plain
168
172
  assert_equal 35.0, posting0.price
169
173
 
170
- posting1 = CraigScrape::PostFull.new read_as_hpricot('post_samples/posting1.html')
174
+ posting1 = CraigScrape.scrape_full_post relative_uri_for('post_samples/posting1.html')
171
175
  assert_equal "Residential income property\227Investors this property is for you! This duplex has a 2bedroom/1bath unit on each side. It features updated kitchens and baths (new tubs, toilet, sink, vanities), ceramic tile flooring throughout, separate water and electric meters and on site laundry facilities. It is also closed to the Galleria, beaches and downtown Fort Lauderdale! \r<br />\n\r<br />\nJe parle le Fran\347ais\r<br />\n\r<br />\nThis property is being offered by Blaunch Perrier, Broker Associate, Atlantic Properties International. Blaunch can be reached at 954-593-0077. For additional property information you may also visit www.garylanham.com\r<br />\n\r<br />", posting1.contents
172
176
  assert_equal ["south florida craigslist", "broward county", "real estate - by broker"], posting1.full_section
173
177
  assert_equal "$189900 / 4br - Investment Property--Duplex in Fort Lauderdale", posting1.header
@@ -180,7 +184,7 @@ EOD
180
184
  assert_equal "Residential income property\227Investors this property is for you! This duplex has a 2bedroom/1bath unit on each side. It features updated kitchens and baths (new tubs, toilet, sink, vanities), ceramic tile flooring throughout, separate water and electric meters and on site laundry facilities. It is also closed to the Galleria, beaches and downtown Fort Lauderdale! \r\n\r\nJe parle le Fran\347ais\r\n\r\nThis property is being offered by Blaunch Perrier, Broker Associate, Atlantic Properties International. Blaunch can be reached at 954-593-0077. For additional property information you may also visit www.garylanham.com\r\n\r", posting1.contents_as_plain
181
185
  assert_equal 189900.0, posting1.price
182
186
 
183
- posting2 = CraigScrape::PostFull.new read_as_hpricot('post_samples/posting2.html')
187
+ posting2 = CraigScrape.scrape_full_post relative_uri_for('post_samples/posting2.html')
184
188
  assert_equal 15775, posting2.contents.length # This is easy, and probably fine enough
185
189
  assert_equal ["south florida craigslist", "broward county", "cars & trucks - by dealer"], posting2.full_section
186
190
  assert_equal "PRESENTING A ELECTRON BLUE METALLIC 2002 CHEVROLET CORVETTE Z06 6 SPEE - $23975 (Fort Lauderdale)", posting2.header
@@ -193,7 +197,7 @@ EOD
193
197
  assert_equal "\302\240 Sheehan Buick Pontiac GMC \302\240 Pompano Beach, FL(754) 224-3257 \302\240PRESENTING A ELECTRON BLUE METALLIC 2002 CHEVROLET CORVETTE Z06 6 SPEED FLORIDA DRIVEN SMOKIN' SPORTS CAR!2002 Chevrolet Corvette Z06 Florida Driven AutoCheck Certified 5.7L V8 6sp2 Door Coupe.\302\240Price: \302\240 $23,975Exterior:Electron Blue MetallicInterior:BlackStock#:P5110AVIN:1G1YY12S625129021FREE AutoCheck Vehicle ReportMileage:63,560Transmission:6 Speed ManualEngine:V8 5.7L OHVWarranty:Limited WarrantyTitle:Clear\302\273\302\240View All 58 Photos\302\273\302\240View Full Vehicle Details\302\273\302\240Ask the Seller a Question\302\273\302\240E-mail this to a Friend\302\240 DescriptionPRESENTING A ELECTRON BLUE METALLIC 2002 CHEVROLET CORVETTE Z06 6 SPEED FLORIDA DRIVEN SMOKIN' SPORTS CAR!\r\n\r\nLOADED WITH BLACK LEATHER BUCKET SEATS, POWER DRIVERS SEAT, DUAL ZONE CLIMATE CONTROL, 4 WHEEL ABS BRAKES, POWER STEERING AND BRAKES, REAR LIMITED SLIP DIFFERENTIAL, STABILITY CONTROL, CRUISE CONTROL, TLT STEERING WHEEL, POWER WINDOWS AND LOCKS, AUTOMATIC ON/OFF HEADLAMPS, FOG LIGHTS, DUAL AIR BAG SAFETY, AM/FM STEREO CD PLAYER, INTERMITTENT WINDSHIELD WIPERS AND SO MUCH MORE - THIS CAR IS TOTALLY HOT WITH GREAT LOW MILES!\r\n\r\nPlease call us to make your deal now at 1-888-453-5244. Please visit our Website at www.sheehanautoplex.com ***View 50+ Pictures of this vehicle - a complete description including standard features and all added options & a FREE AUTO CHECK REPORT at www.sheehanautoplex.com. ***Financing for Everyone - Good credit - bad credit - divorce - charge off's - NO PROBLEM. To complete a secure credit application, please visit our website at www.sheehanautoplex.com ***The largest Dealer in the State of Florida - We export all over the world - For details please visit www.sheehanautoplex.com ***Sheehan Autoplex takes great pride in our outstanding customer service and has been recognized by the following associations - BBB (Better Business Bureau) - NIADA - and the FIADA. Call us to get your best deal. CALL NOW. 1-888-453-5244\302\240 Contact Sheehan Buick Pontiac GMCPhone:(754) 224-3257Fax:(954) 781-9050Phone:(754) 224-3257E-mail:sales@proauto.comBusiness HoursWeekdays:9:00 AM to 9:00 PMSat:9:00 AM to 6:00 PMSun:",posting2.contents_as_plain
194
198
  assert_equal 23975.0, posting2.price
195
199
 
196
- posting3 = CraigScrape::PostFull.new read_as_hpricot('post_samples/posting3.html')
200
+ posting3 = CraigScrape.scrape_full_post relative_uri_for('post_samples/posting3.html')
197
201
  assert_equal "1992 Twin Turbo 300ZX. This car is pearl white outside and Camel leather interior with suede accents. Motor was re-done from the ground up two years ago. 23,000 on new motor rebuild! New Leather seats and center arm rest done also two years ago. Has Alpine Am/Fm Cd with Ipod cable, Viper pager alarm New! JL Audio Amp & JLAudio sub box custom made. Mtx mids& highs component speakers sparate tweeter. Car runs strong & straight. Just detailed the interior. Exterior should be painted. This car once painted will sell for over $10,000. \r<br />\nCome get a great deal now! offers and trades will be considered. 786-303-6550 Manny", posting3.contents
198
202
  assert_equal ["south florida craigslist", "miami / dade", "cars & trucks - by owner"], posting3.full_section
199
203
  assert_equal "300ZX Nissan Twin Turbo 1992 - $5800 (N.Miami/ Hialeah)", posting3.header
@@ -207,7 +211,7 @@ EOD
207
211
  assert_equal 5800.0, posting3.price
208
212
 
209
213
  # This one ended up being quite a curveball since the user uploaded HTML was such junk:
210
- posting4 = CraigScrape::PostFull.new read_as_hpricot('post_samples/posting4.html')
214
+ posting4 = CraigScrape.scrape_full_post relative_uri_for('post_samples/posting4.html')
211
215
  assert_equal 20640, posting4.contents.length
212
216
  assert_equal ["south florida craigslist", "broward county", "real estate - by broker"], posting4.full_section
213
217
  assert_equal "$225000 / 3br - Palm Aire Golf Corner Unit!", posting4.header
@@ -220,7 +224,7 @@ EOD
220
224
  assert_equal 6399,posting4.contents_as_plain.length
221
225
  assert_equal 225000.0, posting4.price
222
226
 
223
- posting5 = CraigScrape::PostFull.new read_as_hpricot('post_samples/posting5.html')
227
+ posting5 = CraigScrape.scrape_full_post relative_uri_for('post_samples/posting5.html')
224
228
  assert_equal true, posting5.flagged_for_removal?
225
229
  assert_equal nil, posting5.contents
226
230
  assert_equal ["south florida craigslist", "palm beach co", "apts/housing for rent"], posting5.full_section
@@ -233,6 +237,35 @@ EOD
233
237
  assert_equal [], posting5.images
234
238
  assert_equal nil, posting5.contents_as_plain
235
239
  assert_equal nil, posting5.price
240
+
241
+ posting_deleted = CraigScrape.scrape_full_post relative_uri_for('post_samples/this_post_has_been_deleted_by_its_author.html')
242
+ assert_equal true, posting_deleted.deleted_by_author?
243
+ assert_equal nil, posting_deleted.contents
244
+ assert_equal ["south florida craigslist", "broward county", "cars & trucks - by owner"], posting_deleted.full_section
245
+ assert_equal "This posting has been deleted by its author.", posting_deleted.header
246
+ assert_equal nil, posting_deleted.title
247
+ assert_equal nil, posting_deleted.location
248
+ assert_equal nil, posting_deleted.posting_id
249
+ assert_equal nil, posting_deleted.reply_to
250
+ assert_equal nil, posting_deleted.post_time
251
+ assert_equal [], posting_deleted.images
252
+ assert_equal nil, posting_deleted.contents_as_plain
253
+ assert_equal nil, posting_deleted.price
254
+
255
+ posting6 = CraigScrape.scrape_full_post relative_uri_for('post_samples/1207457727.html')
256
+ assert_equal "<p><br />Call!! asking for a new owner.<br /> no deposit required rent to own properties. <br /> <br /> Defaulting payment records are not a problem, <br /> we will help you protect the previous owners credit history! 202-567-6371 <br /><br /></p>",posting6.contents
257
+ assert_equal "Call!! asking for a new owner. no deposit required rent to own properties. Defaulting payment records are not a problem, we will help you protect the previous owners credit history! 202-567-6371 ",posting6.contents_as_plain
258
+ assert_equal false,posting6.deleted_by_author?
259
+ assert_equal false,posting6.flagged_for_removal?
260
+ assert_equal ["south florida craigslist", "broward county", "apts/housing for rent"],posting6.full_section
261
+ assert_equal "$1350 / 3br - 2bth for no deposit req (Coral Springs)",posting6.header
262
+ assert_equal ["http://images.craigslist.org/3k43pe3o8ZZZZZZZZZ9655022102a3ea51624.jpg", "http://images.craigslist.org/3n13m53p6ZZZZZZZZZ96596515e51237a179c.jpg", "http://images.craigslist.org/3od3p33leZZZZZZZZZ9656d614da8e3a51dd9.jpg", "http://images.craigslist.org/3pb3oa3leZZZZZZZZZ965eb60e4d2344019fb.jpg"],posting6.images
263
+ assert_equal 'Coral Springs',posting6.location
264
+ assert_equal [0, 56, 18, 5, 6, 2009, 5, 156, true, "EDT"],posting6.post_time.to_a
265
+ assert_equal 1207457727,posting6.posting_id
266
+ assert_equal 1350.0,posting6.price
267
+ assert_equal "hous-ccpap-1207457727@craigslist.org",posting6.reply_to
268
+ assert_equal "2bth for no deposit req",posting6.title
236
269
  end
237
270
 
238
271
  private
@@ -242,4 +275,26 @@ EOD
242
275
  File.open('%s/%s' % [File.dirname(__FILE__), test_file]).read
243
276
  )
244
277
  end
278
+
279
+ def relative_uri_for(filename)
280
+ 'file://%s/%s' % [File.dirname(File.expand_path(__FILE__)), filename]
281
+ end
282
+
283
+ def pp_assertions(obj, obj_name)
284
+ probable_accessors = (obj.methods-obj.class.superclass.methods)
285
+
286
+ puts
287
+ probable_accessors.sort.each do |m|
288
+ val = obj.send(m.to_sym)
289
+
290
+ # There's a good number of transformations worth doing here, I'll just start like this for now:
291
+ if val.kind_of? Time
292
+ # I've decided this is the the easiest way to understand and test a time
293
+ val = val.to_a
294
+ m = "#{m}.to_a"
295
+ end
296
+
297
+ puts "assert_equal %s, %s.%s" % [val.inspect,obj_name,m]
298
+ end
299
+ end
245
300
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: libcraigscrape
3
3
  version: !ruby/object:Gem::Version
4
- version: "0.6"
4
+ version: 0.6.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris DeRose, DeRose Technologies, Inc.
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-05-21 00:00:00 -04:00
12
+ date: 2009-06-08 00:00:00 -04:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -68,6 +68,7 @@ files:
68
68
  - test/listing_samples
69
69
  - test/listing_samples/category_output.html
70
70
  - test/listing_samples/short_search_output.html
71
+ - test/listing_samples/empty_listings.html
71
72
  - test/listing_samples/mia_fua_index8900.5.21.09.html
72
73
  - test/listing_samples/category_output_2.html
73
74
  - test/listing_samples/long_search_output.html
@@ -78,6 +79,8 @@ files:
78
79
  - test/post_samples/posting0.html
79
80
  - test/post_samples/posting5.html
80
81
  - test/post_samples/posting3.html
82
+ - test/post_samples/this_post_has_been_deleted_by_its_author.html
83
+ - test/post_samples/1207457727.html
81
84
  - test/post_samples/posting2.html
82
85
  - test/google.html
83
86
  - lib/libcraigscrape.rb