libcraigscrape 0.6.5 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. data/CHANGELOG +17 -0
  2. data/Rakefile +1 -1
  3. data/bin/craigwatch +10 -10
  4. data/bin/report_mailer/craigslist_report.html.erb +2 -2
  5. data/bin/report_mailer/craigslist_report.plain.erb +2 -2
  6. data/lib/libcraigscrape.rb +585 -342
  7. data/test/geolisting_samples/geo_listing_ca070209.html +76 -0
  8. data/test/geolisting_samples/geo_listing_ca_sk070209.html +31 -0
  9. data/test/geolisting_samples/geo_listing_cn070209.html +35 -0
  10. data/test/geolisting_samples/geo_listing_us070209.html +355 -0
  11. data/test/libcraigscrape_test_helpers.rb +31 -0
  12. data/test/listing_samples/fortmyers_art_index.060909/1046596324.html +93 -0
  13. data/test/listing_samples/fortmyers_art_index.060909/1053085283.html +92 -0
  14. data/test/listing_samples/fortmyers_art_index.060909/1112522674.html +89 -0
  15. data/test/listing_samples/fortmyers_art_index.060909/823516079.html +92 -0
  16. data/test/listing_samples/fortmyers_art_index.060909/825684735.html +89 -0
  17. data/test/listing_samples/fortmyers_art_index.060909/891513957.html +94 -0
  18. data/test/listing_samples/fortmyers_art_index.060909/897549505.html +99 -0
  19. data/test/listing_samples/fortmyers_art_index.060909/960826026.html +89 -0
  20. data/test/listing_samples/fortmyers_art_index.060909/993256300.html +89 -0
  21. data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index500.060909.html +237 -0
  22. data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index600.060909.html +132 -0
  23. data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html +144 -0
  24. data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html +146 -0
  25. data/test/post_samples/brw_reb_1224008903.html +101 -0
  26. data/test/post_samples/sfbay_art_1223614914.html +94 -0
  27. data/test/test_craigslist_geolisting.rb +425 -0
  28. data/test/test_craigslist_listing.rb +179 -260
  29. data/test/test_craigslist_posting.rb +306 -0
  30. metadata +29 -2
@@ -1,300 +1,219 @@
1
1
  #!/usr/bin/ruby
2
2
 
3
- require File.dirname(__FILE__)+'/../lib/libcraigscrape'
4
3
  require 'test/unit'
4
+ require File.dirname(__FILE__)+'/../lib/libcraigscrape'
5
+ require File.dirname(__FILE__)+'/libcraigscrape_test_helpers'
5
6
 
6
7
  class CraigslistListingTest < Test::Unit::TestCase
7
-
8
+ include LibcraigscrapeTestHelpers
9
+
8
10
  def test_pukes
9
- google = read_as_hpricot('google.html')
10
-
11
- assert_raise(CraigScrape::ParseError){ CraigScrape::PostSummary.new google }
12
- assert_raise(CraigScrape::ParseError){ CraigScrape::Listings.new google }
13
- assert_raise(CraigScrape::ParseError){ CraigScrape::PostFull.new google }
14
- end
15
-
16
- def test_listing_parse
17
- search_html_one = <<EOD
18
- <p> Apr 18 - <a href="/brw/reb/1128608404.html">Losing your house? You'll need this New Loan Mod Video -</a><font size="-1"> (W. Woodland)</font> <span class="p"> img</span> &lt;&lt;<i><a href="/reb/">real&nbsp;estate - by broker</a></i></p>
19
- EOD
20
- search_html_two = <<EOD
21
- <p> Jan 4 - <a href="/mdc/reb/1128609783.html">$348000 / 1br - Large 1/1 plus office on 49th Floor. 5-Star NEW Condo. Great Views -</a><font size="-1"> (Miami)</font> <span class="p"> pic&nbsp;img</span> &lt;&lt;<i><a href="/reb/">real&nbsp;estate - by broker</a></i></p>
22
- EOD
23
- search_html_three = <<EOD
24
- <p> Dec 31 - <a href="/mdc/reb/1128520894.html">$22,000 HOME -ADULT COMMUNITY BOYNTON BEACH -</a> <span class="p"> pic</span> &lt;&lt;<i><a href="/reb/">real&nbsp;estate - by broker</a></i></p>
25
- EOD
26
- search_html_four = <<EOD
27
- <p> Jul 22 - <a href="/mdc/reb/1128474725.html">$325000 / 3br - GOOD DEAL GREAT HOUSE AND LOCATION -</a><font size="-1"> (CORAL GABLES)</font> &lt;&lt;<i><a href="/reb/">real&nbsp;estate - by broker</a></i></p>
28
- EOD
29
- search_html_five = <<EOD
30
- <p> Apr 9 - <a href="/pbc/boa/1115308178.html">40' SILVERTON CONVERTIBLE DIESEL - $105000 -</a><font size="-1"> (HOBE SOUND)</font> <span class="p"> pic</span></p>
31
- EOD
32
- category_listing_one = <<EOD
33
- <p><a href="/pbc/reb/1128661387.html">$2995000 / 5br - Downtown Boca New Home To Be Built -</a><font size="-1"> (Boca Raton)</font> <span class="p"> pic</span> &lt;&lt;<i><a href="/reb/">real&nbsp;estate - by broker</a></i></p>
34
- EOD
35
- category_listing_two = <<EOD
36
- <p><a href="/mdc/jwl/1128691192.html">925 Sterling Silver Dragonfly Charm Bracelet - $25 -</a> <span class="p"> img</span></p>
37
- EOD
38
-
39
- one = CraigScrape::PostSummary.new Hpricot.parse(search_html_one).at('p')
40
- assert_equal true, one.has_img?
41
- assert_equal false, one.has_pic?
42
- assert_equal true, one.has_pic_or_img?
43
- assert_equal '/brw/reb/1128608404.html', one.href
44
- assert_equal "Losing your house? You'll need this New Loan Mod Video", one.label
45
- assert_equal "real\302\240estate - by broker", one.section
46
- assert_equal "W. Woodland", one.location
47
- assert_equal 4, one.date.month
48
- assert_equal 18, one.date.day
49
- assert_equal nil, one.price
50
-
51
- two = CraigScrape::PostSummary.new Hpricot.parse(search_html_two).at('p')
52
- assert_equal true, two.has_img?
53
- assert_equal true, two.has_pic?
54
- assert_equal true, two.has_pic_or_img?
55
- assert_equal '/mdc/reb/1128609783.html', two.href
56
- assert_equal "$348000 / 1br - Large 1/1 plus office on 49th Floor. 5-Star NEW Condo. Great Views", two.label
57
- assert_equal "real\302\240estate - by broker", two.section
58
- assert_equal "Miami", two.location
59
- assert_equal 1, two.date.month
60
- assert_equal 4, two.date.day
61
- assert_equal 348000.0, two.price
62
-
63
- three = CraigScrape::PostSummary.new Hpricot.parse(search_html_three).at('p')
64
- assert_equal false, three.has_img?
65
- assert_equal true, three.has_pic?
66
- assert_equal true, three.has_pic_or_img?
67
- assert_equal '/mdc/reb/1128520894.html', three.href
68
- assert_equal "$22,000 HOME -ADULT COMMUNITY BOYNTON BEACH", three.label
69
- assert_equal "real\302\240estate - by broker", three.section
70
- assert_equal nil, three.location
71
- assert_equal 12, three.date.month
72
- assert_equal 31, three.date.day
73
- assert_equal 22.0, three.price
74
-
75
- four = CraigScrape::PostSummary.new Hpricot.parse(search_html_four).at('p')
76
- assert_equal false, four.has_img?
77
- assert_equal false, four.has_pic?
78
- assert_equal false, four.has_pic_or_img?
79
- assert_equal '/mdc/reb/1128474725.html', four.href
80
- assert_equal "$325000 / 3br - GOOD DEAL GREAT HOUSE AND LOCATION", four.label
81
- assert_equal "real\302\240estate - by broker", four.section
82
- assert_equal "CORAL GABLES", four.location
83
- assert_equal 7, four.date.month
84
- assert_equal 22, four.date.day
85
- assert_equal 325000.0, four.price
86
-
87
- five = CraigScrape::PostSummary.new Hpricot.parse(search_html_five).at('p')
88
- assert_equal false, five.has_img?
89
- assert_equal true, five.has_pic?
90
- assert_equal true, five.has_pic_or_img?
91
- assert_equal '/pbc/boa/1115308178.html', five.href
92
- assert_equal "40' SILVERTON CONVERTIBLE DIESEL - $105000", five.label
93
- assert_equal nil, five.section
94
- assert_equal "HOBE SOUND", five.location
95
- assert_equal 4, five.date.month
96
- assert_equal 9, five.date.day
97
- assert_equal 105000.0, five.price
98
-
99
- five = CraigScrape::PostSummary.new Hpricot.parse(category_listing_one).at('p')
100
- assert_equal false, five.has_img?
101
- assert_equal true, five.has_pic?
102
- assert_equal true, five.has_pic_or_img?
103
- assert_equal '/pbc/reb/1128661387.html', five.href
104
- assert_equal "$2995000 / 5br - Downtown Boca New Home To Be Built", five.label
105
- assert_equal "real\302\240estate - by broker", five.section
106
- assert_equal "Boca Raton", five.location
107
- assert_equal nil, five.date
108
- assert_equal 2995000.0, five.price
109
-
110
- six = CraigScrape::PostSummary.new Hpricot.parse(category_listing_two).at('p')
111
- assert_equal true, six.has_img?
112
- assert_equal false, six.has_pic?
113
- assert_equal true, six.has_pic_or_img?
114
- assert_equal '/mdc/jwl/1128691192.html', six.href
115
- assert_equal "925 Sterling Silver Dragonfly Charm Bracelet - $25", six.label
116
- assert_equal nil, six.section
117
- assert_equal nil, six.location
118
- assert_equal nil, six.date
119
- assert_equal 25.0, six.price
11
+ assert_raise(CraigScrape::Scraper::ParseError) do
12
+ CraigScrape::Listings.new( relative_uri_for('google.html') ).posts
13
+ end
120
14
  end
121
15
 
122
16
  def test_listings_parse
123
- category = CraigScrape.scrape_listing relative_uri_for('listing_samples/category_output.html')
17
+ category = CraigScrape::Listings.new relative_uri_for('listing_samples/category_output.html')
124
18
  assert_equal 'index100.html', category.next_page_href
125
19
  assert_equal 100, category.posts.length
20
+
126
21
  category.posts[0..80].each do |l|
127
- assert_equal 4, l.date.month
128
- assert_equal 18, l.date.day
22
+ assert_equal 4, l.post_date.month
23
+ assert_equal 18, l.post_date.day
129
24
  end
130
25
 
131
- category2 = CraigScrape.scrape_listing relative_uri_for('listing_samples/category_output_2.html')
26
+ category2 = CraigScrape::Listings.new relative_uri_for('listing_samples/category_output_2.html')
132
27
  assert_equal 'index900.html', category2.next_page_href
133
28
  assert_equal 100, category2.posts.length
134
29
 
135
- long_search = CraigScrape.scrape_listing relative_uri_for('listing_samples/long_search_output.html')
30
+ long_search = CraigScrape::Listings.new relative_uri_for('listing_samples/long_search_output.html')
136
31
  assert_equal '/search/rea?query=house&minAsk=min&maxAsk=max&bedrooms=&s=800', long_search.next_page_href
137
32
  assert_equal 100, long_search.posts.length
138
33
 
139
- short_search = CraigScrape.scrape_listing relative_uri_for('listing_samples/short_search_output.html')
34
+ short_search = CraigScrape::Listings.new relative_uri_for('listing_samples/short_search_output.html')
140
35
  assert_equal nil, short_search.next_page_href
141
36
  assert_equal 93, short_search.posts.length
142
37
 
143
- mia_fua_index8900_052109 = CraigScrape.scrape_listing relative_uri_for('listing_samples/mia_fua_index8900.5.21.09.html')
38
+ mia_fua_index8900_052109 = CraigScrape::Listings.new relative_uri_for('listing_samples/mia_fua_index8900.5.21.09.html')
144
39
  assert_equal 'index9000.html', mia_fua_index8900_052109.next_page_href
145
40
  assert_equal 100, mia_fua_index8900_052109.posts.length
41
+ # NOTE: This tests a subtle condition where there's a blank h4 tag, and we shouldn't need to eager-load,. since a solid inference can be made on the date, since its not the last h4 on the page
42
+ # This actually happens quite a bit...
146
43
  mia_fua_index8900_052109.posts[0..13].each do |l|
147
- assert_equal 5, l.date.month
148
- assert_equal 15, l.date.day
44
+ assert_equal 5, l.post_date.month
45
+ assert_equal 15, l.post_date.day
149
46
  end
150
47
  mia_fua_index8900_052109.posts[14..99].each do |l|
151
- assert_equal 5, l.date.month
152
- assert_equal 14, l.date.day
48
+ assert_equal 5, l.post_date.month
49
+ assert_equal 14, l.post_date.day
153
50
  end
154
51
 
155
- empty_listings = CraigScrape.scrape_listing relative_uri_for('listing_samples/empty_listings.html')
52
+ empty_listings = CraigScrape::Listings.new relative_uri_for('listing_samples/empty_listings.html')
156
53
  assert_equal nil, empty_listings.next_page_href
157
54
  assert_equal [], empty_listings.posts
158
55
  end
159
-
160
- def test_posting_parse
161
- posting0 = CraigScrape.scrape_full_post relative_uri_for('post_samples/posting0.html')
162
- assert_equal "Has storage for videos/dvds. About 2 ft high by 21/2 ft widw. Almond/light beige color", posting0.contents
163
- assert_equal ["south florida craigslist", "miami / dade", "furniture - by owner"], posting0.full_section
164
- assert_equal "tv cart on wheels - $35 (NMB)", posting0.header
165
- assert_equal "tv cart on wheels", posting0.title
166
- assert_equal "NMB", posting0.location
167
- assert_equal 1131363612, posting0.posting_id
168
- assert_equal "sale-ktf9w-1131363612@craigslist.org", posting0.reply_to
169
- assert_equal [0, 21, 13, 20, 4, 2009, 1, 110, true, "EDT"], posting0.post_time.to_a
170
- assert_equal [], posting0.images
171
- assert_equal "Has storage for videos/dvds. About 2 ft high by 21/2 ft widw. Almond/light beige color",posting0.contents_as_plain
172
- assert_equal 35.0, posting0.price
173
-
174
- posting1 = CraigScrape.scrape_full_post relative_uri_for('post_samples/posting1.html')
175
- assert_equal "Residential income property\227Investors this property is for you! This duplex has a 2bedroom/1bath unit on each side. It features updated kitchens and baths (new tubs, toilet, sink, vanities), ceramic tile flooring throughout, separate water and electric meters and on site laundry facilities. It is also closed to the Galleria, beaches and downtown Fort Lauderdale! \r<br />\n\r<br />\nJe parle le Fran\347ais\r<br />\n\r<br />\nThis property is being offered by Blaunch Perrier, Broker Associate, Atlantic Properties International. Blaunch can be reached at 954-593-0077. For additional property information you may also visit www.garylanham.com\r<br />\n\r<br />", posting1.contents
176
- assert_equal ["south florida craigslist", "broward county", "real estate - by broker"], posting1.full_section
177
- assert_equal "$189900 / 4br - Investment Property--Duplex in Fort Lauderdale", posting1.header
178
- assert_equal "Investment Property--Duplex in Fort Lauderdale", posting1.title
179
- assert_equal '1000 NE 14th Pl', posting1.location
180
- assert_equal 1131242195, posting1.posting_id
181
- assert_equal "hous-5nzhq-1131242195@craigslist.org", posting1.reply_to
182
- assert_equal [0, 33, 13, 20, 4, 2009, 1, 110, true, "EDT"], posting1.post_time.to_a
183
- assert_equal %w(http://images.craigslist.org/3n83o33l5ZZZZZZZZZ94k913ac1582d4b1fa4.jpg http://images.craigslist.org/3n93p63obZZZZZZZZZ94k19d5e32eb3b610c2.jpg http://images.craigslist.org/3n93m03l6ZZZZZZZZZ94k6e9785e37a1b1f3f.jpg http://images.craigslist.org/3ma3oc3l4ZZZZZZZZZ94kbfecbcd2fb2e19cc.jpg), posting1.images
184
- assert_equal "Residential income property\227Investors this property is for you! This duplex has a 2bedroom/1bath unit on each side. It features updated kitchens and baths (new tubs, toilet, sink, vanities), ceramic tile flooring throughout, separate water and electric meters and on site laundry facilities. It is also closed to the Galleria, beaches and downtown Fort Lauderdale! \r\n\r\nJe parle le Fran\347ais\r\n\r\nThis property is being offered by Blaunch Perrier, Broker Associate, Atlantic Properties International. Blaunch can be reached at 954-593-0077. For additional property information you may also visit www.garylanham.com\r\n\r", posting1.contents_as_plain
185
- assert_equal 189900.0, posting1.price
186
-
187
- posting2 = CraigScrape.scrape_full_post relative_uri_for('post_samples/posting2.html')
188
- assert_equal 15775, posting2.contents.length # This is easy, and probably fine enough
189
- assert_equal ["south florida craigslist", "broward county", "cars & trucks - by dealer"], posting2.full_section
190
- assert_equal "PRESENTING A ELECTRON BLUE METALLIC 2002 CHEVROLET CORVETTE Z06 6 SPEE - $23975 (Fort Lauderdale)", posting2.header
191
- assert_equal "PRESENTING A ELECTRON BLUE METALLIC 2002 CHEVROLET CORVETTE Z06 6 SPEE", posting2.title
192
- assert_equal 'Fort Lauderdale', posting2.location
193
- assert_equal 1127037648, posting2.posting_id
194
- assert_equal nil, posting2.reply_to
195
- assert_equal [0, 16, 14, 17, 4, 2009, 5, 107, true, "EDT"], posting2.post_time.to_a
196
- assert_equal [], posting2.images
197
- assert_equal "\302\240 Sheehan Buick Pontiac GMC \302\240 Pompano Beach, FL(754) 224-3257 \302\240PRESENTING A ELECTRON BLUE METALLIC 2002 CHEVROLET CORVETTE Z06 6 SPEED FLORIDA DRIVEN SMOKIN' SPORTS CAR!2002 Chevrolet Corvette Z06 Florida Driven AutoCheck Certified 5.7L V8 6sp2 Door Coupe.\302\240Price: \302\240 $23,975Exterior:Electron Blue MetallicInterior:BlackStock#:P5110AVIN:1G1YY12S625129021FREE AutoCheck Vehicle ReportMileage:63,560Transmission:6 Speed ManualEngine:V8 5.7L OHVWarranty:Limited WarrantyTitle:Clear\302\273\302\240View All 58 Photos\302\273\302\240View Full Vehicle Details\302\273\302\240Ask the Seller a Question\302\273\302\240E-mail this to a Friend\302\240 DescriptionPRESENTING A ELECTRON BLUE METALLIC 2002 CHEVROLET CORVETTE Z06 6 SPEED FLORIDA DRIVEN SMOKIN' SPORTS CAR!\r\n\r\nLOADED WITH BLACK LEATHER BUCKET SEATS, POWER DRIVERS SEAT, DUAL ZONE CLIMATE CONTROL, 4 WHEEL ABS BRAKES, POWER STEERING AND BRAKES, REAR LIMITED SLIP DIFFERENTIAL, STABILITY CONTROL, CRUISE CONTROL, TLT STEERING WHEEL, POWER WINDOWS AND LOCKS, AUTOMATIC ON/OFF HEADLAMPS, FOG LIGHTS, DUAL AIR BAG SAFETY, AM/FM STEREO CD PLAYER, INTERMITTENT WINDSHIELD WIPERS AND SO MUCH MORE - THIS CAR IS TOTALLY HOT WITH GREAT LOW MILES!\r\n\r\nPlease call us to make your deal now at 1-888-453-5244. Please visit our Website at www.sheehanautoplex.com ***View 50+ Pictures of this vehicle - a complete description including standard features and all added options & a FREE AUTO CHECK REPORT at www.sheehanautoplex.com. ***Financing for Everyone - Good credit - bad credit - divorce - charge off's - NO PROBLEM. To complete a secure credit application, please visit our website at www.sheehanautoplex.com ***The largest Dealer in the State of Florida - We export all over the world - For details please visit www.sheehanautoplex.com ***Sheehan Autoplex takes great pride in our outstanding customer service and has been recognized by the following associations - BBB (Better Business Bureau) - NIADA - and the FIADA. Call us to get your best deal. CALL NOW. 1-888-453-5244\302\240 Contact Sheehan Buick Pontiac GMCPhone:(754) 224-3257Fax:(954) 781-9050Phone:(754) 224-3257E-mail:sales@proauto.comBusiness HoursWeekdays:9:00 AM to 9:00 PMSat:9:00 AM to 6:00 PMSun:",posting2.contents_as_plain
198
- assert_equal 23975.0, posting2.price
199
-
200
- posting3 = CraigScrape.scrape_full_post relative_uri_for('post_samples/posting3.html')
201
- assert_equal "1992 Twin Turbo 300ZX. This car is pearl white outside and Camel leather interior with suede accents. Motor was re-done from the ground up two years ago. 23,000 on new motor rebuild! New Leather seats and center arm rest done also two years ago. Has Alpine Am/Fm Cd with Ipod cable, Viper pager alarm New! JL Audio Amp & JLAudio sub box custom made. Mtx mids& highs component speakers sparate tweeter. Car runs strong & straight. Just detailed the interior. Exterior should be painted. This car once painted will sell for over $10,000. \r<br />\nCome get a great deal now! offers and trades will be considered. 786-303-6550 Manny", posting3.contents
202
- assert_equal ["south florida craigslist", "miami / dade", "cars & trucks - by owner"], posting3.full_section
203
- assert_equal "300ZX Nissan Twin Turbo 1992 - $5800 (N.Miami/ Hialeah)", posting3.header
204
- assert_equal "300ZX Nissan Twin Turbo 1992", posting3.title
205
- assert_equal "N.Miami/ Hialeah", posting3.location
206
- assert_equal 1130212403, posting3.posting_id
207
- assert_equal "sale-c9bpa-1130212403@craigslist.org", posting3.reply_to
208
- assert_equal [0, 21, 18, 19, 4, 2009, 0, 109, true, "EDT"], posting3.post_time.to_a
209
- assert_equal %w(http://images.craigslist.org/3n23kf3lfZZZZZZZZZ94j1160e7d7b0601934.jpg http://images.craigslist.org/3nc3kf3p2ZZZZZZZZZ94j04fbc71e0a551ace.jpg http://images.craigslist.org/3nc3k33l7ZZZZZZZZZ94k13d8d7b1024e1e0e.jpg http://images.craigslist.org/3n23k63mfZZZZZZZZZ94k7838ae5d48d91eb8.jpg), posting3.images
210
- assert_equal "1992 Twin Turbo 300ZX. This car is pearl white outside and Camel leather interior with suede accents. Motor was re-done from the ground up two years ago. 23,000 on new motor rebuild! New Leather seats and center arm rest done also two years ago. Has Alpine Am/Fm Cd with Ipod cable, Viper pager alarm New! JL Audio Amp & JLAudio sub box custom made. Mtx mids& highs component speakers sparate tweeter. Car runs strong & straight. Just detailed the interior. Exterior should be painted. This car once painted will sell for over $10,000. \r\nCome get a great deal now! offers and trades will be considered. 786-303-6550 Manny",posting3.contents_as_plain
211
- assert_equal 5800.0, posting3.price
212
56
 
213
- # This one ended up being quite a curveball since the user uploaded HTML was such junk:
214
- posting4 = CraigScrape.scrape_full_post relative_uri_for('post_samples/posting4.html')
215
- assert_equal 20640, posting4.contents.length
216
- assert_equal ["south florida craigslist", "broward county", "real estate - by broker"], posting4.full_section
217
- assert_equal "$225000 / 3br - Palm Aire Golf Corner Unit!", posting4.header
218
- assert_equal "Palm Aire Golf Corner Unit!", posting4.title
219
- assert_equal nil, posting4.location
220
- assert_equal 1139303170, posting4.posting_id
221
- assert_equal "hous-sk9f2-1139303170@craigslist.org", posting4.reply_to
222
- assert_equal [0, 8, 9, 25, 4, 2009, 6, 115, true, "EDT"], posting4.post_time.to_a
223
- assert_equal [], posting4.images
224
- assert_equal 6399,posting4.contents_as_plain.length
225
- assert_equal 225000.0, posting4.price
57
+ def test_eager_post_loading
58
+ # libcraigscrape is supposed to 'smart' when downloading postings that don't make 'sense' solely by looking at the listings.
59
+ # I'm only seen this on occasion, but its annoying and craigslist seems to use a lot of approximations sometimes
60
+ # The test page supplied is slightly adjusted to compensate for the lack of a web server when readng pages form the filesystem.
226
61
 
227
- posting5 = CraigScrape.scrape_full_post relative_uri_for('post_samples/posting5.html')
228
- assert_equal true, posting5.flagged_for_removal?
229
- assert_equal nil, posting5.contents
230
- assert_equal ["south florida craigslist", "palm beach co", "apts/housing for rent"], posting5.full_section
231
- assert_equal "This posting has been <a href=\"http://www.craigslist.org/about/help/flags_and_community_moderation\">flagged</a> for removal", posting5.header
232
- assert_equal nil, posting5.title
233
- assert_equal nil, posting5.location
234
- assert_equal nil, posting5.posting_id
235
- assert_equal nil, posting5.reply_to
236
- assert_equal nil, posting5.post_time
237
- assert_equal [], posting5.images
238
- assert_equal nil, posting5.contents_as_plain
239
- assert_equal nil, posting5.price
62
+ fortmyers_art_index500_060909 = CraigScrape::Listings.new relative_uri_for('listing_samples/fortmyers_art_index.060909/fortmyers_art_index500.060909.html')
63
+ fortmyers_art_index500_060909.posts[0..12].each do |l|
64
+ assert_equal 5, l.post_date.month
65
+ assert_equal 16, l.post_date.day
66
+ end
67
+ fortmyers_art_index500_060909.posts[13..36].each do |l|
68
+ assert_equal 5, l.post_date.month
69
+ assert_equal 15, l.post_date.day
70
+ end
71
+ fortmyers_art_index500_060909.posts[37..41].each do |l|
72
+ assert_equal 5, l.post_date.month
73
+ assert_equal 14, l.post_date.day
74
+ end
75
+ fortmyers_art_index500_060909.posts[42..55].each do |l|
76
+ assert_equal 5, l.post_date.month
77
+ assert_equal 13, l.post_date.day
78
+ end
79
+ fortmyers_art_index500_060909.posts[56..65].each do |l|
80
+ assert_equal 5, l.post_date.month
81
+ assert_equal 12, l.post_date.day
82
+ end
83
+ fortmyers_art_index500_060909.posts[66..87].each do |l|
84
+ assert_equal 5, l.post_date.month
85
+ assert_equal 11, l.post_date.day
86
+ end
87
+ fortmyers_art_index500_060909.posts[88..94].each do |l|
88
+ assert_equal 5, l.post_date.month
89
+ assert_equal 10, l.post_date.day
90
+ end
91
+ assert_equal 4, fortmyers_art_index500_060909.posts[95].post_date.month
92
+ assert_equal 8, fortmyers_art_index500_060909.posts[95].post_date.day
93
+ assert_equal 2, fortmyers_art_index500_060909.posts[96].post_date.month
94
+ assert_equal 27, fortmyers_art_index500_060909.posts[96].post_date.day
95
+ assert_equal 2, fortmyers_art_index500_060909.posts[97].post_date.month
96
+ assert_equal 23, fortmyers_art_index500_060909.posts[97].post_date.day
97
+ assert_equal 1, fortmyers_art_index500_060909.posts[98].post_date.month
98
+ assert_equal 14, fortmyers_art_index500_060909.posts[98].post_date.day
99
+ assert_equal 12, fortmyers_art_index500_060909.posts[99].post_date.month
100
+ assert_equal 16, fortmyers_art_index500_060909.posts[99].post_date.day
101
+
102
+ # Now we'll do one of these elusive 'trailer' pages which don't seem to really make much sense.
103
+ # Best I can tell, it only comes after a page like the one tested just above
104
+ fortmyers_art_index600_060909 = CraigScrape::Listings.new relative_uri_for('listing_samples/fortmyers_art_index.060909/fortmyers_art_index600.060909.html')
105
+ assert_equal "Husqvarna Viking Rose: Used Embroidery/Sewing Machine. Instruction book, Video, Embroidery Unit, 4\" 4\" hoop, designs, tool box with accessories including 8 feet (A, B, C, D, E, J, P, U and zipper foot). $400.00 Firm. (941) 347-8014 or (352)638-4707.", fortmyers_art_index600_060909.posts[0].contents
106
+ assert_equal "Husqvarna Viking Rose: Used Embroidery/Sewing Machine. Instruction book, Video, Embroidery Unit, 4\" 4\" hoop, designs, tool box with accessories including 8 feet (A, B, C, D, E, J, P, U and zipper foot). $400.00 Firm. (941) 347-8014 or (352)638-4707.", fortmyers_art_index600_060909.posts[0].contents_as_plain
107
+ assert_equal false, fortmyers_art_index600_060909.posts[0].deleted_by_author?
108
+ assert_equal true, fortmyers_art_index600_060909.posts[0].downloaded?
109
+ assert_equal false, fortmyers_art_index600_060909.posts[0].flagged_for_removal?
110
+ assert_equal ["fort myers craigslist", "art & crafts"], fortmyers_art_index600_060909.posts[0].full_section
111
+ assert_equal false, fortmyers_art_index600_060909.posts[0].has_img?
112
+ assert_equal true, fortmyers_art_index600_060909.posts[0].has_pic?
113
+ assert_equal true, fortmyers_art_index600_060909.posts[0].has_pic_or_img?
114
+ assert_equal "Husqvarna Viking Rose Embroidery-Sewing Machine - $400 (Punta Gorda, Charlotte County)", fortmyers_art_index600_060909.posts[0].header
115
+ assert_equal "Husqvarna Viking Rose Embroidery-Sewing Machine - $400 (Punta Gorda, Charlotte County)", fortmyers_art_index600_060909.posts[0].header_as_plain
116
+ assert_equal "897549505.html", fortmyers_art_index600_060909.posts[0].href
117
+ assert_equal [], fortmyers_art_index600_060909.posts[0].images
118
+ assert_equal [:pic], fortmyers_art_index600_060909.posts[0].img_types
119
+ assert_equal "Husqvarna Viking Rose Embroidery-Sewing Machine - $400", fortmyers_art_index600_060909.posts[0].label
120
+ assert_equal "Punta Gorda, Charlotte County", fortmyers_art_index600_060909.posts[0].location
121
+ assert_equal [], fortmyers_art_index600_060909.posts[0].pics
122
+ assert_equal [0, 0, 0, 28, 10, 2008, 2, 302, true, "EDT"], fortmyers_art_index600_060909.posts[0].post_date.to_a
123
+ assert_equal [0, 51, 21, 28, 10, 2008, 2, 302, true, "EDT"], fortmyers_art_index600_060909.posts[0].post_time.to_a
124
+ assert_equal 897549505, fortmyers_art_index600_060909.posts[0].posting_id
125
+ assert_equal 400.0, fortmyers_art_index600_060909.posts[0].price
126
+ assert_equal nil, fortmyers_art_index600_060909.posts[0].reply_to
127
+ assert_equal "art & crafts", fortmyers_art_index600_060909.posts[0].section
128
+ assert_equal false, fortmyers_art_index600_060909.posts[0].system_post?
129
+ assert_equal "Husqvarna Viking Rose Embroidery-Sewing Machine", fortmyers_art_index600_060909.posts[0].title
240
130
 
241
- posting_deleted = CraigScrape.scrape_full_post relative_uri_for('post_samples/this_post_has_been_deleted_by_its_author.html')
242
- assert_equal true, posting_deleted.deleted_by_author?
243
- assert_equal nil, posting_deleted.contents
244
- assert_equal ["south florida craigslist", "broward county", "cars & trucks - by owner"], posting_deleted.full_section
245
- assert_equal "This posting has been deleted by its author.", posting_deleted.header
246
- assert_equal nil, posting_deleted.title
247
- assert_equal nil, posting_deleted.location
248
- assert_equal nil, posting_deleted.posting_id
249
- assert_equal nil, posting_deleted.reply_to
250
- assert_equal nil, posting_deleted.post_time
251
- assert_equal [], posting_deleted.images
252
- assert_equal nil, posting_deleted.contents_as_plain
253
- assert_equal nil, posting_deleted.price
254
-
255
- posting6 = CraigScrape.scrape_full_post relative_uri_for('post_samples/1207457727.html')
256
- assert_equal "<p><br />Call!! asking for a new owner.<br /> no deposit required rent to own properties. <br /> <br /> Defaulting payment records are not a problem, <br /> we will help you protect the previous owners credit history! 202-567-6371 <br /><br /></p>",posting6.contents
257
- assert_equal "Call!! asking for a new owner. no deposit required rent to own properties. Defaulting payment records are not a problem, we will help you protect the previous owners credit history! 202-567-6371 ",posting6.contents_as_plain
258
- assert_equal false,posting6.deleted_by_author?
259
- assert_equal false,posting6.flagged_for_removal?
260
- assert_equal ["south florida craigslist", "broward county", "apts/housing for rent"],posting6.full_section
261
- assert_equal "$1350 / 3br - 2bth for no deposit req (Coral Springs)",posting6.header
262
- assert_equal ["http://images.craigslist.org/3k43pe3o8ZZZZZZZZZ9655022102a3ea51624.jpg", "http://images.craigslist.org/3n13m53p6ZZZZZZZZZ96596515e51237a179c.jpg", "http://images.craigslist.org/3od3p33leZZZZZZZZZ9656d614da8e3a51dd9.jpg", "http://images.craigslist.org/3pb3oa3leZZZZZZZZZ965eb60e4d2344019fb.jpg"],posting6.images
263
- assert_equal 'Coral Springs',posting6.location
264
- assert_equal [0, 56, 18, 5, 6, 2009, 5, 156, true, "EDT"],posting6.post_time.to_a
265
- assert_equal 1207457727,posting6.posting_id
266
- assert_equal 1350.0,posting6.price
267
- assert_equal "hous-ccpap-1207457727@craigslist.org",posting6.reply_to
268
- assert_equal "2bth for no deposit req",posting6.title
269
- end
270
-
271
- private
272
-
273
- def read_as_hpricot(test_file)
274
- Hpricot.parse(
275
- File.open('%s/%s' % [File.dirname(__FILE__), test_file]).read
276
- )
131
+ assert_equal "Multiple artists' moving sale. Lots of unusual items including art, art supplies, ceramics and ceramic glazes, furniture, clothes, books, electronics, cd's and much more. Also for sale is alot of restaurant equpment.\r<br />\n\r<br />\nSale to be held at 3570 Bayshore Dr. next to Bayshore Coffee Co.\r<br />\n\r<br />\nSaturday 8:00 a.m. until 2:00 Rain or shine.\r<br />", fortmyers_art_index600_060909.posts[1].contents
132
+ assert_equal "Multiple artists' moving sale. Lots of unusual items including art, art supplies, ceramics and ceramic glazes, furniture, clothes, books, electronics, cd's and much more. Also for sale is alot of restaurant equpment.\r\n\r\nSale to be held at 3570 Bayshore Dr. next to Bayshore Coffee Co.\r\n\r\nSaturday 8:00 a.m. until 2:00 Rain or shine.\r", fortmyers_art_index600_060909.posts[1].contents_as_plain
133
+ assert_equal false, fortmyers_art_index600_060909.posts[1].deleted_by_author?
134
+ assert_equal true, fortmyers_art_index600_060909.posts[1].downloaded?
135
+ assert_equal false, fortmyers_art_index600_060909.posts[1].flagged_for_removal?
136
+ assert_equal ["fort myers craigslist", "art & crafts"], fortmyers_art_index600_060909.posts[1].full_section
137
+ assert_equal false, fortmyers_art_index600_060909.posts[1].has_img?
138
+ assert_equal false, fortmyers_art_index600_060909.posts[1].has_pic?
139
+ assert_equal false, fortmyers_art_index600_060909.posts[1].has_pic_or_img?
140
+ assert_equal "ARTISTS' MOVING SALE-BAYSHORE (Naples)", fortmyers_art_index600_060909.posts[1].header
141
+ assert_equal "ARTISTS' MOVING SALE-BAYSHORE (Naples)", fortmyers_art_index600_060909.posts[1].header_as_plain
142
+ assert_equal "891513957.html", fortmyers_art_index600_060909.posts[1].href
143
+ assert_equal [], fortmyers_art_index600_060909.posts[1].images
144
+ assert_equal [], fortmyers_art_index600_060909.posts[1].img_types
145
+ assert_equal "ARTISTS' MOVING SALE-BAYSHORE", fortmyers_art_index600_060909.posts[1].label
146
+ assert_equal "Naples", fortmyers_art_index600_060909.posts[1].location
147
+ assert_equal [], fortmyers_art_index600_060909.posts[1].pics
148
+ assert_equal [0, 0, 0, 24, 10, 2008, 5, 298, true, "EDT"], fortmyers_art_index600_060909.posts[1].post_date.to_a
149
+ assert_equal [0, 31, 9, 24, 10, 2008, 5, 298, true, "EDT"], fortmyers_art_index600_060909.posts[1].post_time.to_a
150
+ assert_equal 891513957, fortmyers_art_index600_060909.posts[1].posting_id
151
+ assert_equal nil, fortmyers_art_index600_060909.posts[1].price
152
+ assert_equal "sale-891513957@craigslist.org", fortmyers_art_index600_060909.posts[1].reply_to
153
+ assert_equal "art & crafts", fortmyers_art_index600_060909.posts[1].section
154
+ assert_equal false, fortmyers_art_index600_060909.posts[1].system_post?
155
+ assert_equal "ARTISTS' MOVING SALE-BAYSHORE", fortmyers_art_index600_060909.posts[1].title
156
+
157
+ assert_equal "Tapestry sewing machine and embroidery arm luggage for Viking designer sewing machine. Two years old in excellent condition.", fortmyers_art_index600_060909.posts[2].contents
158
+ assert_equal "Tapestry sewing machine and embroidery arm luggage for Viking designer sewing machine. Two years old in excellent condition.", fortmyers_art_index600_060909.posts[2].contents_as_plain
159
+ assert_equal false, fortmyers_art_index600_060909.posts[2].deleted_by_author?
160
+ assert_equal true, fortmyers_art_index600_060909.posts[2].downloaded?
161
+ assert_equal false, fortmyers_art_index600_060909.posts[2].flagged_for_removal?
162
+ assert_equal ["fort myers craigslist", "art & crafts"], fortmyers_art_index600_060909.posts[2].full_section
163
+ assert_equal false, fortmyers_art_index600_060909.posts[2].has_img?
164
+ assert_equal false, fortmyers_art_index600_060909.posts[2].has_pic?
165
+ assert_equal false, fortmyers_art_index600_060909.posts[2].has_pic_or_img?
166
+ assert_equal "tapestry sewing machine and embroidery arm luggage - $250 (Punta Gorda)", fortmyers_art_index600_060909.posts[2].header
167
+ assert_equal "tapestry sewing machine and embroidery arm luggage - $250 (Punta Gorda)", fortmyers_art_index600_060909.posts[2].header_as_plain
168
+ assert_equal "825684735.html", fortmyers_art_index600_060909.posts[2].href
169
+ assert_equal [], fortmyers_art_index600_060909.posts[2].images
170
+ assert_equal [], fortmyers_art_index600_060909.posts[2].img_types
171
+ assert_equal "tapestry sewing machine and embroidery arm luggage - $250", fortmyers_art_index600_060909.posts[2].label
172
+ assert_equal "Punta Gorda", fortmyers_art_index600_060909.posts[2].location
173
+ assert_equal [], fortmyers_art_index600_060909.posts[2].pics
174
+ assert_equal [0, 0, 0, 3, 9, 2008, 3, 247, true, "EDT"], fortmyers_art_index600_060909.posts[2].post_date.to_a
175
+ assert_equal [0, 31, 15, 3, 9, 2008, 3, 247, true, "EDT"], fortmyers_art_index600_060909.posts[2].post_time.to_a
176
+ assert_equal 825684735, fortmyers_art_index600_060909.posts[2].posting_id
177
+ assert_equal 250.0, fortmyers_art_index600_060909.posts[2].price
178
+ assert_equal "sale-825684735@craigslist.org", fortmyers_art_index600_060909.posts[2].reply_to
179
+ assert_equal "art & crafts", fortmyers_art_index600_060909.posts[2].section
180
+ assert_equal false, fortmyers_art_index600_060909.posts[2].system_post?
181
+ assert_equal "tapestry sewing machine and embroidery arm luggage", fortmyers_art_index600_060909.posts[2].title
182
+
183
+ assert_equal "Gorgeous and one of a kind! Museum-collected artist Jay von Koffler's Aurora Series - cast glass nude sculpture - Aurora. Mounted on marble and enhanced with bronze beak. \r<br />\n\r<br />\nDimensions: 30x16x6\r<br />\nCall for appointment for studio viewing - 239.595.1793", fortmyers_art_index600_060909.posts[3].contents
184
+ assert_equal "Gorgeous and one of a kind! Museum-collected artist Jay von Koffler's Aurora Series - cast glass nude sculpture - Aurora. Mounted on marble and enhanced with bronze beak. \r\n\r\nDimensions: 30x16x6\r\nCall for appointment for studio viewing - 239.595.1793", fortmyers_art_index600_060909.posts[3].contents_as_plain
185
+ assert_equal false, fortmyers_art_index600_060909.posts[3].deleted_by_author?
186
+ assert_equal true, fortmyers_art_index600_060909.posts[3].downloaded?
187
+ assert_equal false, fortmyers_art_index600_060909.posts[3].flagged_for_removal?
188
+ assert_equal ["fort myers craigslist", "art & crafts"], fortmyers_art_index600_060909.posts[3].full_section
189
+ assert_equal false, fortmyers_art_index600_060909.posts[3].has_img?
190
+ assert_equal true, fortmyers_art_index600_060909.posts[3].has_pic?
191
+ assert_equal true, fortmyers_art_index600_060909.posts[3].has_pic_or_img?
192
+ assert_equal "Cast Glass Sculpture - Aurora - $2400 (Naples)", fortmyers_art_index600_060909.posts[3].header
193
+ assert_equal "Cast Glass Sculpture - Aurora - $2400 (Naples)", fortmyers_art_index600_060909.posts[3].header_as_plain
194
+ assert_equal "823516079.html", fortmyers_art_index600_060909.posts[3].href
195
+ assert_equal [], fortmyers_art_index600_060909.posts[3].images
196
+ assert_equal [:pic], fortmyers_art_index600_060909.posts[3].img_types
197
+ assert_equal "Cast Glass Sculpture - Aurora - $2400", fortmyers_art_index600_060909.posts[3].label
198
+ assert_equal "Naples", fortmyers_art_index600_060909.posts[3].location
199
+ assert_equal [], fortmyers_art_index600_060909.posts[3].pics
200
+ assert_equal [0, 0, 0, 2, 9, 2008, 2, 246, true, "EDT"], fortmyers_art_index600_060909.posts[3].post_date.to_a
201
+ assert_equal [0, 35, 10, 2, 9, 2008, 2, 246, true, "EDT"], fortmyers_art_index600_060909.posts[3].post_time.to_a
202
+ assert_equal 823516079, fortmyers_art_index600_060909.posts[3].posting_id
203
+ assert_equal 2400.0, fortmyers_art_index600_060909.posts[3].price
204
+ assert_equal "sale-823516079@craigslist.org", fortmyers_art_index600_060909.posts[3].reply_to
205
+ assert_equal "art & crafts", fortmyers_art_index600_060909.posts[3].section
206
+ assert_equal false, fortmyers_art_index600_060909.posts[3].system_post?
207
+ assert_equal "Cast Glass Sculpture - Aurora", fortmyers_art_index600_060909.posts[3].title
277
208
  end
278
209
 
279
- def relative_uri_for(filename)
280
- 'file://%s/%s' % [File.dirname(File.expand_path(__FILE__)), filename]
281
- end
282
-
283
- def pp_assertions(obj, obj_name)
284
- probable_accessors = (obj.methods-obj.class.superclass.methods)
285
-
286
- puts
287
- probable_accessors.sort.each do |m|
288
- val = obj.send(m.to_sym)
289
-
290
- # There's a good number of transformations worth doing here, I'll just start like this for now:
291
- if val.kind_of? Time
292
- # I've decided this is the the easiest way to understand and test a time
293
- val = val.to_a
294
- m = "#{m}.to_a"
295
- end
296
-
297
- puts "assert_equal %s, %s.%s" % [val.inspect,obj_name,m]
298
- end
210
+ def test_nasty_search_listings
211
+ miami_search_sss_rack900_061809 = CraigScrape::Listings.new relative_uri_for('listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html')
212
+ assert_equal '/search/sss?query=rack&s=1000', miami_search_sss_rack900_061809.next_page_href
213
+
214
+ miami_search_sss_rack1000_061809 = CraigScrape::Listings.new relative_uri_for('listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html')
215
+ assert_equal nil, miami_search_sss_rack1000_061809.next_page_href
299
216
  end
217
+
218
+
300
219
  end