libcraigscrape 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +5 -0
- data/Rakefile +1 -1
- data/lib/libcraigscrape.rb +13 -6
- data/lib/listings.rb +3 -3
- data/lib/scraper.rb +0 -8
- data/test/listing_samples/mia_sss_kittens2.10.10.html +144 -0
- data/test/test_craigslist_listing.rb +5 -6
- metadata +3 -2
data/CHANGELOG
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
== Change Log
|
2
2
|
|
3
|
+
=== Release 0.8.1 (Feb 10, 2010)
|
4
|
+
- Found an odd parsing bug occured for the first time today. Scrape sample is in 'listing_samples/mia_sss_kittens2.10.10.html', Adjusted CraigScrape::Listings::LABEL to fix.
|
5
|
+
- Switched to require "active_support" per the deprecation notices
|
6
|
+
- Little adjustments to fix the rdoc readibility
|
7
|
+
|
3
8
|
=== Release 0.8.0 (Oct 22, 2009)
|
4
9
|
- Lots of substantial changes to the API & craigwatch (though backwards compatibility is mostly there)
|
5
10
|
- Added :code_tests to the rakefile
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ include FileUtils
|
|
11
11
|
RbConfig = Config unless defined? RbConfig
|
12
12
|
|
13
13
|
NAME = "libcraigscrape"
|
14
|
-
VERS = ENV['VERSION'] || "0.8.
|
14
|
+
VERS = ENV['VERSION'] || "0.8.1"
|
15
15
|
PKG = "#{NAME}-#{VERS}"
|
16
16
|
|
17
17
|
RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
|
data/lib/libcraigscrape.rb
CHANGED
@@ -3,16 +3,19 @@
|
|
3
3
|
# All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
4
4
|
#
|
5
5
|
|
6
|
+
require 'net/http'
|
7
|
+
require 'zlib'
|
8
|
+
|
9
|
+
require 'rubygems'
|
10
|
+
require 'active_support'
|
11
|
+
require 'hpricot'
|
12
|
+
require 'htmlentities'
|
13
|
+
|
6
14
|
# A base class encapsulating the various libcraigscrape objects, and providing most of the
|
7
15
|
# craigslist interaction methods. Currently, we're supporting the old Class methods
|
8
16
|
# in a legacy-compatibility mode, but these methods are marked for deprecation. Instead,
|
9
17
|
# create an instance of the Craigslist object, and use its Public Instance methods.
|
10
18
|
# See the README for easy to follow examples.
|
11
|
-
class CraigScrape; end
|
12
|
-
|
13
|
-
require 'listings'
|
14
|
-
require 'posting'
|
15
|
-
require 'geo_listings'
|
16
19
|
|
17
20
|
class CraigScrape
|
18
21
|
cattr_accessor :time_now
|
@@ -202,4 +205,8 @@ class CraigScrape
|
|
202
205
|
ret
|
203
206
|
end
|
204
207
|
|
205
|
-
end
|
208
|
+
end
|
209
|
+
|
210
|
+
require 'listings'
|
211
|
+
require 'posting'
|
212
|
+
require 'geo_listings'
|
data/lib/listings.rb
CHANGED
@@ -9,7 +9,7 @@ require 'scraper'
|
|
9
9
|
|
10
10
|
# Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
|
11
11
|
class CraigScrape::Listings < CraigScrape::Scraper
|
12
|
-
LABEL = /^(.+?)[ ]
|
12
|
+
LABEL = /^(.+?)[ ]*[\-]?$/
|
13
13
|
LOCATION = /^[ ]*\((.*?)\)$/
|
14
14
|
IMG_TYPE = /^[ ]*(.+)[ ]*$/
|
15
15
|
HEADER_DATE = /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/
|
@@ -23,7 +23,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
23
23
|
@posts = []
|
24
24
|
|
25
25
|
post_tags = html.get_elements_by_tag_name('p','h4')
|
26
|
-
|
26
|
+
|
27
27
|
# The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
|
28
28
|
post_tags.pop if (
|
29
29
|
post_tags.length > 0 and
|
@@ -36,7 +36,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
36
36
|
case el.name
|
37
37
|
when 'p'
|
38
38
|
post_summary = self.class.parse_summary el, current_date
|
39
|
-
|
39
|
+
|
40
40
|
# Validate that required fields are present:
|
41
41
|
parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
|
42
42
|
|
data/lib/scraper.rb
CHANGED
@@ -9,14 +9,6 @@
|
|
9
9
|
# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
10
10
|
#
|
11
11
|
|
12
|
-
require 'net/http'
|
13
|
-
require 'zlib'
|
14
|
-
|
15
|
-
require 'rubygems'
|
16
|
-
require 'activesupport'
|
17
|
-
require 'hpricot'
|
18
|
-
require 'htmlentities'
|
19
|
-
|
20
12
|
# Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related
|
21
13
|
# functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html
|
22
14
|
# methods. It also contains the http-related cattr_accessors:
|
@@ -0,0 +1,144 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
2
|
+
<html><head>
|
3
|
+
<title>south florida pets classifieds "kitten" - craigslist</title>
|
4
|
+
|
5
|
+
<meta name="description" content="craigslist pets classifieds for south florida "kitten" " />
|
6
|
+
<meta name="keywords" content="south florida pets craigslist, classifieds, want ads " />
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
<link href="/search/pet?query=kitten&catAbbreviation=pet&hasPic=1&format=rss" title="RSS feed for craigslist | pets "kitten" in south florida " rel="alternate" type="application/rss+xml" />
|
11
|
+
<link href="http://www.craigslist.org/styles/craigslist.css" title="craigslist" rel="stylesheet" media="all" type="text/css" />
|
12
|
+
<link href="http://www.craigslist.org/favicon.ico" rel="shortcut icon" type="image/x-icon" />
|
13
|
+
</head>
|
14
|
+
|
15
|
+
<body class="toc">
|
16
|
+
|
17
|
+
<a name="top"></a>
|
18
|
+
|
19
|
+
<div class="bchead">
|
20
|
+
<span id="ef">
|
21
|
+
|
22
|
+
[ <a href="http://www.craigslist.org/about/help/">help</a> ]
|
23
|
+
[ <a href="https://post.craigslist.org/mia/C">post</a> ]</span>
|
24
|
+
|
25
|
+
<a href="/"> south florida craigslist</a> > <a href="/ccc/">community</a> > <a href="/pet/">pets</a>
|
26
|
+
<div id="satabs"> <b>all south florida</b> <a href="/search/pet/mdc?hasPic=1&query=kitten">miami / dade</a> <a href="/search/pet/brw?hasPic=1&query=kitten">broward county</a> <a href="/search/pet/pbc?hasPic=1&query=kitten">palm beach co</a> </div>
|
27
|
+
|
28
|
+
</div>
|
29
|
+
|
30
|
+
<blockquote>
|
31
|
+
<form action="/search/pet" onsubmit="ckCAbb();" method="get">
|
32
|
+
|
33
|
+
<script type="text/javascript"><!--
|
34
|
+
var cabb = "pet";
|
35
|
+
-->
|
36
|
+
</script>
|
37
|
+
|
38
|
+
<table summary="" cellpadding="2" style="white-space: nowrap; background:#eee; border:1px solid gray;" width="100%">
|
39
|
+
<tr>
|
40
|
+
<td align="right" width="1">search for:</td>
|
41
|
+
|
42
|
+
<td width="30%"><input name="query" size="30" id="query" value="kitten" /> in:
|
43
|
+
<select name="catAbbreviation" id="cAbb">
|
44
|
+
|
45
|
+
<option value="ccc">all community
|
46
|
+
</option><option value="" disabled>--
|
47
|
+
</option><option value="act"> activity partners
|
48
|
+
</option><option value="ats"> artists
|
49
|
+
</option><option value="kid"> childcare
|
50
|
+
</option><option value="com"> general
|
51
|
+
</option><option value="grp"> groups
|
52
|
+
</option><option value="vnn"> local news and views
|
53
|
+
</option><option value="laf"> lost & found
|
54
|
+
</option><option value="muc"> musicians
|
55
|
+
</option><option selected value="pet"> pets
|
56
|
+
</option><option value="pol"> politics
|
57
|
+
</option><option value="rid"> rideshare
|
58
|
+
</option><option value="vol"> volunteers
|
59
|
+
</option><option value="" disabled>--
|
60
|
+
|
61
|
+
</option><option value="eee">all event
|
62
|
+
|
63
|
+
</option><option value="sss">all for sale / wanted
|
64
|
+
|
65
|
+
</option><option value="ggg">all gigs
|
66
|
+
|
67
|
+
</option><option value="hhh">all housing
|
68
|
+
|
69
|
+
</option><option value="jjj">all jobs
|
70
|
+
|
71
|
+
</option><option value="ppp">all personals
|
72
|
+
|
73
|
+
</option><option value="res">all resume
|
74
|
+
|
75
|
+
</option><option value="bbb">all services offered
|
76
|
+
</option></select>
|
77
|
+
<input type="submit" value="Search" />
|
78
|
+
</td><td>
|
79
|
+
<label><input name="srchType" title="check this box to search only posting titles" type="checkbox" value="T" /> only search titles</label>
|
80
|
+
</td>
|
81
|
+
</tr>
|
82
|
+
<tr>
|
83
|
+
<td align="right" width="1"></td>
|
84
|
+
<td></td>
|
85
|
+
<td align="left"><label><input checked name="hasPic" type="checkbox" value="1" /> has image</label></td>
|
86
|
+
</tr>
|
87
|
+
</table>
|
88
|
+
</form>
|
89
|
+
</blockquote>
|
90
|
+
|
91
|
+
|
92
|
+
|
93
|
+
<blockquote>
|
94
|
+
<table summary="" width="100%">
|
95
|
+
<tr>
|
96
|
+
<td valign="top"></td>
|
97
|
+
<td id="messages" valign="top"></td>
|
98
|
+
</tr>
|
99
|
+
</table>
|
100
|
+
|
101
|
+
<div>Sort by:<strong> most recent</strong> <a href="/search/pet?catAbbreviation=pet&hasPic=1&query=kitten&sort=rel">best match</a>
|
102
|
+
<div class="sh" style="text-align: center;">
|
103
|
+
<span style="float: left;"> </span>
|
104
|
+
<span style="float: right;"> </span>
|
105
|
+
<b>Found: 43 Displaying: 1 - 43</b>
|
106
|
+
|
107
|
+
</div>
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
<!-- sphinx: 0.005432, csd: 0.032165, total: 0.037597 via 8p -->
|
112
|
+
|
113
|
+
<p> Feb 10 - <a href="/brw/pet/1594409444.html">Two adorable kittens need new homes</a> - <font size="-1"> (ft lauderdale)</font> <span class="p"> pic</span></p><p> Feb 9 - <a href="/brw/pet/1593057827.html">ALL BLACK KITTEN</a> - <font size="-1"> (Pembroke Pines)</font> <span class="p"> pic</span></p><p> Feb 9 - <a href="/brw/pet/1593019152.html">Beautiful Little Boy Kitten</a> - <font size="-1"> (WESTON)</font> <span class="p"> pic</span></p><p> Feb 9 - <a href="/brw/pet/1593011946.html">Four Paw Declawed Female Cat</a> - <font size="-1"> (WESTON)</font> <span class="p"> pic</span></p><p> Feb 9 - <a href="/brw/pet/1592999746.html">Two Beautiful Kitten Brothers</a> - <font size="-1"> (WESTON)</font> <span class="p"> pic</span></p><p> Feb 7 - <a href="/brw/pet/1590495543.html">kitten - 7 months</a> - <font size="-1"> (coral springs/ coconut creek)</font> <span class="p"> pic</span></p><p> Feb 7 - <a href="/mdc/pet/1590158454.html">Reiki desperately needs a loving forever home</a> - <font size="-1"> (Miami)</font> <span class="p"> pic</span></p><p> Feb 7 - <a href="/brw/pet/1589882794.html">Free Kitten</a> - <font size="-1"> (Broward)</font> <span class="p"> pic</span></p><p> Feb 6 - <a href="/mdc/pet/1589384985.html">Two Beautiful, Lovable Kittens Need a New Home</a> - <font size="-1"> (Miami Beach, FL)</font> <span class="p"> pic</span></p><p> Feb 6 - <a href="/pbc/pet/1588600616.html">Love kittens, want to make a HUGE difference? Volunteer!</a> - <font size="-1"> (South Florida)</font> <span class="p"> pic</span></p><p> Feb 5 - <a href="/pbc/pet/1588284560.html">kittens to a good home!!</a> - <font size="-1"> (lake worth)</font> <span class="p"> pic</span></p><p> Feb 3 - <a href="/mdc/pet/1583880286.html">GREATEST KITTEN EVER</a> - <font size="-1"> (MIAMI BEACH)</font> <span class="p"> pic</span></p><p> Jan 30 - <a href="/brw/pet/1578696972.html">Kitten Needs New Home</a> - <font size="-1"> (Pembroke Pines, FL)</font> <span class="p"> pic</span></p><p> Jan 30 - <a href="/brw/pet/1578368357.html">great kitten that needs a good home</a> - <font size="-1"> (hollywood fl)</font> <span class="p"> pic</span></p><p> Jan 30 - <a href="/brw/pet/1577821318.html">Jasper from Twilight looking for home </a> - <font size="-1"> (pompano beach, FL)</font> <span class="p"> pic</span></p><p> Jan 29 - <a href="/pbc/pet/1576749013.html">Foster Moms needed for KITTENS under 12 weeks of age</a> - <font size="-1"> (PBC - Alleys to Eden, Inc.)</font> <span class="p"> pic img</span></p><p> Jan 29 - <a href="/mdc/pet/1575892469.html">Kitten! Kitten! Kitten!</a> - <font size="-1"> (Miami)</font> <span class="p"> pic</span></p><p> Jan 27 - <a href="/brw/pet/1573858828.html">cute cute cute kitten</a> - <font size="-1"> (hollywood fl)</font> <span class="p"> pic</span></p><p> Jan 27 - <a href="/pbc/pet/1573555280.html">Looking for Neonatal Kitten Foster Moms for Kitten Rescue Group</a> - <font size="-1"> (Alleys to Eden, Inc.)</font> <span class="p"> pic img</span></p><p> Jan 27 - <a href="/mdc/pet/1573502704.html">4 Month old kitten for adoption</a> - <span class="p"> pic</span></p><p> Jan 26 - <a href="/mdc/pet/1571904030.html">kendall kittens (:</a> - <font size="-1"> (kendall)</font> <span class="p"> pic</span></p><p> Jan 26 - <a href="/mdc/pet/1571446135.html">Kitten! Kitten! Kitten!</a> - <font size="-1"> (Miami)</font> <span class="p"> pic</span></p><p> Jan 26 - <a href="/pbc/pet/1571440092.html">Tigger - ex-feral kitten - needs a BIG animal lover to give him a home</a> - <font size="-1"> (Alleys to Eden, Inc.)</font> <span class="p"> pic img</span></p><p> Jan 26 - <a href="/brw/pet/1570894156.html">Looking for sweet kitten</a> - <span class="p"> pic</span></p><p> Jan 25 - <a href="/pbc/pet/1570181661.html">solid white female kitten </a> - <font size="-1"> (lake worth )</font> <span class="p"> pic</span></p><p> Jan 24 - <a href="/pbc/pet/1567900657.html">A Big Love with 3 Legs =)</a> - <font size="-1"> (Wellington)</font> <span class="p"> pic</span></p><p> Jan 23 - <a href="/pbc/pet/1567667685.html">assorted kittens ~ rescued</a> - <font size="-1"> (southeast florida)</font> <span class="p"> pic</span></p><p> Jan 23 - <a href="/pbc/pet/1567302066.html">FREE Short haired Black Kitten</a> - <font size="-1"> (Juno)</font> <span class="p"> pic</span></p><p> Jan 22 - <a href="/mdc/pet/1565371176.html"> kitten to loving home :)</a> - <font size="-1"> (homestead)</font> <span class="p"> pic</span></p><p> Jan 21 - <a href="/mdc/pet/1563568531.html">Kitten! Kitten! Kitten!</a> - <font size="-1"> (Doral Miami)</font> <span class="p"> pic</span></p><p> Jan 19 - <a href="/mdc/pet/1560858677.html">kitten (siamese)</a> - <span class="p"> pic</span></p><p> Jan 18 - <a href="/brw/pet/1558744645.html">6 month old kitten</a> - <font size="-1"> (davie)</font> <span class="p"> pic</span></p><p> Jan 17 - <a href="/brw/pet/1556991233.html">SWEET FEMALE CAT</a> - <font size="-1"> (POMPANO BCH)</font> <span class="p"> pic</span></p><p> Jan 16 - <a href="/mdc/pet/1556281733.html">Please adopt CJ the kitten</a> - <font size="-1"> (Miami, FL)</font> <span class="p"> pic</span></p><p> Jan 16 - <a href="/pbc/pet/1555719038.html">Cute little Siamese kitten 6 months old needs a home</a> - <font size="-1"> (West Palm Beach)</font> <span class="p"> pic</span></p><p> Jan 16 - <a href="/brw/pet/1555517479.html">cute kitten reallly needs a loving home!!</a> - <span class="p"> pic</span></p><p> Jan 16 - <a href="/brw/pet/1555410876.html">FEMALE 8 WEEK OLD KITTEN</a> - <font size="-1"> (POMPANO BCH)</font> <span class="p"> pic</span></p><p> Jan 16 - <a href="/mdc/pet/1555345484.html">Kitten! Kitten! Kitten!</a> - <font size="-1"> (Doral Miami)</font> <span class="p"> pic</span></p><p> Jan 14 - <a href="/brw/pet/1553407913.html">Kitty to good home</a> - <span class="p"> pic</span></p><p> Jan 13 - <a href="/pbc/pet/1551406560.html">Gorgeous Black and Grey Kitten!</a> - <span class="p"> pic</span></p><p> Jan 13 - <a href="/brw/pet/1550606813.html">Help, I need to move temp and a want to keep my cat</a> - <font size="-1"> (Wilton Manors)</font> <span class="p"> pic</span></p><p> Jan 13 - <a href="/mdc/pet/1550593272.html">Adopt these two kittens that survived...</a> - <font size="-1"> (Downtown Miami)</font> <span class="p"> pic</span></p><p> Jan 12 - <a href="/mdc/pet/1549484478.html">Free Kitten to Good Home</a> - <font size="-1"> (miami)</font> <span class="p"> pic</span></p><br /><div>Sort by:<strong> most recent</strong> <a href="/search/pet?catAbbreviation=pet&hasPic=1&query=kitten&sort=rel">best match</a>
|
114
|
+
<div class="sh" style="text-align: center;">
|
115
|
+
<span style="float: left;"> </span>
|
116
|
+
<span style="float: right;"> </span>
|
117
|
+
<b>Found: 43 Displaying: 1 - 43</b>
|
118
|
+
|
119
|
+
</div>
|
120
|
+
|
121
|
+
|
122
|
+
|
123
|
+
|
124
|
+
|
125
|
+
<div id="footer">
|
126
|
+
<hr />
|
127
|
+
<span id="copy">
|
128
|
+
Copyright © 2010 craigslist, inc.<br />
|
129
|
+
</span>
|
130
|
+
<span class="rss">
|
131
|
+
<a href="/search/pet?query=kitten&catAbbreviation=pet&hasPic=1&format=rss" class="l">RSS</a>
|
132
|
+
<a href="http://www.craigslist.org/about/rss">(?)</a><br />
|
133
|
+
</span>
|
134
|
+
</div>
|
135
|
+
<br /><br />
|
136
|
+
|
137
|
+
<div id="floater"> </div>
|
138
|
+
|
139
|
+
</div></div></blockquote>
|
140
|
+
<script src="http://www.craigslist.org/js/jquery.js" type="text/javascript"></script>
|
141
|
+
<script src="http://www.craigslist.org/js/tocs.js" type="text/javascript"></script>
|
142
|
+
|
143
|
+
</body>
|
144
|
+
</html>
|
@@ -7,12 +7,6 @@ require File.dirname(__FILE__)+'/libcraigscrape_test_helpers'
|
|
7
7
|
class CraigslistListingTest < Test::Unit::TestCase
|
8
8
|
include LibcraigscrapeTestHelpers
|
9
9
|
|
10
|
-
def test_pukes
|
11
|
-
assert_raise(CraigScrape::Scraper::ParseError) do
|
12
|
-
CraigScrape::Listings.new( relative_uri_for('google.html') ).posts
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
10
|
def test_listings_parse
|
17
11
|
category = CraigScrape::Listings.new relative_uri_for('listing_samples/category_output.html')
|
18
12
|
assert_equal 'index100.html', category.next_page_href
|
@@ -213,6 +207,11 @@ class CraigslistListingTest < Test::Unit::TestCase
|
|
213
207
|
|
214
208
|
miami_search_sss_rack1000_061809 = CraigScrape::Listings.new relative_uri_for('listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html')
|
215
209
|
assert_equal nil, miami_search_sss_rack1000_061809.next_page_href
|
210
|
+
|
211
|
+
# The first post on these results were causing problems at one point, due to the CraigScrape::Listings::LABEL regex
|
212
|
+
mia_sss_kittens021010 = CraigScrape::Listings.new relative_uri_for('listing_samples/mia_sss_kittens2.10.10.html')
|
213
|
+
assert_equal 'Two adorable kittens need new homes', mia_sss_kittens021010.posts[0].label
|
214
|
+
assert_equal '/brw/pet/1594409444.html', mia_sss_kittens021010.posts[0].href
|
216
215
|
end
|
217
216
|
|
218
217
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: libcraigscrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris DeRose, DeRose Technologies, Inc.
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-02-10 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -85,6 +85,7 @@ files:
|
|
85
85
|
- test/listing_samples/fortmyers_art_index.060909/1112522674.html
|
86
86
|
- test/listing_samples/fortmyers_art_index.060909/823516079.html
|
87
87
|
- test/listing_samples/category_output_2.html
|
88
|
+
- test/listing_samples/mia_sss_kittens2.10.10.html
|
88
89
|
- test/listing_samples/long_search_output.html
|
89
90
|
- test/listing_samples/miami_search_sss_rack.6.18.09
|
90
91
|
- test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html
|