libcraigscrape 0.8.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,5 +1,10 @@
1
1
  == Change Log
2
2
 
3
+ === Release 0.8.1 (Feb 10, 2010)
4
+ - Found an odd parsing bug occured for the first time today. Scrape sample is in 'listing_samples/mia_sss_kittens2.10.10.html', Adjusted CraigScrape::Listings::LABEL to fix.
5
+ - Switched to require "active_support" per the deprecation notices
6
+ - Little adjustments to fix the rdoc readibility
7
+
3
8
  === Release 0.8.0 (Oct 22, 2009)
4
9
  - Lots of substantial changes to the API & craigwatch (though backwards compatibility is mostly there)
5
10
  - Added :code_tests to the rakefile
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ include FileUtils
11
11
  RbConfig = Config unless defined? RbConfig
12
12
 
13
13
  NAME = "libcraigscrape"
14
- VERS = ENV['VERSION'] || "0.8.0"
14
+ VERS = ENV['VERSION'] || "0.8.1"
15
15
  PKG = "#{NAME}-#{VERS}"
16
16
 
17
17
  RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
@@ -3,16 +3,19 @@
3
3
  # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
4
4
  #
5
5
 
6
+ require 'net/http'
7
+ require 'zlib'
8
+
9
+ require 'rubygems'
10
+ require 'active_support'
11
+ require 'hpricot'
12
+ require 'htmlentities'
13
+
6
14
  # A base class encapsulating the various libcraigscrape objects, and providing most of the
7
15
  # craigslist interaction methods. Currently, we're supporting the old Class methods
8
16
  # in a legacy-compatibility mode, but these methods are marked for deprecation. Instead,
9
17
  # create an instance of the Craigslist object, and use its Public Instance methods.
10
18
  # See the README for easy to follow examples.
11
- class CraigScrape; end
12
-
13
- require 'listings'
14
- require 'posting'
15
- require 'geo_listings'
16
19
 
17
20
  class CraigScrape
18
21
  cattr_accessor :time_now
@@ -202,4 +205,8 @@ class CraigScrape
202
205
  ret
203
206
  end
204
207
 
205
- end
208
+ end
209
+
210
+ require 'listings'
211
+ require 'posting'
212
+ require 'geo_listings'
data/lib/listings.rb CHANGED
@@ -9,7 +9,7 @@ require 'scraper'
9
9
 
10
10
  # Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
11
11
  class CraigScrape::Listings < CraigScrape::Scraper
12
- LABEL = /^(.+?)[ ]*\-$/
12
+ LABEL = /^(.+?)[ ]*[\-]?$/
13
13
  LOCATION = /^[ ]*\((.*?)\)$/
14
14
  IMG_TYPE = /^[ ]*(.+)[ ]*$/
15
15
  HEADER_DATE = /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/
@@ -23,7 +23,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
23
23
  @posts = []
24
24
 
25
25
  post_tags = html.get_elements_by_tag_name('p','h4')
26
-
26
+
27
27
  # The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
28
28
  post_tags.pop if (
29
29
  post_tags.length > 0 and
@@ -36,7 +36,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
36
36
  case el.name
37
37
  when 'p'
38
38
  post_summary = self.class.parse_summary el, current_date
39
-
39
+
40
40
  # Validate that required fields are present:
41
41
  parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
42
42
 
data/lib/scraper.rb CHANGED
@@ -9,14 +9,6 @@
9
9
  # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
10
10
  #
11
11
 
12
- require 'net/http'
13
- require 'zlib'
14
-
15
- require 'rubygems'
16
- require 'activesupport'
17
- require 'hpricot'
18
- require 'htmlentities'
19
-
20
12
  # Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related
21
13
  # functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html
22
14
  # methods. It also contains the http-related cattr_accessors:
@@ -0,0 +1,144 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
+ <html><head>
3
+ <title>south florida pets classifieds &quot;kitten&quot; - craigslist</title>
4
+
5
+ <meta name="description" content="craigslist pets classifieds for south florida &quot;kitten&quot; " />
6
+ <meta name="keywords" content="south florida pets craigslist, classifieds, want ads " />
7
+
8
+
9
+
10
+ <link href="/search/pet?query=kitten&amp;catAbbreviation=pet&amp;hasPic=1&amp;format=rss" title="RSS feed for craigslist | pets &quot;kitten&quot; in south florida " rel="alternate" type="application/rss+xml" />
11
+ <link href="http://www.craigslist.org/styles/craigslist.css" title="craigslist" rel="stylesheet" media="all" type="text/css" />
12
+ <link href="http://www.craigslist.org/favicon.ico" rel="shortcut icon" type="image/x-icon" />
13
+ </head>
14
+
15
+ <body class="toc">
16
+
17
+ <a name="top"></a>
18
+
19
+ <div class="bchead">
20
+ <span id="ef">
21
+
22
+ [ <a href="http://www.craigslist.org/about/help/">help</a> ]
23
+ [ <a href="https://post.craigslist.org/mia/C">post</a> ]</span>
24
+
25
+ <a href="/"> south florida craigslist</a> &gt; <a href="/ccc/">community</a> &gt; <a href="/pet/">pets</a>
26
+ <div id="satabs"> <b>all south florida</b> <a href="/search/pet/mdc?hasPic=1&amp;query=kitten">miami / dade</a> <a href="/search/pet/brw?hasPic=1&amp;query=kitten">broward county</a> <a href="/search/pet/pbc?hasPic=1&amp;query=kitten">palm beach co</a> </div>
27
+
28
+ </div>
29
+
30
+ <blockquote>
31
+ <form action="/search/pet" onsubmit="ckCAbb();" method="get">
32
+
33
+ <script type="text/javascript"><!--
34
+ var cabb = "pet";
35
+ -->
36
+ </script>
37
+
38
+ <table summary="" cellpadding="2" style="white-space: nowrap; background:#eee; border:1px solid gray;" width="100%">
39
+ <tr>
40
+ <td align="right" width="1">search for:</td>
41
+
42
+ <td width="30%"><input name="query" size="30" id="query" value="kitten" /> in:
43
+ <select name="catAbbreviation" id="cAbb">
44
+
45
+ <option value="ccc">all community
46
+ </option><option value="" disabled>--
47
+ </option><option value="act"> activity partners
48
+ </option><option value="ats"> artists
49
+ </option><option value="kid"> childcare
50
+ </option><option value="com"> general
51
+ </option><option value="grp"> groups
52
+ </option><option value="vnn"> local news and views
53
+ </option><option value="laf"> lost &amp; found
54
+ </option><option value="muc"> musicians
55
+ </option><option selected value="pet"> pets
56
+ </option><option value="pol"> politics
57
+ </option><option value="rid"> rideshare
58
+ </option><option value="vol"> volunteers
59
+ </option><option value="" disabled>--
60
+
61
+ </option><option value="eee">all event
62
+
63
+ </option><option value="sss">all for sale / wanted
64
+
65
+ </option><option value="ggg">all gigs
66
+
67
+ </option><option value="hhh">all housing
68
+
69
+ </option><option value="jjj">all jobs
70
+
71
+ </option><option value="ppp">all personals
72
+
73
+ </option><option value="res">all resume
74
+
75
+ </option><option value="bbb">all services offered
76
+ </option></select>
77
+ <input type="submit" value="Search" />
78
+ </td><td>
79
+ <label><input name="srchType" title="check this box to search only posting titles" type="checkbox" value="T" /> only search titles</label>
80
+ </td>
81
+ </tr>
82
+ <tr>
83
+ <td align="right" width="1"></td>
84
+ <td></td>
85
+ <td align="left"><label><input checked name="hasPic" type="checkbox" value="1" /> has image</label></td>
86
+ </tr>
87
+ </table>
88
+ </form>
89
+ </blockquote>
90
+
91
+
92
+
93
+ <blockquote>
94
+ <table summary="" width="100%">
95
+ <tr>
96
+ <td valign="top"></td>
97
+ <td id="messages" valign="top"></td>
98
+ </tr>
99
+ </table>
100
+
101
+ <div>Sort by:<strong> most recent</strong> <a href="/search/pet?catAbbreviation=pet&hasPic=1&query=kitten&sort=rel">best match</a>
102
+ <div class="sh" style="text-align: center;">
103
+ <span style="float: left;">&nbsp;</span>
104
+ <span style="float: right;">&nbsp;</span>
105
+ <b>Found: 43 Displaying: 1 - 43</b>
106
+
107
+ </div>
108
+
109
+
110
+
111
+ <!-- sphinx: 0.005432, csd: 0.032165, total: 0.037597 via 8p -->
112
+
113
+ <p> Feb 10 - <a href="/brw/pet/1594409444.html">Two adorable kittens need new homes</a> - <font size="-1"> (ft lauderdale)</font> <span class="p"> pic</span></p><p> Feb 9 - <a href="/brw/pet/1593057827.html">ALL BLACK KITTEN</a> - <font size="-1"> (Pembroke Pines)</font> <span class="p"> pic</span></p><p> Feb 9 - <a href="/brw/pet/1593019152.html">Beautiful Little Boy Kitten</a> - <font size="-1"> (WESTON)</font> <span class="p"> pic</span></p><p> Feb 9 - <a href="/brw/pet/1593011946.html">Four Paw Declawed Female Cat</a> - <font size="-1"> (WESTON)</font> <span class="p"> pic</span></p><p> Feb 9 - <a href="/brw/pet/1592999746.html">Two Beautiful Kitten Brothers</a> - <font size="-1"> (WESTON)</font> <span class="p"> pic</span></p><p> Feb 7 - <a href="/brw/pet/1590495543.html">kitten - 7 months</a> - <font size="-1"> (coral springs/ coconut creek)</font> <span class="p"> pic</span></p><p> Feb 7 - <a href="/mdc/pet/1590158454.html">Reiki desperately needs a loving forever home</a> - <font size="-1"> (Miami)</font> <span class="p"> pic</span></p><p> Feb 7 - <a href="/brw/pet/1589882794.html">Free Kitten</a> - <font size="-1"> (Broward)</font> <span class="p"> pic</span></p><p> Feb 6 - <a href="/mdc/pet/1589384985.html">Two Beautiful, Lovable Kittens Need a New Home</a> - <font size="-1"> (Miami Beach, FL)</font> <span class="p"> pic</span></p><p> Feb 6 - <a href="/pbc/pet/1588600616.html">Love kittens, want to make a HUGE difference? Volunteer!</a> - <font size="-1"> (South Florida)</font> <span class="p"> pic</span></p><p> Feb 5 - <a href="/pbc/pet/1588284560.html">kittens to a good home!!</a> - <font size="-1"> (lake worth)</font> <span class="p"> pic</span></p><p> Feb 3 - <a href="/mdc/pet/1583880286.html">GREATEST KITTEN EVER</a> - <font size="-1"> (MIAMI BEACH)</font> <span class="p"> pic</span></p><p> Jan 30 - <a href="/brw/pet/1578696972.html">Kitten Needs New Home</a> - <font size="-1"> (Pembroke Pines, FL)</font> <span class="p"> pic</span></p><p> Jan 30 - <a href="/brw/pet/1578368357.html">great kitten that needs a good home</a> - <font size="-1"> (hollywood fl)</font> <span class="p"> pic</span></p><p> Jan 30 - <a href="/brw/pet/1577821318.html">Jasper from Twilight looking for home </a> - <font size="-1"> (pompano beach, FL)</font> <span class="p"> pic</span></p><p> Jan 29 - <a href="/pbc/pet/1576749013.html">Foster Moms needed for KITTENS under 12 weeks of age</a> - <font size="-1"> (PBC - Alleys to Eden, Inc.)</font> <span class="p"> pic&nbsp;img</span></p><p> Jan 29 - <a href="/mdc/pet/1575892469.html">Kitten! Kitten! Kitten!</a> - <font size="-1"> (Miami)</font> <span class="p"> pic</span></p><p> Jan 27 - <a href="/brw/pet/1573858828.html">cute cute cute kitten</a> - <font size="-1"> (hollywood fl)</font> <span class="p"> pic</span></p><p> Jan 27 - <a href="/pbc/pet/1573555280.html">Looking for Neonatal Kitten Foster Moms for Kitten Rescue Group</a> - <font size="-1"> (Alleys to Eden, Inc.)</font> <span class="p"> pic&nbsp;img</span></p><p> Jan 27 - <a href="/mdc/pet/1573502704.html">4 Month old kitten for adoption</a> - <span class="p"> pic</span></p><p> Jan 26 - <a href="/mdc/pet/1571904030.html">kendall kittens (:</a> - <font size="-1"> (kendall)</font> <span class="p"> pic</span></p><p> Jan 26 - <a href="/mdc/pet/1571446135.html">Kitten! Kitten! Kitten!</a> - <font size="-1"> (Miami)</font> <span class="p"> pic</span></p><p> Jan 26 - <a href="/pbc/pet/1571440092.html">Tigger - ex-feral kitten - needs a BIG animal lover to give him a home</a> - <font size="-1"> (Alleys to Eden, Inc.)</font> <span class="p"> pic&nbsp;img</span></p><p> Jan 26 - <a href="/brw/pet/1570894156.html">Looking for sweet kitten</a> - <span class="p"> pic</span></p><p> Jan 25 - <a href="/pbc/pet/1570181661.html">solid white female kitten </a> - <font size="-1"> (lake worth )</font> <span class="p"> pic</span></p><p> Jan 24 - <a href="/pbc/pet/1567900657.html">A Big Love with 3 Legs =)</a> - <font size="-1"> (Wellington)</font> <span class="p"> pic</span></p><p> Jan 23 - <a href="/pbc/pet/1567667685.html">assorted kittens ~ rescued</a> - <font size="-1"> (southeast florida)</font> <span class="p"> pic</span></p><p> Jan 23 - <a href="/pbc/pet/1567302066.html">FREE Short haired Black Kitten</a> - <font size="-1"> (Juno)</font> <span class="p"> pic</span></p><p> Jan 22 - <a href="/mdc/pet/1565371176.html"> kitten to loving home :)</a> - <font size="-1"> (homestead)</font> <span class="p"> pic</span></p><p> Jan 21 - <a href="/mdc/pet/1563568531.html">Kitten! Kitten! Kitten!</a> - <font size="-1"> (Doral Miami)</font> <span class="p"> pic</span></p><p> Jan 19 - <a href="/mdc/pet/1560858677.html">kitten (siamese)</a> - <span class="p"> pic</span></p><p> Jan 18 - <a href="/brw/pet/1558744645.html">6 month old kitten</a> - <font size="-1"> (davie)</font> <span class="p"> pic</span></p><p> Jan 17 - <a href="/brw/pet/1556991233.html">SWEET FEMALE CAT</a> - <font size="-1"> (POMPANO BCH)</font> <span class="p"> pic</span></p><p> Jan 16 - <a href="/mdc/pet/1556281733.html">Please adopt CJ the kitten</a> - <font size="-1"> (Miami, FL)</font> <span class="p"> pic</span></p><p> Jan 16 - <a href="/pbc/pet/1555719038.html">Cute little Siamese kitten 6 months old needs a home</a> - <font size="-1"> (West Palm Beach)</font> <span class="p"> pic</span></p><p> Jan 16 - <a href="/brw/pet/1555517479.html">cute kitten reallly needs a loving home!!</a> - <span class="p"> pic</span></p><p> Jan 16 - <a href="/brw/pet/1555410876.html">FEMALE 8 WEEK OLD KITTEN</a> - <font size="-1"> (POMPANO BCH)</font> <span class="p"> pic</span></p><p> Jan 16 - <a href="/mdc/pet/1555345484.html">Kitten! Kitten! Kitten!</a> - <font size="-1"> (Doral Miami)</font> <span class="p"> pic</span></p><p> Jan 14 - <a href="/brw/pet/1553407913.html">Kitty to good home</a> - <span class="p"> pic</span></p><p> Jan 13 - <a href="/pbc/pet/1551406560.html">Gorgeous Black and Grey Kitten!</a> - <span class="p"> pic</span></p><p> Jan 13 - <a href="/brw/pet/1550606813.html">Help, I need to move temp and a want to keep my cat</a> - <font size="-1"> (Wilton Manors)</font> <span class="p"> pic</span></p><p> Jan 13 - <a href="/mdc/pet/1550593272.html">Adopt these two kittens that survived...</a> - <font size="-1"> (Downtown Miami)</font> <span class="p"> pic</span></p><p> Jan 12 - <a href="/mdc/pet/1549484478.html">Free Kitten to Good Home</a> - <font size="-1"> (miami)</font> <span class="p"> pic</span></p><br /><div>Sort by:<strong> most recent</strong> <a href="/search/pet?catAbbreviation=pet&hasPic=1&query=kitten&sort=rel">best match</a>
114
+ <div class="sh" style="text-align: center;">
115
+ <span style="float: left;">&nbsp;</span>
116
+ <span style="float: right;">&nbsp;</span>
117
+ <b>Found: 43 Displaying: 1 - 43</b>
118
+
119
+ </div>
120
+
121
+
122
+
123
+
124
+
125
+ <div id="footer">
126
+ <hr />
127
+ <span id="copy">
128
+ Copyright &copy; 2010 craigslist, inc.<br />
129
+ </span>
130
+ <span class="rss">
131
+ <a href="/search/pet?query=kitten&amp;catAbbreviation=pet&amp;hasPic=1&amp;format=rss" class="l">RSS</a>
132
+ <a href="http://www.craigslist.org/about/rss">(?)</a><br />
133
+ </span>
134
+ </div>
135
+ <br /><br />
136
+
137
+ <div id="floater">&nbsp;</div>
138
+
139
+ </div></div></blockquote>
140
+ <script src="http://www.craigslist.org/js/jquery.js" type="text/javascript"></script>
141
+ <script src="http://www.craigslist.org/js/tocs.js" type="text/javascript"></script>
142
+
143
+ </body>
144
+ </html>
@@ -7,12 +7,6 @@ require File.dirname(__FILE__)+'/libcraigscrape_test_helpers'
7
7
  class CraigslistListingTest < Test::Unit::TestCase
8
8
  include LibcraigscrapeTestHelpers
9
9
 
10
- def test_pukes
11
- assert_raise(CraigScrape::Scraper::ParseError) do
12
- CraigScrape::Listings.new( relative_uri_for('google.html') ).posts
13
- end
14
- end
15
-
16
10
  def test_listings_parse
17
11
  category = CraigScrape::Listings.new relative_uri_for('listing_samples/category_output.html')
18
12
  assert_equal 'index100.html', category.next_page_href
@@ -213,6 +207,11 @@ class CraigslistListingTest < Test::Unit::TestCase
213
207
 
214
208
  miami_search_sss_rack1000_061809 = CraigScrape::Listings.new relative_uri_for('listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html')
215
209
  assert_equal nil, miami_search_sss_rack1000_061809.next_page_href
210
+
211
+ # The first post on these results were causing problems at one point, due to the CraigScrape::Listings::LABEL regex
212
+ mia_sss_kittens021010 = CraigScrape::Listings.new relative_uri_for('listing_samples/mia_sss_kittens2.10.10.html')
213
+ assert_equal 'Two adorable kittens need new homes', mia_sss_kittens021010.posts[0].label
214
+ assert_equal '/brw/pet/1594409444.html', mia_sss_kittens021010.posts[0].href
216
215
  end
217
216
 
218
217
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: libcraigscrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.8.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris DeRose, DeRose Technologies, Inc.
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-10-22 00:00:00 -04:00
12
+ date: 2010-02-10 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -85,6 +85,7 @@ files:
85
85
  - test/listing_samples/fortmyers_art_index.060909/1112522674.html
86
86
  - test/listing_samples/fortmyers_art_index.060909/823516079.html
87
87
  - test/listing_samples/category_output_2.html
88
+ - test/listing_samples/mia_sss_kittens2.10.10.html
88
89
  - test/listing_samples/long_search_output.html
89
90
  - test/listing_samples/miami_search_sss_rack.6.18.09
90
91
  - test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html