olek-libcraigscrape 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. data/CHANGELOG +94 -0
  2. data/COPYING +674 -0
  3. data/COPYING.LESSER +165 -0
  4. data/README +89 -0
  5. data/Rakefile +125 -0
  6. data/bin/craig_report_schema.yml +68 -0
  7. data/bin/craigwatch +581 -0
  8. data/bin/report_mailer/craigslist_report.html.erb +17 -0
  9. data/bin/report_mailer/craigslist_report.plain.erb +18 -0
  10. data/lib/geo_listings.rb +144 -0
  11. data/lib/libcraigscrape.rb +217 -0
  12. data/lib/listings.rb +160 -0
  13. data/lib/posting.rb +324 -0
  14. data/lib/scraper.rb +212 -0
  15. data/test/geolisting_samples/geo_listing_ca070209.html +76 -0
  16. data/test/geolisting_samples/geo_listing_ca_sk070209.html +31 -0
  17. data/test/geolisting_samples/geo_listing_cn070209.html +35 -0
  18. data/test/geolisting_samples/geo_listing_us070209.html +355 -0
  19. data/test/geolisting_samples/hierarchy_test071009/index.html +31 -0
  20. data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/%20SW%20florida/index.html +46 -0
  21. data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/index.html +46 -0
  22. data/test/geolisting_samples/hierarchy_test071009/us/fl/index.html +46 -0
  23. data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/index.html +46 -0
  24. data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/index.html +46 -0
  25. data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/more-nonsense/index.html +46 -0
  26. data/test/geolisting_samples/hierarchy_test071009/us/fl/nonexist/index.html +46 -0
  27. data/test/geolisting_samples/hierarchy_test071009/us/fl/nonsense/index.html +46 -0
  28. data/test/geolisting_samples/hierarchy_test071009/us/fl/south%20florida/index.html +46 -0
  29. data/test/geolisting_samples/hierarchy_test071009/us/index.html +355 -0
  30. data/test/google.html +8 -0
  31. data/test/libcraigscrape_test_helpers.rb +37 -0
  32. data/test/listing_samples/category_output.html +231 -0
  33. data/test/listing_samples/category_output_2.html +217 -0
  34. data/test/listing_samples/empty_listings.html +128 -0
  35. data/test/listing_samples/fortmyers_art_index.060909/1046596324.html +93 -0
  36. data/test/listing_samples/fortmyers_art_index.060909/1053085283.html +92 -0
  37. data/test/listing_samples/fortmyers_art_index.060909/1112522674.html +89 -0
  38. data/test/listing_samples/fortmyers_art_index.060909/823516079.html +92 -0
  39. data/test/listing_samples/fortmyers_art_index.060909/825684735.html +89 -0
  40. data/test/listing_samples/fortmyers_art_index.060909/891513957.html +94 -0
  41. data/test/listing_samples/fortmyers_art_index.060909/897549505.html +99 -0
  42. data/test/listing_samples/fortmyers_art_index.060909/960826026.html +89 -0
  43. data/test/listing_samples/fortmyers_art_index.060909/993256300.html +89 -0
  44. data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index500.060909.html +237 -0
  45. data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index600.060909.html +132 -0
  46. data/test/listing_samples/long_search_output.html +137 -0
  47. data/test/listing_samples/mia_fua_index8900.5.21.09.html +226 -0
  48. data/test/listing_samples/mia_search_kitten.3.15.10.html +149 -0
  49. data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html +144 -0
  50. data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html +146 -0
  51. data/test/listing_samples/new_listing_span.4.17.10.html +769 -0
  52. data/test/listing_samples/short_search_output.html +133 -0
  53. data/test/post_samples/1207457727.html +92 -0
  54. data/test/post_samples/brw_reb_1224008903.html +101 -0
  55. data/test/post_samples/posting0.html +91 -0
  56. data/test/post_samples/posting1.html +106 -0
  57. data/test/post_samples/posting1796890756-061710.html +2318 -0
  58. data/test/post_samples/posting1808219423.html +2473 -0
  59. data/test/post_samples/posting1938291834-090610.html +188 -0
  60. data/test/post_samples/posting2.html +107 -0
  61. data/test/post_samples/posting3.html +92 -0
  62. data/test/post_samples/posting4.html +993 -0
  63. data/test/post_samples/posting5.html +38 -0
  64. data/test/post_samples/sfbay_art_1223614914.html +94 -0
  65. data/test/post_samples/this_post_has_been_deleted_by_its_author.html +37 -0
  66. data/test/post_samples/this_post_has_expired.html +48 -0
  67. data/test/test_craigslist_geolisting.rb +521 -0
  68. data/test/test_craigslist_listing.rb +362 -0
  69. data/test/test_craigslist_posting.rb +426 -0
  70. metadata +273 -0
@@ -0,0 +1,17 @@
1
+ <h2><%=h @subject %></h2>
2
+ <%@summaries.each do |summary| %>
3
+ <h3><%=h summary[:search].name%></h3>
4
+ <% if summary[:postings].length > 0 %>
5
+ <%summary[:postings].each do |post|%>
6
+ <%='<p>%s <a href="%s">%s -</a>%s%s</p>' % [
7
+ h(post.post_date.strftime('%b %d')),
8
+ post.url,
9
+ h(post.label),
10
+ (post.location) ? '<font size="-1"> (%s)</font>' % h(post.location) : '',
11
+ (post.has_pic_or_img?) ? ' <span style="color: orange"> img</span>': ''
12
+ ] -%>
13
+ <% end %>
14
+ <% else %>
15
+ <p><i>No new postings were found, which matched the search criteria.</i></p>
16
+ <% end %>
17
+ <% end %>
@@ -0,0 +1,18 @@
1
+ CRAIGSLIST REPORTER
2
+
3
+ <%@summaries.each do |summary| -%>
4
+ <%=summary[:search].name %>
5
+ <% summary[:postings].collect do |post| -%>
6
+ <% if summary[:postings].length > 0 %>
7
+ <%='%s : %s %s %s %s' % [
8
+ post.post_date.strftime('%b %d'),
9
+ post.label,
10
+ (post.location) ? " (#{post.location})" : '',
11
+ (post.has_pic_or_img?) ? ' [img]': '',
12
+ post.url
13
+ ] -%>
14
+ <% else %>
15
+ No new postings were found, which matched the search criteria.
16
+ <% end %>
17
+ <% end %>
18
+ <% end -%>
@@ -0,0 +1,144 @@
1
+ # = About geo_listings.rb
2
+ #
3
+ # This file contains the parsing code, and logic relating to geographic site pages and paths. You
4
+ # should never need to include this file directly, as all of libcraigscrape's objects and methods
5
+ # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
6
+ #
7
+
8
+ require 'scraper'
9
+
10
+ class CraigScrape
11
+
12
+ # GeoListings represents a parsed Craigslist geo lisiting page. (i.e. {'http://geo.craigslist.org/iso/us'}[http://geo.craigslist.org/iso/us])
13
+ # These list all the craigslist sites in a given region.
14
+ class GeoListings < Scraper
15
+ GEOLISTING_BASE_URL = %{http://geo.craigslist.org/iso/}
16
+
17
+ LOCATION_NAME = /[ ]*\>[ ](.+)[ ]*/
18
+ PATH_SCANNER = /(?:\\\/|[^\/])+/
19
+ URL_HOST_PART = /^[^\:]+\:\/\/([^\/]+)[\/]?$/
20
+ SITE_PREFIX = /^([^\.]+)/
21
+ FIND_SITES_PARTS = /^[ ]*([\+|\-]?)[ ]*(.+)[ ]*/
22
+
23
+ class BadGeoListingPath < StandardError #:nodoc:
24
+ end
25
+
26
+ # The geolisting constructor works like all other Scraper objects, in that it accepts a string 'url'.
27
+ # See the Craigscrape.find_sites for a more powerful way to find craigslist sites.
28
+ def initialize(init_via = nil)
29
+ super(init_via)
30
+
31
+ # Validate that required fields are present, at least - if we've downloaded it from a url
32
+ parse_error! unless location
33
+ end
34
+
35
+ # Returns the GeoLocation's full name
36
+ def location
37
+ unless @location
38
+ cursor = html % 'h3 > b > a:first-of-type'
39
+ cursor = cursor.next if cursor
40
+ @location = $1 if cursor and LOCATION_NAME.match he_decode(cursor.to_s)
41
+ end
42
+
43
+ @location
44
+ end
45
+
46
+ # Returns a hash of site name to urls in the current listing
47
+ def sites
48
+ unless @sites
49
+ @sites = {}
50
+ (html / 'div#list > a').each do |el_a|
51
+ site_name = he_decode strip_html(el_a.inner_html)
52
+ @sites[site_name] = $1 if URL_HOST_PART.match el_a[:href]
53
+ end
54
+ end
55
+
56
+ @sites
57
+ end
58
+
59
+ # This method will return an array of all possible sites that match the specified location path.
60
+ # Sample location paths:
61
+ # - us/ca
62
+ # - us/fl/miami
63
+ # - jp/fukuoka
64
+ # - mx
65
+ # Here's how location paths work.
66
+ # - The components of the path are to be separated by '/' 's.
67
+ # - Up to (and optionally, not including) the last component, the path should correspond against a valid GeoLocation url with the prefix of 'http://geo.craigslist.org/iso/'
68
+ # - the last component can either be a site's 'prefix' on a GeoLocation page, or, the last component can just be a geolocation page itself, in which case all the sites on that page are selected.
69
+ # - the site prefix is the first dns record in a website listed on a GeoLocation page. (So, for the case of us/fl/miami , the last 'miami' corresponds to the 'south florida' link on {'http://geo.craigslist.org/iso/us/fl'}[http://geo.craigslist.org/iso/us/fl]
70
+ def self.sites_in_path(full_path, base_url = GEOLISTING_BASE_URL)
71
+ # the base_url parameter is mostly so we can test this method
72
+
73
+ # Unfortunately - the easiest way to understand much of this is to see how craigslist returns
74
+ # these geolocations. Watch what happens when you request us/fl/non-existant/page/here.
75
+ # I also made this a little forgiving in a couple ways not specified with official support, per
76
+ # the rules above.
77
+ full_path_parts = full_path.scan PATH_SCANNER
78
+
79
+ # We'll either find a single site in this loop andf return that, or, we'll find a whole listing
80
+ # and set the geo_listing object to reflect that
81
+ geo_listing = nil
82
+ full_path_parts.each_with_index do |part, i|
83
+
84
+ # Let's un-escape the path-part, if needed:
85
+ part.gsub! "\\/", "/"
86
+
87
+ # If they're specifying a single site, this will catch and return it immediately
88
+ site = geo_listing.sites.find{ |n,s|
89
+ (SITE_PREFIX.match s and $1 == part) or n == part
90
+ } if geo_listing
91
+
92
+ # This returns the site component of the found array
93
+ return [site.last] if site
94
+
95
+ begin
96
+ # The URI escape is mostly needed to translate the space characters
97
+ l = GeoListings.new base_url+full_path_parts[0...i+1].collect{|p| URI.escape p}.join('/')
98
+ rescue CraigScrape::Scraper::FetchError
99
+ bad_geo_path! full_path
100
+ end
101
+
102
+ # This probably tells us the first part of the path was 'correct', but not the rest:
103
+ bad_geo_path! full_path if geo_listing and geo_listing.location == l.location
104
+
105
+ geo_listing = l
106
+ end
107
+
108
+ # We have a valid listing page we found, and we can just return all the sites on it:
109
+ geo_listing.sites.collect{|n,s| s }
110
+ end
111
+
112
+ # find_sites takes a single array of strings as an argument. Each string is to be either a location path
113
+ # (see sites_in_path), or a full site (in canonical form - ie "memphis.craigslist.org"). Optionally,
114
+ # each of this may/should contain a '+' or '-' prefix to indicate whether the string is supposed to
115
+ # include sites from the master list, or remove them from the list. If no '+' or'-' is
116
+ # specified, the default assumption is '+'. Strings are processed from left to right, which gives
117
+ # a high degree of control over the selection set. Examples:
118
+ # - find_sites "us/fl", "- miami.craigslist.org"
119
+ # - find_sites "us", "- us/nm"
120
+ # - find_sites "us", "- us/ny", "+ newyork.craigslist.org"
121
+ # - find_sites "us/ny", "us/id", "caribbean.craigslist.org"
122
+ # There's a lot of flexibility here, you get the idea.
123
+ def self.find_sites(specs, base_url = GEOLISTING_BASE_URL)
124
+ ret = []
125
+
126
+ specs.each do |spec|
127
+ (op,spec = $1,$2) if FIND_SITES_PARTS.match spec
128
+
129
+ spec = (spec.include? '.') ? [spec] : sites_in_path(spec, base_url)
130
+
131
+ (op == '-') ? ret -= spec : ret |= spec
132
+ end
133
+
134
+ ret
135
+ end
136
+
137
+ private
138
+
139
+ def self.bad_geo_path!(path)
140
+ raise BadGeoListingPath, "Unable to load path #{path.inspect}, either you're having problems connecting to Craiglist, or your path is invalid."
141
+ end
142
+
143
+ end
144
+ end
@@ -0,0 +1,217 @@
1
+ # = About libcraigscrape.rb
2
+ #
3
+ # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
4
+ #
5
+ require 'rubygems'
6
+
7
+ gem 'activesupport', '~> 2.3'
8
+ gem 'nokogiri', '>= 1.4.4'
9
+ gem 'htmlentities', '>= 4.0.0'
10
+
11
+
12
+ require 'net/http'
13
+ require 'zlib'
14
+ require 'nokogiri'
15
+ require 'htmlentities'
16
+ require 'active_support'
17
+
18
+
19
+ # A base class encapsulating the various libcraigscrape objects, and providing most of the
20
+ # craigslist interaction methods. Currently, we're supporting the old Class methods
21
+ # in a legacy-compatibility mode, but these methods are marked for deprecation. Instead,
22
+ # create an instance of the Craigslist object, and use its Public Instance methods.
23
+ # See the README for easy to follow examples.
24
+
25
+ class CraigScrape
26
+ cattr_accessor :time_now
27
+ cattr_accessor :site_to_url_prefix
28
+
29
+ #--
30
+ # NOTE:
31
+ # The only reason I took this out is b/c I might want to test with a file://
32
+ # prefix at some point
33
+ #++
34
+ self.site_to_url_prefix = 'http://'
35
+
36
+
37
+ # Takes a variable number of site/path specifiers (strings) as an argument.
38
+ # This list gets flattened and passed to CraigScrape::GeoListings.find_sites .
39
+ # See that method's rdoc for a complete set of rules on what arguments are allowed here.
40
+ def initialize(*args)
41
+ @sites_specs = args.flatten
42
+ end
43
+
44
+ # Returns which sites are included in any operations performed by this object. This is directly
45
+ # ascertained from the initial constructor's spec-list
46
+ def sites
47
+ @sites ||= GeoListings.find_sites @sites_specs
48
+ @sites
49
+ end
50
+
51
+ # Determines all listings which can be construed by combining the sites specified in the object
52
+ # constructor with the provided url-path fragments.
53
+ #
54
+ # Passes the <b>first page listing</b> of each of these urls to the provided block.
55
+ def each_listing(*fragments)
56
+ listing_urls_for(fragments).each{|url| yield Listings.new(url) }
57
+ end
58
+
59
+ # Determines all listings which can be construed by combining the sites specified in the object
60
+ # constructor with the provided url-path fragments.
61
+ #
62
+ # Passes <b>each page on every listing</b> for the passed URLs to the provided block.
63
+ def each_page_in_each_listing(*fragments)
64
+ each_listing(*fragments) do |listing|
65
+ while listing
66
+ yield listing
67
+ listing = listing.next_page
68
+ end
69
+ end
70
+ end
71
+
72
+ # Determines all listings which can be construed by combining the sites specified in the object
73
+ # constructor with the provided url-path fragments.
74
+ #
75
+ # Returns the <b>first page listing</b> of each of these urls to the provided block.
76
+ def listings(*fragments)
77
+ listing_urls_for(fragments).collect{|url| Listings.new url }
78
+ end
79
+
80
+ # Determines all listings which can be construed by combining the sites specified in the object
81
+ # constructor with the provided url-path fragments.
82
+ #
83
+ # Passes all posts from each of these urls to the provided block, in the order they're parsed
84
+ # (for each listing, newest posts are returned first).
85
+ def each_post(*fragments)
86
+ each_page_in_each_listing(*fragments){ |l| l.posts.each{|p| yield p} }
87
+ end
88
+
89
+ # Determines all listings which can be construed by combining the sites specified in the object
90
+ # constructor with the provided url-path fragments.
91
+ #
92
+ # Returns all posts from each of these urls, in the order they're parsed
93
+ # (newest posts first).
94
+ def posts(*fragments)
95
+ ret = []
96
+ each_page_in_each_listing(*fragments){ |l| ret += l.posts }
97
+ ret
98
+ end
99
+
100
+ # Determines all listings which can be construed by combining the sites specified in the object
101
+ # constructor with the provided url-path fragments.
102
+ #
103
+ # Returns all posts from each of these urls, which are newer than the provider 'newer_then' date.
104
+ # (Returns 'newest' posts first).
105
+ def posts_since(newer_then, *fragments)
106
+ ret = []
107
+ fragments.each do |frag|
108
+ each_post(frag) do |p|
109
+ break if p.post_date <= newer_then
110
+ ret << p
111
+ end
112
+ end
113
+
114
+ ret
115
+ end
116
+
117
+ class << self # Class methods
118
+
119
+ #--
120
+ # NOTE: These Class methods are all marked for deprecation as of
121
+ # version 0.8.0, and should not be used with any new project code
122
+ #++
123
+
124
+ # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
125
+ # Instead, consider using CraigScrape::Listings.new
126
+ #
127
+ # Scrapes a single listing url and returns a Listings object representing the contents.
128
+ # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
129
+ def scrape_listing(listing_url)
130
+ CraigScrape::Listings.new listing_url
131
+ end
132
+
133
+ # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
134
+ # Instead, consider using the CraigScrape::each_post method.
135
+ #
136
+ # Continually scrapes listings, using the supplied url as a starting point, until the supplied block returns true or
137
+ # until there's no more 'next page' links available to click on
138
+ def scrape_until(listing_url, &post_condition)
139
+ ret = []
140
+
141
+ listings = CraigScrape::Listings.new listing_url
142
+ catch "ScrapeBreak" do
143
+ while listings do
144
+ listings.posts.each do |post|
145
+ throw "ScrapeBreak" if post_condition.call(post)
146
+ ret << post
147
+ end
148
+
149
+ listings = listings.next_page
150
+ end
151
+ end
152
+
153
+ ret
154
+ end
155
+
156
+ # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
157
+ # Instead, consider using CraigScrape::Posting.new
158
+ #
159
+ # Scrapes a single Post Url, and returns a Posting object representing its contents.
160
+ # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
161
+ def scrape_full_post(post_url)
162
+ CraigScrape::Posting.new post_url
163
+ end
164
+
165
+ # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
166
+ # Instead, consider using the CraigScrape::each_post method.
167
+ #
168
+ # Continually scrapes listings, using the supplied url as a starting point, until 'count' summaries have been retrieved
169
+ # or no more 'next page' links are avialable to be clicked on. Returns an array of PostSummary objects.
170
+ def scrape_posts(listing_url, count)
171
+ count_so_far = 0
172
+ self.scrape_until(listing_url) {|post| count_so_far+=1; count < count_so_far }
173
+ end
174
+
175
+ # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
176
+ # Instead, consider using the CraigScrape::posts_since method.
177
+ #
178
+ # Continually scrapes listings, until the date newer_then has been reached, or no more 'next page' links are avialable to be clicked on.
179
+ # Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
180
+ # As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
181
+ # time-based cutoffs, at the expense of retrieving every post in full during enumerations.
182
+ #
183
+ # <b>Note:</b> The results will not include post summaries having the newer_then date themselves.
184
+ def scrape_posts_since(listing_url, newer_then)
185
+ self.scrape_until(listing_url) {|post| post.post_date <= newer_then}
186
+ end
187
+ end
188
+
189
+ private
190
+
191
+ # This takes a fragments paramter, and turns it into actual urls
192
+ def listing_urls_for(listing_fragments)
193
+ listing_fragments.collect{ |lf|
194
+ # This removes any /'s from he beginning of the fragment
195
+ lf = $1 if /^\/(.*)/.match lf
196
+ # This adds a '/' to the end of a path, so long as its not a query we're dealing with...
197
+ lf += '/' unless lf.index '?'
198
+ sites.collect { |site| '%s%s/%s' % [site_to_url_prefix,site,lf] }
199
+ }.flatten
200
+ end
201
+
202
+ # Returns the most recentlt expired time for the provided month and day
203
+ def self.most_recently_expired_time(month, day) #:nodoc:
204
+ now = (time_now) ? time_now : Time.now
205
+
206
+ # This ensures we always generate a time in the past, by guessing the year and subtracting one if we guessed wrong
207
+ ret = Time.local now.year, month, day
208
+ ret = Time.local now.year-1, month, day if ret > now
209
+
210
+ ret
211
+ end
212
+
213
+ end
214
+
215
+ require 'listings'
216
+ require 'posting'
217
+ require 'geo_listings'
data/lib/listings.rb ADDED
@@ -0,0 +1,160 @@
1
+ # = About listings.rb
2
+ #
3
+ # This file contains the parsing code, and logic relating to post-listing pages. You
4
+ # should never need to include this file directly, as all of libcraigscrape's objects and methods
5
+ # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
6
+ #
7
+ require 'scraper'
8
+
9
+ # Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
10
+ class CraigScrape::Listings < CraigScrape::Scraper
11
+ LABEL = /^(.+?)[ ]*[\-]?$/
12
+ LOCATION = /^[ ]*\((.*?)\)$/
13
+ IMG_TYPE = /^[ ]*(.+)[ ]*$/
14
+ HEADER_DATE = /^[ ]*(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[ ]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Nov|Dec)[ ]+([0-9]{1,2})[ ]*$/i
15
+ SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
16
+ NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
17
+
18
+ # Array, PostSummary objects found in the listing
19
+ def posts
20
+ unless @posts
21
+ current_date = nil
22
+ @posts = []
23
+
24
+ # All we care about are p and h4 tags. This seemed to be the only way I could do this on Nokogiri:
25
+ post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
26
+
27
+ # The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
28
+ post_tags.pop if (
29
+ post_tags.length > 0 and
30
+ post_tags.last.at('a') and
31
+ NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
32
+ )
33
+
34
+ # Now we iterate though the listings:
35
+ post_tags.each do |el|
36
+ case el.name
37
+ when 'p'
38
+ post_summary = self.class.parse_summary el, current_date
39
+
40
+ # Validate that required fields are present:
41
+ parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
42
+
43
+ post_summary[:url] = url_from_href post_summary[:href]
44
+
45
+ @posts << CraigScrape::Posting.new(post_summary)
46
+ when 'h4'
47
+ # Let's make sense of the h4 tag, and then read all the p tags below it
48
+ if HEADER_DATE.match he_decode(el.inner_html)
49
+ # Generally, the H4 tags contain valid dates. When they do - this is easy:
50
+ current_date = CraigScrape.most_recently_expired_time $1, $2
51
+ elsif html.at('h4:last-of-type') == el
52
+ # There's a specific bug in craigslist, where these nonsense h4's just appear without anything relevant inside them.
53
+ # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
54
+ # we need to pull up the full post in order to accurate tell the date.
55
+ # Setting this to nil will achieve the eager-load.
56
+ current_date = nil
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ @posts
63
+ end
64
+
65
+ # String, URL Path href-fragment of the next page link
66
+ def next_page_href
67
+ unless @next_page_href
68
+ cursor = html.at 'p:last-of-type'
69
+
70
+ cursor = cursor.at 'a' if cursor
71
+
72
+ # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
73
+ next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
74
+
75
+ # Search listings put their next page in a link towards the top
76
+ next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
77
+
78
+ # Some search pages have a bug, whereby a 'next page' link isn't displayed,
79
+ # even though we can see that theres another page listed in the page-number links block at the top
80
+ # and bottom of the listing page
81
+ unless next_link
82
+ cursor = html % 'div.sh:first-of-type > b:last-of-type'
83
+
84
+ # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
85
+ # We're looking good.
86
+ next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
87
+ end
88
+
89
+ # We have an anchor tag - so - let's assign the href:
90
+ @next_page_href = next_link[:href] if next_link
91
+ end
92
+
93
+ @next_page_href
94
+ end
95
+
96
+ # String, Full URL Path of the 'next page' link
97
+ def next_page_url
98
+ (next_page_href) ? url_from_href(next_page_href) : nil
99
+ end
100
+
101
+ # Returns a Listings object of the next_page_url on the current listings object
102
+ def next_page
103
+ CraigScrape::Listings.new next_page_url if next_page_url
104
+ end
105
+
106
+ # Takes a paragraph element and returns a mostly-parsed Posting
107
+ # We separate this from the rest of the parsing both for readability and ease of testing
108
+ def self.parse_summary(p_element, date = nil) #:nodoc:
109
+ ret = {}
110
+
111
+ title_anchor = nil
112
+ section_anchor = nil
113
+
114
+ # This loop got a little more complicated after Craigslist start inserting weird <spans>'s in
115
+ # its list summary postings (See test_new_listing_span051710)
116
+ p_element.search('a').each do |a_el|
117
+ # We want the first a-tag that doesn't have spans in it to be the title anchor
118
+ if title_anchor.nil?
119
+ title_anchor = a_el if !a_el.at('span')
120
+ # We want the next a-tag after the title_anchor to be the section anchor
121
+ elsif section_anchor.nil?
122
+ section_anchor = a_el
123
+ # We have no need to tranverse these further:
124
+ break
125
+ end
126
+ end
127
+
128
+ location_tag = p_element.at 'font'
129
+ has_pic_tag = p_element.at 'span'
130
+
131
+ href = nil
132
+
133
+ location = he_decode p_element.at('font').inner_html if location_tag
134
+ ret[:location] = $1 if location and LOCATION.match location
135
+
136
+ ret[:img_types] = []
137
+ if has_pic_tag
138
+ img_type = he_decode has_pic_tag.inner_html
139
+ img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
140
+
141
+ ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
142
+ end
143
+
144
+ ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
145
+
146
+ ret[:post_date] = date
147
+ if SUMMARY_DATE.match he_decode(p_element.children[0])
148
+ ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
149
+ end
150
+
151
+ if title_anchor
152
+ label = he_decode title_anchor.inner_html
153
+ ret[:label] = $1 if LABEL.match label
154
+
155
+ ret[:href] = title_anchor[:href]
156
+ end
157
+
158
+ ret
159
+ end
160
+ end