olek-libcraigscrape 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +94 -0
- data/COPYING +674 -0
- data/COPYING.LESSER +165 -0
- data/README +89 -0
- data/Rakefile +125 -0
- data/bin/craig_report_schema.yml +68 -0
- data/bin/craigwatch +581 -0
- data/bin/report_mailer/craigslist_report.html.erb +17 -0
- data/bin/report_mailer/craigslist_report.plain.erb +18 -0
- data/lib/geo_listings.rb +144 -0
- data/lib/libcraigscrape.rb +217 -0
- data/lib/listings.rb +160 -0
- data/lib/posting.rb +324 -0
- data/lib/scraper.rb +212 -0
- data/test/geolisting_samples/geo_listing_ca070209.html +76 -0
- data/test/geolisting_samples/geo_listing_ca_sk070209.html +31 -0
- data/test/geolisting_samples/geo_listing_cn070209.html +35 -0
- data/test/geolisting_samples/geo_listing_us070209.html +355 -0
- data/test/geolisting_samples/hierarchy_test071009/index.html +31 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/%20SW%20florida/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/more-nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/nonexist/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/south%20florida/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/index.html +355 -0
- data/test/google.html +8 -0
- data/test/libcraigscrape_test_helpers.rb +37 -0
- data/test/listing_samples/category_output.html +231 -0
- data/test/listing_samples/category_output_2.html +217 -0
- data/test/listing_samples/empty_listings.html +128 -0
- data/test/listing_samples/fortmyers_art_index.060909/1046596324.html +93 -0
- data/test/listing_samples/fortmyers_art_index.060909/1053085283.html +92 -0
- data/test/listing_samples/fortmyers_art_index.060909/1112522674.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/823516079.html +92 -0
- data/test/listing_samples/fortmyers_art_index.060909/825684735.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/891513957.html +94 -0
- data/test/listing_samples/fortmyers_art_index.060909/897549505.html +99 -0
- data/test/listing_samples/fortmyers_art_index.060909/960826026.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/993256300.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index500.060909.html +237 -0
- data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index600.060909.html +132 -0
- data/test/listing_samples/long_search_output.html +137 -0
- data/test/listing_samples/mia_fua_index8900.5.21.09.html +226 -0
- data/test/listing_samples/mia_search_kitten.3.15.10.html +149 -0
- data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html +144 -0
- data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html +146 -0
- data/test/listing_samples/new_listing_span.4.17.10.html +769 -0
- data/test/listing_samples/short_search_output.html +133 -0
- data/test/post_samples/1207457727.html +92 -0
- data/test/post_samples/brw_reb_1224008903.html +101 -0
- data/test/post_samples/posting0.html +91 -0
- data/test/post_samples/posting1.html +106 -0
- data/test/post_samples/posting1796890756-061710.html +2318 -0
- data/test/post_samples/posting1808219423.html +2473 -0
- data/test/post_samples/posting1938291834-090610.html +188 -0
- data/test/post_samples/posting2.html +107 -0
- data/test/post_samples/posting3.html +92 -0
- data/test/post_samples/posting4.html +993 -0
- data/test/post_samples/posting5.html +38 -0
- data/test/post_samples/sfbay_art_1223614914.html +94 -0
- data/test/post_samples/this_post_has_been_deleted_by_its_author.html +37 -0
- data/test/post_samples/this_post_has_expired.html +48 -0
- data/test/test_craigslist_geolisting.rb +521 -0
- data/test/test_craigslist_listing.rb +362 -0
- data/test/test_craigslist_posting.rb +426 -0
- metadata +273 -0
@@ -0,0 +1,17 @@
|
|
1
|
+
<h2><%=h @subject %></h2>
|
2
|
+
<%@summaries.each do |summary| %>
|
3
|
+
<h3><%=h summary[:search].name%></h3>
|
4
|
+
<% if summary[:postings].length > 0 %>
|
5
|
+
<%summary[:postings].each do |post|%>
|
6
|
+
<%='<p>%s <a href="%s">%s -</a>%s%s</p>' % [
|
7
|
+
h(post.post_date.strftime('%b %d')),
|
8
|
+
post.url,
|
9
|
+
h(post.label),
|
10
|
+
(post.location) ? '<font size="-1"> (%s)</font>' % h(post.location) : '',
|
11
|
+
(post.has_pic_or_img?) ? ' <span style="color: orange"> img</span>': ''
|
12
|
+
] -%>
|
13
|
+
<% end %>
|
14
|
+
<% else %>
|
15
|
+
<p><i>No new postings were found, which matched the search criteria.</i></p>
|
16
|
+
<% end %>
|
17
|
+
<% end %>
|
@@ -0,0 +1,18 @@
|
|
1
|
+
CRAIGSLIST REPORTER
|
2
|
+
|
3
|
+
<%@summaries.each do |summary| -%>
|
4
|
+
<%=summary[:search].name %>
|
5
|
+
<% summary[:postings].collect do |post| -%>
|
6
|
+
<% if summary[:postings].length > 0 %>
|
7
|
+
<%='%s : %s %s %s %s' % [
|
8
|
+
post.post_date.strftime('%b %d'),
|
9
|
+
post.label,
|
10
|
+
(post.location) ? " (#{post.location})" : '',
|
11
|
+
(post.has_pic_or_img?) ? ' [img]': '',
|
12
|
+
post.url
|
13
|
+
] -%>
|
14
|
+
<% else %>
|
15
|
+
No new postings were found, which matched the search criteria.
|
16
|
+
<% end %>
|
17
|
+
<% end %>
|
18
|
+
<% end -%>
|
data/lib/geo_listings.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
# = About geo_listings.rb
|
2
|
+
#
|
3
|
+
# This file contains the parsing code, and logic relating to geographic site pages and paths. You
|
4
|
+
# should never need to include this file directly, as all of libcraigscrape's objects and methods
|
5
|
+
# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'scraper'
|
9
|
+
|
10
|
+
class CraigScrape
|
11
|
+
|
12
|
+
# GeoListings represents a parsed Craigslist geo lisiting page. (i.e. {'http://geo.craigslist.org/iso/us'}[http://geo.craigslist.org/iso/us])
|
13
|
+
# These list all the craigslist sites in a given region.
|
14
|
+
class GeoListings < Scraper
|
15
|
+
GEOLISTING_BASE_URL = %{http://geo.craigslist.org/iso/}
|
16
|
+
|
17
|
+
LOCATION_NAME = /[ ]*\>[ ](.+)[ ]*/
|
18
|
+
PATH_SCANNER = /(?:\\\/|[^\/])+/
|
19
|
+
URL_HOST_PART = /^[^\:]+\:\/\/([^\/]+)[\/]?$/
|
20
|
+
SITE_PREFIX = /^([^\.]+)/
|
21
|
+
FIND_SITES_PARTS = /^[ ]*([\+|\-]?)[ ]*(.+)[ ]*/
|
22
|
+
|
23
|
+
class BadGeoListingPath < StandardError #:nodoc:
|
24
|
+
end
|
25
|
+
|
26
|
+
# The geolisting constructor works like all other Scraper objects, in that it accepts a string 'url'.
|
27
|
+
# See the Craigscrape.find_sites for a more powerful way to find craigslist sites.
|
28
|
+
def initialize(init_via = nil)
|
29
|
+
super(init_via)
|
30
|
+
|
31
|
+
# Validate that required fields are present, at least - if we've downloaded it from a url
|
32
|
+
parse_error! unless location
|
33
|
+
end
|
34
|
+
|
35
|
+
# Returns the GeoLocation's full name
|
36
|
+
def location
|
37
|
+
unless @location
|
38
|
+
cursor = html % 'h3 > b > a:first-of-type'
|
39
|
+
cursor = cursor.next if cursor
|
40
|
+
@location = $1 if cursor and LOCATION_NAME.match he_decode(cursor.to_s)
|
41
|
+
end
|
42
|
+
|
43
|
+
@location
|
44
|
+
end
|
45
|
+
|
46
|
+
# Returns a hash of site name to urls in the current listing
|
47
|
+
def sites
|
48
|
+
unless @sites
|
49
|
+
@sites = {}
|
50
|
+
(html / 'div#list > a').each do |el_a|
|
51
|
+
site_name = he_decode strip_html(el_a.inner_html)
|
52
|
+
@sites[site_name] = $1 if URL_HOST_PART.match el_a[:href]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
@sites
|
57
|
+
end
|
58
|
+
|
59
|
+
# This method will return an array of all possible sites that match the specified location path.
|
60
|
+
# Sample location paths:
|
61
|
+
# - us/ca
|
62
|
+
# - us/fl/miami
|
63
|
+
# - jp/fukuoka
|
64
|
+
# - mx
|
65
|
+
# Here's how location paths work.
|
66
|
+
# - The components of the path are to be separated by '/' 's.
|
67
|
+
# - Up to (and optionally, not including) the last component, the path should correspond against a valid GeoLocation url with the prefix of 'http://geo.craigslist.org/iso/'
|
68
|
+
# - the last component can either be a site's 'prefix' on a GeoLocation page, or, the last component can just be a geolocation page itself, in which case all the sites on that page are selected.
|
69
|
+
# - the site prefix is the first dns record in a website listed on a GeoLocation page. (So, for the case of us/fl/miami , the last 'miami' corresponds to the 'south florida' link on {'http://geo.craigslist.org/iso/us/fl'}[http://geo.craigslist.org/iso/us/fl]
|
70
|
+
def self.sites_in_path(full_path, base_url = GEOLISTING_BASE_URL)
|
71
|
+
# the base_url parameter is mostly so we can test this method
|
72
|
+
|
73
|
+
# Unfortunately - the easiest way to understand much of this is to see how craigslist returns
|
74
|
+
# these geolocations. Watch what happens when you request us/fl/non-existant/page/here.
|
75
|
+
# I also made this a little forgiving in a couple ways not specified with official support, per
|
76
|
+
# the rules above.
|
77
|
+
full_path_parts = full_path.scan PATH_SCANNER
|
78
|
+
|
79
|
+
# We'll either find a single site in this loop andf return that, or, we'll find a whole listing
|
80
|
+
# and set the geo_listing object to reflect that
|
81
|
+
geo_listing = nil
|
82
|
+
full_path_parts.each_with_index do |part, i|
|
83
|
+
|
84
|
+
# Let's un-escape the path-part, if needed:
|
85
|
+
part.gsub! "\\/", "/"
|
86
|
+
|
87
|
+
# If they're specifying a single site, this will catch and return it immediately
|
88
|
+
site = geo_listing.sites.find{ |n,s|
|
89
|
+
(SITE_PREFIX.match s and $1 == part) or n == part
|
90
|
+
} if geo_listing
|
91
|
+
|
92
|
+
# This returns the site component of the found array
|
93
|
+
return [site.last] if site
|
94
|
+
|
95
|
+
begin
|
96
|
+
# The URI escape is mostly needed to translate the space characters
|
97
|
+
l = GeoListings.new base_url+full_path_parts[0...i+1].collect{|p| URI.escape p}.join('/')
|
98
|
+
rescue CraigScrape::Scraper::FetchError
|
99
|
+
bad_geo_path! full_path
|
100
|
+
end
|
101
|
+
|
102
|
+
# This probably tells us the first part of the path was 'correct', but not the rest:
|
103
|
+
bad_geo_path! full_path if geo_listing and geo_listing.location == l.location
|
104
|
+
|
105
|
+
geo_listing = l
|
106
|
+
end
|
107
|
+
|
108
|
+
# We have a valid listing page we found, and we can just return all the sites on it:
|
109
|
+
geo_listing.sites.collect{|n,s| s }
|
110
|
+
end
|
111
|
+
|
112
|
+
# find_sites takes a single array of strings as an argument. Each string is to be either a location path
|
113
|
+
# (see sites_in_path), or a full site (in canonical form - ie "memphis.craigslist.org"). Optionally,
|
114
|
+
# each of this may/should contain a '+' or '-' prefix to indicate whether the string is supposed to
|
115
|
+
# include sites from the master list, or remove them from the list. If no '+' or'-' is
|
116
|
+
# specified, the default assumption is '+'. Strings are processed from left to right, which gives
|
117
|
+
# a high degree of control over the selection set. Examples:
|
118
|
+
# - find_sites "us/fl", "- miami.craigslist.org"
|
119
|
+
# - find_sites "us", "- us/nm"
|
120
|
+
# - find_sites "us", "- us/ny", "+ newyork.craigslist.org"
|
121
|
+
# - find_sites "us/ny", "us/id", "caribbean.craigslist.org"
|
122
|
+
# There's a lot of flexibility here, you get the idea.
|
123
|
+
def self.find_sites(specs, base_url = GEOLISTING_BASE_URL)
|
124
|
+
ret = []
|
125
|
+
|
126
|
+
specs.each do |spec|
|
127
|
+
(op,spec = $1,$2) if FIND_SITES_PARTS.match spec
|
128
|
+
|
129
|
+
spec = (spec.include? '.') ? [spec] : sites_in_path(spec, base_url)
|
130
|
+
|
131
|
+
(op == '-') ? ret -= spec : ret |= spec
|
132
|
+
end
|
133
|
+
|
134
|
+
ret
|
135
|
+
end
|
136
|
+
|
137
|
+
private
|
138
|
+
|
139
|
+
def self.bad_geo_path!(path)
|
140
|
+
raise BadGeoListingPath, "Unable to load path #{path.inspect}, either you're having problems connecting to Craiglist, or your path is invalid."
|
141
|
+
end
|
142
|
+
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,217 @@
|
|
1
|
+
# = About libcraigscrape.rb
|
2
|
+
#
|
3
|
+
# All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
4
|
+
#
|
5
|
+
require 'rubygems'
|
6
|
+
|
7
|
+
gem 'activesupport', '~> 2.3'
|
8
|
+
gem 'nokogiri', '>= 1.4.4'
|
9
|
+
gem 'htmlentities', '>= 4.0.0'
|
10
|
+
|
11
|
+
|
12
|
+
require 'net/http'
|
13
|
+
require 'zlib'
|
14
|
+
require 'nokogiri'
|
15
|
+
require 'htmlentities'
|
16
|
+
require 'active_support'
|
17
|
+
|
18
|
+
|
19
|
+
# A base class encapsulating the various libcraigscrape objects, and providing most of the
|
20
|
+
# craigslist interaction methods. Currently, we're supporting the old Class methods
|
21
|
+
# in a legacy-compatibility mode, but these methods are marked for deprecation. Instead,
|
22
|
+
# create an instance of the Craigslist object, and use its Public Instance methods.
|
23
|
+
# See the README for easy to follow examples.
|
24
|
+
|
25
|
+
class CraigScrape
|
26
|
+
cattr_accessor :time_now
|
27
|
+
cattr_accessor :site_to_url_prefix
|
28
|
+
|
29
|
+
#--
|
30
|
+
# NOTE:
|
31
|
+
# The only reason I took this out is b/c I might want to test with a file://
|
32
|
+
# prefix at some point
|
33
|
+
#++
|
34
|
+
self.site_to_url_prefix = 'http://'
|
35
|
+
|
36
|
+
|
37
|
+
# Takes a variable number of site/path specifiers (strings) as an argument.
|
38
|
+
# This list gets flattened and passed to CraigScrape::GeoListings.find_sites .
|
39
|
+
# See that method's rdoc for a complete set of rules on what arguments are allowed here.
|
40
|
+
def initialize(*args)
|
41
|
+
@sites_specs = args.flatten
|
42
|
+
end
|
43
|
+
|
44
|
+
# Returns which sites are included in any operations performed by this object. This is directly
|
45
|
+
# ascertained from the initial constructor's spec-list
|
46
|
+
def sites
|
47
|
+
@sites ||= GeoListings.find_sites @sites_specs
|
48
|
+
@sites
|
49
|
+
end
|
50
|
+
|
51
|
+
# Determines all listings which can be construed by combining the sites specified in the object
|
52
|
+
# constructor with the provided url-path fragments.
|
53
|
+
#
|
54
|
+
# Passes the <b>first page listing</b> of each of these urls to the provided block.
|
55
|
+
def each_listing(*fragments)
|
56
|
+
listing_urls_for(fragments).each{|url| yield Listings.new(url) }
|
57
|
+
end
|
58
|
+
|
59
|
+
# Determines all listings which can be construed by combining the sites specified in the object
|
60
|
+
# constructor with the provided url-path fragments.
|
61
|
+
#
|
62
|
+
# Passes <b>each page on every listing</b> for the passed URLs to the provided block.
|
63
|
+
def each_page_in_each_listing(*fragments)
|
64
|
+
each_listing(*fragments) do |listing|
|
65
|
+
while listing
|
66
|
+
yield listing
|
67
|
+
listing = listing.next_page
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Determines all listings which can be construed by combining the sites specified in the object
|
73
|
+
# constructor with the provided url-path fragments.
|
74
|
+
#
|
75
|
+
# Returns the <b>first page listing</b> of each of these urls to the provided block.
|
76
|
+
def listings(*fragments)
|
77
|
+
listing_urls_for(fragments).collect{|url| Listings.new url }
|
78
|
+
end
|
79
|
+
|
80
|
+
# Determines all listings which can be construed by combining the sites specified in the object
|
81
|
+
# constructor with the provided url-path fragments.
|
82
|
+
#
|
83
|
+
# Passes all posts from each of these urls to the provided block, in the order they're parsed
|
84
|
+
# (for each listing, newest posts are returned first).
|
85
|
+
def each_post(*fragments)
|
86
|
+
each_page_in_each_listing(*fragments){ |l| l.posts.each{|p| yield p} }
|
87
|
+
end
|
88
|
+
|
89
|
+
# Determines all listings which can be construed by combining the sites specified in the object
|
90
|
+
# constructor with the provided url-path fragments.
|
91
|
+
#
|
92
|
+
# Returns all posts from each of these urls, in the order they're parsed
|
93
|
+
# (newest posts first).
|
94
|
+
def posts(*fragments)
|
95
|
+
ret = []
|
96
|
+
each_page_in_each_listing(*fragments){ |l| ret += l.posts }
|
97
|
+
ret
|
98
|
+
end
|
99
|
+
|
100
|
+
# Determines all listings which can be construed by combining the sites specified in the object
|
101
|
+
# constructor with the provided url-path fragments.
|
102
|
+
#
|
103
|
+
# Returns all posts from each of these urls, which are newer than the provider 'newer_then' date.
|
104
|
+
# (Returns 'newest' posts first).
|
105
|
+
def posts_since(newer_then, *fragments)
|
106
|
+
ret = []
|
107
|
+
fragments.each do |frag|
|
108
|
+
each_post(frag) do |p|
|
109
|
+
break if p.post_date <= newer_then
|
110
|
+
ret << p
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
ret
|
115
|
+
end
|
116
|
+
|
117
|
+
class << self # Class methods
|
118
|
+
|
119
|
+
#--
|
120
|
+
# NOTE: These Class methods are all marked for deprecation as of
|
121
|
+
# version 0.8.0, and should not be used with any new project code
|
122
|
+
#++
|
123
|
+
|
124
|
+
# <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
|
125
|
+
# Instead, consider using CraigScrape::Listings.new
|
126
|
+
#
|
127
|
+
# Scrapes a single listing url and returns a Listings object representing the contents.
|
128
|
+
# Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
|
129
|
+
def scrape_listing(listing_url)
|
130
|
+
CraigScrape::Listings.new listing_url
|
131
|
+
end
|
132
|
+
|
133
|
+
# <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
|
134
|
+
# Instead, consider using the CraigScrape::each_post method.
|
135
|
+
#
|
136
|
+
# Continually scrapes listings, using the supplied url as a starting point, until the supplied block returns true or
|
137
|
+
# until there's no more 'next page' links available to click on
|
138
|
+
def scrape_until(listing_url, &post_condition)
|
139
|
+
ret = []
|
140
|
+
|
141
|
+
listings = CraigScrape::Listings.new listing_url
|
142
|
+
catch "ScrapeBreak" do
|
143
|
+
while listings do
|
144
|
+
listings.posts.each do |post|
|
145
|
+
throw "ScrapeBreak" if post_condition.call(post)
|
146
|
+
ret << post
|
147
|
+
end
|
148
|
+
|
149
|
+
listings = listings.next_page
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
ret
|
154
|
+
end
|
155
|
+
|
156
|
+
# <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
|
157
|
+
# Instead, consider using CraigScrape::Posting.new
|
158
|
+
#
|
159
|
+
# Scrapes a single Post Url, and returns a Posting object representing its contents.
|
160
|
+
# Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
|
161
|
+
def scrape_full_post(post_url)
|
162
|
+
CraigScrape::Posting.new post_url
|
163
|
+
end
|
164
|
+
|
165
|
+
# <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
|
166
|
+
# Instead, consider using the CraigScrape::each_post method.
|
167
|
+
#
|
168
|
+
# Continually scrapes listings, using the supplied url as a starting point, until 'count' summaries have been retrieved
|
169
|
+
# or no more 'next page' links are avialable to be clicked on. Returns an array of PostSummary objects.
|
170
|
+
def scrape_posts(listing_url, count)
|
171
|
+
count_so_far = 0
|
172
|
+
self.scrape_until(listing_url) {|post| count_so_far+=1; count < count_so_far }
|
173
|
+
end
|
174
|
+
|
175
|
+
# <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
|
176
|
+
# Instead, consider using the CraigScrape::posts_since method.
|
177
|
+
#
|
178
|
+
# Continually scrapes listings, until the date newer_then has been reached, or no more 'next page' links are avialable to be clicked on.
|
179
|
+
# Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
|
180
|
+
# As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
|
181
|
+
# time-based cutoffs, at the expense of retrieving every post in full during enumerations.
|
182
|
+
#
|
183
|
+
# <b>Note:</b> The results will not include post summaries having the newer_then date themselves.
|
184
|
+
def scrape_posts_since(listing_url, newer_then)
|
185
|
+
self.scrape_until(listing_url) {|post| post.post_date <= newer_then}
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
private
|
190
|
+
|
191
|
+
# This takes a fragments paramter, and turns it into actual urls
|
192
|
+
def listing_urls_for(listing_fragments)
|
193
|
+
listing_fragments.collect{ |lf|
|
194
|
+
# This removes any /'s from he beginning of the fragment
|
195
|
+
lf = $1 if /^\/(.*)/.match lf
|
196
|
+
# This adds a '/' to the end of a path, so long as its not a query we're dealing with...
|
197
|
+
lf += '/' unless lf.index '?'
|
198
|
+
sites.collect { |site| '%s%s/%s' % [site_to_url_prefix,site,lf] }
|
199
|
+
}.flatten
|
200
|
+
end
|
201
|
+
|
202
|
+
# Returns the most recentlt expired time for the provided month and day
|
203
|
+
def self.most_recently_expired_time(month, day) #:nodoc:
|
204
|
+
now = (time_now) ? time_now : Time.now
|
205
|
+
|
206
|
+
# This ensures we always generate a time in the past, by guessing the year and subtracting one if we guessed wrong
|
207
|
+
ret = Time.local now.year, month, day
|
208
|
+
ret = Time.local now.year-1, month, day if ret > now
|
209
|
+
|
210
|
+
ret
|
211
|
+
end
|
212
|
+
|
213
|
+
end
|
214
|
+
|
215
|
+
require 'listings'
|
216
|
+
require 'posting'
|
217
|
+
require 'geo_listings'
|
data/lib/listings.rb
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
# = About listings.rb
|
2
|
+
#
|
3
|
+
# This file contains the parsing code, and logic relating to post-listing pages. You
|
4
|
+
# should never need to include this file directly, as all of libcraigscrape's objects and methods
|
5
|
+
# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
6
|
+
#
|
7
|
+
require 'scraper'
|
8
|
+
|
9
|
+
# Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
|
10
|
+
class CraigScrape::Listings < CraigScrape::Scraper
|
11
|
+
LABEL = /^(.+?)[ ]*[\-]?$/
|
12
|
+
LOCATION = /^[ ]*\((.*?)\)$/
|
13
|
+
IMG_TYPE = /^[ ]*(.+)[ ]*$/
|
14
|
+
HEADER_DATE = /^[ ]*(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[ ]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Nov|Dec)[ ]+([0-9]{1,2})[ ]*$/i
|
15
|
+
SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
|
16
|
+
NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
|
17
|
+
|
18
|
+
# Array, PostSummary objects found in the listing
|
19
|
+
def posts
|
20
|
+
unless @posts
|
21
|
+
current_date = nil
|
22
|
+
@posts = []
|
23
|
+
|
24
|
+
# All we care about are p and h4 tags. This seemed to be the only way I could do this on Nokogiri:
|
25
|
+
post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
|
26
|
+
|
27
|
+
# The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
|
28
|
+
post_tags.pop if (
|
29
|
+
post_tags.length > 0 and
|
30
|
+
post_tags.last.at('a') and
|
31
|
+
NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
|
32
|
+
)
|
33
|
+
|
34
|
+
# Now we iterate though the listings:
|
35
|
+
post_tags.each do |el|
|
36
|
+
case el.name
|
37
|
+
when 'p'
|
38
|
+
post_summary = self.class.parse_summary el, current_date
|
39
|
+
|
40
|
+
# Validate that required fields are present:
|
41
|
+
parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
|
42
|
+
|
43
|
+
post_summary[:url] = url_from_href post_summary[:href]
|
44
|
+
|
45
|
+
@posts << CraigScrape::Posting.new(post_summary)
|
46
|
+
when 'h4'
|
47
|
+
# Let's make sense of the h4 tag, and then read all the p tags below it
|
48
|
+
if HEADER_DATE.match he_decode(el.inner_html)
|
49
|
+
# Generally, the H4 tags contain valid dates. When they do - this is easy:
|
50
|
+
current_date = CraigScrape.most_recently_expired_time $1, $2
|
51
|
+
elsif html.at('h4:last-of-type') == el
|
52
|
+
# There's a specific bug in craigslist, where these nonsense h4's just appear without anything relevant inside them.
|
53
|
+
# They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
|
54
|
+
# we need to pull up the full post in order to accurate tell the date.
|
55
|
+
# Setting this to nil will achieve the eager-load.
|
56
|
+
current_date = nil
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
@posts
|
63
|
+
end
|
64
|
+
|
65
|
+
# String, URL Path href-fragment of the next page link
|
66
|
+
def next_page_href
|
67
|
+
unless @next_page_href
|
68
|
+
cursor = html.at 'p:last-of-type'
|
69
|
+
|
70
|
+
cursor = cursor.at 'a' if cursor
|
71
|
+
|
72
|
+
# Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
|
73
|
+
next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
|
74
|
+
|
75
|
+
# Search listings put their next page in a link towards the top
|
76
|
+
next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
|
77
|
+
|
78
|
+
# Some search pages have a bug, whereby a 'next page' link isn't displayed,
|
79
|
+
# even though we can see that theres another page listed in the page-number links block at the top
|
80
|
+
# and bottom of the listing page
|
81
|
+
unless next_link
|
82
|
+
cursor = html % 'div.sh:first-of-type > b:last-of-type'
|
83
|
+
|
84
|
+
# If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
|
85
|
+
# We're looking good.
|
86
|
+
next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
|
87
|
+
end
|
88
|
+
|
89
|
+
# We have an anchor tag - so - let's assign the href:
|
90
|
+
@next_page_href = next_link[:href] if next_link
|
91
|
+
end
|
92
|
+
|
93
|
+
@next_page_href
|
94
|
+
end
|
95
|
+
|
96
|
+
# String, Full URL Path of the 'next page' link
|
97
|
+
def next_page_url
|
98
|
+
(next_page_href) ? url_from_href(next_page_href) : nil
|
99
|
+
end
|
100
|
+
|
101
|
+
# Returns a Listings object of the next_page_url on the current listings object
|
102
|
+
def next_page
|
103
|
+
CraigScrape::Listings.new next_page_url if next_page_url
|
104
|
+
end
|
105
|
+
|
106
|
+
# Takes a paragraph element and returns a mostly-parsed Posting
|
107
|
+
# We separate this from the rest of the parsing both for readability and ease of testing
|
108
|
+
def self.parse_summary(p_element, date = nil) #:nodoc:
|
109
|
+
ret = {}
|
110
|
+
|
111
|
+
title_anchor = nil
|
112
|
+
section_anchor = nil
|
113
|
+
|
114
|
+
# This loop got a little more complicated after Craigslist start inserting weird <spans>'s in
|
115
|
+
# its list summary postings (See test_new_listing_span051710)
|
116
|
+
p_element.search('a').each do |a_el|
|
117
|
+
# We want the first a-tag that doesn't have spans in it to be the title anchor
|
118
|
+
if title_anchor.nil?
|
119
|
+
title_anchor = a_el if !a_el.at('span')
|
120
|
+
# We want the next a-tag after the title_anchor to be the section anchor
|
121
|
+
elsif section_anchor.nil?
|
122
|
+
section_anchor = a_el
|
123
|
+
# We have no need to tranverse these further:
|
124
|
+
break
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
location_tag = p_element.at 'font'
|
129
|
+
has_pic_tag = p_element.at 'span'
|
130
|
+
|
131
|
+
href = nil
|
132
|
+
|
133
|
+
location = he_decode p_element.at('font').inner_html if location_tag
|
134
|
+
ret[:location] = $1 if location and LOCATION.match location
|
135
|
+
|
136
|
+
ret[:img_types] = []
|
137
|
+
if has_pic_tag
|
138
|
+
img_type = he_decode has_pic_tag.inner_html
|
139
|
+
img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
|
140
|
+
|
141
|
+
ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
|
142
|
+
end
|
143
|
+
|
144
|
+
ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
|
145
|
+
|
146
|
+
ret[:post_date] = date
|
147
|
+
if SUMMARY_DATE.match he_decode(p_element.children[0])
|
148
|
+
ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
|
149
|
+
end
|
150
|
+
|
151
|
+
if title_anchor
|
152
|
+
label = he_decode title_anchor.inner_html
|
153
|
+
ret[:label] = $1 if LABEL.match label
|
154
|
+
|
155
|
+
ret[:href] = title_anchor[:href]
|
156
|
+
end
|
157
|
+
|
158
|
+
ret
|
159
|
+
end
|
160
|
+
end
|