libcraigscrape 1.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,38 +3,34 @@
3
3
  # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
4
4
  #
5
5
  require 'rubygems'
6
-
7
- gem 'activesupport', '~> 2.3'
8
- gem 'nokogiri', '~> 1.4.4'
9
- gem 'htmlentities', '~> 4.0.0'
10
-
11
-
12
- require 'net/http'
13
- require 'zlib'
14
- require 'nokogiri'
6
+ require 'time'
7
+ require 'uri'
15
8
  require 'htmlentities'
16
- require 'active_support'
17
-
9
+ require 'active_support/core_ext/class/attribute_accessors'
10
+ require 'active_support/core_ext/time/calculations'
11
+ require 'htmlentities'
12
+ require 'nokogiri'
13
+ require 'typhoeus'
14
+ require 'money'
18
15
 
19
- # A base class encapsulating the various libcraigscrape objects, and providing most of the
20
- # craigslist interaction methods. Currently, we're supporting the old Class methods
16
+ # A base class encapsulating the various libcraigscrape objects, and providing most of the
17
+ # craigslist interaction methods. Currently, we're supporting the old Class methods
21
18
  # in a legacy-compatibility mode, but these methods are marked for deprecation. Instead,
22
19
  # create an instance of the Craigslist object, and use its Public Instance methods.
23
20
  # See the README for easy to follow examples.
24
21
 
25
22
  class CraigScrape
26
- cattr_accessor :time_now
27
23
  cattr_accessor :site_to_url_prefix
28
-
24
+
29
25
  #--
30
26
  # NOTE:
31
- # The only reason I took this out is b/c I might want to test with a file://
27
+ # The only reason I took this out is b/c I might want to test with a file://
32
28
  # prefix at some point
33
29
  #++
34
30
  self.site_to_url_prefix = 'http://'
35
31
 
36
-
37
- # Takes a variable number of site/path specifiers (strings) as an argument.
32
+
33
+ # Takes a variable number of site/path specifiers (strings) as an argument.
38
34
  # This list gets flattened and passed to CraigScrape::GeoListings.find_sites .
39
35
  # See that method's rdoc for a complete set of rules on what arguments are allowed here.
40
36
  def initialize(*args)
@@ -44,50 +40,50 @@ class CraigScrape
44
40
  # Returns which sites are included in any operations performed by this object. This is directly
45
41
  # ascertained from the initial constructor's spec-list
46
42
  def sites
47
- @sites ||= GeoListings.find_sites @sites_specs
43
+ @sites ||= GeoListings.find_sites @sites_specs
48
44
  @sites
49
45
  end
50
-
46
+
51
47
  # Determines all listings which can be construed by combining the sites specified in the object
52
- # constructor with the provided url-path fragments.
48
+ # constructor with the provided url-path fragments.
53
49
  #
54
50
  # Passes the <b>first page listing</b> of each of these urls to the provided block.
55
51
  def each_listing(*fragments)
56
52
  listing_urls_for(fragments).each{|url| yield Listings.new(url) }
57
53
  end
58
-
54
+
59
55
  # Determines all listings which can be construed by combining the sites specified in the object
60
- # constructor with the provided url-path fragments.
56
+ # constructor with the provided url-path fragments.
61
57
  #
62
58
  # Passes <b>each page on every listing</b> for the passed URLs to the provided block.
63
59
  def each_page_in_each_listing(*fragments)
64
60
  each_listing(*fragments) do |listing|
65
61
  while listing
66
62
  yield listing
67
- listing = listing.next_page
63
+ listing = listing.next_page
68
64
  end
69
65
  end
70
66
  end
71
-
67
+
72
68
  # Determines all listings which can be construed by combining the sites specified in the object
73
- # constructor with the provided url-path fragments.
69
+ # constructor with the provided url-path fragments.
74
70
  #
75
71
  # Returns the <b>first page listing</b> of each of these urls to the provided block.
76
72
  def listings(*fragments)
77
73
  listing_urls_for(fragments).collect{|url| Listings.new url }
78
74
  end
79
-
75
+
80
76
  # Determines all listings which can be construed by combining the sites specified in the object
81
- # constructor with the provided url-path fragments.
77
+ # constructor with the provided url-path fragments.
82
78
  #
83
79
  # Passes all posts from each of these urls to the provided block, in the order they're parsed
84
80
  # (for each listing, newest posts are returned first).
85
81
  def each_post(*fragments)
86
82
  each_page_in_each_listing(*fragments){ |l| l.posts.each{|p| yield p} }
87
83
  end
88
-
84
+
89
85
  # Determines all listings which can be construed by combining the sites specified in the object
90
- # constructor with the provided url-path fragments.
86
+ # constructor with the provided url-path fragments.
91
87
  #
92
88
  # Returns all posts from each of these urls, in the order they're parsed
93
89
  # (newest posts first).
@@ -96,24 +92,32 @@ class CraigScrape
96
92
  each_page_in_each_listing(*fragments){ |l| ret += l.posts }
97
93
  ret
98
94
  end
99
-
95
+
100
96
  # Determines all listings which can be construed by combining the sites specified in the object
101
- # constructor with the provided url-path fragments.
97
+ # constructor with the provided url-path fragments.
102
98
  #
103
- # Returns all posts from each of these urls, which are newer than the provider 'newer_then' date.
99
+ # Returns all posts from each of these urls, which are newer than (or equal to) the provider 'newer_then' date.
104
100
  # (Returns 'newest' posts first).
101
+ #
102
+ # NOTE: New to version 1.1, if newer_then is a date, we compare to the post_date
103
+ # if newer_then is a Time, we compare to post_time. Be aware that post_time
104
+ # requires the entire post be loaded, and not just the summary - which will
105
+ # take longer to download.
105
106
  def posts_since(newer_then, *fragments)
107
+ accessor = (newer_then.kind_of? Date) ? :post_date : :post_time
106
108
  ret = []
107
109
  fragments.each do |frag|
108
110
  each_post(frag) do |p|
109
- break if p.post_date <= newer_then
111
+ # We have to try the comparison, since post_time could conceivably be nil
112
+ # for the case of a system_post?
113
+ break if p.send(accessor).try(:<=, newer_then)
110
114
  ret << p
111
115
  end
112
116
  end
113
117
 
114
- ret
118
+ ret
115
119
  end
116
-
120
+
117
121
  class << self # Class methods
118
122
 
119
123
  #--
@@ -122,11 +126,11 @@ class CraigScrape
122
126
  #++
123
127
 
124
128
  # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
125
- # Instead, consider using CraigScrape::Listings.new
129
+ # Instead, consider using CraigScrape::Listings.new
126
130
  #
127
- # Scrapes a single listing url and returns a Listings object representing the contents.
131
+ # Scrapes a single listing url and returns a Listings object representing the contents.
128
132
  # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
129
- def scrape_listing(listing_url)
133
+ def scrape_listing(listing_url)
130
134
  CraigScrape::Listings.new listing_url
131
135
  end
132
136
 
@@ -137,24 +141,24 @@ class CraigScrape
137
141
  # until there's no more 'next page' links available to click on
138
142
  def scrape_until(listing_url, &post_condition)
139
143
  ret = []
140
-
144
+
141
145
  listings = CraigScrape::Listings.new listing_url
142
146
  catch "ScrapeBreak" do
143
- while listings do
147
+ while listings do
144
148
  listings.posts.each do |post|
145
149
  throw "ScrapeBreak" if post_condition.call(post)
146
150
  ret << post
147
151
  end
148
-
152
+
149
153
  listings = listings.next_page
150
154
  end
151
155
  end
152
-
156
+
153
157
  ret
154
158
  end
155
159
 
156
160
  # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
157
- # Instead, consider using CraigScrape::Posting.new
161
+ # Instead, consider using CraigScrape::Posting.new
158
162
  #
159
163
  # Scrapes a single Post Url, and returns a Posting object representing its contents.
160
164
  # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
@@ -176,7 +180,7 @@ class CraigScrape
176
180
  # Instead, consider using the CraigScrape::posts_since method.
177
181
  #
178
182
  # Continually scrapes listings, until the date newer_then has been reached, or no more 'next page' links are avialable to be clicked on.
179
- # Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
183
+ # Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
180
184
  # As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
181
185
  # time-based cutoffs, at the expense of retrieving every post in full during enumerations.
182
186
  #
@@ -185,9 +189,9 @@ class CraigScrape
185
189
  self.scrape_until(listing_url) {|post| post.post_date <= newer_then}
186
190
  end
187
191
  end
188
-
192
+
189
193
  private
190
-
194
+
191
195
  # This takes a fragments paramter, and turns it into actual urls
192
196
  def listing_urls_for(listing_fragments)
193
197
  listing_fragments.collect{ |lf|
@@ -198,20 +202,9 @@ class CraigScrape
198
202
  sites.collect { |site| '%s%s/%s' % [site_to_url_prefix,site,lf] }
199
203
  }.flatten
200
204
  end
201
-
202
- # Returns the most recentlt expired time for the provided month and day
203
- def self.most_recently_expired_time(month, day) #:nodoc:
204
- now = (time_now) ? time_now : Time.now
205
-
206
- # This ensures we always generate a time in the past, by guessing the year and subtracting one if we guessed wrong
207
- ret = Time.local now.year, month, day
208
- ret = Time.local now.year-1, month, day if ret > now
209
-
210
- ret
211
- end
212
205
 
213
206
  end
214
207
 
215
208
  require 'listings'
216
209
  require 'posting'
217
- require 'geo_listings'
210
+ require 'geo_listings'
data/lib/listings.rb CHANGED
@@ -13,7 +13,13 @@ class CraigScrape::Listings < CraigScrape::Scraper
13
13
  IMG_TYPE = /^[ ]*(.+)[ ]*$/
14
14
  HEADER_DATE = /^[ ]*(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[ ]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Nov|Dec)[ ]+([0-9]{1,2})[ ]*$/i
15
15
  SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
16
- NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
16
+ NEXT_PAGE_LINK = /^[ ]*(?:next [\d]+ postings|Next \>\>)[ ]*$/
17
+
18
+ XPATH_POST_DATE = "*[@class='itemdate']"
19
+ XPATH_POST_IMGPIC = "*[@class='itempx']/*[@class='p']"
20
+ XPATH_PAGENAV_LINKS = "//*[@class='ban']//a"
21
+ # There's a couple places that the price hangs out. We search in this order
22
+ XPATHS_POST_PRICE = ["*[@class='itempp']", "*[@class='itemph']"]
17
23
 
18
24
  # Array, PostSummary objects found in the listing
19
25
  def posts
@@ -35,7 +41,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
35
41
  post_tags.each do |el|
36
42
  case el.name
37
43
  when 'p'
38
- post_summary = self.class.parse_summary el, current_date
44
+ post_summary = parse_summary el, current_date
39
45
 
40
46
  # Validate that required fields are present:
41
47
  parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
@@ -47,7 +53,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
47
53
  # Let's make sense of the h4 tag, and then read all the p tags below it
48
54
  if HEADER_DATE.match he_decode(el.inner_html)
49
55
  # Generally, the H4 tags contain valid dates. When they do - this is easy:
50
- current_date = CraigScrape.most_recently_expired_time $1, $2
56
+ current_date = Date.parse [$1, $2].join('/')
51
57
  elsif html.at('h4:last-of-type') == el
52
58
  # There's a specific bug in craigslist, where these nonsense h4's just appear without anything relevant inside them.
53
59
  # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
@@ -65,29 +71,37 @@ class CraigScrape::Listings < CraigScrape::Scraper
65
71
  # String, URL Path href-fragment of the next page link
66
72
  def next_page_href
67
73
  unless @next_page_href
68
- cursor = html.at 'p:last-of-type'
69
-
70
- cursor = cursor.at 'a' if cursor
71
-
72
- # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
73
- next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
74
-
75
- # Search listings put their next page in a link towards the top
76
- next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
77
-
78
- # Some search pages have a bug, whereby a 'next page' link isn't displayed,
79
- # even though we can see that theres another page listed in the page-number links block at the top
80
- # and bottom of the listing page
81
- unless next_link
82
- cursor = html % 'div.sh:first-of-type > b:last-of-type'
83
-
84
- # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
85
- # We're looking good.
86
- next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
74
+
75
+ if html.at_xpath(XPATH_PAGENAV_LINKS)
76
+ # Post 12/3
77
+ next_link = html.xpath(XPATH_PAGENAV_LINKS).find{|link| NEXT_PAGE_LINK.match link.content}
78
+ @next_page_href = next_link[:href] if next_link
79
+ else
80
+ # Old style
81
+ cursor = html.at 'p:last-of-type'
82
+
83
+ cursor = cursor.at 'a' if cursor
84
+
85
+ # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
86
+ next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
87
+
88
+ # Search listings put their next page in a link towards the top
89
+ next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
90
+
91
+ # Some search pages have a bug, whereby a 'next page' link isn't displayed,
92
+ # even though we can see that theres another page listed in the page-number links block at the top
93
+ # and bottom of the listing page
94
+ unless next_link
95
+ cursor = html % 'div.sh:first-of-type > b:last-of-type'
96
+
97
+ # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
98
+ # We're looking good.
99
+ next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
100
+ end
101
+
102
+ # We have an anchor tag - so - let's assign the href:
103
+ @next_page_href = next_link[:href] if next_link
87
104
  end
88
-
89
- # We have an anchor tag - so - let's assign the href:
90
- @next_page_href = next_link[:href] if next_link
91
105
  end
92
106
 
93
107
  @next_page_href
@@ -100,12 +114,14 @@ class CraigScrape::Listings < CraigScrape::Scraper
100
114
 
101
115
  # Returns a Listings object of the next_page_url on the current listings object
102
116
  def next_page
103
- CraigScrape::Listings.new next_page_url if next_page_url
117
+ CraigScrape::Listings.new URI.encode(next_page_url) if next_page_url
104
118
  end
105
-
119
+
120
+ private
121
+
106
122
  # Takes a paragraph element and returns a mostly-parsed Posting
107
123
  # We separate this from the rest of the parsing both for readability and ease of testing
108
- def self.parse_summary(p_element, date = nil) #:nodoc:
124
+ def parse_summary(p_element, date = nil) #:nodoc:
109
125
  ret = {}
110
126
 
111
127
  title_anchor = nil
@@ -126,26 +142,45 @@ class CraigScrape::Listings < CraigScrape::Scraper
126
142
  end
127
143
 
128
144
  location_tag = p_element.at 'font'
129
- has_pic_tag = p_element.at 'span'
130
145
 
131
146
  href = nil
132
147
 
133
148
  location = he_decode p_element.at('font').inner_html if location_tag
134
149
  ret[:location] = $1 if location and LOCATION.match location
135
150
 
136
- ret[:img_types] = []
137
- if has_pic_tag
138
- img_type = he_decode has_pic_tag.inner_html
139
- img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
151
+ price_path = XPATHS_POST_PRICE.find{|path|
152
+ content = p_element.at_xpath(path).try(:content)
153
+ (!content.nil? and !content.empty?)
154
+ }
155
+ ret[:price] = Money.new($1.to_i * 100, 'USD') if price_path and
156
+ /\$([\d]+)/.match(p_element.at_xpath(price_path).content)
140
157
 
141
- ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
158
+ ret[:img_types] = []
159
+ if p_element.at_xpath XPATH_POST_IMGPIC
160
+ # Post 12/3
161
+ ret[:img_types] = p_element.at_xpath(XPATH_POST_IMGPIC).content.scan(/\w+/).collect(&:to_sym)
162
+ else
163
+ # Old style:
164
+ has_pic_tag = p_element.at 'span'
165
+ if has_pic_tag
166
+ img_type = he_decode has_pic_tag.inner_html
167
+ img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
168
+
169
+ ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
170
+ end
142
171
  end
143
172
 
144
- ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
145
-
173
+ ret[:section] = he_decode(section_anchor.inner_html) if section_anchor
174
+
146
175
  ret[:post_date] = date
147
- if SUMMARY_DATE.match he_decode(p_element.children[0])
148
- ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
176
+ if p_element.at_xpath(XPATH_POST_DATE)
177
+ # Post 12/3
178
+ if /\A([^ ]+) ([\d]+)\Z/.match p_element.at_xpath(XPATH_POST_DATE).content.strip
179
+ ret[:post_date] = Date.parse [$1, $2].join('/')
180
+ end
181
+ elsif SUMMARY_DATE.match he_decode(p_element.children[0])
182
+ # Old style
183
+ ret[:post_date] = Date.parse [$1, $2].join('/')
149
184
  end
150
185
 
151
186
  if title_anchor
@@ -157,4 +192,5 @@ class CraigScrape::Listings < CraigScrape::Scraper
157
192
 
158
193
  ret
159
194
  end
160
- end
195
+
196
+ end