libcraigscrape 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +19 -0
- data/README +27 -11
- data/Rakefile +44 -2
- data/bin/craig_report_schema.yml +30 -21
- data/bin/craigwatch +232 -67
- data/bin/report_mailer/craigslist_report.html.erb +12 -9
- data/bin/report_mailer/craigslist_report.plain.erb +4 -1
- data/lib/geo_listings.rb +144 -0
- data/lib/libcraigscrape.rb +158 -650
- data/lib/listings.rb +144 -0
- data/lib/posting.rb +293 -0
- data/lib/scraper.rb +203 -0
- data/test/geolisting_samples/hierarchy_test071009/index.html +31 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/%20SW%20florida/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/more-nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/nonexist/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/south%20florida/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/index.html +355 -0
- data/test/test_craigslist_geolisting.rb +476 -380
- metadata +28 -2
data/CHANGELOG
CHANGED
@@ -1,5 +1,24 @@
|
|
1
1
|
== Change Log
|
2
2
|
|
3
|
+
=== Release 0.8.0 (Oct 22, 2009)
|
4
|
+
- Lots of substantial changes to the API & craigwatch (though backwards compatibility is mostly there)
|
5
|
+
- Added :code_tests to the rakefile
|
6
|
+
- Report definitions don't need a full path to the :dbfile, the parameter here can now be relative to the yaml file itself
|
7
|
+
- Added a Listings::next_page
|
8
|
+
- craigwatch: When not specifying a regex in _has or _has_no, we now perform an insensitive search
|
9
|
+
- Created CraigScrape::GeoListings.find_sites, & CraigScrape::GeoListings.sites_in_path methods
|
10
|
+
- <b>Large API changes</b> Added a constructor to CraigScrape, and changed a number of ways that sites are scraped
|
11
|
+
- Changed the format of the craigwatch tracking db - you'll need to delete any db's you already have and let the migrations re-run
|
12
|
+
- craigwatch is *much* more efficient with memory. Feel free to scrape the whole world now!
|
13
|
+
- craigwatch's yml changed a bit - documented in craigwatch
|
14
|
+
- We'll more or less automatically figure out the tracking_database in craigwatch if none is specified (will default to sqlite and auto-generated filename)
|
15
|
+
- craigwatch report_name is optional too now and can largely figure itself out
|
16
|
+
- Added summary_or_full_post_has and summary_or_full_post_has_no as craigwatch report parameters
|
17
|
+
- If a craigwatch search comes up empty - we now indicate that no results were found...
|
18
|
+
- Added location_has, location_has_no to craigwatch
|
19
|
+
- Cleaned up thge rdoc to clarify all the new syntax/features
|
20
|
+
- Added Scraper::retries_on_404_fail, Scraper::sleep_between_404_retries to help deal with some of the subtleties in handling connection reset errors different than the 404's
|
21
|
+
|
3
22
|
=== Release 0.7.0 (Jul 5, 2009)
|
4
23
|
- A good bit of refactoring
|
5
24
|
- Eager-loading in the Post object without the need of the full_post method
|
data/README
CHANGED
@@ -17,31 +17,47 @@ Install via RubyGems:
|
|
17
17
|
|
18
18
|
== Usage
|
19
19
|
|
20
|
-
=== Scrape Craigslist Listings since
|
20
|
+
=== Scrape Craigslist Listings since Sep 10
|
21
21
|
|
22
|
-
|
22
|
+
On the 'miami.craigslist.org' site, using the query "search/sss?query=apple"
|
23
23
|
|
24
24
|
require 'rubygems'
|
25
25
|
require 'libcraigscrape'
|
26
26
|
require 'date'
|
27
27
|
require 'pp'
|
28
28
|
|
29
|
-
|
30
|
-
|
31
|
-
pp post
|
29
|
+
miami_cl = CraigScrape.new 'us/fl/miami'
|
30
|
+
miami_cl.posts_since(Time.parse('Sep 10'), 'search/sss?query=apple').each do |post|
|
31
|
+
pp post
|
32
32
|
end
|
33
33
|
|
34
34
|
=== Scrape Last 225 Craigslist Listings
|
35
35
|
|
36
|
-
|
36
|
+
On the 'miami.craigslist.org' under the 'apa' category
|
37
37
|
|
38
38
|
require 'rubygems'
|
39
39
|
require 'libcraigscrape'
|
40
40
|
require 'pp'
|
41
41
|
|
42
|
-
|
43
|
-
|
44
|
-
|
42
|
+
i=1
|
43
|
+
CraigScrape.new('us/fl/miami').each_post('apa') do |post|
|
44
|
+
break if i > 225
|
45
|
+
i+=1
|
46
|
+
pp post
|
47
|
+
end
|
48
|
+
|
49
|
+
=== Multiple site with multiple section/search enumeration of posts
|
50
|
+
|
51
|
+
In Florida, with the exception of 'miami.craigslist.org' & 'keys.craigslist.org' sites, output each post in
|
52
|
+
the 'crg' category and for the search 'artist needed'
|
53
|
+
|
54
|
+
require 'rubygems'
|
55
|
+
require 'libcraigscrape'
|
56
|
+
require 'pp'
|
57
|
+
|
58
|
+
non_sfl_sites = CraigScrape.new('us/fl', '- us/fl/miami', '- us/fl/keys')
|
59
|
+
non_sfl_sites.each_post('crg', 'search/sss?query=artist+needed') do |post|
|
60
|
+
pp post
|
45
61
|
end
|
46
62
|
|
47
63
|
=== Scrape Single Craigslist Posting
|
@@ -51,7 +67,7 @@ This grabs the full details under the specific post http://miami.craigslist.org/
|
|
51
67
|
require 'rubygems'
|
52
68
|
require 'libcraigscrape'
|
53
69
|
|
54
|
-
post = CraigScrape.
|
70
|
+
post = CraigScrape::Posting.new 'http://miami.craigslist.org/mdc/sys/1140808860.html'
|
55
71
|
puts "(%s) %s:\n %s" % [ post.post_time.strftime('%b %d'), post.title, post.contents_as_plain ]
|
56
72
|
|
57
73
|
=== Scrape Single Craigslist Listing
|
@@ -61,7 +77,7 @@ This grabs the post summaries of the single listings at http://miami.craigslist.
|
|
61
77
|
require 'rubygems'
|
62
78
|
require 'libcraigscrape'
|
63
79
|
|
64
|
-
listing = CraigScrape.
|
80
|
+
listing = CraigScrape::Listings.new 'http://miami.craigslist.org/search/sss?query=laptop'
|
65
81
|
puts 'Found %d posts for the search "laptop" on this page' % listing.posts.length
|
66
82
|
|
67
83
|
== Author
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ include FileUtils
|
|
11
11
|
RbConfig = Config unless defined? RbConfig
|
12
12
|
|
13
13
|
NAME = "libcraigscrape"
|
14
|
-
VERS = ENV['VERSION'] || "0.
|
14
|
+
VERS = ENV['VERSION'] || "0.8.0"
|
15
15
|
PKG = "#{NAME}-#{VERS}"
|
16
16
|
|
17
17
|
RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
|
@@ -53,7 +53,8 @@ Rake::RDocTask.new do |rdoc|
|
|
53
53
|
rdoc.rdoc_dir = 'doc/rdoc'
|
54
54
|
rdoc.options += RDOC_OPTS
|
55
55
|
rdoc.main = "README"
|
56
|
-
|
56
|
+
# NOTE: If you don't put libcraigscrape.rb at the beginning, the rdoc ends up looking a little screwy
|
57
|
+
rdoc.rdoc_files.add RDOC_FILES+Dir.glob('lib/*.rb').sort_by{|a,b| (a == 'lib/libcraigscrape.rb') ? -1 : 0 }
|
57
58
|
end
|
58
59
|
|
59
60
|
Rake::GemPackageTask.new(SPEC) do |p|
|
@@ -77,3 +78,44 @@ task :uninstall => [:clean] do
|
|
77
78
|
sh %{sudo gem uninstall #{NAME}}
|
78
79
|
end
|
79
80
|
|
81
|
+
require 'roodi'
|
82
|
+
require 'roodi_task'
|
83
|
+
|
84
|
+
namespace :code_tests do
|
85
|
+
desc "Analyze for code complexity"
|
86
|
+
task :flog do
|
87
|
+
require 'flog'
|
88
|
+
|
89
|
+
flog = Flog.new
|
90
|
+
flog.flog_files ['lib']
|
91
|
+
threshold = 105
|
92
|
+
|
93
|
+
bad_methods = flog.totals.select do |name, score|
|
94
|
+
score > threshold
|
95
|
+
end
|
96
|
+
|
97
|
+
bad_methods.sort { |a,b| a[1] <=> b[1] }.each do |name, score|
|
98
|
+
puts "%8.1f: %s" % [score, name]
|
99
|
+
end
|
100
|
+
|
101
|
+
puts "WARNING : #{bad_methods.size} methods have a flog complexity > #{threshold}" unless bad_methods.empty?
|
102
|
+
end
|
103
|
+
|
104
|
+
desc "Analyze for code duplication"
|
105
|
+
require 'flay'
|
106
|
+
task :flay do
|
107
|
+
threshold = 25
|
108
|
+
flay = Flay.new({:fuzzy => false, :verbose => false, :mass => threshold})
|
109
|
+
flay.process(*Flay.expand_dirs_to_files(['lib']))
|
110
|
+
|
111
|
+
flay.report
|
112
|
+
|
113
|
+
raise "#{flay.masses.size} chunks of code have a duplicate mass > #{threshold}" unless flay.masses.empty?
|
114
|
+
end
|
115
|
+
|
116
|
+
RoodiTask.new 'roodi', ['lib/*.rb'], 'roodi.yml'
|
117
|
+
end
|
118
|
+
|
119
|
+
desc "Run all code tests"
|
120
|
+
task :code_tests => %w(code_tests:flog code_tests:flay code_tests:roodi)
|
121
|
+
|
data/bin/craig_report_schema.yml
CHANGED
@@ -5,19 +5,19 @@ mapping:
|
|
5
5
|
"debug_mailer": { type: bool, required: no }
|
6
6
|
"debug_craigscrape": { type: bool, required: no }
|
7
7
|
|
8
|
-
"report_name":
|
9
|
-
"email_to":
|
10
|
-
"email_from":
|
8
|
+
"report_name": { type: str, required: no }
|
9
|
+
"email_to": { type: str, required: yes }
|
10
|
+
"email_from": { type: str, required: no }
|
11
11
|
"smtp_settings":
|
12
12
|
type: map
|
13
13
|
required: no
|
14
14
|
mapping:
|
15
|
-
"address":
|
16
|
-
"port":
|
17
|
-
"user_name":
|
18
|
-
"domain":
|
19
|
-
"password":
|
20
|
-
"authentication":
|
15
|
+
"address": { type: str, required: yes }
|
16
|
+
"port": { type: int, required: no, default: 25 }
|
17
|
+
"user_name": { type: str, required: no }
|
18
|
+
"domain": { type: str, required: no }
|
19
|
+
"password": { type: str, required: no }
|
20
|
+
"authentication": { type: str, required: no }
|
21
21
|
"tracking_database":
|
22
22
|
type: map
|
23
23
|
mapping:
|
@@ -34,22 +34,31 @@ mapping:
|
|
34
34
|
- type: map
|
35
35
|
class: CraigReportDefinition::SearchDefinition
|
36
36
|
mapping:
|
37
|
-
"name":
|
38
|
-
"has_image":
|
39
|
-
"newest_first":
|
40
|
-
"price_required":
|
41
|
-
"price_greater_than":
|
42
|
-
"price_less_than":
|
43
|
-
"full_post_has":
|
44
|
-
"full_post_has_no":
|
45
|
-
"summary_post_has":
|
46
|
-
"summary_post_has_no":
|
47
|
-
"
|
37
|
+
"name": {type: str, required: yes, unique: yes}
|
38
|
+
"has_image": {type: bool, required: no}
|
39
|
+
"newest_first": {type: bool, required: no, default: no}
|
40
|
+
"price_required": {type: bool, required: no, default: no}
|
41
|
+
"price_greater_than": {type: int, required: no}
|
42
|
+
"price_less_than": {type: int, required: no}
|
43
|
+
"full_post_has": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
|
44
|
+
"full_post_has_no": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
|
45
|
+
"summary_post_has": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
|
46
|
+
"summary_post_has_no": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
|
47
|
+
"summary_or_full_post_has": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
|
48
|
+
"summary_or_full_post_has_no": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
|
49
|
+
"location_has": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
|
50
|
+
"location_has_no": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
|
51
|
+
"sites":
|
52
|
+
type: seq
|
53
|
+
required: yes
|
54
|
+
sequence:
|
55
|
+
- type: str
|
56
|
+
unique: yes
|
57
|
+
"listings":
|
48
58
|
type: seq
|
49
59
|
required: yes
|
50
60
|
sequence:
|
51
61
|
- type: str
|
52
|
-
pattern: /^http[s]?\:\/\//
|
53
62
|
unique: yes
|
54
63
|
"starting":
|
55
64
|
type: str
|