libcraigscrape 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,5 +1,24 @@
1
1
  == Change Log
2
2
 
3
+ === Release 0.8.0 (Oct 22, 2009)
4
+ - Lots of substantial changes to the API & craigwatch (though backwards compatibility is mostly there)
5
+ - Added :code_tests to the rakefile
6
+ - Report definitions don't need a full path to the :dbfile, the parameter here can now be relative to the yaml file itself
7
+ - Added a Listings::next_page
8
+ - craigwatch: When not specifying a regex in _has or _has_no, we now perform an insensitive search
9
+ - Created CraigScrape::GeoListings.find_sites, & CraigScrape::GeoListings.sites_in_path methods
10
+ - <b>Large API changes</b> Added a constructor to CraigScrape, and changed a number of ways that sites are scraped
11
+ - Changed the format of the craigwatch tracking db - you'll need to delete any db's you already have and let the migrations re-run
12
+ - craigwatch is *much* more efficient with memory. Feel free to scrape the whole world now!
13
+ - craigwatch's yml changed a bit - documented in craigwatch
14
+ - We'll more or less automatically figure out the tracking_database in craigwatch if none is specified (will default to sqlite and auto-generated filename)
15
+ - craigwatch report_name is optional too now and can largely figure itself out
16
+ - Added summary_or_full_post_has and summary_or_full_post_has_no as craigwatch report parameters
17
+ - If a craigwatch search comes up empty - we now indicate that no results were found...
18
+ - Added location_has, location_has_no to craigwatch
19
+ - Cleaned up thge rdoc to clarify all the new syntax/features
20
+ - Added Scraper::retries_on_404_fail, Scraper::sleep_between_404_retries to help deal with some of the subtleties in handling connection reset errors different than the 404's
21
+
3
22
  === Release 0.7.0 (Jul 5, 2009)
4
23
  - A good bit of refactoring
5
24
  - Eager-loading in the Post object without the need of the full_post method
data/README CHANGED
@@ -17,31 +17,47 @@ Install via RubyGems:
17
17
 
18
18
  == Usage
19
19
 
20
- === Scrape Craigslist Listings since Apr 26
20
+ === Scrape Craigslist Listings since Sep 10
21
21
 
22
- Using the search url http://miami.craigslist.org/search/sss?query=apple
22
+ On the 'miami.craigslist.org' site, using the query "search/sss?query=apple"
23
23
 
24
24
  require 'rubygems'
25
25
  require 'libcraigscrape'
26
26
  require 'date'
27
27
  require 'pp'
28
28
 
29
- posts = CraigScrape.scrape_posts_since 'http://miami.craigslist.org/search/sss?query=apple', Time.parse('Apr 25')
30
- posts.each do |post|
31
- pp post
29
+ miami_cl = CraigScrape.new 'us/fl/miami'
30
+ miami_cl.posts_since(Time.parse('Sep 10'), 'search/sss?query=apple').each do |post|
31
+ pp post
32
32
  end
33
33
 
34
34
  === Scrape Last 225 Craigslist Listings
35
35
 
36
- Under the category url http://miami.craigslist.org/apa/
36
+ On the 'miami.craigslist.org' under the 'apa' category
37
37
 
38
38
  require 'rubygems'
39
39
  require 'libcraigscrape'
40
40
  require 'pp'
41
41
 
42
- posts = CraigScrape.scrape_posts 'http://miami.craigslist.org/apa/', 225
43
- posts.each do |post|
44
- pp post
42
+ i=1
43
+ CraigScrape.new('us/fl/miami').each_post('apa') do |post|
44
+ break if i > 225
45
+ i+=1
46
+ pp post
47
+ end
48
+
49
+ === Multiple site with multiple section/search enumeration of posts
50
+
51
+ In Florida, with the exception of 'miami.craigslist.org' & 'keys.craigslist.org' sites, output each post in
52
+ the 'crg' category and for the search 'artist needed'
53
+
54
+ require 'rubygems'
55
+ require 'libcraigscrape'
56
+ require 'pp'
57
+
58
+ non_sfl_sites = CraigScrape.new('us/fl', '- us/fl/miami', '- us/fl/keys')
59
+ non_sfl_sites.each_post('crg', 'search/sss?query=artist+needed') do |post|
60
+ pp post
45
61
  end
46
62
 
47
63
  === Scrape Single Craigslist Posting
@@ -51,7 +67,7 @@ This grabs the full details under the specific post http://miami.craigslist.org/
51
67
  require 'rubygems'
52
68
  require 'libcraigscrape'
53
69
 
54
- post = CraigScrape.scrape_full_post 'http://miami.craigslist.org/mdc/sys/1140808860.html'
70
+ post = CraigScrape::Posting.new 'http://miami.craigslist.org/mdc/sys/1140808860.html'
55
71
  puts "(%s) %s:\n %s" % [ post.post_time.strftime('%b %d'), post.title, post.contents_as_plain ]
56
72
 
57
73
  === Scrape Single Craigslist Listing
@@ -61,7 +77,7 @@ This grabs the post summaries of the single listings at http://miami.craigslist.
61
77
  require 'rubygems'
62
78
  require 'libcraigscrape'
63
79
 
64
- listing = CraigScrape.scrape_listing 'http://miami.craigslist.org/search/sss?query=laptop'
80
+ listing = CraigScrape::Listings.new 'http://miami.craigslist.org/search/sss?query=laptop'
65
81
  puts 'Found %d posts for the search "laptop" on this page' % listing.posts.length
66
82
 
67
83
  == Author
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ include FileUtils
11
11
  RbConfig = Config unless defined? RbConfig
12
12
 
13
13
  NAME = "libcraigscrape"
14
- VERS = ENV['VERSION'] || "0.7.0"
14
+ VERS = ENV['VERSION'] || "0.8.0"
15
15
  PKG = "#{NAME}-#{VERS}"
16
16
 
17
17
  RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
@@ -53,7 +53,8 @@ Rake::RDocTask.new do |rdoc|
53
53
  rdoc.rdoc_dir = 'doc/rdoc'
54
54
  rdoc.options += RDOC_OPTS
55
55
  rdoc.main = "README"
56
- rdoc.rdoc_files.add RDOC_FILES+['lib/**/*.rb']
56
+ # NOTE: If you don't put libcraigscrape.rb at the beginning, the rdoc ends up looking a little screwy
57
+ rdoc.rdoc_files.add RDOC_FILES+Dir.glob('lib/*.rb').sort_by{|a,b| (a == 'lib/libcraigscrape.rb') ? -1 : 0 }
57
58
  end
58
59
 
59
60
  Rake::GemPackageTask.new(SPEC) do |p|
@@ -77,3 +78,44 @@ task :uninstall => [:clean] do
77
78
  sh %{sudo gem uninstall #{NAME}}
78
79
  end
79
80
 
81
+ require 'roodi'
82
+ require 'roodi_task'
83
+
84
+ namespace :code_tests do
85
+ desc "Analyze for code complexity"
86
+ task :flog do
87
+ require 'flog'
88
+
89
+ flog = Flog.new
90
+ flog.flog_files ['lib']
91
+ threshold = 105
92
+
93
+ bad_methods = flog.totals.select do |name, score|
94
+ score > threshold
95
+ end
96
+
97
+ bad_methods.sort { |a,b| a[1] <=> b[1] }.each do |name, score|
98
+ puts "%8.1f: %s" % [score, name]
99
+ end
100
+
101
+ puts "WARNING : #{bad_methods.size} methods have a flog complexity > #{threshold}" unless bad_methods.empty?
102
+ end
103
+
104
+ desc "Analyze for code duplication"
105
+ require 'flay'
106
+ task :flay do
107
+ threshold = 25
108
+ flay = Flay.new({:fuzzy => false, :verbose => false, :mass => threshold})
109
+ flay.process(*Flay.expand_dirs_to_files(['lib']))
110
+
111
+ flay.report
112
+
113
+ raise "#{flay.masses.size} chunks of code have a duplicate mass > #{threshold}" unless flay.masses.empty?
114
+ end
115
+
116
+ RoodiTask.new 'roodi', ['lib/*.rb'], 'roodi.yml'
117
+ end
118
+
119
+ desc "Run all code tests"
120
+ task :code_tests => %w(code_tests:flog code_tests:flay code_tests:roodi)
121
+
@@ -5,19 +5,19 @@ mapping:
5
5
  "debug_mailer": { type: bool, required: no }
6
6
  "debug_craigscrape": { type: bool, required: no }
7
7
 
8
- "report_name": { type: str, required: yes }
9
- "email_to": { type: str, required: yes }
10
- "email_from": { type: str, required: no }
8
+ "report_name": { type: str, required: no }
9
+ "email_to": { type: str, required: yes }
10
+ "email_from": { type: str, required: no }
11
11
  "smtp_settings":
12
12
  type: map
13
13
  required: no
14
14
  mapping:
15
- "address": { type: str, required: yes }
16
- "port": { type: int, required: no, default: 25 }
17
- "user_name": { type: str, required: no }
18
- "domain": { type: str, required: no }
19
- "password": { type: str, required: no }
20
- "authentication": { type: str, required: no }
15
+ "address": { type: str, required: yes }
16
+ "port": { type: int, required: no, default: 25 }
17
+ "user_name": { type: str, required: no }
18
+ "domain": { type: str, required: no }
19
+ "password": { type: str, required: no }
20
+ "authentication": { type: str, required: no }
21
21
  "tracking_database":
22
22
  type: map
23
23
  mapping:
@@ -34,22 +34,31 @@ mapping:
34
34
  - type: map
35
35
  class: CraigReportDefinition::SearchDefinition
36
36
  mapping:
37
- "name": {type: str, required: yes, unique: yes}
38
- "has_image": {type: bool, required: no}
39
- "newest_first": {type: bool, required: no, default: no}
40
- "price_required": {type: bool, required: no, default: no}
41
- "price_greater_than": {type: int, required: no}
42
- "price_less_than": {type: int, required: no}
43
- "full_post_has": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
44
- "full_post_has_no": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
45
- "summary_post_has": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
46
- "summary_post_has_no": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
47
- "listing":
37
+ "name": {type: str, required: yes, unique: yes}
38
+ "has_image": {type: bool, required: no}
39
+ "newest_first": {type: bool, required: no, default: no}
40
+ "price_required": {type: bool, required: no, default: no}
41
+ "price_greater_than": {type: int, required: no}
42
+ "price_less_than": {type: int, required: no}
43
+ "full_post_has": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
44
+ "full_post_has_no": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
45
+ "summary_post_has": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
46
+ "summary_post_has_no": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
47
+ "summary_or_full_post_has": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
48
+ "summary_or_full_post_has_no": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
49
+ "location_has": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
50
+ "location_has_no": {type: seq, required: no, sequence: [ {type: str, unique: yes} ]}
51
+ "sites":
52
+ type: seq
53
+ required: yes
54
+ sequence:
55
+ - type: str
56
+ unique: yes
57
+ "listings":
48
58
  type: seq
49
59
  required: yes
50
60
  sequence:
51
61
  - type: str
52
- pattern: /^http[s]?\:\/\//
53
62
  unique: yes
54
63
  "starting":
55
64
  type: str