spidey 0.0.4 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -3,3 +3,4 @@
3
3
  Gemfile.lock
4
4
  pkg/*
5
5
  .rspec
6
+ .rbenv-version
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - "1.9.2"
4
+ - "1.9.3"
5
+ - "2.0.0"
6
+ - rbx-19mode
7
+ matrix:
8
+ allow_failures:
9
+ - rvm: rbx-19mode
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- Spidey
1
+ Spidey [![Build Status](https://travis-ci.org/joeyAghion/spidey.png?branch=master)](https://travis-ci.org/joeyAghion/spidey)
2
2
  ======
3
3
 
4
4
  Spidey provides a bare-bones framework for crawling and scraping web sites. Its goal is to keep boilerplate scraping logic out of your code.
@@ -55,6 +55,12 @@ By default, the lists of URLs being crawled, results scraped, and errors encount
55
55
  Add the [spidey-mongo](https://github.com/joeyAghion/spidey-mongo) gem and include `Spidey::Strategies::Mongo` in your spider to instead use MongoDB to persist these data. [See the docs](https://github.com/joeyAghion/spidey-mongo) for more information. Or, you can implement your own strategy by overriding the appropriate methods from `AbstractSpider`.
56
56
 
57
57
 
58
+ Logging
59
+ -------
60
+
61
+ You may set `Spidey.logger` to a logger of your choosing. When used in a Rails environment, the logger defaults to the Rails logger. Otherwise, it's directed to STDOUT.
62
+
63
+
58
64
  Contributing
59
65
  ------------
60
66
 
@@ -68,4 +74,4 @@ To Do
68
74
 
69
75
  Copyright
70
76
  ---------
71
- Copyright (c) 2012 Joey Aghion, Art.sy Inc. See [LICENSE.txt](LICENSE.txt) for further details.
77
+ Copyright (c) 2012 [Joey Aghion](http://halfamind.aghion.com), [Artsy Inc](http://artsy.net). See [LICENSE.txt](LICENSE.txt) for further details.
data/Rakefile CHANGED
@@ -1 +1,7 @@
1
1
  require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core/rake_task'
4
+ RSpec::Core::RakeTask.new
5
+
6
+ task :test => :spec
7
+ task :default => :spec
@@ -1,2 +1,12 @@
1
+ require "logger"
1
2
  require "spidey/version"
2
3
  require "spidey/abstract_spider"
4
+
5
+ module Spidey
6
+ extend self
7
+ attr_accessor :logger
8
+ end
9
+
10
+ Spidey.logger = Logger.new(STDOUT)
11
+
12
+ require 'spidey/railtie' if defined?(::Rails)
@@ -3,7 +3,7 @@ require 'mechanize'
3
3
 
4
4
  module Spidey
5
5
  class AbstractSpider
6
- attr_accessor :urls, :handlers, :results, :request_interval, :verbose, :errors
6
+ attr_accessor :urls, :handlers, :results, :request_interval, :errors
7
7
 
8
8
  DEFAULT_REQUEST_INTERVAL = 3 # seconds
9
9
 
@@ -14,14 +14,12 @@ module Spidey
14
14
 
15
15
  # Accepts:
16
16
  # request_interval: number of seconds to wait between requests (default: 3)
17
- # verbose: prints debugging and progress information if true
18
17
  def initialize(attrs = {})
19
18
  @urls = []
20
19
  @handlers = {}
21
20
  @results = []
22
21
  self.class.start_urls.each { |url| handle url, *self.class.handlers[url] }
23
22
  @request_interval = attrs[:request_interval] || DEFAULT_REQUEST_INTERVAL
24
- @verbose = !!attrs[:verbose]
25
23
  end
26
24
 
27
25
  # Iterates through URLs queued for handling, including any that are added in the course of crawling. Accepts:
@@ -33,7 +31,7 @@ module Spidey
33
31
  break if options[:max_urls] && i >= options[:max_urls]
34
32
  begin
35
33
  page = agent.get(url)
36
- $stderr.puts "Handling #{url.inspect}" if verbose
34
+ Spidey.logger.info "Handling #{url.inspect}"
37
35
  send handler, page, default_data
38
36
  rescue => ex
39
37
  add_error url: url, handler: handler, error: ex
@@ -60,17 +58,17 @@ module Spidey
60
58
  yield url, handlers[url].first, handlers[url].last
61
59
  end
62
60
  end
63
-
61
+
64
62
  # Override this for custom result storage.
65
63
  def record(data)
66
64
  results << data
67
- $stderr.puts "Recording #{data.inspect}" if verbose
65
+ Spidey.logger.info "Recording #{data.inspect}"
68
66
  end
69
-
67
+
70
68
  # Override this for custom error-handling.
71
69
  def add_error(attrs)
72
70
  @errors << attrs
73
- $stderr.puts "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}" if verbose
71
+ Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
74
72
  end
75
73
 
76
74
  def resolve_url(href, page)
@@ -98,5 +96,5 @@ module Spidey
98
96
  end
99
97
 
100
98
  end
101
-
99
+
102
100
  end
@@ -0,0 +1,7 @@
1
+ module Spidey
2
+ class Railtie < ::Rails::Railtie
3
+ initializer 'spidey.configure_rails_logger' do
4
+ Spidey.logger = ::Rails.logger
5
+ end
6
+ end
7
+ end
@@ -1,3 +1,3 @@
1
1
  module Spidey
2
- VERSION = "0.0.4"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -11,17 +11,16 @@ Gem::Specification.new do |s|
11
11
  s.summary = %q{A loose framework for crawling and scraping web sites.}
12
12
  s.description = %q{A loose framework for crawling and scraping web sites.}
13
13
  s.license = 'MIT'
14
-
14
+
15
15
  s.rubyforge_project = "spidey"
16
-
16
+
17
17
  s.files = `git ls-files`.split("\n")
18
18
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
19
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
20
  s.require_paths = ["lib"]
21
-
21
+
22
22
  s.add_development_dependency "rake"
23
23
  s.add_development_dependency "rspec"
24
- s.add_development_dependency "ruby-debug19"
25
-
24
+
26
25
  s.add_runtime_dependency "mechanize"
27
26
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-21 00:00:00.000000000 Z
12
+ date: 2013-05-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -43,22 +43,6 @@ dependencies:
43
43
  - - ! '>='
44
44
  - !ruby/object:Gem::Version
45
45
  version: '0'
46
- - !ruby/object:Gem::Dependency
47
- name: ruby-debug19
48
- requirement: !ruby/object:Gem::Requirement
49
- none: false
50
- requirements:
51
- - - ! '>='
52
- - !ruby/object:Gem::Version
53
- version: '0'
54
- type: :development
55
- prerelease: false
56
- version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
- requirements:
59
- - - ! '>='
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
46
  - !ruby/object:Gem::Dependency
63
47
  name: mechanize
64
48
  requirement: !ruby/object:Gem::Requirement
@@ -83,6 +67,7 @@ extensions: []
83
67
  extra_rdoc_files: []
84
68
  files:
85
69
  - .gitignore
70
+ - .travis.yml
86
71
  - Gemfile
87
72
  - LICENSE.txt
88
73
  - README.md
@@ -90,6 +75,7 @@ files:
90
75
  - examples/ebay_pet_supplies_spider.rb
91
76
  - lib/spidey.rb
92
77
  - lib/spidey/abstract_spider.rb
78
+ - lib/spidey/railtie.rb
93
79
  - lib/spidey/version.rb
94
80
  - spec/spec_helper.rb
95
81
  - spec/spidey/abstract_spider_spec.rb
@@ -109,7 +95,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
109
95
  version: '0'
110
96
  segments:
111
97
  - 0
112
- hash: -3162500508741796001
98
+ hash: 101937807007286306
113
99
  required_rubygems_version: !ruby/object:Gem::Requirement
114
100
  none: false
115
101
  requirements:
@@ -118,10 +104,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
104
  version: '0'
119
105
  segments:
120
106
  - 0
121
- hash: -3162500508741796001
107
+ hash: 101937807007286306
122
108
  requirements: []
123
109
  rubyforge_project: spidey
124
- rubygems_version: 1.8.24
110
+ rubygems_version: 1.8.25
125
111
  signing_key:
126
112
  specification_version: 3
127
113
  summary: A loose framework for crawling and scraping web sites.