spidey 0.0.4 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -3,3 +3,4 @@
3
3
  Gemfile.lock
4
4
  pkg/*
5
5
  .rspec
6
+ .rbenv-version
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - "1.9.2"
4
+ - "1.9.3"
5
+ - "2.0.0"
6
+ - rbx-19mode
7
+ matrix:
8
+ allow_failures:
9
+ - rvm: rbx-19mode
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- Spidey
1
+ Spidey [![Build Status](https://travis-ci.org/joeyAghion/spidey.png?branch=master)](https://travis-ci.org/joeyAghion/spidey)
2
2
  ======
3
3
 
4
4
  Spidey provides a bare-bones framework for crawling and scraping web sites. Its goal is to keep boilerplate scraping logic out of your code.
@@ -55,6 +55,12 @@ By default, the lists of URLs being crawled, results scraped, and errors encount
55
55
  Add the [spidey-mongo](https://github.com/joeyAghion/spidey-mongo) gem and include `Spidey::Strategies::Mongo` in your spider to instead use MongoDB to persist these data. [See the docs](https://github.com/joeyAghion/spidey-mongo) for more information. Or, you can implement your own strategy by overriding the appropriate methods from `AbstractSpider`.
56
56
 
57
57
 
58
+ Logging
59
+ -------
60
+
61
+ You may set `Spidey.logger` to a logger of your choosing. When used in a Rails environment, the logger defaults to the Rails logger. Otherwise, it's directed to STDOUT.
62
+
63
+
58
64
  Contributing
59
65
  ------------
60
66
 
@@ -68,4 +74,4 @@ To Do
68
74
 
69
75
  Copyright
70
76
  ---------
71
- Copyright (c) 2012 Joey Aghion, Art.sy Inc. See [LICENSE.txt](LICENSE.txt) for further details.
77
+ Copyright (c) 2012 [Joey Aghion](http://halfamind.aghion.com), [Artsy Inc](http://artsy.net). See [LICENSE.txt](LICENSE.txt) for further details.
data/Rakefile CHANGED
@@ -1 +1,7 @@
1
1
  require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core/rake_task'
4
+ RSpec::Core::RakeTask.new
5
+
6
+ task :test => :spec
7
+ task :default => :spec
@@ -1,2 +1,12 @@
1
+ require "logger"
1
2
  require "spidey/version"
2
3
  require "spidey/abstract_spider"
4
+
5
+ module Spidey
6
+ extend self
7
+ attr_accessor :logger
8
+ end
9
+
10
+ Spidey.logger = Logger.new(STDOUT)
11
+
12
+ require 'spidey/railtie' if defined?(::Rails)
@@ -3,7 +3,7 @@ require 'mechanize'
3
3
 
4
4
  module Spidey
5
5
  class AbstractSpider
6
- attr_accessor :urls, :handlers, :results, :request_interval, :verbose, :errors
6
+ attr_accessor :urls, :handlers, :results, :request_interval, :errors
7
7
 
8
8
  DEFAULT_REQUEST_INTERVAL = 3 # seconds
9
9
 
@@ -14,14 +14,12 @@ module Spidey
14
14
 
15
15
  # Accepts:
16
16
  # request_interval: number of seconds to wait between requests (default: 3)
17
- # verbose: prints debugging and progress information if true
18
17
  def initialize(attrs = {})
19
18
  @urls = []
20
19
  @handlers = {}
21
20
  @results = []
22
21
  self.class.start_urls.each { |url| handle url, *self.class.handlers[url] }
23
22
  @request_interval = attrs[:request_interval] || DEFAULT_REQUEST_INTERVAL
24
- @verbose = !!attrs[:verbose]
25
23
  end
26
24
 
27
25
  # Iterates through URLs queued for handling, including any that are added in the course of crawling. Accepts:
@@ -33,7 +31,7 @@ module Spidey
33
31
  break if options[:max_urls] && i >= options[:max_urls]
34
32
  begin
35
33
  page = agent.get(url)
36
- $stderr.puts "Handling #{url.inspect}" if verbose
34
+ Spidey.logger.info "Handling #{url.inspect}"
37
35
  send handler, page, default_data
38
36
  rescue => ex
39
37
  add_error url: url, handler: handler, error: ex
@@ -60,17 +58,17 @@ module Spidey
60
58
  yield url, handlers[url].first, handlers[url].last
61
59
  end
62
60
  end
63
-
61
+
64
62
  # Override this for custom result storage.
65
63
  def record(data)
66
64
  results << data
67
- $stderr.puts "Recording #{data.inspect}" if verbose
65
+ Spidey.logger.info "Recording #{data.inspect}"
68
66
  end
69
-
67
+
70
68
  # Override this for custom error-handling.
71
69
  def add_error(attrs)
72
70
  @errors << attrs
73
- $stderr.puts "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}" if verbose
71
+ Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
74
72
  end
75
73
 
76
74
  def resolve_url(href, page)
@@ -98,5 +96,5 @@ module Spidey
98
96
  end
99
97
 
100
98
  end
101
-
99
+
102
100
  end
@@ -0,0 +1,7 @@
1
+ module Spidey
2
+ class Railtie < ::Rails::Railtie
3
+ initializer 'spidey.configure_rails_logger' do
4
+ Spidey.logger = ::Rails.logger
5
+ end
6
+ end
7
+ end
@@ -1,3 +1,3 @@
1
1
  module Spidey
2
- VERSION = "0.0.4"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -11,17 +11,16 @@ Gem::Specification.new do |s|
11
11
  s.summary = %q{A loose framework for crawling and scraping web sites.}
12
12
  s.description = %q{A loose framework for crawling and scraping web sites.}
13
13
  s.license = 'MIT'
14
-
14
+
15
15
  s.rubyforge_project = "spidey"
16
-
16
+
17
17
  s.files = `git ls-files`.split("\n")
18
18
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
19
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
20
  s.require_paths = ["lib"]
21
-
21
+
22
22
  s.add_development_dependency "rake"
23
23
  s.add_development_dependency "rspec"
24
- s.add_development_dependency "ruby-debug19"
25
-
24
+
26
25
  s.add_runtime_dependency "mechanize"
27
26
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-21 00:00:00.000000000 Z
12
+ date: 2013-05-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -43,22 +43,6 @@ dependencies:
43
43
  - - ! '>='
44
44
  - !ruby/object:Gem::Version
45
45
  version: '0'
46
- - !ruby/object:Gem::Dependency
47
- name: ruby-debug19
48
- requirement: !ruby/object:Gem::Requirement
49
- none: false
50
- requirements:
51
- - - ! '>='
52
- - !ruby/object:Gem::Version
53
- version: '0'
54
- type: :development
55
- prerelease: false
56
- version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
- requirements:
59
- - - ! '>='
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
46
  - !ruby/object:Gem::Dependency
63
47
  name: mechanize
64
48
  requirement: !ruby/object:Gem::Requirement
@@ -83,6 +67,7 @@ extensions: []
83
67
  extra_rdoc_files: []
84
68
  files:
85
69
  - .gitignore
70
+ - .travis.yml
86
71
  - Gemfile
87
72
  - LICENSE.txt
88
73
  - README.md
@@ -90,6 +75,7 @@ files:
90
75
  - examples/ebay_pet_supplies_spider.rb
91
76
  - lib/spidey.rb
92
77
  - lib/spidey/abstract_spider.rb
78
+ - lib/spidey/railtie.rb
93
79
  - lib/spidey/version.rb
94
80
  - spec/spec_helper.rb
95
81
  - spec/spidey/abstract_spider_spec.rb
@@ -109,7 +95,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
109
95
  version: '0'
110
96
  segments:
111
97
  - 0
112
- hash: -3162500508741796001
98
+ hash: 101937807007286306
113
99
  required_rubygems_version: !ruby/object:Gem::Requirement
114
100
  none: false
115
101
  requirements:
@@ -118,10 +104,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
104
  version: '0'
119
105
  segments:
120
106
  - 0
121
- hash: -3162500508741796001
107
+ hash: 101937807007286306
122
108
  requirements: []
123
109
  rubyforge_project: spidey
124
- rubygems_version: 1.8.24
110
+ rubygems_version: 1.8.25
125
111
  signing_key:
126
112
  specification_version: 3
127
113
  summary: A loose framework for crawling and scraping web sites.