scrapey 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ require 'selenium-webdriver'
2
+ require 'capybara/dsl'
3
+ require 'capybara-webkit'
4
+
5
+ Capybara.run_server = false
6
+ Capybara.current_driver = :webkit
7
+
8
+ class Browser
9
+ include Capybara::DSL
10
+ end
11
+ @browser = Browser.new
@@ -13,7 +13,7 @@ module Scrapey
13
13
  return nil unless File::exists?(filename)
14
14
  debug "Loading #{filename} from cache"
15
15
  begin
16
- Nokogiri::HTML Marshal.load(File.read(filename))
16
+ Nokogiri::HTML Marshal.load(File.open(filename, "rb"){|f| f.read})
17
17
  rescue Exception => e
18
18
  puts e.message
19
19
  end
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.7"
2
+ VERSION = "0.0.8"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
@@ -22,7 +22,15 @@ module Scrapey
22
22
  end
23
23
 
24
24
  def init_db
25
- ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/all', 'active_support/multibyte/chars'].each{|lib| require lib}
25
+ [
26
+ 'active_record',
27
+ 'active_record/schema',
28
+ 'active_record/connection_adapters/abstract/schema_definitions',
29
+ @config['database']['adapter'],
30
+ 'tzinfo',
31
+ 'active_support/all',
32
+ 'active_support/multibyte/chars'
33
+ ].each{|lib| require lib}
26
34
  ActiveRecord::Base.establish_connection(@config['database'])
27
35
  end
28
36
  end
data/lib/scrapey/multi.rb CHANGED
@@ -1,18 +1,23 @@
1
1
  require 'httpclient'
2
2
 
3
+ # monkey patch to remove annoying httpclient warnings
4
+ class HTTPClient; def warn str; end; end
5
+
3
6
  module Scrapey
4
7
  def multi_get_or_post method, all_urls, options = {}
5
8
  all_urls.reject!{|url| is_cached? url} if @use_cache
6
9
  return unless all_urls.size > 0
7
10
 
8
- threads = options[:threads] || [10, all_urls.size].min
11
+ threads = options[:threads] || 20
9
12
  on_success = options[:on_success] || :on_success
10
13
  on_error = options[:on_error] || :on_error
11
14
  user_agent = options[:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
12
15
  proxy = options[:proxy] || nil
16
+ timeout = options[:timeout] || 1000
13
17
 
14
18
  @lock ||= Mutex.new
15
- @http_clients ||= threads.times.map{HTTPClient.new(options[:proxies] ? options[:proxies].rotate!.first : proxy, user_agent).tap{|c| c.ssl_config.verify_mode, c.receive_timeout = OpenSSL::SSL::VERIFY_NONE, 10000}}
19
+ @http_clients ||= threads.times.map{HTTPClient.new(options[:proxies] ? options[:proxies].rotate!.first : proxy, user_agent).tap{|c| c.ssl_config.verify_mode, c.receive_timeout, c.ssl_config.verify_callback = OpenSSL::SSL::VERIFY_NONE, timeout, proc{true}}}
20
+
16
21
  debug 'starting multi'
17
22
 
18
23
  all_urls.each_slice(threads) do |urls|
@@ -52,7 +52,7 @@ module Scrapey
52
52
  end
53
53
  case
54
54
  when item.is_a?(Array) then @csv << item
55
- when item.is_a?(Hash)
55
+ when item.is_a?(Hash) || item.is_a?(CSV::Row)
56
56
  raise 'No fields defined!' unless @fields
57
57
  @csv << @fields.map{|f| item[f]}
58
58
  else raise "unsupported type: #{item.class}"
@@ -0,0 +1,20 @@
1
+ module Scrapey
2
+ class Tee
3
+ def initialize *targets
4
+ @targets = targets
5
+ end
6
+
7
+ def write *args
8
+ @targets.each {|t| t.write(*args)}
9
+ end
10
+
11
+ def flush *args
12
+ @targets.each {|t| t.flush(*args)}
13
+ end
14
+
15
+ def close
16
+ @targets.each(&:close)
17
+ end
18
+ end
19
+ end
20
+
data/lib/scrapey.rb CHANGED
@@ -8,6 +8,7 @@ require "scrapey/constants"
8
8
  require "scrapey/cache"
9
9
  require "scrapey/database"
10
10
  require "scrapey/multi"
11
+ require "scrapey/tee"
11
12
 
12
13
  include Scrapey
13
14
 
@@ -23,3 +24,5 @@ config_file = "#{BASEDIR}/config/config.yml"
23
24
  @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
24
25
 
25
26
  init_db if @config['database']
27
+
28
+ $stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "a"))
data/scrapey.gemspec CHANGED
@@ -16,6 +16,7 @@ Gem::Specification.new do |gem|
16
16
  gem.require_paths = ["lib"]
17
17
  gem.version = Scrapey::VERSION
18
18
  gem.add_dependency(%q<mechanize>)
19
+ gem.add_dependency(%q<httpclient>)
19
20
  gem.add_dependency(%q<json>, ["~> 1.7.0"])
20
21
  end
21
22
 
@@ -11,7 +11,7 @@ require 'pry'
11
11
  def scrape div
12
12
  a = div.at('a')
13
13
  url = URI.join(@url, a[:href]).to_s
14
- return unless visited? url
14
+ return if visited? url
15
15
  item = {}
16
16
 
17
17
  save item
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-27 00:00:00.000000000 Z
12
+ date: 2012-09-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -27,6 +27,22 @@ dependencies:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
29
  version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: httpclient
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
30
46
  - !ruby/object:Gem::Dependency
31
47
  name: json
32
48
  requirement: !ruby/object:Gem::Requirement
@@ -62,6 +78,7 @@ files:
62
78
  - examples/output.csv
63
79
  - examples/redis.rb
64
80
  - examples/status_check.rb
81
+ - lib/scrapey/browser.rb
65
82
  - lib/scrapey/cache/disk.rb
66
83
  - lib/scrapey/cache/redis.rb
67
84
  - lib/scrapey/cache.rb
@@ -69,9 +86,9 @@ files:
69
86
  - lib/scrapey/database.rb
70
87
  - lib/scrapey/multi.rb
71
88
  - lib/scrapey/scrapey.rb
89
+ - lib/scrapey/tee.rb
72
90
  - lib/scrapey/template.rb
73
91
  - lib/scrapey.rb
74
- - output.csv
75
92
  - scrapey.gemspec
76
93
  - template/config/config.yml
77
94
  - template/Gemfile
data/output.csv DELETED
@@ -1,5 +0,0 @@
1
- url,status
2
- http://www.bing.com/,200
3
- http://www.bing.com/404.html,404
4
- http://locahlost2/foo,getaddrinfo: No such host is known. (http://locahlost2:80)
5
- http://www.google.com/,302