scrapey 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,11 @@
1
+ require 'selenium-webdriver'
2
+ require 'capybara/dsl'
3
+ require 'capybara-webkit'
4
+
5
+ Capybara.run_server = false
6
+ Capybara.current_driver = :webkit
7
+
8
+ class Browser
9
+ include Capybara::DSL
10
+ end
11
+ @browser = Browser.new
@@ -13,7 +13,7 @@ module Scrapey
13
13
  return nil unless File::exists?(filename)
14
14
  debug "Loading #{filename} from cache"
15
15
  begin
16
- Nokogiri::HTML Marshal.load(File.read(filename))
16
+ Nokogiri::HTML Marshal.load(File.open(filename, "rb"){|f| f.read})
17
17
  rescue Exception => e
18
18
  puts e.message
19
19
  end
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.7"
2
+ VERSION = "0.0.8"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
@@ -22,7 +22,15 @@ module Scrapey
22
22
  end
23
23
 
24
24
  def init_db
25
- ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/all', 'active_support/multibyte/chars'].each{|lib| require lib}
25
+ [
26
+ 'active_record',
27
+ 'active_record/schema',
28
+ 'active_record/connection_adapters/abstract/schema_definitions',
29
+ @config['database']['adapter'],
30
+ 'tzinfo',
31
+ 'active_support/all',
32
+ 'active_support/multibyte/chars'
33
+ ].each{|lib| require lib}
26
34
  ActiveRecord::Base.establish_connection(@config['database'])
27
35
  end
28
36
  end
data/lib/scrapey/multi.rb CHANGED
@@ -1,18 +1,23 @@
1
1
  require 'httpclient'
2
2
 
3
+ # monkey patch to remove annoying httpclient warnings
4
+ class HTTPClient; def warn str; end; end
5
+
3
6
  module Scrapey
4
7
  def multi_get_or_post method, all_urls, options = {}
5
8
  all_urls.reject!{|url| is_cached? url} if @use_cache
6
9
  return unless all_urls.size > 0
7
10
 
8
- threads = options[:threads] || [10, all_urls.size].min
11
+ threads = options[:threads] || 20
9
12
  on_success = options[:on_success] || :on_success
10
13
  on_error = options[:on_error] || :on_error
11
14
  user_agent = options[:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
12
15
  proxy = options[:proxy] || nil
16
+ timeout = options[:timeout] || 1000
13
17
 
14
18
  @lock ||= Mutex.new
15
- @http_clients ||= threads.times.map{HTTPClient.new(options[:proxies] ? options[:proxies].rotate!.first : proxy, user_agent).tap{|c| c.ssl_config.verify_mode, c.receive_timeout = OpenSSL::SSL::VERIFY_NONE, 10000}}
19
+ @http_clients ||= threads.times.map{HTTPClient.new(options[:proxies] ? options[:proxies].rotate!.first : proxy, user_agent).tap{|c| c.ssl_config.verify_mode, c.receive_timeout, c.ssl_config.verify_callback = OpenSSL::SSL::VERIFY_NONE, timeout, proc{true}}}
20
+
16
21
  debug 'starting multi'
17
22
 
18
23
  all_urls.each_slice(threads) do |urls|
@@ -52,7 +52,7 @@ module Scrapey
52
52
  end
53
53
  case
54
54
  when item.is_a?(Array) then @csv << item
55
- when item.is_a?(Hash)
55
+ when item.is_a?(Hash) || item.is_a?(CSV::Row)
56
56
  raise 'No fields defined!' unless @fields
57
57
  @csv << @fields.map{|f| item[f]}
58
58
  else raise "unsupported type: #{item.class}"
@@ -0,0 +1,20 @@
1
+ module Scrapey
2
+ class Tee
3
+ def initialize *targets
4
+ @targets = targets
5
+ end
6
+
7
+ def write *args
8
+ @targets.each {|t| t.write(*args)}
9
+ end
10
+
11
+ def flush *args
12
+ @targets.each {|t| t.flush(*args)}
13
+ end
14
+
15
+ def close
16
+ @targets.each(&:close)
17
+ end
18
+ end
19
+ end
20
+
data/lib/scrapey.rb CHANGED
@@ -8,6 +8,7 @@ require "scrapey/constants"
8
8
  require "scrapey/cache"
9
9
  require "scrapey/database"
10
10
  require "scrapey/multi"
11
+ require "scrapey/tee"
11
12
 
12
13
  include Scrapey
13
14
 
@@ -23,3 +24,5 @@ config_file = "#{BASEDIR}/config/config.yml"
23
24
  @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
24
25
 
25
26
  init_db if @config['database']
27
+
28
+ $stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "a"))
data/scrapey.gemspec CHANGED
@@ -16,6 +16,7 @@ Gem::Specification.new do |gem|
16
16
  gem.require_paths = ["lib"]
17
17
  gem.version = Scrapey::VERSION
18
18
  gem.add_dependency(%q<mechanize>)
19
+ gem.add_dependency(%q<httpclient>)
19
20
  gem.add_dependency(%q<json>, ["~> 1.7.0"])
20
21
  end
21
22
 
@@ -11,7 +11,7 @@ require 'pry'
11
11
  def scrape div
12
12
  a = div.at('a')
13
13
  url = URI.join(@url, a[:href]).to_s
14
- return unless visited? url
14
+ return if visited? url
15
15
  item = {}
16
16
 
17
17
  save item
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-27 00:00:00.000000000 Z
12
+ date: 2012-09-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -27,6 +27,22 @@ dependencies:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
29
  version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: httpclient
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
30
46
  - !ruby/object:Gem::Dependency
31
47
  name: json
32
48
  requirement: !ruby/object:Gem::Requirement
@@ -62,6 +78,7 @@ files:
62
78
  - examples/output.csv
63
79
  - examples/redis.rb
64
80
  - examples/status_check.rb
81
+ - lib/scrapey/browser.rb
65
82
  - lib/scrapey/cache/disk.rb
66
83
  - lib/scrapey/cache/redis.rb
67
84
  - lib/scrapey/cache.rb
@@ -69,9 +86,9 @@ files:
69
86
  - lib/scrapey/database.rb
70
87
  - lib/scrapey/multi.rb
71
88
  - lib/scrapey/scrapey.rb
89
+ - lib/scrapey/tee.rb
72
90
  - lib/scrapey/template.rb
73
91
  - lib/scrapey.rb
74
- - output.csv
75
92
  - scrapey.gemspec
76
93
  - template/config/config.yml
77
94
  - template/Gemfile
data/output.csv DELETED
@@ -1,5 +0,0 @@
1
- url,status
2
- http://www.bing.com/,200
3
- http://www.bing.com/404.html,404
4
- http://locahlost2/foo,getaddrinfo: No such host is known. (http://locahlost2:80)
5
- http://www.google.com/,302