scrapey 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/scrapey/browser.rb +11 -0
- data/lib/scrapey/cache/disk.rb +1 -1
- data/lib/scrapey/constants.rb +1 -1
- data/lib/scrapey/database.rb +9 -1
- data/lib/scrapey/multi.rb +7 -2
- data/lib/scrapey/scrapey.rb +1 -1
- data/lib/scrapey/tee.rb +20 -0
- data/lib/scrapey.rb +3 -0
- data/scrapey.gemspec +1 -0
- data/template/src/template.rb +1 -1
- metadata +20 -3
- data/output.csv +0 -5
data/lib/scrapey/cache/disk.rb
CHANGED
@@ -13,7 +13,7 @@ module Scrapey
|
|
13
13
|
return nil unless File::exists?(filename)
|
14
14
|
debug "Loading #{filename} from cache"
|
15
15
|
begin
|
16
|
-
Nokogiri::HTML Marshal.load(File.
|
16
|
+
Nokogiri::HTML Marshal.load(File.open(filename, "rb"){|f| f.read})
|
17
17
|
rescue Exception => e
|
18
18
|
puts e.message
|
19
19
|
end
|
data/lib/scrapey/constants.rb
CHANGED
data/lib/scrapey/database.rb
CHANGED
@@ -22,7 +22,15 @@ module Scrapey
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def init_db
|
25
|
-
[
|
25
|
+
[
|
26
|
+
'active_record',
|
27
|
+
'active_record/schema',
|
28
|
+
'active_record/connection_adapters/abstract/schema_definitions',
|
29
|
+
@config['database']['adapter'],
|
30
|
+
'tzinfo',
|
31
|
+
'active_support/all',
|
32
|
+
'active_support/multibyte/chars'
|
33
|
+
].each{|lib| require lib}
|
26
34
|
ActiveRecord::Base.establish_connection(@config['database'])
|
27
35
|
end
|
28
36
|
end
|
data/lib/scrapey/multi.rb
CHANGED
@@ -1,18 +1,23 @@
|
|
1
1
|
require 'httpclient'
|
2
2
|
|
3
|
+
# monkey patch to remove annoying httpclient warnings
|
4
|
+
class HTTPClient; def warn str; end; end
|
5
|
+
|
3
6
|
module Scrapey
|
4
7
|
def multi_get_or_post method, all_urls, options = {}
|
5
8
|
all_urls.reject!{|url| is_cached? url} if @use_cache
|
6
9
|
return unless all_urls.size > 0
|
7
10
|
|
8
|
-
threads = options[:threads] ||
|
11
|
+
threads = options[:threads] || 20
|
9
12
|
on_success = options[:on_success] || :on_success
|
10
13
|
on_error = options[:on_error] || :on_error
|
11
14
|
user_agent = options[:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
|
12
15
|
proxy = options[:proxy] || nil
|
16
|
+
timeout = options[:timeout] || 1000
|
13
17
|
|
14
18
|
@lock ||= Mutex.new
|
15
|
-
@http_clients ||= threads.times.map{HTTPClient.new(options[:proxies] ? options[:proxies].rotate!.first : proxy, user_agent).tap{|c| c.ssl_config.verify_mode, c.receive_timeout = OpenSSL::SSL::VERIFY_NONE,
|
19
|
+
@http_clients ||= threads.times.map{HTTPClient.new(options[:proxies] ? options[:proxies].rotate!.first : proxy, user_agent).tap{|c| c.ssl_config.verify_mode, c.receive_timeout, c.ssl_config.verify_callback = OpenSSL::SSL::VERIFY_NONE, timeout, proc{true}}}
|
20
|
+
|
16
21
|
debug 'starting multi'
|
17
22
|
|
18
23
|
all_urls.each_slice(threads) do |urls|
|
data/lib/scrapey/scrapey.rb
CHANGED
@@ -52,7 +52,7 @@ module Scrapey
|
|
52
52
|
end
|
53
53
|
case
|
54
54
|
when item.is_a?(Array) then @csv << item
|
55
|
-
when item.is_a?(Hash)
|
55
|
+
when item.is_a?(Hash) || item.is_a?(CSV::Row)
|
56
56
|
raise 'No fields defined!' unless @fields
|
57
57
|
@csv << @fields.map{|f| item[f]}
|
58
58
|
else raise "unsupported type: #{item.class}"
|
data/lib/scrapey/tee.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
module Scrapey
|
2
|
+
class Tee
|
3
|
+
def initialize *targets
|
4
|
+
@targets = targets
|
5
|
+
end
|
6
|
+
|
7
|
+
def write *args
|
8
|
+
@targets.each {|t| t.write(*args)}
|
9
|
+
end
|
10
|
+
|
11
|
+
def flush *args
|
12
|
+
@targets.each {|t| t.flush(*args)}
|
13
|
+
end
|
14
|
+
|
15
|
+
def close
|
16
|
+
@targets.each(&:close)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
data/lib/scrapey.rb
CHANGED
@@ -8,6 +8,7 @@ require "scrapey/constants"
|
|
8
8
|
require "scrapey/cache"
|
9
9
|
require "scrapey/database"
|
10
10
|
require "scrapey/multi"
|
11
|
+
require "scrapey/tee"
|
11
12
|
|
12
13
|
include Scrapey
|
13
14
|
|
@@ -23,3 +24,5 @@ config_file = "#{BASEDIR}/config/config.yml"
|
|
23
24
|
@config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
|
24
25
|
|
25
26
|
init_db if @config['database']
|
27
|
+
|
28
|
+
$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "a"))
|
data/scrapey.gemspec
CHANGED
data/template/src/template.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-09-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -27,6 +27,22 @@ dependencies:
|
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: httpclient
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
30
46
|
- !ruby/object:Gem::Dependency
|
31
47
|
name: json
|
32
48
|
requirement: !ruby/object:Gem::Requirement
|
@@ -62,6 +78,7 @@ files:
|
|
62
78
|
- examples/output.csv
|
63
79
|
- examples/redis.rb
|
64
80
|
- examples/status_check.rb
|
81
|
+
- lib/scrapey/browser.rb
|
65
82
|
- lib/scrapey/cache/disk.rb
|
66
83
|
- lib/scrapey/cache/redis.rb
|
67
84
|
- lib/scrapey/cache.rb
|
@@ -69,9 +86,9 @@ files:
|
|
69
86
|
- lib/scrapey/database.rb
|
70
87
|
- lib/scrapey/multi.rb
|
71
88
|
- lib/scrapey/scrapey.rb
|
89
|
+
- lib/scrapey/tee.rb
|
72
90
|
- lib/scrapey/template.rb
|
73
91
|
- lib/scrapey.rb
|
74
|
-
- output.csv
|
75
92
|
- scrapey.gemspec
|
76
93
|
- template/config/config.yml
|
77
94
|
- template/Gemfile
|