scrapey 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/scrapey/browser.rb +11 -0
- data/lib/scrapey/cache/disk.rb +1 -1
- data/lib/scrapey/constants.rb +1 -1
- data/lib/scrapey/database.rb +9 -1
- data/lib/scrapey/multi.rb +7 -2
- data/lib/scrapey/scrapey.rb +1 -1
- data/lib/scrapey/tee.rb +20 -0
- data/lib/scrapey.rb +3 -0
- data/scrapey.gemspec +1 -0
- data/template/src/template.rb +1 -1
- metadata +20 -3
- data/output.csv +0 -5
data/lib/scrapey/cache/disk.rb
CHANGED
@@ -13,7 +13,7 @@ module Scrapey
|
|
13
13
|
return nil unless File::exists?(filename)
|
14
14
|
debug "Loading #{filename} from cache"
|
15
15
|
begin
|
16
|
-
Nokogiri::HTML Marshal.load(File.
|
16
|
+
Nokogiri::HTML Marshal.load(File.open(filename, "rb"){|f| f.read})
|
17
17
|
rescue Exception => e
|
18
18
|
puts e.message
|
19
19
|
end
|
data/lib/scrapey/constants.rb
CHANGED
data/lib/scrapey/database.rb
CHANGED
@@ -22,7 +22,15 @@ module Scrapey
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def init_db
|
25
|
-
[
|
25
|
+
[
|
26
|
+
'active_record',
|
27
|
+
'active_record/schema',
|
28
|
+
'active_record/connection_adapters/abstract/schema_definitions',
|
29
|
+
@config['database']['adapter'],
|
30
|
+
'tzinfo',
|
31
|
+
'active_support/all',
|
32
|
+
'active_support/multibyte/chars'
|
33
|
+
].each{|lib| require lib}
|
26
34
|
ActiveRecord::Base.establish_connection(@config['database'])
|
27
35
|
end
|
28
36
|
end
|
data/lib/scrapey/multi.rb
CHANGED
@@ -1,18 +1,23 @@
|
|
1
1
|
require 'httpclient'
|
2
2
|
|
3
|
+
# monkey patch to remove annoying httpclient warnings
|
4
|
+
class HTTPClient; def warn str; end; end
|
5
|
+
|
3
6
|
module Scrapey
|
4
7
|
def multi_get_or_post method, all_urls, options = {}
|
5
8
|
all_urls.reject!{|url| is_cached? url} if @use_cache
|
6
9
|
return unless all_urls.size > 0
|
7
10
|
|
8
|
-
threads = options[:threads] ||
|
11
|
+
threads = options[:threads] || 20
|
9
12
|
on_success = options[:on_success] || :on_success
|
10
13
|
on_error = options[:on_error] || :on_error
|
11
14
|
user_agent = options[:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
|
12
15
|
proxy = options[:proxy] || nil
|
16
|
+
timeout = options[:timeout] || 1000
|
13
17
|
|
14
18
|
@lock ||= Mutex.new
|
15
|
-
@http_clients ||= threads.times.map{HTTPClient.new(options[:proxies] ? options[:proxies].rotate!.first : proxy, user_agent).tap{|c| c.ssl_config.verify_mode, c.receive_timeout = OpenSSL::SSL::VERIFY_NONE,
|
19
|
+
@http_clients ||= threads.times.map{HTTPClient.new(options[:proxies] ? options[:proxies].rotate!.first : proxy, user_agent).tap{|c| c.ssl_config.verify_mode, c.receive_timeout, c.ssl_config.verify_callback = OpenSSL::SSL::VERIFY_NONE, timeout, proc{true}}}
|
20
|
+
|
16
21
|
debug 'starting multi'
|
17
22
|
|
18
23
|
all_urls.each_slice(threads) do |urls|
|
data/lib/scrapey/scrapey.rb
CHANGED
@@ -52,7 +52,7 @@ module Scrapey
|
|
52
52
|
end
|
53
53
|
case
|
54
54
|
when item.is_a?(Array) then @csv << item
|
55
|
-
when item.is_a?(Hash)
|
55
|
+
when item.is_a?(Hash) || item.is_a?(CSV::Row)
|
56
56
|
raise 'No fields defined!' unless @fields
|
57
57
|
@csv << @fields.map{|f| item[f]}
|
58
58
|
else raise "unsupported type: #{item.class}"
|
data/lib/scrapey/tee.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
module Scrapey
|
2
|
+
class Tee
|
3
|
+
def initialize *targets
|
4
|
+
@targets = targets
|
5
|
+
end
|
6
|
+
|
7
|
+
def write *args
|
8
|
+
@targets.each {|t| t.write(*args)}
|
9
|
+
end
|
10
|
+
|
11
|
+
def flush *args
|
12
|
+
@targets.each {|t| t.flush(*args)}
|
13
|
+
end
|
14
|
+
|
15
|
+
def close
|
16
|
+
@targets.each(&:close)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
data/lib/scrapey.rb
CHANGED
@@ -8,6 +8,7 @@ require "scrapey/constants"
|
|
8
8
|
require "scrapey/cache"
|
9
9
|
require "scrapey/database"
|
10
10
|
require "scrapey/multi"
|
11
|
+
require "scrapey/tee"
|
11
12
|
|
12
13
|
include Scrapey
|
13
14
|
|
@@ -23,3 +24,5 @@ config_file = "#{BASEDIR}/config/config.yml"
|
|
23
24
|
@config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
|
24
25
|
|
25
26
|
init_db if @config['database']
|
27
|
+
|
28
|
+
$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "a"))
|
data/scrapey.gemspec
CHANGED
data/template/src/template.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-09-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -27,6 +27,22 @@ dependencies:
|
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: httpclient
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
30
46
|
- !ruby/object:Gem::Dependency
|
31
47
|
name: json
|
32
48
|
requirement: !ruby/object:Gem::Requirement
|
@@ -62,6 +78,7 @@ files:
|
|
62
78
|
- examples/output.csv
|
63
79
|
- examples/redis.rb
|
64
80
|
- examples/status_check.rb
|
81
|
+
- lib/scrapey/browser.rb
|
65
82
|
- lib/scrapey/cache/disk.rb
|
66
83
|
- lib/scrapey/cache/redis.rb
|
67
84
|
- lib/scrapey/cache.rb
|
@@ -69,9 +86,9 @@ files:
|
|
69
86
|
- lib/scrapey/database.rb
|
70
87
|
- lib/scrapey/multi.rb
|
71
88
|
- lib/scrapey/scrapey.rb
|
89
|
+
- lib/scrapey/tee.rb
|
72
90
|
- lib/scrapey/template.rb
|
73
91
|
- lib/scrapey.rb
|
74
|
-
- output.csv
|
75
92
|
- scrapey.gemspec
|
76
93
|
- template/config/config.yml
|
77
94
|
- template/Gemfile
|