daimon_skycrawlers 0.12.0 → 1.0.0.pre.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/README.md +0 -2
- data/Rakefile +3 -1
- data/daimon_skycrawlers.gemspec +1 -1
- data/lib/daimon_skycrawlers/callbacks.rb +19 -0
- data/lib/daimon_skycrawlers/commands/enqueue.rb +40 -0
- data/lib/daimon_skycrawlers/commands/runner.rb +1 -1
- data/lib/daimon_skycrawlers/crawler/base.rb +18 -3
- data/lib/daimon_skycrawlers/crawler/default.rb +7 -7
- data/lib/daimon_skycrawlers/generator/new.rb +4 -1
- data/lib/daimon_skycrawlers/generator/processor.rb +1 -1
- data/lib/daimon_skycrawlers/processor/base.rb +28 -1
- data/lib/daimon_skycrawlers/processor/default.rb +0 -1
- data/lib/daimon_skycrawlers/processor/spider.rb +0 -1
- data/lib/daimon_skycrawlers/sitemap_parser.rb +13 -6
- data/lib/daimon_skycrawlers/storage/base.rb +1 -1
- data/lib/daimon_skycrawlers/storage/file.rb +24 -13
- data/lib/daimon_skycrawlers/storage/null.rb +1 -1
- data/lib/daimon_skycrawlers/storage/rdb.rb +19 -6
- data/lib/daimon_skycrawlers/timer.rb +6 -1
- data/lib/daimon_skycrawlers/version.rb +1 -1
- metadata +7 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c9a394115d873b122e18a6970e62dfbc897dd690
|
4
|
+
data.tar.gz: 55fc07c13fa04c82c5e1c6a1075ffd97793b01f9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5ae484de037763f04487ad8c1ea4047a78a726156781cffeebf0c4c14ae0dfb70029b5f834c1414af9ee43c2b67fb7b137e5d5e335c2dee01abe9aae6ced1099
|
7
|
+
data.tar.gz: 63c85408015248ca66448025b2eb8ff6b65080c3da375b2e5b9723e944e3836f73285cb083088a31c1c8d60fe6176f57c47bc2d894fc7618c5853b3aebc3d500
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
data/Rakefile
CHANGED
@@ -6,7 +6,9 @@ require "daimon_skycrawlers/tasks"
|
|
6
6
|
Rake::TestTask.new(:test) do |t|
|
7
7
|
t.libs << "test"
|
8
8
|
t.libs << "lib"
|
9
|
-
t.test_files = FileList["test
|
9
|
+
t.test_files = FileList["test/**/test_*.rb"]
|
10
|
+
t.verbose = false
|
11
|
+
t.warning = false
|
10
12
|
end
|
11
13
|
|
12
14
|
task :default => [:test]
|
data/daimon_skycrawlers.gemspec
CHANGED
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
|
|
31
31
|
spec.add_dependency "webrobots"
|
32
32
|
|
33
33
|
spec.add_development_dependency "bundler", "~> 1.11"
|
34
|
-
spec.add_development_dependency "rake", "~>
|
34
|
+
spec.add_development_dependency "rake", "~> 12.0"
|
35
35
|
spec.add_development_dependency "test-unit"
|
36
36
|
spec.add_development_dependency "test-unit-rr"
|
37
37
|
spec.add_development_dependency "test-unit-notify"
|
@@ -3,6 +3,7 @@ module DaimonSkycrawlers
|
|
3
3
|
def initialize
|
4
4
|
super
|
5
5
|
@before_process_callbacks = []
|
6
|
+
@after_process_callbacks = []
|
6
7
|
end
|
7
8
|
|
8
9
|
def before_process(callback = nil, &block)
|
@@ -22,5 +23,23 @@ module DaimonSkycrawlers
|
|
22
23
|
def clear_before_process_callbacks
|
23
24
|
@before_process_callbacks = []
|
24
25
|
end
|
26
|
+
|
27
|
+
def after_process(callback = nil, &block)
|
28
|
+
if block_given?
|
29
|
+
@after_process_callbacks << block
|
30
|
+
else
|
31
|
+
@after_process_callbacks << callback if callback.respond_to?(:call)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def run_after_callbacks(message)
|
36
|
+
@after_process_callbacks.each do |callback|
|
37
|
+
callback.call(message)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def clear_after_process_callbacks
|
42
|
+
@after_process_callbacks = []
|
43
|
+
end
|
25
44
|
end
|
26
45
|
end
|
@@ -3,6 +3,7 @@ require "daimon_skycrawlers/crawler"
|
|
3
3
|
require "daimon_skycrawlers/processor"
|
4
4
|
require "daimon_skycrawlers/sitemap_parser"
|
5
5
|
require "daimon_skycrawlers/version"
|
6
|
+
require "thor"
|
6
7
|
require "webrobots"
|
7
8
|
|
8
9
|
module DaimonSkycrawlers
|
@@ -47,6 +48,45 @@ module DaimonSkycrawlers
|
|
47
48
|
end
|
48
49
|
end
|
49
50
|
|
51
|
+
desc "list PATH", "Enqueue URLs from PATH. PATH content includes a URL per line"
|
52
|
+
method_option("type", aliases: ["-t"], type: :string, default: "url", desc: "Specify type for URLs")
|
53
|
+
def list(path)
|
54
|
+
load_init
|
55
|
+
File.open(path, "r") do |file|
|
56
|
+
file.each_line do |line|
|
57
|
+
line.chomp!
|
58
|
+
next if /\A#/ =~ line
|
59
|
+
case options["type"]
|
60
|
+
when "response"
|
61
|
+
DaimonSkycrawlers::Processor.enqueue_http_response(line)
|
62
|
+
when "url"
|
63
|
+
DaimonSkycrawlers::Crawler.enqueue_url(line)
|
64
|
+
else
|
65
|
+
raise ArgumentError, "Unknown type: #{options["type"]}"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
desc "yaml PATH", "Enqueue URLs from PATH."
|
72
|
+
method_option("type", aliases: ["-t"], type: :string, default: "url", desc: "Specify type for URLs")
|
73
|
+
def yaml(path)
|
74
|
+
load_init
|
75
|
+
YAML.load_file(path).each do |hash|
|
76
|
+
url = hash["url"]
|
77
|
+
message = hash["message"] || {}
|
78
|
+
raise "Could not find URL: #{hash}" unless url
|
79
|
+
case options["type"]
|
80
|
+
when "response"
|
81
|
+
DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
|
82
|
+
when "url"
|
83
|
+
DaimonSkycrawlers::Crawler.enqueue_url(url, message)
|
84
|
+
else
|
85
|
+
raise ArgumentError, "Unknown type: #{options["type"]}"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
50
90
|
private
|
51
91
|
|
52
92
|
def load_init
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require "uri"
|
2
2
|
require "faraday"
|
3
|
+
require "typhoeus/adapters/faraday"
|
3
4
|
|
4
5
|
require "daimon_skycrawlers/logger"
|
5
6
|
require "daimon_skycrawlers/config"
|
@@ -10,6 +11,8 @@ require "daimon_skycrawlers/processor"
|
|
10
11
|
require "daimon_skycrawlers/filter/update_checker"
|
11
12
|
require "daimon_skycrawlers/filter/robots_txt_checker"
|
12
13
|
|
14
|
+
Faraday.default_adapter = :typhoeus
|
15
|
+
|
13
16
|
module DaimonSkycrawlers
|
14
17
|
module Crawler
|
15
18
|
#
|
@@ -88,10 +91,10 @@ module DaimonSkycrawlers
|
|
88
91
|
@n_processed_urls += 1
|
89
92
|
|
90
93
|
setup_default_filters
|
94
|
+
setup_default_post_processes
|
91
95
|
|
92
96
|
proceeding = run_before_callbacks(message)
|
93
97
|
unless proceeding
|
94
|
-
@skipped = true
|
95
98
|
skip(message[:url])
|
96
99
|
return
|
97
100
|
end
|
@@ -101,7 +104,10 @@ module DaimonSkycrawlers
|
|
101
104
|
url = (URI(connection.url_prefix) + url).to_s
|
102
105
|
|
103
106
|
@prepare.call(connection)
|
104
|
-
fetch(url, message, &block)
|
107
|
+
response = fetch(url, message, &block)
|
108
|
+
data = { url: url, message: message, response: response }
|
109
|
+
run_after_callbacks(data)
|
110
|
+
data
|
105
111
|
end
|
106
112
|
|
107
113
|
def fetch(path, message = {})
|
@@ -137,8 +143,17 @@ module DaimonSkycrawlers
|
|
137
143
|
end
|
138
144
|
end
|
139
145
|
|
146
|
+
def setup_default_post_processes
|
147
|
+
after_process do |data|
|
148
|
+
storage.save(data)
|
149
|
+
message = data[:message]
|
150
|
+
url = data[:url]
|
151
|
+
schedule_to_process(url, message)
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
140
155
|
def skip(url)
|
141
|
-
log.info("
|
156
|
+
log.info("Skipped '#{url}' by '#{self.class}'")
|
142
157
|
@skipped = true
|
143
158
|
schedule_to_process(url.to_s, heartbeat: true)
|
144
159
|
end
|
@@ -9,13 +9,13 @@ module DaimonSkycrawlers
|
|
9
9
|
#
|
10
10
|
class Default < Base
|
11
11
|
def fetch(url, message)
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
params = message[:params] || {}
|
13
|
+
method = message[:method] || "GET"
|
14
|
+
if method == "POST"
|
15
|
+
post(url, params)
|
16
|
+
else
|
17
|
+
get(url, params)
|
18
|
+
end
|
19
19
|
end
|
20
20
|
end
|
21
21
|
end
|
@@ -39,6 +39,7 @@ module DaimonSkycrawlers
|
|
39
39
|
}
|
40
40
|
invoke(MigrationGenerator, [
|
41
41
|
"CreatePages",
|
42
|
+
"key:string",
|
42
43
|
"url:string",
|
43
44
|
"headers:text",
|
44
45
|
"body:binary",
|
@@ -56,6 +57,8 @@ module DaimonSkycrawlers
|
|
56
57
|
<<-CODE.chomp
|
57
58
|
#{indent}t.timestamps
|
58
59
|
|
60
|
+
#{indent}t.index [:key]
|
61
|
+
#{indent}t.index [:key, :updated_at]
|
59
62
|
#{indent}t.index [:url]
|
60
63
|
#{indent}t.index [:url, :updated_at]
|
61
64
|
CODE
|
@@ -95,7 +98,7 @@ module DaimonSkycrawlers
|
|
95
98
|
|
96
99
|
def display_post_message
|
97
100
|
puts <<MESSAGE
|
98
|
-
Check .env and .env.db before
|
101
|
+
Check .env and .env.db before running `docker-compose build` or `docker-compose up`.
|
99
102
|
MESSAGE
|
100
103
|
end
|
101
104
|
end
|
@@ -24,7 +24,7 @@ module DaimonSkycrawlers
|
|
24
24
|
You can register your processor in `app/processor.rb` to run your processor.
|
25
25
|
Following code snippet is useful:
|
26
26
|
|
27
|
-
processor = #{
|
27
|
+
processor = #{name.classify}.new
|
28
28
|
DaimonSkycrawlers.register_processor(processor)
|
29
29
|
|
30
30
|
MESSAGE
|
@@ -11,9 +11,23 @@ module DaimonSkycrawlers
|
|
11
11
|
include DaimonSkycrawlers::Callbacks
|
12
12
|
include DaimonSkycrawlers::Configurable
|
13
13
|
|
14
|
+
def initialize
|
15
|
+
super
|
16
|
+
@skipped = false
|
17
|
+
end
|
18
|
+
|
19
|
+
def skipped?
|
20
|
+
@skipped
|
21
|
+
end
|
22
|
+
|
14
23
|
def process(message)
|
24
|
+
@skipped = false
|
25
|
+
setup_default_filters
|
15
26
|
proceeding = run_before_callbacks(message)
|
16
|
-
|
27
|
+
unless proceeding
|
28
|
+
skip(message[:url])
|
29
|
+
return
|
30
|
+
end
|
17
31
|
call(message)
|
18
32
|
end
|
19
33
|
|
@@ -24,6 +38,19 @@ module DaimonSkycrawlers
|
|
24
38
|
def storage
|
25
39
|
@storage ||= DaimonSkycrawlers::Storage::RDB.new
|
26
40
|
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def setup_default_filters
|
45
|
+
before_process do |m|
|
46
|
+
!m[:heartbeat]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def skip(url)
|
51
|
+
log.info("Skipped '#{url}' by '#{self.class}'")
|
52
|
+
@skipped = true
|
53
|
+
end
|
27
54
|
end
|
28
55
|
end
|
29
56
|
end
|
@@ -6,6 +6,9 @@ require "uri"
|
|
6
6
|
module DaimonSkycrawlers
|
7
7
|
# Based on https://github.com/benbalter/sitemap-parser
|
8
8
|
class SitemapParser
|
9
|
+
class Error < StandardError
|
10
|
+
end
|
11
|
+
|
9
12
|
def initialize(urls, options = {})
|
10
13
|
@urls = urls
|
11
14
|
end
|
@@ -14,7 +17,8 @@ module DaimonSkycrawlers
|
|
14
17
|
hydra = Typhoeus::Hydra.new(max_concurrency: 1)
|
15
18
|
sitemap_urls = []
|
16
19
|
@urls.each do |url|
|
17
|
-
|
20
|
+
uri = URI(url)
|
21
|
+
if uri.scheme && uri.scheme.start_with?("http")
|
18
22
|
request = Typhoeus::Request.new(url, followlocation: true)
|
19
23
|
request.on_complete do |response|
|
20
24
|
sitemap_urls.concat(on_complete(response))
|
@@ -22,18 +26,21 @@ module DaimonSkycrawlers
|
|
22
26
|
hydra.queue(request)
|
23
27
|
else
|
24
28
|
if File.exist?(url)
|
25
|
-
extract_urls(File.read(url))
|
29
|
+
sitemap_urls.concat(extract_urls(File.read(url)))
|
26
30
|
end
|
27
31
|
end
|
28
32
|
end
|
29
|
-
|
33
|
+
loop do
|
34
|
+
hydra.run
|
35
|
+
break if hydra.queued_requests.empty?
|
36
|
+
end
|
30
37
|
sitemap_urls
|
31
38
|
end
|
32
39
|
|
33
40
|
private
|
34
41
|
|
35
42
|
def on_complete(response)
|
36
|
-
raise "HTTP requset to #{response.effective_url} failed" unless response.success?
|
43
|
+
raise Error, "HTTP requset to #{response.effective_url} failed. status: #{response.code}" unless response.success?
|
37
44
|
raw_sitemap = inflate_response(response)
|
38
45
|
extract_urls(raw_sitemap)
|
39
46
|
end
|
@@ -51,7 +58,7 @@ module DaimonSkycrawlers
|
|
51
58
|
url.at("loc").content
|
52
59
|
end
|
53
60
|
else
|
54
|
-
raise "Malformed sitemap.xml no <sitemapindex> or <urlset>"
|
61
|
+
raise Error, "Malformed sitemap.xml no <sitemapindex> or <urlset>"
|
55
62
|
end
|
56
63
|
end
|
57
64
|
|
@@ -72,7 +79,7 @@ module DaimonSkycrawlers
|
|
72
79
|
when "deflate", "gzip", "x-gzip"
|
73
80
|
true
|
74
81
|
else
|
75
|
-
signature = response.body[0, 2]
|
82
|
+
signature = response.body[0, 2].b
|
76
83
|
signature == "\x1F\x8B".b
|
77
84
|
end
|
78
85
|
end
|
@@ -11,35 +11,46 @@ module DaimonSkycrawlers
|
|
11
11
|
@base_dir = Pathname(base_dir)
|
12
12
|
end
|
13
13
|
|
14
|
-
def save(
|
14
|
+
def save(data)
|
15
|
+
url = data[:url]
|
16
|
+
message = data[:message]
|
17
|
+
key = message[:key]
|
18
|
+
response = data[:response]
|
19
|
+
headers = response.headers
|
20
|
+
body = response.body
|
15
21
|
@base_dir.mkpath
|
16
|
-
body_path(url).dirname.mkpath
|
17
|
-
body_path(url).open("wb+") do |file|
|
22
|
+
body_path(url, key).dirname.mkpath
|
23
|
+
body_path(url, key).open("wb+") do |file|
|
18
24
|
file.write(body)
|
19
25
|
end
|
20
|
-
headers_path(url).open("wb+") do |file|
|
26
|
+
headers_path(url, key).open("wb+") do |file|
|
21
27
|
file.write(JSON.generate(headers))
|
22
28
|
end
|
23
29
|
end
|
24
30
|
|
25
|
-
def read(url)
|
26
|
-
|
27
|
-
|
28
|
-
|
31
|
+
def read(url, message)
|
32
|
+
key = message[:key]
|
33
|
+
headers = JSON.parse(headers_path(url, key).read)
|
34
|
+
body = body_path(url, key).read
|
35
|
+
Page.new(url, key, headers, body, headers["last-modified"], headers["etag"])
|
29
36
|
end
|
30
37
|
|
31
|
-
Page = Struct.new(:url, :headers, :body, :last_modified, :etag)
|
38
|
+
Page = Struct.new(:url, :key, :headers, :body, :last_modified, :etag)
|
32
39
|
|
33
40
|
private
|
34
41
|
|
35
|
-
def body_path(url)
|
42
|
+
def body_path(url, key)
|
36
43
|
url = URI(url)
|
37
|
-
|
44
|
+
if key
|
45
|
+
@base_dir + key
|
46
|
+
else
|
47
|
+
@base_dir + ".#{url.path}"
|
48
|
+
end
|
38
49
|
end
|
39
50
|
|
40
|
-
def headers_path(url)
|
51
|
+
def headers_path(url, key)
|
41
52
|
url = URI(url)
|
42
|
-
Pathname("#{body_path(url)}-headers.json")
|
53
|
+
Pathname("#{body_path(url, key)}-headers.json")
|
43
54
|
end
|
44
55
|
end
|
45
56
|
end
|
@@ -16,12 +16,20 @@ module DaimonSkycrawlers
|
|
16
16
|
#
|
17
17
|
# Save
|
18
18
|
#
|
19
|
-
# @param [
|
20
|
-
#
|
21
|
-
#
|
19
|
+
# @param [Hash] data has following keys
|
20
|
+
# * :url: URL
|
21
|
+
# * :message: Given message
|
22
|
+
# * :response: HTTP response
|
22
23
|
#
|
23
|
-
def save(
|
24
|
+
def save(data)
|
25
|
+
url = data[:url]
|
26
|
+
message = data[:message]
|
27
|
+
key = message[:key] || url
|
28
|
+
response = data[:response]
|
29
|
+
headers = response.headers
|
30
|
+
body = response.body
|
24
31
|
Page.create(url: url,
|
32
|
+
key: key,
|
25
33
|
headers: JSON.generate(headers),
|
26
34
|
body: body,
|
27
35
|
last_modified_at: headers["last-modified"],
|
@@ -33,8 +41,13 @@ module DaimonSkycrawlers
|
|
33
41
|
#
|
34
42
|
# @param [String] url identity of the page
|
35
43
|
#
|
36
|
-
def find(url)
|
37
|
-
|
44
|
+
def find(url, message = {})
|
45
|
+
key = message[:key]
|
46
|
+
if key
|
47
|
+
Page.where(key: key).order(updated_at: :desc).limit(1).first
|
48
|
+
else
|
49
|
+
Page.where(url: url).order(updated_at: :desc).limit(1).first
|
50
|
+
end
|
38
51
|
end
|
39
52
|
|
40
53
|
class Base < ActiveRecord::Base
|
@@ -8,7 +8,11 @@ module DaimonSkycrawlers
|
|
8
8
|
def setup_shutdown_timer(queue_name_prefix, interval: 10)
|
9
9
|
timers = Timers::Group.new
|
10
10
|
timer = timers.after(interval) do
|
11
|
-
|
11
|
+
if block_given?
|
12
|
+
yield
|
13
|
+
else
|
14
|
+
Process.kill(:INT, 0)
|
15
|
+
end
|
12
16
|
end
|
13
17
|
Thread.new(timers) do |t|
|
14
18
|
loop { t.wait }
|
@@ -19,6 +23,7 @@ module DaimonSkycrawlers
|
|
19
23
|
DaimonSkycrawlers.configuration.logger.debug("Reset timer: consume message #{queue_name}")
|
20
24
|
timer.reset
|
21
25
|
end
|
26
|
+
timers
|
22
27
|
end
|
23
28
|
end
|
24
29
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: daimon_skycrawlers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0.pre.rc1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- daimon developers
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-01-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -184,14 +184,14 @@ dependencies:
|
|
184
184
|
requirements:
|
185
185
|
- - "~>"
|
186
186
|
- !ruby/object:Gem::Version
|
187
|
-
version: '
|
187
|
+
version: '12.0'
|
188
188
|
type: :development
|
189
189
|
prerelease: false
|
190
190
|
version_requirements: !ruby/object:Gem::Requirement
|
191
191
|
requirements:
|
192
192
|
- - "~>"
|
193
193
|
- !ruby/object:Gem::Version
|
194
|
-
version: '
|
194
|
+
version: '12.0'
|
195
195
|
- !ruby/object:Gem::Dependency
|
196
196
|
name: test-unit
|
197
197
|
requirement: !ruby/object:Gem::Requirement
|
@@ -442,12 +442,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
442
442
|
version: '0'
|
443
443
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
444
444
|
requirements:
|
445
|
-
- - "
|
445
|
+
- - ">"
|
446
446
|
- !ruby/object:Gem::Version
|
447
|
-
version:
|
447
|
+
version: 1.3.1
|
448
448
|
requirements: []
|
449
449
|
rubyforge_project:
|
450
|
-
rubygems_version: 2.6.
|
450
|
+
rubygems_version: 2.6.8
|
451
451
|
signing_key:
|
452
452
|
specification_version: 4
|
453
453
|
summary: This is a crawler framework.
|