daimon_skycrawlers 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/daimon_skycrawlers/cli.rb +2 -0
- data/lib/daimon_skycrawlers/consumer/http_response.rb +1 -1
- data/lib/daimon_skycrawlers/consumer/url.rb +1 -4
- data/lib/daimon_skycrawlers/crawler/base.rb +48 -7
- data/lib/daimon_skycrawlers/crawler/default.rb +2 -30
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +9 -0
- data/lib/daimon_skycrawlers/filter/robots_txt_checker.rb +2 -0
- data/lib/daimon_skycrawlers/filter/update_checker.rb +10 -4
- data/lib/daimon_skycrawlers/generator/crawler.rb +22 -0
- data/lib/daimon_skycrawlers/generator/generate.rb +12 -0
- data/lib/daimon_skycrawlers/generator/new.rb +7 -4
- data/lib/daimon_skycrawlers/generator/processor.rb +22 -0
- data/lib/daimon_skycrawlers/generator/templates/crawler.rb.erb +13 -0
- data/lib/daimon_skycrawlers/generator/templates/processor.rb.erb +13 -0
- data/lib/daimon_skycrawlers/processor/base.rb +26 -0
- data/lib/daimon_skycrawlers/processor/spider.rb +7 -7
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/sample/spider/app/processors/blog_spider.rb +4 -4
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cbbd476464b9b44d5bc5f71bc999820126379e2d
|
4
|
+
data.tar.gz: 473ac2bd6f9c63a7307357aa2fae7da2be6efde7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 788e296ef3fbd73c39db3ca0f6e6507e9c2074893c4895598c3e418849cf70a8951cc65c5fbff88d1a832c721edd53bd7d4078699090b6da36efaf460027a944
|
7
|
+
data.tar.gz: 42ba6b8ad282060811d2817d85c8b276ccc0890c4a15c546571f22594cb1a552be2838c166ced8fa609807dc0399b910acc53ba7f56e927222955af2008a831a
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require "thor"
|
2
2
|
require "daimon_skycrawlers/generator/new"
|
3
|
+
require "daimon_skycrawlers/generator/generate"
|
3
4
|
require "daimon_skycrawlers/commands/enqueue"
|
4
5
|
require "daimon_skycrawlers/commands/runner"
|
5
6
|
require "daimon_skycrawlers/version"
|
@@ -7,6 +8,7 @@ require "daimon_skycrawlers/version"
|
|
7
8
|
module DaimonSkycrawlers
|
8
9
|
class CLI < Thor
|
9
10
|
register(Generator::New, "new", "new NAME", "Create new project")
|
11
|
+
register(Generator::Generate, "generate", "generate COMMAND", "Generate new code")
|
10
12
|
register(Commands::Runner, "exec", "exec [COMMAND]", "Execute crawler/processor")
|
11
13
|
register(Commands::Enqueue, "enqueue", "enqueue [TYPE] URL [messages...]", "Enqueue URL")
|
12
14
|
|
@@ -44,14 +44,11 @@ module DaimonSkycrawlers
|
|
44
44
|
# @private
|
45
45
|
#
|
46
46
|
def process(message)
|
47
|
-
url = message[:url]
|
48
|
-
depth = Integer(message[:depth] || 0)
|
49
|
-
|
50
47
|
crawler_interval = DaimonSkycrawlers.configuration.crawler_interval
|
51
48
|
|
52
49
|
# XXX When several crawlers are registered, how should they behave?
|
53
50
|
self.class.crawlers.each do |crawler|
|
54
|
-
crawler.
|
51
|
+
crawler.process(message)
|
55
52
|
if crawler.skipped?
|
56
53
|
sleep(crawler_interval) if crawler.n_processed_urls % 50 == 0
|
57
54
|
else
|
@@ -20,13 +20,19 @@ module DaimonSkycrawlers
|
|
20
20
|
# @return [void]
|
21
21
|
attr_writer :storage
|
22
22
|
|
23
|
+
# @!attribute [r] n_processed_urls
|
24
|
+
# The number of processed URLs.
|
25
|
+
# @return [Integer]
|
26
|
+
attr_reader :n_processed_urls
|
27
|
+
|
23
28
|
#
|
24
29
|
# @param [String] Base URL for crawler
|
25
30
|
# @param [Hash] options for Faraday
|
26
31
|
#
|
27
|
-
def initialize(base_url = nil, options
|
32
|
+
def initialize(base_url = nil, faraday_options: {}, options: {})
|
28
33
|
super()
|
29
34
|
@base_url = base_url
|
35
|
+
@faraday_options = faraday_options
|
30
36
|
@options = options
|
31
37
|
@prepare = ->(connection) {}
|
32
38
|
@skipped = false
|
@@ -41,7 +47,9 @@ module DaimonSkycrawlers
|
|
41
47
|
# @yieldparam faraday [Faraday]
|
42
48
|
#
|
43
49
|
def setup_connection(options = {})
|
44
|
-
|
50
|
+
merged_options = @faraday_options.merge(options)
|
51
|
+
faraday_options = merged_options.empty? ? nil : merged_options
|
52
|
+
@connection = Faraday.new(@base_url, faraday_options) do |faraday|
|
45
53
|
yield faraday
|
46
54
|
end
|
47
55
|
end
|
@@ -66,10 +74,26 @@ module DaimonSkycrawlers
|
|
66
74
|
end
|
67
75
|
|
68
76
|
def connection
|
69
|
-
@connection ||= Faraday.new(@base_url, @
|
77
|
+
@connection ||= Faraday.new(@base_url, @faraday_options)
|
78
|
+
end
|
79
|
+
|
80
|
+
def process(message, &block)
|
81
|
+
url = message.delete(:url)
|
82
|
+
|
83
|
+
@skipped = false
|
84
|
+
@n_processed_urls += 1
|
85
|
+
# url can be a path
|
86
|
+
url = connection.url_prefix + url
|
87
|
+
|
88
|
+
apply_filters(url)
|
89
|
+
|
90
|
+
unless skipped?
|
91
|
+
@prepare.call(connection)
|
92
|
+
fetch(url, message, &block)
|
93
|
+
end
|
70
94
|
end
|
71
95
|
|
72
|
-
def fetch(path,
|
96
|
+
def fetch(path, message = {})
|
73
97
|
raise NotImplementedError, "Must implement this method in subclass"
|
74
98
|
end
|
75
99
|
|
@@ -81,11 +105,28 @@ module DaimonSkycrawlers
|
|
81
105
|
@connection.post(path, params)
|
82
106
|
end
|
83
107
|
|
84
|
-
|
85
|
-
|
108
|
+
private
|
109
|
+
|
110
|
+
def apply_filters(url)
|
111
|
+
if @options[:obey_robots_txt]
|
112
|
+
robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url)
|
113
|
+
unless robots_txt_checker.allowed?(url)
|
114
|
+
skip(url)
|
115
|
+
return
|
116
|
+
end
|
117
|
+
end
|
118
|
+
update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
|
119
|
+
unless update_checker.updated?(url.to_s, connection: connection)
|
120
|
+
skip(url)
|
121
|
+
return
|
122
|
+
end
|
86
123
|
end
|
87
124
|
|
88
|
-
|
125
|
+
def skip(url)
|
126
|
+
log.info("Skip #{url}")
|
127
|
+
@skipped = true
|
128
|
+
schedule_to_process(url.to_s, heartbeat: true)
|
129
|
+
end
|
89
130
|
|
90
131
|
def schedule_to_process(url, message = {})
|
91
132
|
DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
|
@@ -10,43 +10,15 @@ module DaimonSkycrawlers
|
|
10
10
|
# This crawler can GET given URL and store response to storage
|
11
11
|
#
|
12
12
|
class Default < Base
|
13
|
-
def fetch(
|
14
|
-
|
15
|
-
@skipped = false
|
16
|
-
url = connection.url_prefix + path
|
17
|
-
if @options[:obey_robots_txt]
|
18
|
-
robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url)
|
19
|
-
unless robots_txt_checker.call(url)
|
20
|
-
skip(url)
|
21
|
-
return
|
22
|
-
end
|
23
|
-
end
|
24
|
-
update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
|
25
|
-
unless update_checker.call(url.to_s, connection: connection)
|
26
|
-
skip(url)
|
27
|
-
return
|
28
|
-
end
|
29
|
-
@prepare.call(connection)
|
30
|
-
response = get(path)
|
13
|
+
def fetch(url, message)
|
14
|
+
response = get(url)
|
31
15
|
data = [url.to_s, response.headers, response.body]
|
32
16
|
|
33
17
|
yield(*data) if block_given?
|
34
18
|
|
35
19
|
storage.save(*data)
|
36
|
-
message = {
|
37
|
-
depth: depth
|
38
|
-
}
|
39
|
-
message = message.merge(kw)
|
40
20
|
schedule_to_process(url.to_s, message)
|
41
21
|
end
|
42
|
-
|
43
|
-
private
|
44
|
-
|
45
|
-
def skip(url)
|
46
|
-
log.info("Skip #{url}")
|
47
|
-
@skipped = true
|
48
|
-
schedule_to_process(url.to_s, heartbeat: true)
|
49
|
-
end
|
50
22
|
end
|
51
23
|
end
|
52
24
|
end
|
@@ -28,6 +28,15 @@ module DaimonSkycrawlers
|
|
28
28
|
@urls << url
|
29
29
|
true
|
30
30
|
end
|
31
|
+
|
32
|
+
#
|
33
|
+
# @param [String] url to check duplication. If given URL is
|
34
|
+
# relative URL, use `@base_url + url` as absolute URL.
|
35
|
+
# @return [true|false] Return true when duplicated, otherwise return false.
|
36
|
+
#
|
37
|
+
def duplicated?(url)
|
38
|
+
!call(url)
|
39
|
+
end
|
31
40
|
end
|
32
41
|
end
|
33
42
|
end
|
@@ -32,11 +32,17 @@ module DaimonSkycrawlers
|
|
32
32
|
else
|
33
33
|
headers = Faraday.head(url)
|
34
34
|
end
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
35
|
+
case
|
36
|
+
when headers.key?("etag") && page.etag
|
37
|
+
headers["etag"] != page.etag
|
38
|
+
when headers.key?("last-modified") && page.last_modified_at
|
39
|
+
headers["last-modified"] > page.last_modified_at
|
40
|
+
else
|
41
|
+
true
|
42
|
+
end
|
39
43
|
end
|
44
|
+
|
45
|
+
alias updated? call
|
40
46
|
end
|
41
47
|
end
|
42
48
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require "thor"
|
2
|
+
|
3
|
+
module DaimonSkycrawlers
|
4
|
+
module Generator
|
5
|
+
class Crawler < Thor::Group
|
6
|
+
include Thor::Actions
|
7
|
+
|
8
|
+
argument :name
|
9
|
+
|
10
|
+
def self.source_root
|
11
|
+
File.join(__dir__, "templates")
|
12
|
+
end
|
13
|
+
|
14
|
+
def create_files
|
15
|
+
config = {
|
16
|
+
class_name: name.classify,
|
17
|
+
}
|
18
|
+
template("crawler.rb.erb", "app/crawlers/#{name.underscore}.rb", config)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require "thor"
|
2
|
+
require "daimon_skycrawlers/generator/crawler"
|
3
|
+
require "daimon_skycrawlers/generator/processor"
|
4
|
+
|
5
|
+
module DaimonSkycrawlers
|
6
|
+
module Generator
|
7
|
+
class Generate < Thor
|
8
|
+
register(Crawler, "crawler", "crawler NAME", "Generate new crawler")
|
9
|
+
register(Processor, "processor", "processor NAME", "Generate new processor")
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -22,16 +22,19 @@ module DaimonSkycrawlers
|
|
22
22
|
].each do |path|
|
23
23
|
template("#{path}.erb", "#{name}/#{path}")
|
24
24
|
end
|
25
|
+
migration_options = {
|
26
|
+
destination_root: File.join(destination_root, name),
|
27
|
+
timestamps: true
|
28
|
+
}
|
25
29
|
invoke(MigrationGenerator, [
|
26
30
|
"CreatePage",
|
27
31
|
"url:string",
|
28
32
|
"headers:text",
|
29
33
|
"body:binary",
|
30
34
|
"last_modified_at:datetime",
|
31
|
-
"etag:string"
|
32
|
-
"timestamps"
|
35
|
+
"etag:string"
|
33
36
|
],
|
34
|
-
|
37
|
+
migration_options)
|
35
38
|
end
|
36
39
|
|
37
40
|
def copy_files
|
@@ -56,7 +59,7 @@ module DaimonSkycrawlers
|
|
56
59
|
set_local_assigns!
|
57
60
|
validate_file_name!
|
58
61
|
dest = options[:destination_root]
|
59
|
-
migration_template
|
62
|
+
migration_template(@migration_template, "#{dest}/db/migrate/#{file_name}.rb")
|
60
63
|
end
|
61
64
|
end
|
62
65
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require "thor"
|
2
|
+
|
3
|
+
module DaimonSkycrawlers
|
4
|
+
module Generator
|
5
|
+
class Processor < Thor::Group
|
6
|
+
include Thor::Actions
|
7
|
+
|
8
|
+
argument :name
|
9
|
+
|
10
|
+
def self.source_root
|
11
|
+
File.join(__dir__, "templates")
|
12
|
+
end
|
13
|
+
|
14
|
+
def create_files
|
15
|
+
config = {
|
16
|
+
class_name: name.classify,
|
17
|
+
}
|
18
|
+
template("processor.rb.erb", "app/processors/#{name.underscore}.rb", config)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require "daimon_skycrawlers"
|
2
|
+
require "daimon_skycrawlers/crawler"
|
3
|
+
require "daimon_skycrawlers/crawler/base"
|
4
|
+
|
5
|
+
class <%= config[:class_name] %> < DaimonSkycrawlers::Crawler::Base
|
6
|
+
def fetch(path, **kw)
|
7
|
+
# Implement your crawler here
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
base_url = ""
|
12
|
+
crawler = <%= config[:class_name] %>.new(base_url)
|
13
|
+
DaimonSkycrawlers.register_crawler(crawler)
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require "daimon_skycrawlers"
|
2
|
+
require "daimon_skycrawlers/processor"
|
3
|
+
require "daimon_skycrawlers/processor/base"
|
4
|
+
|
5
|
+
class <%= config[:class_name] %> < DaimonSkycrawlers::Processor::Base
|
6
|
+
def call(message)
|
7
|
+
# Implement your processor here
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
base_url = ""
|
12
|
+
processor = <%= config[:class_name] %>.new(base_url)
|
13
|
+
DaimonSkycrawlers.register_processor(processor)
|
@@ -7,6 +7,24 @@ module DaimonSkycrawlers
|
|
7
7
|
include DaimonSkycrawlers::LoggerMixin
|
8
8
|
include DaimonSkycrawlers::ConfigMixin
|
9
9
|
|
10
|
+
def initialize
|
11
|
+
super
|
12
|
+
@before_process_filters = []
|
13
|
+
end
|
14
|
+
|
15
|
+
def before_process(filter = nil, &block)
|
16
|
+
if block_given?
|
17
|
+
@before_process_filters << block
|
18
|
+
else
|
19
|
+
@before_process_filters << filter if filter.respond_to?(:call)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def process(message)
|
24
|
+
return unless apply_before_filters(message[:url])
|
25
|
+
call(message)
|
26
|
+
end
|
27
|
+
|
10
28
|
def call(message)
|
11
29
|
raise "Implement this method in subclass"
|
12
30
|
end
|
@@ -14,6 +32,14 @@ module DaimonSkycrawlers
|
|
14
32
|
def storage
|
15
33
|
@storage ||= DaimonSkycrawlers::Storage::RDB.new
|
16
34
|
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def apply_before_filters(url)
|
39
|
+
@before_process_filters.all? do |filter|
|
40
|
+
filter.call(url)
|
41
|
+
end
|
42
|
+
end
|
17
43
|
end
|
18
44
|
end
|
19
45
|
end
|
@@ -8,17 +8,17 @@ module DaimonSkycrawlers
|
|
8
8
|
|
9
9
|
def initialize
|
10
10
|
super
|
11
|
-
@
|
11
|
+
@link_filters = []
|
12
12
|
@doc = nil
|
13
13
|
@links = nil
|
14
14
|
@enqueue = true
|
15
15
|
end
|
16
16
|
|
17
|
-
def
|
17
|
+
def append_link_filter(filter = nil, &block)
|
18
18
|
if block_given?
|
19
|
-
@
|
19
|
+
@link_filters << block
|
20
20
|
else
|
21
|
-
@
|
21
|
+
@link_filters << filter if filter.respond_to?(:call)
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
@@ -53,15 +53,15 @@ module DaimonSkycrawlers
|
|
53
53
|
element["href"]
|
54
54
|
end
|
55
55
|
urls.uniq!
|
56
|
-
|
56
|
+
apply_link_filters(urls) || []
|
57
57
|
end
|
58
58
|
|
59
|
-
def
|
59
|
+
def apply_link_filters(urls)
|
60
60
|
return if urls.nil?
|
61
61
|
return if urls.empty?
|
62
62
|
log.debug("Candidate URLs: #{urls.size}")
|
63
63
|
urls = urls.select do |url|
|
64
|
-
@
|
64
|
+
@link_filters.all? {|filter| filter.call(url) }
|
65
65
|
end
|
66
66
|
log.debug("Filtered URLs: #{urls.size}")
|
67
67
|
urls
|
@@ -6,11 +6,11 @@ require "daimon_skycrawlers/filter/update_checker"
|
|
6
6
|
default_processor = DaimonSkycrawlers::Processor::Default.new
|
7
7
|
spider = DaimonSkycrawlers::Processor::Spider.new
|
8
8
|
#spider.enqueue = false
|
9
|
-
spider.
|
9
|
+
spider.append_link_filter do |url|
|
10
10
|
uri = URI(url)
|
11
11
|
uri.host.nil? || uri.host == "www.clear-code.com"
|
12
12
|
end
|
13
|
-
spider.
|
13
|
+
spider.append_link_filter do |url|
|
14
14
|
case url
|
15
15
|
when %r!\A(\.\./|/|#)!
|
16
16
|
false
|
@@ -19,9 +19,9 @@ spider.append_filter do |url|
|
|
19
19
|
end
|
20
20
|
end
|
21
21
|
duplicate_checker = DaimonSkycrawlers::Filter::DuplicateChecker.new(base_url: "http://www.clear-code.com/blog/")
|
22
|
-
spider.
|
22
|
+
spider.append_link_filter(duplicate_checker)
|
23
23
|
update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(base_url: "http://www.clear-code.com/blog/")
|
24
|
-
spider.
|
24
|
+
spider.append_link_filter(update_checker)
|
25
25
|
|
26
26
|
DaimonSkycrawlers.register_processor(default_processor)
|
27
27
|
DaimonSkycrawlers.register_processor(spider)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: daimon_skycrawlers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryunosuke SATO
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -324,7 +324,11 @@ files:
|
|
324
324
|
- lib/daimon_skycrawlers/filter/duplicate_checker.rb
|
325
325
|
- lib/daimon_skycrawlers/filter/robots_txt_checker.rb
|
326
326
|
- lib/daimon_skycrawlers/filter/update_checker.rb
|
327
|
+
- lib/daimon_skycrawlers/generator/crawler.rb
|
328
|
+
- lib/daimon_skycrawlers/generator/generate.rb
|
327
329
|
- lib/daimon_skycrawlers/generator/new.rb
|
330
|
+
- lib/daimon_skycrawlers/generator/processor.rb
|
331
|
+
- lib/daimon_skycrawlers/generator/templates/crawler.rb.erb
|
328
332
|
- lib/daimon_skycrawlers/generator/templates/new/Gemfile
|
329
333
|
- lib/daimon_skycrawlers/generator/templates/new/README.md.erb
|
330
334
|
- lib/daimon_skycrawlers/generator/templates/new/Rakefile
|
@@ -332,6 +336,7 @@ files:
|
|
332
336
|
- lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
|
333
337
|
- lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
|
334
338
|
- lib/daimon_skycrawlers/generator/templates/new/config/init.rb
|
339
|
+
- lib/daimon_skycrawlers/generator/templates/processor.rb.erb
|
335
340
|
- lib/daimon_skycrawlers/logger.rb
|
336
341
|
- lib/daimon_skycrawlers/processor.rb
|
337
342
|
- lib/daimon_skycrawlers/processor/base.rb
|