daimon_skycrawlers 1.0.0.pre.rc1 → 1.0.0.pre.rc2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +11 -0
- data/daimon_skycrawlers.gemspec +1 -0
- data/lib/daimon_skycrawlers.rb +32 -2
- data/lib/daimon_skycrawlers/callbacks.rb +32 -2
- data/lib/daimon_skycrawlers/cli.rb +4 -0
- data/lib/daimon_skycrawlers/commands/enqueue.rb +4 -1
- data/lib/daimon_skycrawlers/commands/runner.rb +2 -0
- data/lib/daimon_skycrawlers/config.rb +1 -0
- data/lib/daimon_skycrawlers/configurable.rb +6 -1
- data/lib/daimon_skycrawlers/consumer.rb +3 -0
- data/lib/daimon_skycrawlers/consumer/base.rb +5 -0
- data/lib/daimon_skycrawlers/consumer/http_response.rb +1 -1
- data/lib/daimon_skycrawlers/consumer/url.rb +1 -1
- data/lib/daimon_skycrawlers/crawler.rb +5 -2
- data/lib/daimon_skycrawlers/crawler/base.rb +56 -8
- data/lib/daimon_skycrawlers/crawler/default.rb +9 -1
- data/lib/daimon_skycrawlers/filter.rb +3 -0
- data/lib/daimon_skycrawlers/filter/base.rb +12 -0
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +2 -2
- data/lib/daimon_skycrawlers/filter/robots_txt_checker.rb +1 -1
- data/lib/daimon_skycrawlers/filter/update_checker.rb +2 -2
- data/lib/daimon_skycrawlers/generator/crawler.rb +4 -1
- data/lib/daimon_skycrawlers/generator/filter.rb +4 -1
- data/lib/daimon_skycrawlers/generator/generate.rb +3 -0
- data/lib/daimon_skycrawlers/generator/new.rb +5 -1
- data/lib/daimon_skycrawlers/generator/processor.rb +4 -1
- data/lib/daimon_skycrawlers/logger.rb +8 -0
- data/lib/daimon_skycrawlers/processor.rb +5 -2
- data/lib/daimon_skycrawlers/processor/base.rb +28 -2
- data/lib/daimon_skycrawlers/processor/default.rb +7 -1
- data/lib/daimon_skycrawlers/processor/proc.rb +6 -0
- data/lib/daimon_skycrawlers/processor/spider.rb +2 -2
- data/lib/daimon_skycrawlers/queue.rb +31 -0
- data/lib/daimon_skycrawlers/sitemap_parser.rb +23 -1
- data/lib/daimon_skycrawlers/storage.rb +3 -0
- data/lib/daimon_skycrawlers/storage/base.rb +21 -1
- data/lib/daimon_skycrawlers/storage/file.rb +16 -0
- data/lib/daimon_skycrawlers/storage/null.rb +2 -2
- data/lib/daimon_skycrawlers/storage/rdb.rb +25 -7
- data/lib/daimon_skycrawlers/timer.rb +9 -0
- data/lib/daimon_skycrawlers/version.rb +4 -1
- data/sample/amazon-ranking/app/processors/amazon_ranking.rb +1 -1
- data/sample/itp-crawler/app/processors/itp_processor.rb +1 -1
- data/{lib/daimon_skycrawlers/generator/templates → templates}/crawler.rb.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/filter.rb.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Dockerfile +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Dockerfile.db +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Gemfile +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/README.md.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Rakefile +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/app/crawler.rb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/app/processor.rb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/config/database.yml.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/config/init.rb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/docker-compose.yml.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/env.db.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/env.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/services/common/docker-entrypoint.sh +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/services/db/init-user-db.sh +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/processor.rb.erb +0 -0
- metadata +34 -19
@@ -5,9 +5,17 @@ module DaimonSkycrawlers
|
|
5
5
|
#
|
6
6
|
# The default crawler
|
7
7
|
#
|
8
|
-
# This crawler can GET given URL and store response to storage
|
8
|
+
# This crawler can GET/POST given URL and store response to storage
|
9
9
|
#
|
10
10
|
class Default < Base
|
11
|
+
#
|
12
|
+
# GET/POST given url
|
13
|
+
#
|
14
|
+
# @param url [String] URI or path
|
15
|
+
# @param message [Hash] message can include anything
|
16
|
+
#
|
17
|
+
# @return [Faraday::Response] HTTP response
|
18
|
+
#
|
11
19
|
def fetch(url, message)
|
12
20
|
params = message[:params] || {}
|
13
21
|
method = message[:method] || "GET"
|
@@ -20,10 +20,22 @@ module DaimonSkycrawlers
|
|
20
20
|
@storage = storage
|
21
21
|
end
|
22
22
|
|
23
|
+
#
|
24
|
+
# Retrieve storage instance
|
25
|
+
#
|
23
26
|
def storage
|
24
27
|
@storage ||= DaimonSkycrawlers::Storage::RDB.new
|
25
28
|
end
|
26
29
|
|
30
|
+
#
|
31
|
+
# Filter message
|
32
|
+
#
|
33
|
+
# Override this method in subclass.
|
34
|
+
#
|
35
|
+
# @param message [Hash] message can include anything
|
36
|
+
#
|
37
|
+
# @return [true|false] process the message if true otherwise skip message.
|
38
|
+
#
|
27
39
|
def call(message)
|
28
40
|
raise NotImplementedError, "Must implement this method in subclass"
|
29
41
|
end
|
@@ -16,7 +16,7 @@ module DaimonSkycrawlers
|
|
16
16
|
end
|
17
17
|
|
18
18
|
#
|
19
|
-
# @param [Hash] message to check duplication. If given URL is
|
19
|
+
# @param message [Hash] message to check duplication. If given URL is
|
20
20
|
# relative URL, use `@base_url + url` as absolute URL.
|
21
21
|
# @return [true|false] Return false when duplicated, otherwise return true.
|
22
22
|
#
|
@@ -28,7 +28,7 @@ module DaimonSkycrawlers
|
|
28
28
|
end
|
29
29
|
|
30
30
|
#
|
31
|
-
# @param [Hash] message to check duplication. If given URL is
|
31
|
+
# @param message [Hash] message to check duplication. If given URL is
|
32
32
|
# relative URL, use `@base_url + url` as absolute URL.
|
33
33
|
# @return [true|false] Return true when duplicated, otherwise return false.
|
34
34
|
#
|
@@ -16,7 +16,7 @@ module DaimonSkycrawlers
|
|
16
16
|
end
|
17
17
|
|
18
18
|
#
|
19
|
-
# @param [Hash]
|
19
|
+
# @param message [Hash] check given URL is allowed or not by robots.txt
|
20
20
|
# @return [true|false] Return true when web site allows to fetch the URL, otherwise return false
|
21
21
|
#
|
22
22
|
def call(message)
|
@@ -17,13 +17,13 @@ module DaimonSkycrawlers
|
|
17
17
|
end
|
18
18
|
|
19
19
|
#
|
20
|
-
# @param [Hash] message
|
20
|
+
# @param message [Hash] message includes `:url`
|
21
21
|
# @param connection [Faraday]
|
22
22
|
# @return [true|false] Return true when need update, otherwise return false
|
23
23
|
#
|
24
24
|
def call(message, connection: nil)
|
25
25
|
url = normalize_url(message[:url])
|
26
|
-
page = storage.
|
26
|
+
page = storage.read(url, message)
|
27
27
|
return true unless page
|
28
28
|
if connection
|
29
29
|
response = connection.head(url)
|
@@ -1,14 +1,17 @@
|
|
1
1
|
require "thor"
|
2
|
+
require "pathname"
|
2
3
|
|
3
4
|
module DaimonSkycrawlers
|
5
|
+
# @private
|
4
6
|
module Generator
|
7
|
+
# @private
|
5
8
|
class Crawler < Thor::Group
|
6
9
|
include Thor::Actions
|
7
10
|
|
8
11
|
argument :name
|
9
12
|
|
10
13
|
def self.source_root
|
11
|
-
|
14
|
+
(Pathname(__dir__) + "../../../templates").to_s
|
12
15
|
end
|
13
16
|
|
14
17
|
def create_files
|
@@ -1,14 +1,17 @@
|
|
1
1
|
require "thor"
|
2
|
+
require "pathname"
|
2
3
|
|
3
4
|
module DaimonSkycrawlers
|
5
|
+
# @private
|
4
6
|
module Generator
|
7
|
+
# @private
|
5
8
|
class Filter < Thor::Group
|
6
9
|
include Thor::Actions
|
7
10
|
|
8
11
|
argument :name
|
9
12
|
|
10
13
|
def self.source_root
|
11
|
-
|
14
|
+
(Pathname(__dir__) + "../../../templates").to_s
|
12
15
|
end
|
13
16
|
|
14
17
|
def create_files
|
@@ -1,10 +1,13 @@
|
|
1
1
|
require "thor"
|
2
|
+
require "pathname"
|
2
3
|
require "daimon_skycrawlers/generator/crawler"
|
3
4
|
require "daimon_skycrawlers/generator/processor"
|
4
5
|
require "daimon_skycrawlers/generator/filter"
|
5
6
|
|
6
7
|
module DaimonSkycrawlers
|
8
|
+
# @private
|
7
9
|
module Generator
|
10
|
+
# @private
|
8
11
|
class Generate < Thor
|
9
12
|
register(Crawler, "crawler", "crawler NAME", "Generate new crawler")
|
10
13
|
register(Processor, "processor", "processor NAME", "Generate new processor")
|
@@ -1,19 +1,22 @@
|
|
1
1
|
require "securerandom"
|
2
2
|
require "thor"
|
3
|
+
require "pathname"
|
3
4
|
require "rails/generators"
|
4
5
|
require "rails/generators/actions"
|
5
6
|
require "rails/generators/active_record"
|
6
7
|
require "rails/generators/active_record/migration/migration_generator"
|
7
8
|
|
8
9
|
module DaimonSkycrawlers
|
10
|
+
# @private
|
9
11
|
module Generator
|
12
|
+
# @private
|
10
13
|
class New < Thor::Group
|
11
14
|
include Thor::Actions
|
12
15
|
|
13
16
|
argument :name
|
14
17
|
|
15
18
|
def self.source_root
|
16
|
-
|
19
|
+
(Pathname(__dir__) + "../../../templates/new").to_s
|
17
20
|
end
|
18
21
|
|
19
22
|
def create_files
|
@@ -103,6 +106,7 @@ MESSAGE
|
|
103
106
|
end
|
104
107
|
end
|
105
108
|
|
109
|
+
# @private
|
106
110
|
class MigrationGenerator < ActiveRecord::Generators::MigrationGenerator
|
107
111
|
def self.source_root
|
108
112
|
ActiveRecord::Generators::MigrationGenerator.source_root
|
@@ -1,14 +1,17 @@
|
|
1
1
|
require "thor"
|
2
|
+
require "pathname"
|
2
3
|
|
3
4
|
module DaimonSkycrawlers
|
5
|
+
# @private
|
4
6
|
module Generator
|
7
|
+
# @private
|
5
8
|
class Processor < Thor::Group
|
6
9
|
include Thor::Actions
|
7
10
|
|
8
11
|
argument :name
|
9
12
|
|
10
13
|
def self.source_root
|
11
|
-
|
14
|
+
(Pathname(__dir__) + "../../../templates").to_s
|
12
15
|
end
|
13
16
|
|
14
17
|
def create_files
|
@@ -2,13 +2,20 @@ require "delegate"
|
|
2
2
|
require "logger"
|
3
3
|
|
4
4
|
module DaimonSkycrawlers
|
5
|
+
#
|
6
|
+
# Logger for daimon_skycrawlers
|
7
|
+
#
|
5
8
|
class Logger < SimpleDelegator
|
6
9
|
class << self
|
10
|
+
#
|
11
|
+
# Default logger
|
12
|
+
#
|
7
13
|
def default
|
8
14
|
@default ||= DaimonSkycrawlers::Logger.new(STDOUT)
|
9
15
|
end
|
10
16
|
end
|
11
17
|
|
18
|
+
# @private
|
12
19
|
def initialize(logdev, shift_age = 0, shift_size = 1048576)
|
13
20
|
@log = ::Logger.new(logdev, shift_age, shift_size)
|
14
21
|
super(@log)
|
@@ -16,6 +23,7 @@ module DaimonSkycrawlers
|
|
16
23
|
end
|
17
24
|
|
18
25
|
module LoggerMixin
|
26
|
+
# @private
|
19
27
|
def initialize
|
20
28
|
super
|
21
29
|
@log = DaimonSkycrawlers::Logger.default
|
@@ -4,6 +4,9 @@ require "daimon_skycrawlers/timer"
|
|
4
4
|
require "daimon_skycrawlers/consumer/http_response"
|
5
5
|
|
6
6
|
module DaimonSkycrawlers
|
7
|
+
#
|
8
|
+
# Name space for processors
|
9
|
+
#
|
7
10
|
module Processor
|
8
11
|
class << self
|
9
12
|
#
|
@@ -21,8 +24,8 @@ module DaimonSkycrawlers
|
|
21
24
|
#
|
22
25
|
# Enqueue a URL to processor queue
|
23
26
|
#
|
24
|
-
# @param [String] Specify absolute URL
|
25
|
-
# @param [Hash] Extra parameters for crawler
|
27
|
+
# @param url [String] Specify absolute URL
|
28
|
+
# @param message [Hash] Extra parameters for crawler
|
26
29
|
# @return [void]
|
27
30
|
def enqueue_http_response(url, message = {})
|
28
31
|
message[:url] = url
|
@@ -5,6 +5,12 @@ require "daimon_skycrawlers/configurable"
|
|
5
5
|
|
6
6
|
module DaimonSkycrawlers
|
7
7
|
module Processor
|
8
|
+
#
|
9
|
+
# The base class of processor
|
10
|
+
#
|
11
|
+
# A processor implementation can inherit this class and override
|
12
|
+
# `#call` in the class.
|
13
|
+
#
|
8
14
|
class Base
|
9
15
|
include DaimonSkycrawlers::LoggerMixin
|
10
16
|
include DaimonSkycrawlers::ConfigMixin
|
@@ -14,16 +20,26 @@ module DaimonSkycrawlers
|
|
14
20
|
def initialize
|
15
21
|
super
|
16
22
|
@skipped = false
|
23
|
+
|
24
|
+
setup_default_filters
|
17
25
|
end
|
18
26
|
|
27
|
+
# @private
|
19
28
|
def skipped?
|
20
29
|
@skipped
|
21
30
|
end
|
22
31
|
|
32
|
+
#
|
33
|
+
# Process processor sequence
|
34
|
+
#
|
35
|
+
# 1. Run registered filters
|
36
|
+
# 1. Process HTTP response from message
|
37
|
+
#
|
38
|
+
# @param message [Hash] parameters for processor
|
39
|
+
#
|
23
40
|
def process(message)
|
24
41
|
@skipped = false
|
25
|
-
|
26
|
-
proceeding = run_before_callbacks(message)
|
42
|
+
proceeding = run_before_process_callbacks(message)
|
27
43
|
unless proceeding
|
28
44
|
skip(message[:url])
|
29
45
|
return
|
@@ -31,10 +47,20 @@ module DaimonSkycrawlers
|
|
31
47
|
call(message)
|
32
48
|
end
|
33
49
|
|
50
|
+
#
|
51
|
+
# Process message
|
52
|
+
#
|
53
|
+
# Override this method in subclass
|
54
|
+
#
|
55
|
+
# @param message [Hash] parameters for processor
|
56
|
+
#
|
34
57
|
def call(message)
|
35
58
|
raise "Implement this method in subclass"
|
36
59
|
end
|
37
60
|
|
61
|
+
#
|
62
|
+
# Retrieve storage instance
|
63
|
+
#
|
38
64
|
def storage
|
39
65
|
@storage ||= DaimonSkycrawlers::Storage::RDB.new
|
40
66
|
end
|
@@ -3,10 +3,16 @@ require "daimon_skycrawlers/processor/base"
|
|
3
3
|
|
4
4
|
module DaimonSkycrawlers
|
5
5
|
module Processor
|
6
|
+
#
|
7
|
+
# Very simple processor
|
8
|
+
#
|
6
9
|
class Default < Base
|
10
|
+
#
|
11
|
+
# Display page information
|
12
|
+
#
|
7
13
|
def call(message)
|
8
14
|
url = message[:url]
|
9
|
-
page = storage.
|
15
|
+
page = storage.read(url, message)
|
10
16
|
headers = JSON.parse(page.headers)
|
11
17
|
headers_string = headers.map {|key, value| " #{key}: #{value}" }.join("\n")
|
12
18
|
dumped_message = <<LOG
|
@@ -2,12 +2,18 @@ require "daimon_skycrawlers/processor/base"
|
|
2
2
|
|
3
3
|
module DaimonSkycrawlers
|
4
4
|
module Processor
|
5
|
+
#
|
6
|
+
# Processor for Proc
|
7
|
+
#
|
5
8
|
class Proc < Base
|
6
9
|
def initialize(handler)
|
7
10
|
super()
|
8
11
|
@handler = handler
|
9
12
|
end
|
10
13
|
|
14
|
+
#
|
15
|
+
# Process message
|
16
|
+
#
|
11
17
|
def call(message)
|
12
18
|
@handler.call(message)
|
13
19
|
end
|
@@ -96,13 +96,13 @@ module DaimonSkycrawlers
|
|
96
96
|
end
|
97
97
|
|
98
98
|
#
|
99
|
-
# @param [Hash]
|
99
|
+
# @param message [Hash] Must have key :url, :depth
|
100
100
|
#
|
101
101
|
def call(message)
|
102
102
|
key_url = message[:url]
|
103
103
|
depth = Integer(message[:depth] || 2)
|
104
104
|
return if depth <= 1
|
105
|
-
page = storage.
|
105
|
+
page = storage.read(key_url, message)
|
106
106
|
@doc = Nokogiri::HTML(page.body)
|
107
107
|
new_message = {
|
108
108
|
depth: depth - 1,
|
@@ -1,8 +1,14 @@
|
|
1
1
|
require "songkick_queue"
|
2
2
|
|
3
3
|
module DaimonSkycrawlers
|
4
|
+
#
|
5
|
+
# Wrapper for queue configuration class
|
6
|
+
#
|
4
7
|
class Queue
|
5
8
|
class << self
|
9
|
+
#
|
10
|
+
# Configuration for queue
|
11
|
+
#
|
6
12
|
def configuration
|
7
13
|
@configuration ||= SongkickQueue.configure do |config|
|
8
14
|
config.logger = Logger.new(STDOUT)
|
@@ -16,6 +22,31 @@ module DaimonSkycrawlers
|
|
16
22
|
end
|
17
23
|
end
|
18
24
|
|
25
|
+
#
|
26
|
+
# Configure queue
|
27
|
+
#
|
28
|
+
# ```ruby
|
29
|
+
# DaimonSkycrawlers::Queue.configure do |config|
|
30
|
+
# config.logger = Logger.new(STDOUT)
|
31
|
+
# config.host = "127.0.0.1"
|
32
|
+
# config.port = 5672
|
33
|
+
# # config.username = 'guest'
|
34
|
+
# # config.password = 'guest'
|
35
|
+
# config.vhost = "/"
|
36
|
+
# config.max_reconnect_attempts = 10
|
37
|
+
# config.network_recovery_interval = 1.0
|
38
|
+
# end
|
39
|
+
# ```
|
40
|
+
#
|
41
|
+
# * logger: logger instance for queue system
|
42
|
+
# * host: RabbitMQ host
|
43
|
+
# * port: RabbitMQ port
|
44
|
+
# * username: RabbitMQ username
|
45
|
+
# * passowrd: RabbitMQ password
|
46
|
+
# * vhost: virtual host used for connection
|
47
|
+
# * max_reconnect_attempts: The maximum number of reconnection attempts
|
48
|
+
# * network_recovery_interval: reconnection interval for TCP connection failures
|
49
|
+
#
|
19
50
|
def configure
|
20
51
|
yield configuration
|
21
52
|
end
|
@@ -4,15 +4,37 @@ require "zlib"
|
|
4
4
|
require "uri"
|
5
5
|
|
6
6
|
module DaimonSkycrawlers
|
7
|
+
#
|
8
|
+
# Parser for sitemap.xml
|
9
|
+
#
|
7
10
|
# Based on https://github.com/benbalter/sitemap-parser
|
11
|
+
# See also https://www.sitemaps.org/
|
12
|
+
#
|
13
|
+
# ```ruby
|
14
|
+
# urls = ["https://example.com/sitemap.xml"]
|
15
|
+
# sitemap_parser = DaimonSkycrawlers::SitemapParser.new(urls)
|
16
|
+
# sitemap_urls = sitemap_parser.parse
|
17
|
+
# ```
|
18
|
+
#
|
8
19
|
class SitemapParser
|
20
|
+
#
|
21
|
+
# Error class for SitemapParser
|
22
|
+
#
|
9
23
|
class Error < StandardError
|
10
24
|
end
|
11
25
|
|
12
|
-
|
26
|
+
#
|
27
|
+
# @param urls [Array] List of sitemap.xml URL
|
28
|
+
#
|
29
|
+
def initialize(urls)
|
13
30
|
@urls = urls
|
14
31
|
end
|
15
32
|
|
33
|
+
#
|
34
|
+
# Fetch and parse sitemap.xml
|
35
|
+
#
|
36
|
+
# @return [Array] URLs in sitemap.xml
|
37
|
+
#
|
16
38
|
def parse
|
17
39
|
hydra = Typhoeus::Hydra.new(max_concurrency: 1)
|
18
40
|
sitemap_urls = []
|