daimon_skycrawlers 1.0.0.pre.rc1 → 1.0.0.pre.rc2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +11 -0
  3. data/daimon_skycrawlers.gemspec +1 -0
  4. data/lib/daimon_skycrawlers.rb +32 -2
  5. data/lib/daimon_skycrawlers/callbacks.rb +32 -2
  6. data/lib/daimon_skycrawlers/cli.rb +4 -0
  7. data/lib/daimon_skycrawlers/commands/enqueue.rb +4 -1
  8. data/lib/daimon_skycrawlers/commands/runner.rb +2 -0
  9. data/lib/daimon_skycrawlers/config.rb +1 -0
  10. data/lib/daimon_skycrawlers/configurable.rb +6 -1
  11. data/lib/daimon_skycrawlers/consumer.rb +3 -0
  12. data/lib/daimon_skycrawlers/consumer/base.rb +5 -0
  13. data/lib/daimon_skycrawlers/consumer/http_response.rb +1 -1
  14. data/lib/daimon_skycrawlers/consumer/url.rb +1 -1
  15. data/lib/daimon_skycrawlers/crawler.rb +5 -2
  16. data/lib/daimon_skycrawlers/crawler/base.rb +56 -8
  17. data/lib/daimon_skycrawlers/crawler/default.rb +9 -1
  18. data/lib/daimon_skycrawlers/filter.rb +3 -0
  19. data/lib/daimon_skycrawlers/filter/base.rb +12 -0
  20. data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +2 -2
  21. data/lib/daimon_skycrawlers/filter/robots_txt_checker.rb +1 -1
  22. data/lib/daimon_skycrawlers/filter/update_checker.rb +2 -2
  23. data/lib/daimon_skycrawlers/generator/crawler.rb +4 -1
  24. data/lib/daimon_skycrawlers/generator/filter.rb +4 -1
  25. data/lib/daimon_skycrawlers/generator/generate.rb +3 -0
  26. data/lib/daimon_skycrawlers/generator/new.rb +5 -1
  27. data/lib/daimon_skycrawlers/generator/processor.rb +4 -1
  28. data/lib/daimon_skycrawlers/logger.rb +8 -0
  29. data/lib/daimon_skycrawlers/processor.rb +5 -2
  30. data/lib/daimon_skycrawlers/processor/base.rb +28 -2
  31. data/lib/daimon_skycrawlers/processor/default.rb +7 -1
  32. data/lib/daimon_skycrawlers/processor/proc.rb +6 -0
  33. data/lib/daimon_skycrawlers/processor/spider.rb +2 -2
  34. data/lib/daimon_skycrawlers/queue.rb +31 -0
  35. data/lib/daimon_skycrawlers/sitemap_parser.rb +23 -1
  36. data/lib/daimon_skycrawlers/storage.rb +3 -0
  37. data/lib/daimon_skycrawlers/storage/base.rb +21 -1
  38. data/lib/daimon_skycrawlers/storage/file.rb +16 -0
  39. data/lib/daimon_skycrawlers/storage/null.rb +2 -2
  40. data/lib/daimon_skycrawlers/storage/rdb.rb +25 -7
  41. data/lib/daimon_skycrawlers/timer.rb +9 -0
  42. data/lib/daimon_skycrawlers/version.rb +4 -1
  43. data/sample/amazon-ranking/app/processors/amazon_ranking.rb +1 -1
  44. data/sample/itp-crawler/app/processors/itp_processor.rb +1 -1
  45. data/{lib/daimon_skycrawlers/generator/templates → templates}/crawler.rb.erb +0 -0
  46. data/{lib/daimon_skycrawlers/generator/templates → templates}/filter.rb.erb +0 -0
  47. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Dockerfile +0 -0
  48. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Dockerfile.db +0 -0
  49. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Gemfile +0 -0
  50. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/README.md.erb +0 -0
  51. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Rakefile +0 -0
  52. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/app/crawler.rb +0 -0
  53. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/app/processor.rb +0 -0
  54. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/config/database.yml.erb +0 -0
  55. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/config/init.rb +0 -0
  56. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/docker-compose.yml.erb +0 -0
  57. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/env.db.erb +0 -0
  58. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/env.erb +0 -0
  59. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/services/common/docker-entrypoint.sh +0 -0
  60. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/services/db/init-user-db.sh +0 -0
  61. data/{lib/daimon_skycrawlers/generator/templates → templates}/processor.rb.erb +0 -0
  62. metadata +34 -19
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c9a394115d873b122e18a6970e62dfbc897dd690
4
- data.tar.gz: 55fc07c13fa04c82c5e1c6a1075ffd97793b01f9
3
+ metadata.gz: 648bff4fe2019f82836bcf21c75dc6e29d45abc8
4
+ data.tar.gz: f7691acca87a0686190171806a6a07aaa8f4015f
5
5
  SHA512:
6
- metadata.gz: 5ae484de037763f04487ad8c1ea4047a78a726156781cffeebf0c4c14ae0dfb70029b5f834c1414af9ee43c2b67fb7b137e5d5e335c2dee01abe9aae6ced1099
7
- data.tar.gz: 63c85408015248ca66448025b2eb8ff6b65080c3da375b2e5b9723e944e3836f73285cb083088a31c1c8d60fe6176f57c47bc2d894fc7618c5853b3aebc3d500
6
+ metadata.gz: 378fe0060bacead511a87f637702cd5d24ce79c5acf3cba0bfac123d9dfc2cfcde41261a813dce69f9e11126d988df609a92df126f82235610db7c5a7332539b
7
+ data.tar.gz: '0790c05562fc71542a1cf33ca0971bbf356659d20012f7968c6c4ffd9ec9df521f0e6d009aa8256ec05a7d1b3e494baaec187d1faa8bbd34102fb73875edcfd0'
data/.yardopts ADDED
@@ -0,0 +1,11 @@
1
+ --markup markdown
2
+ --markup-provider redcarpet
3
+ --format html
4
+ --exclude /test/
5
+ --exclude /sample/
6
+ --exclude /db/
7
+ --exclude /bin/
8
+ --no-private
9
+ -
10
+ README.md
11
+ LICENSE.txt
@@ -36,6 +36,7 @@ Gem::Specification.new do |spec|
36
36
  spec.add_development_dependency "test-unit-rr"
37
37
  spec.add_development_dependency "test-unit-notify"
38
38
  spec.add_development_dependency "pry"
39
+ spec.add_development_dependency "redcarpet"
39
40
  spec.add_development_dependency "tapp"
40
41
  spec.add_development_dependency "simplecov"
41
42
  spec.add_development_dependency "sqlite3"
@@ -3,7 +3,13 @@ require "bundler/setup"
3
3
  require "daimon_skycrawlers/version"
4
4
  require "daimon_skycrawlers/logger"
5
5
 
6
+ #
7
+ # Name space for this library
8
+ #
6
9
  module DaimonSkycrawlers
10
+ #
11
+ # Configuration class
12
+ #
7
13
  Configuration = Struct.new(
8
14
  :logger,
9
15
  :queue_name_prefix,
@@ -15,7 +21,7 @@ module DaimonSkycrawlers
15
21
  # Register a processor
16
22
  #
17
23
  # @overload register_processor(processor)
18
- # @param [Processor] processor instance which implements `call` method
24
+ # @param processor [Processor] instance which implements `call` method
19
25
  # @return [void]
20
26
  #
21
27
  # @overload register_processor
@@ -31,7 +37,7 @@ module DaimonSkycrawlers
31
37
  #
32
38
  # Register a crawler
33
39
  #
34
- # @param [Crawler] crawler instance which implements `fetch` method
40
+ # @param crawler [Crawler] instance which implements `fetch` method
35
41
  # @return [void]
36
42
  #
37
43
  def register_crawler(crawler)
@@ -55,6 +61,20 @@ module DaimonSkycrawlers
55
61
  #
56
62
  # Configure DaimonSkycrawlers
57
63
  #
64
+ # ```ruby
65
+ # DaimonSkycrawlers.configure do |config|
66
+ # config.logger = DaimonSkycrawlers::Logger.default
67
+ # config.queue_name_prefix = "daimon-skycrawlers"
68
+ # config.crawler_interval = 1
69
+ # config.shutdown_interval = 10
70
+ # end
71
+ # ```
72
+ #
73
+ # * logger: logger instance
74
+ # * queue_name_prefix: prefix of queue name.
75
+ # * crawler_interval: crawling interval
76
+ # * shutdown_interval: shutdown after interval after the queue is empty
77
+ #
58
78
  # @return [void]
59
79
  # @yield [configuration] configure DaimonSkycrawlers
60
80
  # @yieldparam configuration [DaimonSkycrawlers::Configuration] configuration object
@@ -75,6 +95,11 @@ module DaimonSkycrawlers
75
95
  exit(false)
76
96
  end
77
97
 
98
+ #
99
+ # Load "app/crawlers/**/*.rb"
100
+ #
101
+ # @return [void]
102
+ #
78
103
  def load_crawlers
79
104
  Dir.glob("app/crawlers/**/*.rb") do |path|
80
105
  require(File.expand_path(path, Dir.pwd)) &&
@@ -82,6 +107,11 @@ module DaimonSkycrawlers
82
107
  end
83
108
  end
84
109
 
110
+ #
111
+ # Load "app/processors/**/*.rb"
112
+ #
113
+ # @return [void]
114
+ #
85
115
  def load_processors
86
116
  Dir.glob("app/processors/**/*.rb") do |path|
87
117
  require(File.expand_path(path, Dir.pwd)) &&
@@ -1,11 +1,22 @@
1
1
  module DaimonSkycrawlers
2
+ #
3
+ # This module provides simple callback system
4
+ #
2
5
  module Callbacks
6
+ # @private
3
7
  def initialize
4
8
  super
5
9
  @before_process_callbacks = []
6
10
  @after_process_callbacks = []
7
11
  end
8
12
 
13
+ #
14
+ # Register before process callback
15
+ #
16
+ # @param callback [Object] This object must respond to call
17
+ # @yield [message]
18
+ # @yieldparam message [Hash]
19
+ #
9
20
  def before_process(callback = nil, &block)
10
21
  if block_given?
11
22
  @before_process_callbacks << block
@@ -14,16 +25,29 @@ module DaimonSkycrawlers
14
25
  end
15
26
  end
16
27
 
17
- def run_before_callbacks(message)
28
+ #
29
+ # Run registered before process callbacks
30
+ #
31
+ def run_before_process_callbacks(message)
18
32
  @before_process_callbacks.all? do |callback|
19
33
  callback.call(message)
20
34
  end
21
35
  end
22
36
 
37
+ #
38
+ # Clear all before process callbacks
39
+ #
23
40
  def clear_before_process_callbacks
24
41
  @before_process_callbacks = []
25
42
  end
26
43
 
44
+ #
45
+ # Register after process callback
46
+ #
47
+ # @param callback [Object] This object must respond to call
48
+ # @yield [message]
49
+ # @yieldparam message [Hash]
50
+ #
27
51
  def after_process(callback = nil, &block)
28
52
  if block_given?
29
53
  @after_process_callbacks << block
@@ -32,12 +56,18 @@ module DaimonSkycrawlers
32
56
  end
33
57
  end
34
58
 
35
- def run_after_callbacks(message)
59
+ #
60
+ # Run registered before process callbacks
61
+ #
62
+ def run_after_process_callbacks(message)
36
63
  @after_process_callbacks.each do |callback|
37
64
  callback.call(message)
38
65
  end
39
66
  end
40
67
 
68
+ #
69
+ # Clear all after process callbacks
70
+ #
41
71
  def clear_after_process_callbacks
42
72
  @after_process_callbacks = []
43
73
  end
@@ -6,6 +6,10 @@ require "daimon_skycrawlers/commands/runner"
6
6
  require "daimon_skycrawlers/version"
7
7
 
8
8
  module DaimonSkycrawlers
9
+ #
10
+ # This class provides CLI (Command Line Interface)
11
+ #
12
+ # @private
9
13
  class CLI < Thor
10
14
  register(Generator::New, "new", "new NAME", "Create new project")
11
15
  register(Generator::Generate, "generate", "generate COMMAND", "Generate new code")
@@ -7,7 +7,9 @@ require "thor"
7
7
  require "webrobots"
8
8
 
9
9
  module DaimonSkycrawlers
10
+ # @private
10
11
  module Commands
12
+ # @private
11
13
  class Enqueue < Thor
12
14
  desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
13
15
  def url(url, *rest)
@@ -72,7 +74,8 @@ module DaimonSkycrawlers
72
74
  method_option("type", aliases: ["-t"], type: :string, default: "url", desc: "Specify type for URLs")
73
75
  def yaml(path)
74
76
  load_init
75
- YAML.load_file(path).each do |hash|
77
+ yaml_text = ERB.new(File.read(path), nil, "-").result(binding)
78
+ YAML.load(yaml_text).each do |hash|
76
79
  url = hash["url"]
77
80
  message = hash["message"] || {}
78
81
  raise "Could not find URL: #{hash}" unless url
@@ -3,7 +3,9 @@ require "daimon_skycrawlers"
3
3
  require "daimon_skycrawlers/crawler"
4
4
 
5
5
  module DaimonSkycrawlers
6
+ # @private
6
7
  module Commands
8
+ # @private
7
9
  class Runner < Thor
8
10
  namespace "exec"
9
11
 
@@ -1,5 +1,6 @@
1
1
  module DaimonSkycrawlers
2
2
  module ConfigMixin
3
+ # @private
3
4
  def initialize
4
5
  super
5
6
  @log = DaimonSkycrawlers.configuration.logger
@@ -1,8 +1,13 @@
1
1
  module DaimonSkycrawlers
2
+ #
3
+ # This module provides `#configure` to construct instance
4
+ #
2
5
  module Configurable
3
6
  #
4
- # Configure spider instance
7
+ # Configure instance
5
8
  #
9
+ # @yield [instance] give instance to the block
10
+ # @yieldparam instance [DaimonSkycrawlers::Crawler::Base|DaimonSkycrawlers::Processor::Base] self
6
11
  # @return [DaimonSkycrawlers::Crawler::Base|DaimonSkycrawlers::Processor::Base] self
7
12
  #
8
13
  def configure
@@ -1,4 +1,7 @@
1
1
  module DaimonSkycrawlers
2
+ #
3
+ # Name space for consumer related classes
4
+ #
2
5
  module Consumer
3
6
  end
4
7
  end
@@ -4,10 +4,15 @@ require "daimon_skycrawlers/config"
4
4
 
5
5
  module DaimonSkycrawlers
6
6
  module Consumer
7
+ #
8
+ # Base class for consumer
9
+ #
10
+ # @private
7
11
  class Base
8
12
  include DaimonSkycrawlers::LoggerMixin
9
13
  include DaimonSkycrawlers::ConfigMixin
10
14
 
15
+ # @private
11
16
  def process(message)
12
17
  raise NotImplementedError, "Must implement in subclass"
13
18
  end
@@ -17,7 +17,7 @@ module DaimonSkycrawlers
17
17
  # Register a processor
18
18
  #
19
19
  # @overload register(processor)
20
- # @param [Processor] processor instance which implements `call` method
20
+ # @param processor [Processor] processor instance which implements `call` method
21
21
  # @return [void]
22
22
  #
23
23
  # @overload register
@@ -14,7 +14,7 @@ module DaimonSkycrawlers
14
14
  #
15
15
  # Register a given crawler
16
16
  #
17
- # @param [Crawler] crawler instance which implements `fetch` method
17
+ # @param crawler [Crawler] crawler instance which implements `fetch` method
18
18
  # @return [void]
19
19
  #
20
20
  def register(crawler)
@@ -4,6 +4,9 @@ require "daimon_skycrawlers/timer"
4
4
  require "daimon_skycrawlers/consumer/url"
5
5
 
6
6
  module DaimonSkycrawlers
7
+ #
8
+ # Name space for crawler related classes
9
+ #
7
10
  module Crawler
8
11
  class << self
9
12
  #
@@ -21,8 +24,8 @@ module DaimonSkycrawlers
21
24
  #
22
25
  # Enqueue a URL to crawler queue
23
26
  #
24
- # @param [String] Specify absolute URL
25
- # @param [Hash] Extra parameters for crawler
27
+ # @param url [String] Specify absolute URL
28
+ # @param message [Hash] Extra parameters for crawler
26
29
  # @return [void]
27
30
  def enqueue_url(url, message = {})
28
31
  message[:url] = url
@@ -18,6 +18,9 @@ module DaimonSkycrawlers
18
18
  #
19
19
  # The base class of crawler
20
20
  #
21
+ # A crawler implementation can inherit this class and override
22
+ # `#fetch` in the class.
23
+ #
21
24
  class Base
22
25
  include DaimonSkycrawlers::LoggerMixin
23
26
  include DaimonSkycrawlers::ConfigMixin
@@ -35,8 +38,9 @@ module DaimonSkycrawlers
35
38
  attr_reader :n_processed_urls
36
39
 
37
40
  #
38
- # @param [String] Base URL for crawler
39
- # @param [Hash] options for Faraday
41
+ # @param base_url [String] Base URL for crawler
42
+ # @param faraday_options [Hash] options for Faraday
43
+ # @param options [Hash] options for crawler
40
44
  #
41
45
  def initialize(base_url = nil, faraday_options: {}, options: {})
42
46
  super()
@@ -46,12 +50,15 @@ module DaimonSkycrawlers
46
50
  @prepare = ->(connection) {}
47
51
  @skipped = false
48
52
  @n_processed_urls = 0
53
+
54
+ setup_default_filters
55
+ setup_default_post_processes
49
56
  end
50
57
 
51
58
  #
52
59
  # Set up connection
53
60
  #
54
- # @param [Hash] options for Faraday
61
+ # @param options [Hash] options for Faraday
55
62
  # @yield [faraday]
56
63
  # @yieldparam faraday [Faraday]
57
64
  #
@@ -67,6 +74,8 @@ module DaimonSkycrawlers
67
74
  # Call this method before DaimonSkycrawlers.register_crawler
68
75
  # For example, you can login before fetch URL
69
76
  #
77
+ # @yield [connection]
78
+ #
70
79
  def prepare(&block)
71
80
  @prepare = block
72
81
  end
@@ -74,26 +83,39 @@ module DaimonSkycrawlers
74
83
  #
75
84
  # Retrieve storage instance
76
85
  #
86
+ # @return [DaimonSkycrawlers::Storage::Base]
87
+ #
77
88
  def storage
78
89
  @storage ||= Storage::RDB.new
79
90
  end
80
91
 
92
+ #
93
+ # @return [true|false]
94
+ #
81
95
  def skipped?
82
96
  @skipped
83
97
  end
84
98
 
99
+ #
100
+ # @return [Faraday]
101
+ #
85
102
  def connection
86
103
  @connection ||= Faraday.new(@base_url, @faraday_options)
87
104
  end
88
105
 
106
+ #
107
+ # Process crawler sequence
108
+ #
109
+ # 1. Run registered filters
110
+ # 1. Prepare connection
111
+ # 1. Download(fetch) data from given URL
112
+ # 1. Run post processes (store downloaded data to storage)
113
+ #
89
114
  def process(message, &block)
90
115
  @skipped = false
91
116
  @n_processed_urls += 1
92
117
 
93
- setup_default_filters
94
- setup_default_post_processes
95
-
96
- proceeding = run_before_callbacks(message)
118
+ proceeding = run_before_process_callbacks(message)
97
119
  unless proceeding
98
120
  skip(message[:url])
99
121
  return
@@ -106,18 +128,44 @@ module DaimonSkycrawlers
106
128
  @prepare.call(connection)
107
129
  response = fetch(url, message, &block)
108
130
  data = { url: url, message: message, response: response }
109
- run_after_callbacks(data)
131
+ run_after_process_callbacks(data)
110
132
  data
111
133
  end
112
134
 
135
+ #
136
+ # Fetch URL
137
+ #
138
+ # Override this method in subclass.
139
+ #
140
+ # @param path [String] URI or path
141
+ # @param message [Hash] message can include anything
142
+ #
143
+ # @return [Faraday::Response] HTTP response
144
+ #
113
145
  def fetch(path, message = {})
114
146
  raise NotImplementedError, "Must implement this method in subclass"
115
147
  end
116
148
 
149
+ #
150
+ # GET URL with params
151
+ #
152
+ # @param path [String] URI or path
153
+ # @param params [Hash] query parameters
154
+ #
155
+ # @return [Faraday::Response] HTTP response
156
+ #
117
157
  def get(path, params = {})
118
158
  @connection.get(path, params)
119
159
  end
120
160
 
161
+ #
162
+ # POST URL with params
163
+ #
164
+ # @param path [String] URI or path
165
+ # @param params [Hash] query parameters
166
+ #
167
+ # @return [Faraday::Response] HTTP response
168
+ #
121
169
  def post(path, params = {})
122
170
  @connection.post(path, params)
123
171
  end