daimon_skycrawlers 1.0.0.pre.rc1 → 1.0.0.pre.rc2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +11 -0
  3. data/daimon_skycrawlers.gemspec +1 -0
  4. data/lib/daimon_skycrawlers.rb +32 -2
  5. data/lib/daimon_skycrawlers/callbacks.rb +32 -2
  6. data/lib/daimon_skycrawlers/cli.rb +4 -0
  7. data/lib/daimon_skycrawlers/commands/enqueue.rb +4 -1
  8. data/lib/daimon_skycrawlers/commands/runner.rb +2 -0
  9. data/lib/daimon_skycrawlers/config.rb +1 -0
  10. data/lib/daimon_skycrawlers/configurable.rb +6 -1
  11. data/lib/daimon_skycrawlers/consumer.rb +3 -0
  12. data/lib/daimon_skycrawlers/consumer/base.rb +5 -0
  13. data/lib/daimon_skycrawlers/consumer/http_response.rb +1 -1
  14. data/lib/daimon_skycrawlers/consumer/url.rb +1 -1
  15. data/lib/daimon_skycrawlers/crawler.rb +5 -2
  16. data/lib/daimon_skycrawlers/crawler/base.rb +56 -8
  17. data/lib/daimon_skycrawlers/crawler/default.rb +9 -1
  18. data/lib/daimon_skycrawlers/filter.rb +3 -0
  19. data/lib/daimon_skycrawlers/filter/base.rb +12 -0
  20. data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +2 -2
  21. data/lib/daimon_skycrawlers/filter/robots_txt_checker.rb +1 -1
  22. data/lib/daimon_skycrawlers/filter/update_checker.rb +2 -2
  23. data/lib/daimon_skycrawlers/generator/crawler.rb +4 -1
  24. data/lib/daimon_skycrawlers/generator/filter.rb +4 -1
  25. data/lib/daimon_skycrawlers/generator/generate.rb +3 -0
  26. data/lib/daimon_skycrawlers/generator/new.rb +5 -1
  27. data/lib/daimon_skycrawlers/generator/processor.rb +4 -1
  28. data/lib/daimon_skycrawlers/logger.rb +8 -0
  29. data/lib/daimon_skycrawlers/processor.rb +5 -2
  30. data/lib/daimon_skycrawlers/processor/base.rb +28 -2
  31. data/lib/daimon_skycrawlers/processor/default.rb +7 -1
  32. data/lib/daimon_skycrawlers/processor/proc.rb +6 -0
  33. data/lib/daimon_skycrawlers/processor/spider.rb +2 -2
  34. data/lib/daimon_skycrawlers/queue.rb +31 -0
  35. data/lib/daimon_skycrawlers/sitemap_parser.rb +23 -1
  36. data/lib/daimon_skycrawlers/storage.rb +3 -0
  37. data/lib/daimon_skycrawlers/storage/base.rb +21 -1
  38. data/lib/daimon_skycrawlers/storage/file.rb +16 -0
  39. data/lib/daimon_skycrawlers/storage/null.rb +2 -2
  40. data/lib/daimon_skycrawlers/storage/rdb.rb +25 -7
  41. data/lib/daimon_skycrawlers/timer.rb +9 -0
  42. data/lib/daimon_skycrawlers/version.rb +4 -1
  43. data/sample/amazon-ranking/app/processors/amazon_ranking.rb +1 -1
  44. data/sample/itp-crawler/app/processors/itp_processor.rb +1 -1
  45. data/{lib/daimon_skycrawlers/generator/templates → templates}/crawler.rb.erb +0 -0
  46. data/{lib/daimon_skycrawlers/generator/templates → templates}/filter.rb.erb +0 -0
  47. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Dockerfile +0 -0
  48. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Dockerfile.db +0 -0
  49. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Gemfile +0 -0
  50. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/README.md.erb +0 -0
  51. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Rakefile +0 -0
  52. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/app/crawler.rb +0 -0
  53. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/app/processor.rb +0 -0
  54. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/config/database.yml.erb +0 -0
  55. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/config/init.rb +0 -0
  56. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/docker-compose.yml.erb +0 -0
  57. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/env.db.erb +0 -0
  58. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/env.erb +0 -0
  59. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/services/common/docker-entrypoint.sh +0 -0
  60. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/services/db/init-user-db.sh +0 -0
  61. data/{lib/daimon_skycrawlers/generator/templates → templates}/processor.rb.erb +0 -0
  62. metadata +34 -19
@@ -5,9 +5,17 @@ module DaimonSkycrawlers
5
5
  #
6
6
  # The default crawler
7
7
  #
8
- # This crawler can GET given URL and store response to storage
8
+ # This crawler can GET/POST given URL and store response to storage
9
9
  #
10
10
  class Default < Base
11
+ #
12
+ # GET/POST given url
13
+ #
14
+ # @param url [String] URI or path
15
+ # @param message [Hash] message can include anything
16
+ #
17
+ # @return [Faraday::Response] HTTP response
18
+ #
11
19
  def fetch(url, message)
12
20
  params = message[:params] || {}
13
21
  method = message[:method] || "GET"
@@ -1,4 +1,7 @@
1
1
  module DaimonSkycrawlers
2
+ #
3
+ # Name space for filters
4
+ #
2
5
  module Filter
3
6
  end
4
7
  end
@@ -20,10 +20,22 @@ module DaimonSkycrawlers
20
20
  @storage = storage
21
21
  end
22
22
 
23
+ #
24
+ # Retrieve storage instance
25
+ #
23
26
  def storage
24
27
  @storage ||= DaimonSkycrawlers::Storage::RDB.new
25
28
  end
26
29
 
30
+ #
31
+ # Filter message
32
+ #
33
+ # Override this method in subclass.
34
+ #
35
+ # @param message [Hash] message can include anything
36
+ #
37
+ # @return [true|false] process the message if true otherwise skip message.
38
+ #
27
39
  def call(message)
28
40
  raise NotImplementedError, "Must implement this method in subclass"
29
41
  end
@@ -16,7 +16,7 @@ module DaimonSkycrawlers
16
16
  end
17
17
 
18
18
  #
19
- # @param [Hash] message to check duplication. If given URL is
19
+ # @param message [Hash] message to check duplication. If given URL is
20
20
  # relative URL, use `@base_url + url` as absolute URL.
21
21
  # @return [true|false] Return false when duplicated, otherwise return true.
22
22
  #
@@ -28,7 +28,7 @@ module DaimonSkycrawlers
28
28
  end
29
29
 
30
30
  #
31
- # @param [Hash] message to check duplication. If given URL is
31
+ # @param message [Hash] message to check duplication. If given URL is
32
32
  # relative URL, use `@base_url + url` as absolute URL.
33
33
  # @return [true|false] Return true when duplicated, otherwise return false.
34
34
  #
@@ -16,7 +16,7 @@ module DaimonSkycrawlers
16
16
  end
17
17
 
18
18
  #
19
- # @param [Hash] message
19
+ # @param message [Hash] check given URL is allowed or not by robots.txt
20
20
  # @return [true|false] Return true when web site allows to fetch the URL, otherwise return false
21
21
  #
22
22
  def call(message)
@@ -17,13 +17,13 @@ module DaimonSkycrawlers
17
17
  end
18
18
 
19
19
  #
20
- # @param [Hash] message
20
+ # @param message [Hash] message includes `:url`
21
21
  # @param connection [Faraday]
22
22
  # @return [true|false] Return true when need update, otherwise return false
23
23
  #
24
24
  def call(message, connection: nil)
25
25
  url = normalize_url(message[:url])
26
- page = storage.find(url)
26
+ page = storage.read(url, message)
27
27
  return true unless page
28
28
  if connection
29
29
  response = connection.head(url)
@@ -1,14 +1,17 @@
1
1
  require "thor"
2
+ require "pathname"
2
3
 
3
4
  module DaimonSkycrawlers
5
+ # @private
4
6
  module Generator
7
+ # @private
5
8
  class Crawler < Thor::Group
6
9
  include Thor::Actions
7
10
 
8
11
  argument :name
9
12
 
10
13
  def self.source_root
11
- File.join(__dir__, "templates")
14
+ (Pathname(__dir__) + "../../../templates").to_s
12
15
  end
13
16
 
14
17
  def create_files
@@ -1,14 +1,17 @@
1
1
  require "thor"
2
+ require "pathname"
2
3
 
3
4
  module DaimonSkycrawlers
5
+ # @private
4
6
  module Generator
7
+ # @private
5
8
  class Filter < Thor::Group
6
9
  include Thor::Actions
7
10
 
8
11
  argument :name
9
12
 
10
13
  def self.source_root
11
- File.join(__dir__, "templates")
14
+ (Pathname(__dir__) + "../../../templates").to_s
12
15
  end
13
16
 
14
17
  def create_files
@@ -1,10 +1,13 @@
1
1
  require "thor"
2
+ require "pathname"
2
3
  require "daimon_skycrawlers/generator/crawler"
3
4
  require "daimon_skycrawlers/generator/processor"
4
5
  require "daimon_skycrawlers/generator/filter"
5
6
 
6
7
  module DaimonSkycrawlers
8
+ # @private
7
9
  module Generator
10
+ # @private
8
11
  class Generate < Thor
9
12
  register(Crawler, "crawler", "crawler NAME", "Generate new crawler")
10
13
  register(Processor, "processor", "processor NAME", "Generate new processor")
@@ -1,19 +1,22 @@
1
1
  require "securerandom"
2
2
  require "thor"
3
+ require "pathname"
3
4
  require "rails/generators"
4
5
  require "rails/generators/actions"
5
6
  require "rails/generators/active_record"
6
7
  require "rails/generators/active_record/migration/migration_generator"
7
8
 
8
9
  module DaimonSkycrawlers
10
+ # @private
9
11
  module Generator
12
+ # @private
10
13
  class New < Thor::Group
11
14
  include Thor::Actions
12
15
 
13
16
  argument :name
14
17
 
15
18
  def self.source_root
16
- File.join(__dir__, "templates", "new")
19
+ (Pathname(__dir__) + "../../../templates/new").to_s
17
20
  end
18
21
 
19
22
  def create_files
@@ -103,6 +106,7 @@ MESSAGE
103
106
  end
104
107
  end
105
108
 
109
+ # @private
106
110
  class MigrationGenerator < ActiveRecord::Generators::MigrationGenerator
107
111
  def self.source_root
108
112
  ActiveRecord::Generators::MigrationGenerator.source_root
@@ -1,14 +1,17 @@
1
1
  require "thor"
2
+ require "pathname"
2
3
 
3
4
  module DaimonSkycrawlers
5
+ # @private
4
6
  module Generator
7
+ # @private
5
8
  class Processor < Thor::Group
6
9
  include Thor::Actions
7
10
 
8
11
  argument :name
9
12
 
10
13
  def self.source_root
11
- File.join(__dir__, "templates")
14
+ (Pathname(__dir__) + "../../../templates").to_s
12
15
  end
13
16
 
14
17
  def create_files
@@ -2,13 +2,20 @@ require "delegate"
2
2
  require "logger"
3
3
 
4
4
  module DaimonSkycrawlers
5
+ #
6
+ # Logger for daimon_skycrawlers
7
+ #
5
8
  class Logger < SimpleDelegator
6
9
  class << self
10
+ #
11
+ # Default logger
12
+ #
7
13
  def default
8
14
  @default ||= DaimonSkycrawlers::Logger.new(STDOUT)
9
15
  end
10
16
  end
11
17
 
18
+ # @private
12
19
  def initialize(logdev, shift_age = 0, shift_size = 1048576)
13
20
  @log = ::Logger.new(logdev, shift_age, shift_size)
14
21
  super(@log)
@@ -16,6 +23,7 @@ module DaimonSkycrawlers
16
23
  end
17
24
 
18
25
  module LoggerMixin
26
+ # @private
19
27
  def initialize
20
28
  super
21
29
  @log = DaimonSkycrawlers::Logger.default
@@ -4,6 +4,9 @@ require "daimon_skycrawlers/timer"
4
4
  require "daimon_skycrawlers/consumer/http_response"
5
5
 
6
6
  module DaimonSkycrawlers
7
+ #
8
+ # Name space for processors
9
+ #
7
10
  module Processor
8
11
  class << self
9
12
  #
@@ -21,8 +24,8 @@ module DaimonSkycrawlers
21
24
  #
22
25
  # Enqueue a URL to processor queue
23
26
  #
24
- # @param [String] Specify absolute URL
25
- # @param [Hash] Extra parameters for crawler
27
+ # @param url [String] Specify absolute URL
28
+ # @param message [Hash] Extra parameters for crawler
26
29
  # @return [void]
27
30
  def enqueue_http_response(url, message = {})
28
31
  message[:url] = url
@@ -5,6 +5,12 @@ require "daimon_skycrawlers/configurable"
5
5
 
6
6
  module DaimonSkycrawlers
7
7
  module Processor
8
+ #
9
+ # The base class of processor
10
+ #
11
+ # A processor implementation can inherit this class and override
12
+ # `#call` in the class.
13
+ #
8
14
  class Base
9
15
  include DaimonSkycrawlers::LoggerMixin
10
16
  include DaimonSkycrawlers::ConfigMixin
@@ -14,16 +20,26 @@ module DaimonSkycrawlers
14
20
  def initialize
15
21
  super
16
22
  @skipped = false
23
+
24
+ setup_default_filters
17
25
  end
18
26
 
27
+ # @private
19
28
  def skipped?
20
29
  @skipped
21
30
  end
22
31
 
32
+ #
33
+ # Process processor sequence
34
+ #
35
+ # 1. Run registered filters
36
+ # 1. Process HTTP response from message
37
+ #
38
+ # @param message [Hash] parameters for processor
39
+ #
23
40
  def process(message)
24
41
  @skipped = false
25
- setup_default_filters
26
- proceeding = run_before_callbacks(message)
42
+ proceeding = run_before_process_callbacks(message)
27
43
  unless proceeding
28
44
  skip(message[:url])
29
45
  return
@@ -31,10 +47,20 @@ module DaimonSkycrawlers
31
47
  call(message)
32
48
  end
33
49
 
50
+ #
51
+ # Process message
52
+ #
53
+ # Override this method in subclass
54
+ #
55
+ # @param message [Hash] parameters for processor
56
+ #
34
57
  def call(message)
35
58
  raise "Implement this method in subclass"
36
59
  end
37
60
 
61
+ #
62
+ # Retrieve storage instance
63
+ #
38
64
  def storage
39
65
  @storage ||= DaimonSkycrawlers::Storage::RDB.new
40
66
  end
@@ -3,10 +3,16 @@ require "daimon_skycrawlers/processor/base"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Processor
6
+ #
7
+ # Very simple processor
8
+ #
6
9
  class Default < Base
10
+ #
11
+ # Display page information
12
+ #
7
13
  def call(message)
8
14
  url = message[:url]
9
- page = storage.find(url)
15
+ page = storage.read(url, message)
10
16
  headers = JSON.parse(page.headers)
11
17
  headers_string = headers.map {|key, value| " #{key}: #{value}" }.join("\n")
12
18
  dumped_message = <<LOG
@@ -2,12 +2,18 @@ require "daimon_skycrawlers/processor/base"
2
2
 
3
3
  module DaimonSkycrawlers
4
4
  module Processor
5
+ #
6
+ # Processor for Proc
7
+ #
5
8
  class Proc < Base
6
9
  def initialize(handler)
7
10
  super()
8
11
  @handler = handler
9
12
  end
10
13
 
14
+ #
15
+ # Process message
16
+ #
11
17
  def call(message)
12
18
  @handler.call(message)
13
19
  end
@@ -96,13 +96,13 @@ module DaimonSkycrawlers
96
96
  end
97
97
 
98
98
  #
99
- # @param [Hash] message Must have key :url, :depth
99
+ # @param message [Hash] Must have key :url, :depth
100
100
  #
101
101
  def call(message)
102
102
  key_url = message[:url]
103
103
  depth = Integer(message[:depth] || 2)
104
104
  return if depth <= 1
105
- page = storage.find(key_url)
105
+ page = storage.read(key_url, message)
106
106
  @doc = Nokogiri::HTML(page.body)
107
107
  new_message = {
108
108
  depth: depth - 1,
@@ -1,8 +1,14 @@
1
1
  require "songkick_queue"
2
2
 
3
3
  module DaimonSkycrawlers
4
+ #
5
+ # Wrapper for queue configuration class
6
+ #
4
7
  class Queue
5
8
  class << self
9
+ #
10
+ # Configuration for queue
11
+ #
6
12
  def configuration
7
13
  @configuration ||= SongkickQueue.configure do |config|
8
14
  config.logger = Logger.new(STDOUT)
@@ -16,6 +22,31 @@ module DaimonSkycrawlers
16
22
  end
17
23
  end
18
24
 
25
+ #
26
+ # Configure queue
27
+ #
28
+ # ```ruby
29
+ # DaimonSkycrawlers::Queue.configure do |config|
30
+ # config.logger = Logger.new(STDOUT)
31
+ # config.host = "127.0.0.1"
32
+ # config.port = 5672
33
+ # # config.username = 'guest'
34
+ # # config.password = 'guest'
35
+ # config.vhost = "/"
36
+ # config.max_reconnect_attempts = 10
37
+ # config.network_recovery_interval = 1.0
38
+ # end
39
+ # ```
40
+ #
41
+ # * logger: logger instance for queue system
42
+ # * host: RabbitMQ host
43
+ # * port: RabbitMQ port
44
+ # * username: RabbitMQ username
45
+ # * passowrd: RabbitMQ password
46
+ # * vhost: virtual host used for connection
47
+ # * max_reconnect_attempts: The maximum number of reconnection attempts
48
+ # * network_recovery_interval: reconnection interval for TCP connection failures
49
+ #
19
50
  def configure
20
51
  yield configuration
21
52
  end
@@ -4,15 +4,37 @@ require "zlib"
4
4
  require "uri"
5
5
 
6
6
  module DaimonSkycrawlers
7
+ #
8
+ # Parser for sitemap.xml
9
+ #
7
10
  # Based on https://github.com/benbalter/sitemap-parser
11
+ # See also https://www.sitemaps.org/
12
+ #
13
+ # ```ruby
14
+ # urls = ["https://example.com/sitemap.xml"]
15
+ # sitemap_parser = DaimonSkycrawlers::SitemapParser.new(urls)
16
+ # sitemap_urls = sitemap_parser.parse
17
+ # ```
18
+ #
8
19
  class SitemapParser
20
+ #
21
+ # Error class for SitemapParser
22
+ #
9
23
  class Error < StandardError
10
24
  end
11
25
 
12
- def initialize(urls, options = {})
26
+ #
27
+ # @param urls [Array] List of sitemap.xml URL
28
+ #
29
+ def initialize(urls)
13
30
  @urls = urls
14
31
  end
15
32
 
33
+ #
34
+ # Fetch and parse sitemap.xml
35
+ #
36
+ # @return [Array] URLs in sitemap.xml
37
+ #
16
38
  def parse
17
39
  hydra = Typhoeus::Hydra.new(max_concurrency: 1)
18
40
  sitemap_urls = []