daimon_skycrawlers 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +30 -22
  3. data/bin/daimon-skycrawlers +1 -1
  4. data/daimon_skycrawlers.gemspec +3 -1
  5. data/lib/daimon_skycrawlers/consumer/http_response.rb +28 -0
  6. data/lib/daimon_skycrawlers/consumer/url.rb +20 -0
  7. data/lib/daimon_skycrawlers/crawler/base.rb +20 -0
  8. data/lib/daimon_skycrawlers/crawler/default.rb +5 -0
  9. data/lib/daimon_skycrawlers/crawler.rb +26 -0
  10. data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +8 -0
  11. data/lib/daimon_skycrawlers/filter/update_checker.rb +8 -0
  12. data/lib/daimon_skycrawlers/generator/new.rb +34 -11
  13. data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +4 -4
  14. data/lib/daimon_skycrawlers/generator/templates/new/{crawler.rb → app/crawlers/sample_crawler.rb} +0 -6
  15. data/lib/daimon_skycrawlers/generator/templates/new/{processor.rb → app/processors/sample_processor.rb} +0 -6
  16. data/lib/daimon_skycrawlers/generator/templates/new/bin/crawler +10 -0
  17. data/{sample/spider/enqueue.rb → lib/daimon_skycrawlers/generator/templates/new/bin/enqueue} +1 -2
  18. data/lib/daimon_skycrawlers/generator/templates/new/bin/processor +10 -0
  19. data/lib/daimon_skycrawlers/generator/templates/new/{init.rb → config/init.rb} +1 -0
  20. data/lib/daimon_skycrawlers/storage/file.rb +46 -0
  21. data/lib/daimon_skycrawlers/storage/null.rb +12 -0
  22. data/lib/daimon_skycrawlers/storage/rdb.rb +16 -0
  23. data/lib/daimon_skycrawlers/storage.rb +1 -0
  24. data/lib/daimon_skycrawlers/version.rb +1 -1
  25. data/lib/daimon_skycrawlers.rb +31 -0
  26. data/sample/spider/README.md +4 -4
  27. data/sample/spider/{crawler.rb → app/crawlers/blog_crawler.rb} +0 -7
  28. data/sample/spider/{processor.rb → app/processors/blog_spider.rb} +0 -7
  29. data/sample/spider/bin/crawler +10 -0
  30. data/{lib/daimon_skycrawlers/generator/templates/new/enqueue.rb → sample/spider/bin/enqueue} +1 -2
  31. data/sample/spider/bin/processor +11 -0
  32. data/sample/spider/{init.rb → config/init.rb} +0 -0
  33. metadata +51 -19
  34. data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +0 -13
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f03168cc7d465dd69da00eabc00c5bd1d4654455
4
- data.tar.gz: e55f902ab4f78340ee5df80499f581cefb2bad96
3
+ metadata.gz: 411d6b35b2c909712f9f79cd5e45440226ebba28
4
+ data.tar.gz: afa7bdc3cc98a28742c64ba46d7237fcce981b92
5
5
  SHA512:
6
- metadata.gz: 0bd1e1e832766d27779e91bba3ae70c70d44863373cd5172d2c67e40ad6ded2dd1329a68f9652aa58fd00bac7e38fd75e3c43092fbeaec4bca42fada779b2ffd
7
- data.tar.gz: 99dc774a495bfec4e8b0b0693333340fc57be9321ef9fb5d9b2d64f61856ab6defa10c1e8292dc13d90bcca4a69e80e5f2a4aeec033c7c1a345e64d2c965eaec
6
+ metadata.gz: f8844e9ebd88e7be7344a607866c15ea74be64a8a405f05f196d54f0125bf66be6a81c4ff27ea3ee4480a82989abbedd79cd934c122c7ac1bee5d83bbfa1b6d3
7
+ data.tar.gz: 37fe8376a4165aef7f29aa29af51bb637402c48d9d5327c44ea05c37dac2418c39b1f5e55303ec8c66e9cfe8eb3e77f0dc264648caa262abc551c72f06944b55
data/README.md CHANGED
@@ -33,44 +33,52 @@ Or install it yourself as:
33
33
 
34
34
  1. Create project
35
35
 
36
- ```
37
- $ bundle exec daimon-skycrawlers new mycrawlers
38
- $ cd mycrawlers
39
- ```
36
+ ```
37
+ $ bundle exec daimon-skycrawlers new mycrawlers
38
+ $ cd mycrawlers
39
+ ```
40
+ or
41
+ ```
42
+ $ daimon-skycrawlers new mycrawlers
43
+ $ cd mycrawlers
44
+ ```
40
45
 
41
46
  2. Install dependencies
42
47
 
43
- ```
44
- $ bundle install
45
- ```
48
+ ```
49
+ $ bundle install
50
+ ```
46
51
 
47
52
  3. Create database
48
53
 
49
- ```
50
- $ bundle exec rake db:create
51
- $ bundle exec rake db:migrate
52
- ```
54
+ ```
55
+ $ bundle exec rake db:create
56
+ $ bundle exec rake db:migrate
57
+ ```
53
58
 
54
59
  4. Open new terminal and run crawler/processor
55
60
 
56
- ```
57
- $ bundle exec ruby crawler.rb # on new terminal
58
- $ bundle exec ruby processor.rb # on new terminal
59
- ```
61
+ ```
62
+ $ bin/crawler # on new terminal
63
+ $ bin/processor # on new terminal
64
+ ```
65
+
66
+ NOTE: Execute step 5 as soon as possible. Because bin/crawler and
67
+ bin/processor will stop after 10 seconds by default if their
68
+ queues are empty.
60
69
 
61
70
  5. Enqueue task
62
71
 
63
- ```
64
- $ bundle exec ruby enqueue.rb url http://example.com/
65
- ```
72
+ ```
73
+ $ bin/enqueue url http://example.com/
74
+ ```
66
75
 
67
76
  6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
68
-
69
77
  7. You can re-enqueue task for processor
70
78
 
71
- ```
72
- $ bundle exec ruby enqueue.rb response http://example.com/
73
- ```
79
+ ```
80
+ $ bin/enqueue response http://example.com/
81
+ ```
74
82
 
75
83
  Display `It works with 'http://example.com'` again on your terminal which runs your processor.
76
84
 
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  if File.exist?(File.expand_path("../.git", __dir__))
4
- $LOAD_PATH << File.expand_path("../lib", __dir__)
4
+ $LOAD_PATH.unshift(File.expand_path("../lib", __dir__))
5
5
  end
6
6
 
7
7
  require "daimon_skycrawlers/cli"
@@ -18,16 +18,17 @@ Gem::Specification.new do |spec|
18
18
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
19
  spec.require_paths = ["lib"]
20
20
 
21
+ spec.add_dependency "bundler", "~> 1.11"
21
22
  spec.add_dependency "thor"
22
23
  spec.add_dependency "songkick_queue"
23
24
  spec.add_dependency "faraday"
24
25
  spec.add_dependency "faraday_middleware"
25
26
  spec.add_dependency "nokogiri"
26
27
  spec.add_dependency "activerecord"
28
+ spec.add_dependency "railties"
27
29
  spec.add_dependency "pg"
28
30
  spec.add_dependency "timers"
29
31
 
30
- spec.add_development_dependency "bundler", "~> 1.11"
31
32
  spec.add_development_dependency "rake", "~> 10.0"
32
33
  spec.add_development_dependency "test-unit"
33
34
  spec.add_development_dependency "test-unit-rr"
@@ -35,4 +36,5 @@ Gem::Specification.new do |spec|
35
36
  spec.add_development_dependency "pry"
36
37
  spec.add_development_dependency "tapp"
37
38
  spec.add_development_dependency "sqlite3"
39
+ spec.add_development_dependency "yard"
38
40
  end
@@ -5,10 +5,26 @@ require "daimon_skycrawlers/processor/default"
5
5
 
6
6
  module DaimonSkycrawlers
7
7
  module Consumer
8
+ #
9
+ # HTTP response consumer class
10
+ #
8
11
  class HTTPResponse < Base
9
12
  include SongkickQueue::Consumer
10
13
 
11
14
  class << self
15
+ #
16
+ # Register a processor
17
+ #
18
+ # @overload register(processor)
19
+ # @param [Processor] processor instance which implements `call` method
20
+ # @return [void]
21
+ #
22
+ # @overload register
23
+ # @return [void]
24
+ # @yield [message] register given block as a processor
25
+ # @yieldparam message [Hash] A message from queue
26
+ # @yieldreturn [void]
27
+ #
12
28
  def register(processor = nil, &block)
13
29
  if block_given?
14
30
  processors << block
@@ -17,14 +33,23 @@ module DaimonSkycrawlers
17
33
  end
18
34
  end
19
35
 
36
+ #
37
+ # @private
38
+ #
20
39
  def processors
21
40
  @processors ||= []
22
41
  end
23
42
 
43
+ #
44
+ # @private
45
+ #
24
46
  def default_processor
25
47
  DaimonSkycrawlers::Processor::Default.new
26
48
  end
27
49
 
50
+ #
51
+ # @private
52
+ #
28
53
  def queue_name
29
54
  "#{DaimonSkycrawlers.configuration.queue_name_prefix}.http-response"
30
55
  end
@@ -32,6 +57,9 @@ module DaimonSkycrawlers
32
57
 
33
58
  consume_from_queue queue_name
34
59
 
60
+ #
61
+ # @private
62
+ #
35
63
  def process(message)
36
64
  if self.class.processors.empty?
37
65
  processors = [self.class.default_processor]
@@ -4,18 +4,35 @@ require "daimon_skycrawlers/consumer/base"
4
4
 
5
5
  module DaimonSkycrawlers
6
6
  module Consumer
7
+ #
8
+ # URL consumer class
9
+ #
7
10
  class URL < Base
8
11
  include SongkickQueue::Consumer
9
12
 
10
13
  class << self
14
+ #
15
+ # Register a given crawler
16
+ #
17
+ # @param [Crawler] crawler instance which implements `fetch` method
18
+ # @return [void]
19
+ #
11
20
  def register(crawler)
12
21
  crawlers << crawler
13
22
  end
14
23
 
24
+ #
25
+ # Returns registered crawlers
26
+ #
27
+ # @return [Array<Crawler>]
28
+ #
15
29
  def crawlers
16
30
  @crawlers ||= []
17
31
  end
18
32
 
33
+ #
34
+ # @private
35
+ #
19
36
  def queue_name
20
37
  "#{DaimonSkycrawlers.configuration.queue_name_prefix}.url"
21
38
  end
@@ -23,6 +40,9 @@ module DaimonSkycrawlers
23
40
 
24
41
  consume_from_queue queue_name
25
42
 
43
+ #
44
+ # @private
45
+ #
26
46
  def process(message)
27
47
  url = message[:url]
28
48
  depth = Integer(message[:depth] || 0)
@@ -8,12 +8,22 @@ require "daimon_skycrawlers/processor"
8
8
 
9
9
  module DaimonSkycrawlers
10
10
  module Crawler
11
+ #
12
+ # The base class of crawler
13
+ #
11
14
  class Base
12
15
  include DaimonSkycrawlers::LoggerMixin
13
16
  include DaimonSkycrawlers::ConfigMixin
14
17
 
18
+ # @!attribute [w] storage
19
+ # Set storage to crawler instance.
20
+ # @return [void]
15
21
  attr_writer :storage
16
22
 
23
+ #
24
+ # @param [String] Base URL for crawler
25
+ # @param [Hash] options for Faraday
26
+ #
17
27
  def initialize(base_url = nil, options = {})
18
28
  super()
19
29
  @base_url = base_url
@@ -23,6 +33,13 @@ module DaimonSkycrawlers
23
33
  @n_processed_urls = 0
24
34
  end
25
35
 
36
+ #
37
+ # Set up connection
38
+ #
39
+ # @param [Hash] options for Faraday
40
+ # @yield [faraday]
41
+ # @yieldparam faraday [Faraday]
42
+ #
26
43
  def setup_connection(options = {})
27
44
  @connection = Faraday.new(@base_url, @options.merge(options)) do |faraday|
28
45
  yield faraday
@@ -37,6 +54,9 @@ module DaimonSkycrawlers
37
54
  @prepare = block
38
55
  end
39
56
 
57
+ #
58
+ # Retrieve storage instance
59
+ #
40
60
  def storage
41
61
  @storage ||= Storage::RDB.new
42
62
  end
@@ -3,6 +3,11 @@ require "daimon_skycrawlers/filter/update_checker"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Crawler
6
+ #
7
+ # The default crawler
8
+ #
9
+ # This crawler can GET given URL and store response to storage
10
+ #
6
11
  class Default < Base
7
12
  def fetch(path, depth: 3, **kw)
8
13
  @n_processed_urls += 1
@@ -6,24 +6,50 @@ require "daimon_skycrawlers/consumer/url"
6
6
  module DaimonSkycrawlers
7
7
  module Crawler
8
8
  class << self
9
+ #
10
+ # Run registered crawlers
11
+ #
12
+ # @param process_name [String] Process name
13
+ #
9
14
  def run(process_name: default_process_name)
10
15
  DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
11
16
  SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::URL]).run
12
17
  end
13
18
 
19
+ #
20
+ # Enqueue a URL to crawler queue
21
+ #
22
+ # @param [String] Specify absolute URL
23
+ # @param [Hash] Extra parameters for crawler
24
+ # @return [void]
14
25
  def enqueue_url(url, message = {})
15
26
  message[:url] = url
16
27
  SongkickQueue.publish(queue_name, message)
17
28
  end
18
29
 
30
+ #
31
+ # Shortcut of DaimonSkycrawlers.configuration
32
+ #
33
+ # @return [DaimonSkycrawlers::Configuration]
34
+ #
19
35
  def config
20
36
  DaimonSkycrawlers.configuration
21
37
  end
22
38
 
39
+ #
40
+ # Queue name for crawler
41
+ #
42
+ # @return [String] Queue name
43
+ #
23
44
  def queue_name
24
45
  "#{config.queue_name_prefix}.url"
25
46
  end
26
47
 
48
+ #
49
+ # Default process name
50
+ #
51
+ # @return [String] Default process name
52
+ #
27
53
  def default_process_name
28
54
  "#{config.queue_name_prefix}:url"
29
55
  end
@@ -3,6 +3,9 @@ require "daimon_skycrawlers/filter/base"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Filter
6
+ #
7
+ # This filter provides duplication checker for given URL.
8
+ #
6
9
  class DuplicateChecker < Base
7
10
  def initialize(base_url: nil)
8
11
  @base_url = nil
@@ -10,6 +13,11 @@ module DaimonSkycrawlers
10
13
  @urls = Set.new
11
14
  end
12
15
 
16
+ #
17
+ # @param [String] url to check duplication. If given URL is
18
+ # relative URL, use `@base_url + url` as absolute URL.
19
+ # @return [true|false] Return false when duplicated, otherwise return true.
20
+ #
13
21
  def call(url)
14
22
  unless URI(url).absolute?
15
23
  url = (@base_url + url).to_s
@@ -3,6 +3,9 @@ require "daimon_skycrawlers/filter/base"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Filter
6
+ #
7
+ # This filter provides update checker for given URL.
8
+ #
6
9
  class UpdateChecker < Base
7
10
  def initialize(storage: nil, base_url: nil)
8
11
  super(storage: storage)
@@ -10,6 +13,11 @@ module DaimonSkycrawlers
10
13
  @base_url = URI(base_url) if base_url
11
14
  end
12
15
 
16
+ #
17
+ # @param [String] url
18
+ # @param connection [Faraday]
19
+ # @return [true|false] Return true when need update, otherwise return false
20
+ #
13
21
  def call(url, connection: nil)
14
22
  unless URI(url).absolute?
15
23
  url = (@base_url + url).to_s
@@ -1,4 +1,8 @@
1
1
  require "thor"
2
+ require "rails/generators"
3
+ require "rails/generators/actions"
4
+ require "rails/generators/active_record"
5
+ require "rails/generators/active_record/migration/migration_generator"
2
6
 
3
7
  module DaimonSkycrawlers
4
8
  module Generator
@@ -18,26 +22,45 @@ module DaimonSkycrawlers
18
22
  ].each do |path|
19
23
  template("#{path}.erb", "#{name}/#{path}")
20
24
  end
25
+ invoke(MigrationGenerator, [
26
+ "CreatePage",
27
+ "url:string",
28
+ "headers:text",
29
+ "body:binary",
30
+ "last_modified_at:datetime",
31
+ "etag:string",
32
+ "timestamps"
33
+ ],
34
+ { destination_root: File.join(destination_root, name) })
21
35
  end
22
36
 
23
37
  def copy_files
24
38
  [
25
39
  "Gemfile",
26
40
  "Rakefile",
27
- "crawler.rb",
28
- "enqueue.rb",
29
- "init.rb",
30
- "processor.rb",
41
+ "app/crawlers/sample_crawler.rb",
42
+ "app/processors/sample_processor.rb",
43
+ "bin/crawler",
44
+ "bin/enqueue",
45
+ "bin/processor",
46
+ "config/init.rb",
31
47
  ].each do |path|
32
- copy_file(path, "#{name}/#{path}")
33
- end
34
- [
35
- "db/migrate/create_pages.rb",
36
- ].each do |path|
37
- migration = "#{Time.now.strftime("%Y%m%d%H%M%S")}_#{File.basename(path)}"
38
- copy_file(path, "#{name}/db/migrate/#{migration}")
48
+ copy_file(path, "#{name}/#{path}", mode: :preserve)
39
49
  end
40
50
  end
41
51
  end
52
+
53
+ class MigrationGenerator < ActiveRecord::Generators::MigrationGenerator
54
+ def self.source_root
55
+ ActiveRecord::Generators::MigrationGenerator.source_root
56
+ end
57
+
58
+ def create_migration_file
59
+ set_local_assigns!
60
+ validate_file_name!
61
+ dest = options[:destination_root]
62
+ migration_template @migration_template, "#{dest}/db/migrate/#{file_name}.rb"
63
+ end
64
+ end
42
65
  end
43
66
  end
@@ -29,14 +29,14 @@ $ bundle exec rake db:migrate
29
29
  3. Open new terminal and run crawler/processor
30
30
 
31
31
  ```
32
- $ bundle exec ruby crawler.rb # on new terminal
33
- $ bundle exec ruby processor.rb # on new terminal
32
+ $ bin/crawler # on new terminal
33
+ $ bin/processor # on new terminal
34
34
  ```
35
35
 
36
36
  4. Enqueue task
37
37
 
38
38
  ```
39
- $ bundle exec ruby enqueue.rb http://example.com/
39
+ $ bin/enqueue url http://example.com/
40
40
  ```
41
41
 
42
42
  5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
@@ -44,7 +44,7 @@ $ bundle exec ruby enqueue.rb http://example.com/
44
44
  6. You can re-enqueue task for processor
45
45
 
46
46
  ```
47
- $ bundle exec ruby enqueue.rb response http://example.com/
47
+ $ bin/enqueue response http://example.com/
48
48
  ```
49
49
 
50
50
  Display `It works with 'http://example.com'` again on your terminal which runs your processor.
@@ -1,14 +1,8 @@
1
- #!/usr/bin/env ruby
2
-
3
1
  require "daimon_skycrawlers/crawler"
4
2
  require "daimon_skycrawlers/crawler/default"
5
3
 
6
- require_relative "./init"
7
-
8
4
  base_url = "http://example.com"
9
5
 
10
6
  crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
11
7
 
12
8
  DaimonSkycrawlers.register_crawler(crawler)
13
-
14
- DaimonSkycrawlers::Crawler.run
@@ -1,11 +1,5 @@
1
- #!/usr/bin/env ruby
2
-
3
1
  require "daimon_skycrawlers/processor"
4
2
 
5
- require_relative "./init"
6
-
7
3
  DaimonSkycrawlers.register_processor do |data|
8
4
  p "It works with '#{data[:url]}'"
9
5
  end
10
-
11
- DaimonSkycrawlers::Processor.run
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "../config/init"
4
+ require "daimon_skycrawlers/crawler"
5
+
6
+ Dir.glob("app/crawlers/**/*.rb") do |path|
7
+ require(File.expand_path(path, File.dirname(__dir__)))
8
+ end
9
+
10
+ DaimonSkycrawlers::Crawler.run
@@ -2,11 +2,10 @@
2
2
 
3
3
  require "thor"
4
4
 
5
+ require_relative "../config/init"
5
6
  require "daimon_skycrawlers/crawler"
6
7
  require "daimon_skycrawlers/processor"
7
8
 
8
- require_relative "./init"
9
-
10
9
  class Enqueue < Thor
11
10
  desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
12
11
  def url(url, *rest)
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "../config/init"
4
+ require "daimon_skycrawlers/processor"
5
+
6
+ Dir.glob("app/processors/**/*.rb") do |path|
7
+ require(File.expand_path(path, File.dirname(__dir__)))
8
+ end
9
+
10
+ DaimonSkycrawlers::Processor.run
@@ -1,3 +1,4 @@
1
+ require "bundler/setup"
1
2
  require "daimon_skycrawlers"
2
3
  require "daimon_skycrawlers/logger"
3
4
  require "daimon_skycrawlers/queue"
@@ -0,0 +1,46 @@
1
+ require "daimon_skycrawlers/storage/base"
2
+
3
+ module DaimonSkycrawlers
4
+ module Storage
5
+ #
6
+ # Storage for files
7
+ #
8
+ class File < Base
9
+ def initialize(base_dir)
10
+ super()
11
+ @base_dir = Pathname(base_dir)
12
+ end
13
+
14
+ def save(url, headers, body)
15
+ @base_dir.mkpath
16
+ body_path(url).dirname.mkpath
17
+ body_path(url).open("wb+") do |file|
18
+ file.write(body)
19
+ end
20
+ headers_path(url).open("wb+") do |file|
21
+ file.write(JSON.generate(headers))
22
+ end
23
+ end
24
+
25
+ def read(url)
26
+ headers = JSON.parse(headers_path(url).read)
27
+ body = body_path(url).read
28
+ Page.new(url, headers, body, headers["last-modified"], headers["etag"])
29
+ end
30
+
31
+ Page = Struct.new(:url, :headers, :body, :last_modified, :etag)
32
+
33
+ private
34
+
35
+ def body_path(url)
36
+ url = URI(url)
37
+ @base_dir + ".#{url.path}"
38
+ end
39
+
40
+ def headers_path(url)
41
+ url = URI(url)
42
+ Pathname("#{body_path(url)}-headers.json")
43
+ end
44
+ end
45
+ end
46
+ end
@@ -1,9 +1,21 @@
1
1
  module DaimonSkycrawlers
2
2
  module Storage
3
+ #
4
+ # The null storage.
5
+ #
6
+ # This storage is useful for test.
7
+ #
3
8
  class Null < Base
9
+
10
+ #
11
+ # Save nothing
12
+ #
4
13
  def save(url, headers, body)
5
14
  end
6
15
 
16
+ #
17
+ # Find nothing
18
+ #
7
19
  def find(url)
8
20
  end
9
21
  end
@@ -3,13 +3,24 @@ require "active_record"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Storage
6
+ #
7
+ # Storage for RDBMS
8
+ #
6
9
  class RDB < Base
7
10
  def initialize(config_path = "config/database.yml")
11
+ super()
8
12
  config = YAML.load_file(config_path)
9
13
  environment = ENV["SKYCRAWLERS_ENV"] || "development"
10
14
  ActiveRecord::Base.establish_connection(config[environment])
11
15
  end
12
16
 
17
+ #
18
+ # Save
19
+ #
20
+ # @param [String] url identity of the page
21
+ # @param [Hash] header of URL
22
+ # @param [String] body
23
+ #
13
24
  def save(url, headers, body)
14
25
  Page.create(url: url,
15
26
  headers: JSON.generate(headers),
@@ -18,6 +29,11 @@ module DaimonSkycrawlers
18
29
  etag: headers["etag"])
19
30
  end
20
31
 
32
+ #
33
+ # Fetch page identified by url
34
+ #
35
+ # @param [String] url identity of the page
36
+ #
21
37
  def find(url)
22
38
  Page.where(url: url).order(last_modified_at: :desc).limit(1).first
23
39
  end
@@ -6,3 +6,4 @@ end
6
6
  require "daimon_skycrawlers/storage/base"
7
7
  require "daimon_skycrawlers/storage/rdb"
8
8
  require "daimon_skycrawlers/storage/null"
9
+ require "daimon_skycrawlers/storage/file"
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -11,14 +11,38 @@ module DaimonSkycrawlers
11
11
  :shutdown_interval
12
12
  )
13
13
  class << self
14
+ #
15
+ # Register a processor
16
+ #
17
+ # @overload register_processor(processor)
18
+ # @param [Processor] processor instance which implements `call` method
19
+ # @return [void]
20
+ #
21
+ # @overload register_processor
22
+ # @return [void]
23
+ # @yield [message] Register given block as a processor.
24
+ # @yieldparam message [Hash] A message from queue
25
+ # @yieldreturn [void]
26
+ #
14
27
  def register_processor(processor = nil, &block)
15
28
  DaimonSkycrawlers::Consumer::HTTPResponse.register(processor, &block)
16
29
  end
17
30
 
31
+ #
32
+ # Register a crawler
33
+ #
34
+ # @param [Crawler] crawler instance which implements `fetch` method
35
+ # @return [void]
36
+ #
18
37
  def register_crawler(crawler)
19
38
  DaimonSkycrawlers::Consumer::URL.register(crawler)
20
39
  end
21
40
 
41
+ #
42
+ # Retrieve configuration object
43
+ #
44
+ # @return [DaimonSkycrawlers::Configuration]
45
+ #
22
46
  def configuration
23
47
  @configuration ||= DaimonSkycrawlers::Configuration.new.tap do |config|
24
48
  config.logger = DaimonSkycrawlers::Logger.default
@@ -28,6 +52,13 @@ module DaimonSkycrawlers
28
52
  end
29
53
  end
30
54
 
55
+ #
56
+ # Configure DaimonSkycrawlers
57
+ #
58
+ # @return [void]
59
+ # @yield [configuration] configure DaimonSkycrawlers
60
+ # @yieldparam configuration [DaimonSkycrawlers::Configuration] configuration object
61
+ # @yieldreturn [void]
31
62
  def configure
32
63
  yield configuration
33
64
  end
@@ -29,14 +29,14 @@ $ bundle exec rake db:migrate
29
29
  3. Open new terminal and run crawler/processor
30
30
 
31
31
  ```
32
- $ bundle exec ruby crawler.rb # on new terminal
33
- $ bundle exec ruby processor.rb # on new terminal
32
+ $ bin/crawler # on new terminal
33
+ $ bin/processor # on new terminal
34
34
  ```
35
35
 
36
36
  4. Enqueue task
37
37
 
38
38
  ```
39
- $ bundle exec ruby enqueue.rb http://example.com/
39
+ $ bin/enqueue url http://example.com/
40
40
  ```
41
41
 
42
42
  5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
@@ -44,7 +44,7 @@ $ bundle exec ruby enqueue.rb http://example.com/
44
44
  6. You can re-enqueue task for processor
45
45
 
46
46
  ```
47
- $ bundle exec ruby enqueue.rb response http://example.com/
47
+ $ bin/enqueue response http://example.com/
48
48
  ```
49
49
 
50
50
  Display `It works with 'http://example.com'` again on your terminal which runs your processor.
@@ -1,14 +1,7 @@
1
- #!/usr/bin/env ruby
2
-
3
- require "daimon_skycrawlers/crawler"
4
1
  require "daimon_skycrawlers/crawler/default"
5
2
 
6
- require_relative "./init"
7
-
8
3
  base_url = "http://www.clear-code.com/blog/"
9
4
 
10
5
  crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
11
6
 
12
7
  DaimonSkycrawlers.register_crawler(crawler)
13
-
14
- DaimonSkycrawlers::Crawler.run
@@ -1,13 +1,8 @@
1
- #!/usr/bin/env ruby
2
-
3
- require "daimon_skycrawlers/processor"
4
1
  require "daimon_skycrawlers/processor/spider"
5
2
  require "daimon_skycrawlers/filter"
6
3
  require "daimon_skycrawlers/filter/duplicate_checker"
7
4
  require "daimon_skycrawlers/filter/update_checker"
8
5
 
9
- require_relative "./init"
10
-
11
6
  default_processor = DaimonSkycrawlers::Processor::Default.new
12
7
  spider = DaimonSkycrawlers::Processor::Spider.new
13
8
  #spider.enqueue = false
@@ -30,5 +25,3 @@ spider.append_filter(update_checker)
30
25
 
31
26
  DaimonSkycrawlers.register_processor(default_processor)
32
27
  DaimonSkycrawlers.register_processor(spider)
33
-
34
- DaimonSkycrawlers::Processor.run
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "../config/init"
4
+ require "daimon_skycrawlers/crawler"
5
+
6
+ Dir.glob("app/crawlers/**/*.rb") do |path|
7
+ require(File.expand_path(path, File.dirname(__dir__)))
8
+ end
9
+
10
+ DaimonSkycrawlers::Crawler.run
@@ -2,11 +2,10 @@
2
2
 
3
3
  require "thor"
4
4
 
5
+ require_relative "../config/init"
5
6
  require "daimon_skycrawlers/crawler"
6
7
  require "daimon_skycrawlers/processor"
7
8
 
8
- require_relative "./init"
9
-
10
9
  class Enqueue < Thor
11
10
  desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
12
11
  def url(url, *rest)
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "../config/init"
4
+
5
+ require "daimon_skycrawlers/processor"
6
+
7
+ Dir.glob("app/processors/**/*.rb") do |path|
8
+ require(File.expand_path(path, File.dirname(__dir__)))
9
+ end
10
+
11
+ DaimonSkycrawlers::Processor.run
File without changes
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryunosuke SATO
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-09-15 00:00:00.000000000 Z
11
+ date: 2016-09-29 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.11'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.11'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: thor
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -95,7 +109,7 @@ dependencies:
95
109
  - !ruby/object:Gem::Version
96
110
  version: '0'
97
111
  - !ruby/object:Gem::Dependency
98
- name: pg
112
+ name: railties
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
115
  - - ">="
@@ -109,7 +123,7 @@ dependencies:
109
123
  - !ruby/object:Gem::Version
110
124
  version: '0'
111
125
  - !ruby/object:Gem::Dependency
112
- name: timers
126
+ name: pg
113
127
  requirement: !ruby/object:Gem::Requirement
114
128
  requirements:
115
129
  - - ">="
@@ -123,19 +137,19 @@ dependencies:
123
137
  - !ruby/object:Gem::Version
124
138
  version: '0'
125
139
  - !ruby/object:Gem::Dependency
126
- name: bundler
140
+ name: timers
127
141
  requirement: !ruby/object:Gem::Requirement
128
142
  requirements:
129
- - - "~>"
143
+ - - ">="
130
144
  - !ruby/object:Gem::Version
131
- version: '1.11'
132
- type: :development
145
+ version: '0'
146
+ type: :runtime
133
147
  prerelease: false
134
148
  version_requirements: !ruby/object:Gem::Requirement
135
149
  requirements:
136
- - - "~>"
150
+ - - ">="
137
151
  - !ruby/object:Gem::Version
138
- version: '1.11'
152
+ version: '0'
139
153
  - !ruby/object:Gem::Dependency
140
154
  name: rake
141
155
  requirement: !ruby/object:Gem::Requirement
@@ -234,6 +248,20 @@ dependencies:
234
248
  - - ">="
235
249
  - !ruby/object:Gem::Version
236
250
  version: '0'
251
+ - !ruby/object:Gem::Dependency
252
+ name: yard
253
+ requirement: !ruby/object:Gem::Requirement
254
+ requirements:
255
+ - - ">="
256
+ - !ruby/object:Gem::Version
257
+ version: '0'
258
+ type: :development
259
+ prerelease: false
260
+ version_requirements: !ruby/object:Gem::Requirement
261
+ requirements:
262
+ - - ">="
263
+ - !ruby/object:Gem::Version
264
+ version: '0'
237
265
  description: This is a crawler framework.
238
266
  email:
239
267
  - tricknotes.rs@gmail.com
@@ -269,12 +297,13 @@ files:
269
297
  - lib/daimon_skycrawlers/generator/templates/new/Gemfile
270
298
  - lib/daimon_skycrawlers/generator/templates/new/README.md.erb
271
299
  - lib/daimon_skycrawlers/generator/templates/new/Rakefile
300
+ - lib/daimon_skycrawlers/generator/templates/new/app/crawlers/sample_crawler.rb
301
+ - lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
302
+ - lib/daimon_skycrawlers/generator/templates/new/bin/crawler
303
+ - lib/daimon_skycrawlers/generator/templates/new/bin/enqueue
304
+ - lib/daimon_skycrawlers/generator/templates/new/bin/processor
272
305
  - lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
273
- - lib/daimon_skycrawlers/generator/templates/new/crawler.rb
274
- - lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb
275
- - lib/daimon_skycrawlers/generator/templates/new/enqueue.rb
276
- - lib/daimon_skycrawlers/generator/templates/new/init.rb
277
- - lib/daimon_skycrawlers/generator/templates/new/processor.rb
306
+ - lib/daimon_skycrawlers/generator/templates/new/config/init.rb
278
307
  - lib/daimon_skycrawlers/logger.rb
279
308
  - lib/daimon_skycrawlers/processor.rb
280
309
  - lib/daimon_skycrawlers/processor/base.rb
@@ -283,6 +312,7 @@ files:
283
312
  - lib/daimon_skycrawlers/queue.rb
284
313
  - lib/daimon_skycrawlers/storage.rb
285
314
  - lib/daimon_skycrawlers/storage/base.rb
315
+ - lib/daimon_skycrawlers/storage/file.rb
286
316
  - lib/daimon_skycrawlers/storage/null.rb
287
317
  - lib/daimon_skycrawlers/storage/rdb.rb
288
318
  - lib/daimon_skycrawlers/tasks.rb
@@ -292,13 +322,15 @@ files:
292
322
  - sample/spider/Gemfile
293
323
  - sample/spider/README.md
294
324
  - sample/spider/Rakefile
325
+ - sample/spider/app/crawlers/blog_crawler.rb
326
+ - sample/spider/app/processors/blog_spider.rb
327
+ - sample/spider/bin/crawler
328
+ - sample/spider/bin/enqueue
329
+ - sample/spider/bin/processor
295
330
  - sample/spider/config/database.yml
296
- - sample/spider/crawler.rb
331
+ - sample/spider/config/init.rb
297
332
  - sample/spider/db/migrate/20160830155803_create_pages.rb
298
333
  - sample/spider/db/schema.rb
299
- - sample/spider/enqueue.rb
300
- - sample/spider/init.rb
301
- - sample/spider/processor.rb
302
334
  homepage: https://github.com/bm-sms/daimon-skycrawlers
303
335
  licenses:
304
336
  - MIT
@@ -1,13 +0,0 @@
1
- class CreatePages < ActiveRecord::Migration
2
- def change
3
- create_table :pages do |t|
4
- t.string :url
5
- t.text :headers
6
- t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
7
- t.datetime :last_modified_at
8
- t.string :etag
9
-
10
- t.timestamps null: false
11
- end
12
- end
13
- end