daimon_skycrawlers 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +30 -22
  3. data/bin/daimon-skycrawlers +1 -1
  4. data/daimon_skycrawlers.gemspec +3 -1
  5. data/lib/daimon_skycrawlers/consumer/http_response.rb +28 -0
  6. data/lib/daimon_skycrawlers/consumer/url.rb +20 -0
  7. data/lib/daimon_skycrawlers/crawler/base.rb +20 -0
  8. data/lib/daimon_skycrawlers/crawler/default.rb +5 -0
  9. data/lib/daimon_skycrawlers/crawler.rb +26 -0
  10. data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +8 -0
  11. data/lib/daimon_skycrawlers/filter/update_checker.rb +8 -0
  12. data/lib/daimon_skycrawlers/generator/new.rb +34 -11
  13. data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +4 -4
  14. data/lib/daimon_skycrawlers/generator/templates/new/{crawler.rb → app/crawlers/sample_crawler.rb} +0 -6
  15. data/lib/daimon_skycrawlers/generator/templates/new/{processor.rb → app/processors/sample_processor.rb} +0 -6
  16. data/lib/daimon_skycrawlers/generator/templates/new/bin/crawler +10 -0
  17. data/{sample/spider/enqueue.rb → lib/daimon_skycrawlers/generator/templates/new/bin/enqueue} +1 -2
  18. data/lib/daimon_skycrawlers/generator/templates/new/bin/processor +10 -0
  19. data/lib/daimon_skycrawlers/generator/templates/new/{init.rb → config/init.rb} +1 -0
  20. data/lib/daimon_skycrawlers/storage/file.rb +46 -0
  21. data/lib/daimon_skycrawlers/storage/null.rb +12 -0
  22. data/lib/daimon_skycrawlers/storage/rdb.rb +16 -0
  23. data/lib/daimon_skycrawlers/storage.rb +1 -0
  24. data/lib/daimon_skycrawlers/version.rb +1 -1
  25. data/lib/daimon_skycrawlers.rb +31 -0
  26. data/sample/spider/README.md +4 -4
  27. data/sample/spider/{crawler.rb → app/crawlers/blog_crawler.rb} +0 -7
  28. data/sample/spider/{processor.rb → app/processors/blog_spider.rb} +0 -7
  29. data/sample/spider/bin/crawler +10 -0
  30. data/{lib/daimon_skycrawlers/generator/templates/new/enqueue.rb → sample/spider/bin/enqueue} +1 -2
  31. data/sample/spider/bin/processor +11 -0
  32. data/sample/spider/{init.rb → config/init.rb} +0 -0
  33. metadata +51 -19
  34. data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +0 -13
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f03168cc7d465dd69da00eabc00c5bd1d4654455
4
- data.tar.gz: e55f902ab4f78340ee5df80499f581cefb2bad96
3
+ metadata.gz: 411d6b35b2c909712f9f79cd5e45440226ebba28
4
+ data.tar.gz: afa7bdc3cc98a28742c64ba46d7237fcce981b92
5
5
  SHA512:
6
- metadata.gz: 0bd1e1e832766d27779e91bba3ae70c70d44863373cd5172d2c67e40ad6ded2dd1329a68f9652aa58fd00bac7e38fd75e3c43092fbeaec4bca42fada779b2ffd
7
- data.tar.gz: 99dc774a495bfec4e8b0b0693333340fc57be9321ef9fb5d9b2d64f61856ab6defa10c1e8292dc13d90bcca4a69e80e5f2a4aeec033c7c1a345e64d2c965eaec
6
+ metadata.gz: f8844e9ebd88e7be7344a607866c15ea74be64a8a405f05f196d54f0125bf66be6a81c4ff27ea3ee4480a82989abbedd79cd934c122c7ac1bee5d83bbfa1b6d3
7
+ data.tar.gz: 37fe8376a4165aef7f29aa29af51bb637402c48d9d5327c44ea05c37dac2418c39b1f5e55303ec8c66e9cfe8eb3e77f0dc264648caa262abc551c72f06944b55
data/README.md CHANGED
@@ -33,44 +33,52 @@ Or install it yourself as:
33
33
 
34
34
  1. Create project
35
35
 
36
- ```
37
- $ bundle exec daimon-skycrawlers new mycrawlers
38
- $ cd mycrawlers
39
- ```
36
+ ```
37
+ $ bundle exec daimon-skycrawlers new mycrawlers
38
+ $ cd mycrawlers
39
+ ```
40
+ or
41
+ ```
42
+ $ daimon-skycrawlers new mycrawlers
43
+ $ cd mycrawlers
44
+ ```
40
45
 
41
46
  2. Install dependencies
42
47
 
43
- ```
44
- $ bundle install
45
- ```
48
+ ```
49
+ $ bundle install
50
+ ```
46
51
 
47
52
  3. Create database
48
53
 
49
- ```
50
- $ bundle exec rake db:create
51
- $ bundle exec rake db:migrate
52
- ```
54
+ ```
55
+ $ bundle exec rake db:create
56
+ $ bundle exec rake db:migrate
57
+ ```
53
58
 
54
59
  4. Open new terminal and run crawler/processor
55
60
 
56
- ```
57
- $ bundle exec ruby crawler.rb # on new terminal
58
- $ bundle exec ruby processor.rb # on new terminal
59
- ```
61
+ ```
62
+ $ bin/crawler # on new terminal
63
+ $ bin/processor # on new terminal
64
+ ```
65
+
66
+ NOTE: Execute step 5 as soon as possible. Because bin/crawler and
67
+ bin/processor will stop after 10 seconds by default if their
68
+ queues are empty.
60
69
 
61
70
  5. Enqueue task
62
71
 
63
- ```
64
- $ bundle exec ruby enqueue.rb url http://example.com/
65
- ```
72
+ ```
73
+ $ bin/enqueue url http://example.com/
74
+ ```
66
75
 
67
76
  6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
68
-
69
77
  7. You can re-enqueue task for processor
70
78
 
71
- ```
72
- $ bundle exec ruby enqueue.rb response http://example.com/
73
- ```
79
+ ```
80
+ $ bin/enqueue response http://example.com/
81
+ ```
74
82
 
75
83
  Display `It works with 'http://example.com'` again on your terminal which runs your processor.
76
84
 
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  if File.exist?(File.expand_path("../.git", __dir__))
4
- $LOAD_PATH << File.expand_path("../lib", __dir__)
4
+ $LOAD_PATH.unshift(File.expand_path("../lib", __dir__))
5
5
  end
6
6
 
7
7
  require "daimon_skycrawlers/cli"
@@ -18,16 +18,17 @@ Gem::Specification.new do |spec|
18
18
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
19
  spec.require_paths = ["lib"]
20
20
 
21
+ spec.add_dependency "bundler", "~> 1.11"
21
22
  spec.add_dependency "thor"
22
23
  spec.add_dependency "songkick_queue"
23
24
  spec.add_dependency "faraday"
24
25
  spec.add_dependency "faraday_middleware"
25
26
  spec.add_dependency "nokogiri"
26
27
  spec.add_dependency "activerecord"
28
+ spec.add_dependency "railties"
27
29
  spec.add_dependency "pg"
28
30
  spec.add_dependency "timers"
29
31
 
30
- spec.add_development_dependency "bundler", "~> 1.11"
31
32
  spec.add_development_dependency "rake", "~> 10.0"
32
33
  spec.add_development_dependency "test-unit"
33
34
  spec.add_development_dependency "test-unit-rr"
@@ -35,4 +36,5 @@ Gem::Specification.new do |spec|
35
36
  spec.add_development_dependency "pry"
36
37
  spec.add_development_dependency "tapp"
37
38
  spec.add_development_dependency "sqlite3"
39
+ spec.add_development_dependency "yard"
38
40
  end
@@ -5,10 +5,26 @@ require "daimon_skycrawlers/processor/default"
5
5
 
6
6
  module DaimonSkycrawlers
7
7
  module Consumer
8
+ #
9
+ # HTTP response consumer class
10
+ #
8
11
  class HTTPResponse < Base
9
12
  include SongkickQueue::Consumer
10
13
 
11
14
  class << self
15
+ #
16
+ # Register a processor
17
+ #
18
+ # @overload register(processor)
19
+ # @param [Processor] processor instance which implements `call` method
20
+ # @return [void]
21
+ #
22
+ # @overload register
23
+ # @return [void]
24
+ # @yield [message] register given block as a processor
25
+ # @yieldparam message [Hash] A message from queue
26
+ # @yieldreturn [void]
27
+ #
12
28
  def register(processor = nil, &block)
13
29
  if block_given?
14
30
  processors << block
@@ -17,14 +33,23 @@ module DaimonSkycrawlers
17
33
  end
18
34
  end
19
35
 
36
+ #
37
+ # @private
38
+ #
20
39
  def processors
21
40
  @processors ||= []
22
41
  end
23
42
 
43
+ #
44
+ # @private
45
+ #
24
46
  def default_processor
25
47
  DaimonSkycrawlers::Processor::Default.new
26
48
  end
27
49
 
50
+ #
51
+ # @private
52
+ #
28
53
  def queue_name
29
54
  "#{DaimonSkycrawlers.configuration.queue_name_prefix}.http-response"
30
55
  end
@@ -32,6 +57,9 @@ module DaimonSkycrawlers
32
57
 
33
58
  consume_from_queue queue_name
34
59
 
60
+ #
61
+ # @private
62
+ #
35
63
  def process(message)
36
64
  if self.class.processors.empty?
37
65
  processors = [self.class.default_processor]
@@ -4,18 +4,35 @@ require "daimon_skycrawlers/consumer/base"
4
4
 
5
5
  module DaimonSkycrawlers
6
6
  module Consumer
7
+ #
8
+ # URL consumer class
9
+ #
7
10
  class URL < Base
8
11
  include SongkickQueue::Consumer
9
12
 
10
13
  class << self
14
+ #
15
+ # Register a given crawler
16
+ #
17
+ # @param [Crawler] crawler instance which implements `fetch` method
18
+ # @return [void]
19
+ #
11
20
  def register(crawler)
12
21
  crawlers << crawler
13
22
  end
14
23
 
24
+ #
25
+ # Returns registered crawlers
26
+ #
27
+ # @return [Array<Crawler>]
28
+ #
15
29
  def crawlers
16
30
  @crawlers ||= []
17
31
  end
18
32
 
33
+ #
34
+ # @private
35
+ #
19
36
  def queue_name
20
37
  "#{DaimonSkycrawlers.configuration.queue_name_prefix}.url"
21
38
  end
@@ -23,6 +40,9 @@ module DaimonSkycrawlers
23
40
 
24
41
  consume_from_queue queue_name
25
42
 
43
+ #
44
+ # @private
45
+ #
26
46
  def process(message)
27
47
  url = message[:url]
28
48
  depth = Integer(message[:depth] || 0)
@@ -8,12 +8,22 @@ require "daimon_skycrawlers/processor"
8
8
 
9
9
  module DaimonSkycrawlers
10
10
  module Crawler
11
+ #
12
+ # The base class of crawler
13
+ #
11
14
  class Base
12
15
  include DaimonSkycrawlers::LoggerMixin
13
16
  include DaimonSkycrawlers::ConfigMixin
14
17
 
18
+ # @!attribute [w] storage
19
+ # Set storage to crawler instance.
20
+ # @return [void]
15
21
  attr_writer :storage
16
22
 
23
+ #
24
+ # @param [String] Base URL for crawler
25
+ # @param [Hash] options for Faraday
26
+ #
17
27
  def initialize(base_url = nil, options = {})
18
28
  super()
19
29
  @base_url = base_url
@@ -23,6 +33,13 @@ module DaimonSkycrawlers
23
33
  @n_processed_urls = 0
24
34
  end
25
35
 
36
+ #
37
+ # Set up connection
38
+ #
39
+ # @param [Hash] options for Faraday
40
+ # @yield [faraday]
41
+ # @yieldparam faraday [Faraday]
42
+ #
26
43
  def setup_connection(options = {})
27
44
  @connection = Faraday.new(@base_url, @options.merge(options)) do |faraday|
28
45
  yield faraday
@@ -37,6 +54,9 @@ module DaimonSkycrawlers
37
54
  @prepare = block
38
55
  end
39
56
 
57
+ #
58
+ # Retrieve storage instance
59
+ #
40
60
  def storage
41
61
  @storage ||= Storage::RDB.new
42
62
  end
@@ -3,6 +3,11 @@ require "daimon_skycrawlers/filter/update_checker"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Crawler
6
+ #
7
+ # The default crawler
8
+ #
9
+ # This crawler can GET given URL and store response to storage
10
+ #
6
11
  class Default < Base
7
12
  def fetch(path, depth: 3, **kw)
8
13
  @n_processed_urls += 1
@@ -6,24 +6,50 @@ require "daimon_skycrawlers/consumer/url"
6
6
  module DaimonSkycrawlers
7
7
  module Crawler
8
8
  class << self
9
+ #
10
+ # Run registered crawlers
11
+ #
12
+ # @param process_name [String] Process name
13
+ #
9
14
  def run(process_name: default_process_name)
10
15
  DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
11
16
  SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::URL]).run
12
17
  end
13
18
 
19
+ #
20
+ # Enqueue a URL to crawler queue
21
+ #
22
+ # @param [String] Specify absolute URL
23
+ # @param [Hash] Extra parameters for crawler
24
+ # @return [void]
14
25
  def enqueue_url(url, message = {})
15
26
  message[:url] = url
16
27
  SongkickQueue.publish(queue_name, message)
17
28
  end
18
29
 
30
+ #
31
+ # Shortcut of DaimonSkycrawlers.configuration
32
+ #
33
+ # @return [DaimonSkycrawlers::Configuration]
34
+ #
19
35
  def config
20
36
  DaimonSkycrawlers.configuration
21
37
  end
22
38
 
39
+ #
40
+ # Queue name for crawler
41
+ #
42
+ # @return [String] Queue name
43
+ #
23
44
  def queue_name
24
45
  "#{config.queue_name_prefix}.url"
25
46
  end
26
47
 
48
+ #
49
+ # Default process name
50
+ #
51
+ # @return [String] Default process name
52
+ #
27
53
  def default_process_name
28
54
  "#{config.queue_name_prefix}:url"
29
55
  end
@@ -3,6 +3,9 @@ require "daimon_skycrawlers/filter/base"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Filter
6
+ #
7
+ # This filter provides duplication checker for given URL.
8
+ #
6
9
  class DuplicateChecker < Base
7
10
  def initialize(base_url: nil)
8
11
  @base_url = nil
@@ -10,6 +13,11 @@ module DaimonSkycrawlers
10
13
  @urls = Set.new
11
14
  end
12
15
 
16
+ #
17
+ # @param [String] url to check duplication. If given URL is
18
+ # relative URL, use `@base_url + url` as absolute URL.
19
+ # @return [true|false] Return false when duplicated, otherwise return true.
20
+ #
13
21
  def call(url)
14
22
  unless URI(url).absolute?
15
23
  url = (@base_url + url).to_s
@@ -3,6 +3,9 @@ require "daimon_skycrawlers/filter/base"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Filter
6
+ #
7
+ # This filter provides update checker for given URL.
8
+ #
6
9
  class UpdateChecker < Base
7
10
  def initialize(storage: nil, base_url: nil)
8
11
  super(storage: storage)
@@ -10,6 +13,11 @@ module DaimonSkycrawlers
10
13
  @base_url = URI(base_url) if base_url
11
14
  end
12
15
 
16
+ #
17
+ # @param [String] url
18
+ # @param connection [Faraday]
19
+ # @return [true|false] Return true when need update, otherwise return false
20
+ #
13
21
  def call(url, connection: nil)
14
22
  unless URI(url).absolute?
15
23
  url = (@base_url + url).to_s
@@ -1,4 +1,8 @@
1
1
  require "thor"
2
+ require "rails/generators"
3
+ require "rails/generators/actions"
4
+ require "rails/generators/active_record"
5
+ require "rails/generators/active_record/migration/migration_generator"
2
6
 
3
7
  module DaimonSkycrawlers
4
8
  module Generator
@@ -18,26 +22,45 @@ module DaimonSkycrawlers
18
22
  ].each do |path|
19
23
  template("#{path}.erb", "#{name}/#{path}")
20
24
  end
25
+ invoke(MigrationGenerator, [
26
+ "CreatePage",
27
+ "url:string",
28
+ "headers:text",
29
+ "body:binary",
30
+ "last_modified_at:datetime",
31
+ "etag:string",
32
+ "timestamps"
33
+ ],
34
+ { destination_root: File.join(destination_root, name) })
21
35
  end
22
36
 
23
37
  def copy_files
24
38
  [
25
39
  "Gemfile",
26
40
  "Rakefile",
27
- "crawler.rb",
28
- "enqueue.rb",
29
- "init.rb",
30
- "processor.rb",
41
+ "app/crawlers/sample_crawler.rb",
42
+ "app/processors/sample_processor.rb",
43
+ "bin/crawler",
44
+ "bin/enqueue",
45
+ "bin/processor",
46
+ "config/init.rb",
31
47
  ].each do |path|
32
- copy_file(path, "#{name}/#{path}")
33
- end
34
- [
35
- "db/migrate/create_pages.rb",
36
- ].each do |path|
37
- migration = "#{Time.now.strftime("%Y%m%d%H%M%S")}_#{File.basename(path)}"
38
- copy_file(path, "#{name}/db/migrate/#{migration}")
48
+ copy_file(path, "#{name}/#{path}", mode: :preserve)
39
49
  end
40
50
  end
41
51
  end
52
+
53
+ class MigrationGenerator < ActiveRecord::Generators::MigrationGenerator
54
+ def self.source_root
55
+ ActiveRecord::Generators::MigrationGenerator.source_root
56
+ end
57
+
58
+ def create_migration_file
59
+ set_local_assigns!
60
+ validate_file_name!
61
+ dest = options[:destination_root]
62
+ migration_template @migration_template, "#{dest}/db/migrate/#{file_name}.rb"
63
+ end
64
+ end
42
65
  end
43
66
  end
@@ -29,14 +29,14 @@ $ bundle exec rake db:migrate
29
29
  3. Open new terminal and run crawler/processor
30
30
 
31
31
  ```
32
- $ bundle exec ruby crawler.rb # on new terminal
33
- $ bundle exec ruby processor.rb # on new terminal
32
+ $ bin/crawler # on new terminal
33
+ $ bin/processor # on new terminal
34
34
  ```
35
35
 
36
36
  4. Enqueue task
37
37
 
38
38
  ```
39
- $ bundle exec ruby enqueue.rb http://example.com/
39
+ $ bin/enqueue url http://example.com/
40
40
  ```
41
41
 
42
42
  5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
@@ -44,7 +44,7 @@ $ bundle exec ruby enqueue.rb http://example.com/
44
44
  6. You can re-enqueue task for processor
45
45
 
46
46
  ```
47
- $ bundle exec ruby enqueue.rb response http://example.com/
47
+ $ bin/enqueue response http://example.com/
48
48
  ```
49
49
 
50
50
  Display `It works with 'http://example.com'` again on your terminal which runs your processor.
@@ -1,14 +1,8 @@
1
- #!/usr/bin/env ruby
2
-
3
1
  require "daimon_skycrawlers/crawler"
4
2
  require "daimon_skycrawlers/crawler/default"
5
3
 
6
- require_relative "./init"
7
-
8
4
  base_url = "http://example.com"
9
5
 
10
6
  crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
11
7
 
12
8
  DaimonSkycrawlers.register_crawler(crawler)
13
-
14
- DaimonSkycrawlers::Crawler.run
@@ -1,11 +1,5 @@
1
- #!/usr/bin/env ruby
2
-
3
1
  require "daimon_skycrawlers/processor"
4
2
 
5
- require_relative "./init"
6
-
7
3
  DaimonSkycrawlers.register_processor do |data|
8
4
  p "It works with '#{data[:url]}'"
9
5
  end
10
-
11
- DaimonSkycrawlers::Processor.run
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "../config/init"
4
+ require "daimon_skycrawlers/crawler"
5
+
6
+ Dir.glob("app/crawlers/**/*.rb") do |path|
7
+ require(File.expand_path(path, File.dirname(__dir__)))
8
+ end
9
+
10
+ DaimonSkycrawlers::Crawler.run
@@ -2,11 +2,10 @@
2
2
 
3
3
  require "thor"
4
4
 
5
+ require_relative "../config/init"
5
6
  require "daimon_skycrawlers/crawler"
6
7
  require "daimon_skycrawlers/processor"
7
8
 
8
- require_relative "./init"
9
-
10
9
  class Enqueue < Thor
11
10
  desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
12
11
  def url(url, *rest)
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "../config/init"
4
+ require "daimon_skycrawlers/processor"
5
+
6
+ Dir.glob("app/processors/**/*.rb") do |path|
7
+ require(File.expand_path(path, File.dirname(__dir__)))
8
+ end
9
+
10
+ DaimonSkycrawlers::Processor.run
@@ -1,3 +1,4 @@
1
+ require "bundler/setup"
1
2
  require "daimon_skycrawlers"
2
3
  require "daimon_skycrawlers/logger"
3
4
  require "daimon_skycrawlers/queue"
@@ -0,0 +1,46 @@
1
+ require "daimon_skycrawlers/storage/base"
2
+
3
+ module DaimonSkycrawlers
4
+ module Storage
5
+ #
6
+ # Storage for files
7
+ #
8
+ class File < Base
9
+ def initialize(base_dir)
10
+ super()
11
+ @base_dir = Pathname(base_dir)
12
+ end
13
+
14
+ def save(url, headers, body)
15
+ @base_dir.mkpath
16
+ body_path(url).dirname.mkpath
17
+ body_path(url).open("wb+") do |file|
18
+ file.write(body)
19
+ end
20
+ headers_path(url).open("wb+") do |file|
21
+ file.write(JSON.generate(headers))
22
+ end
23
+ end
24
+
25
+ def read(url)
26
+ headers = JSON.parse(headers_path(url).read)
27
+ body = body_path(url).read
28
+ Page.new(url, headers, body, headers["last-modified"], headers["etag"])
29
+ end
30
+
31
+ Page = Struct.new(:url, :headers, :body, :last_modified, :etag)
32
+
33
+ private
34
+
35
+ def body_path(url)
36
+ url = URI(url)
37
+ @base_dir + ".#{url.path}"
38
+ end
39
+
40
+ def headers_path(url)
41
+ url = URI(url)
42
+ Pathname("#{body_path(url)}-headers.json")
43
+ end
44
+ end
45
+ end
46
+ end
@@ -1,9 +1,21 @@
1
1
  module DaimonSkycrawlers
2
2
  module Storage
3
+ #
4
+ # The null storage.
5
+ #
6
+ # This storage is useful for test.
7
+ #
3
8
  class Null < Base
9
+
10
+ #
11
+ # Save nothing
12
+ #
4
13
  def save(url, headers, body)
5
14
  end
6
15
 
16
+ #
17
+ # Find nothing
18
+ #
7
19
  def find(url)
8
20
  end
9
21
  end
@@ -3,13 +3,24 @@ require "active_record"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Storage
6
+ #
7
+ # Storage for RDBMS
8
+ #
6
9
  class RDB < Base
7
10
  def initialize(config_path = "config/database.yml")
11
+ super()
8
12
  config = YAML.load_file(config_path)
9
13
  environment = ENV["SKYCRAWLERS_ENV"] || "development"
10
14
  ActiveRecord::Base.establish_connection(config[environment])
11
15
  end
12
16
 
17
+ #
18
+ # Save
19
+ #
20
+ # @param [String] url identity of the page
21
+ # @param [Hash] header of URL
22
+ # @param [String] body
23
+ #
13
24
  def save(url, headers, body)
14
25
  Page.create(url: url,
15
26
  headers: JSON.generate(headers),
@@ -18,6 +29,11 @@ module DaimonSkycrawlers
18
29
  etag: headers["etag"])
19
30
  end
20
31
 
32
+ #
33
+ # Fetch page identified by url
34
+ #
35
+ # @param [String] url identity of the page
36
+ #
21
37
  def find(url)
22
38
  Page.where(url: url).order(last_modified_at: :desc).limit(1).first
23
39
  end
@@ -6,3 +6,4 @@ end
6
6
  require "daimon_skycrawlers/storage/base"
7
7
  require "daimon_skycrawlers/storage/rdb"
8
8
  require "daimon_skycrawlers/storage/null"
9
+ require "daimon_skycrawlers/storage/file"
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -11,14 +11,38 @@ module DaimonSkycrawlers
11
11
  :shutdown_interval
12
12
  )
13
13
  class << self
14
+ #
15
+ # Register a processor
16
+ #
17
+ # @overload register_processor(processor)
18
+ # @param [Processor] processor instance which implements `call` method
19
+ # @return [void]
20
+ #
21
+ # @overload register_processor
22
+ # @return [void]
23
+ # @yield [message] Register given block as a processor.
24
+ # @yieldparam message [Hash] A message from queue
25
+ # @yieldreturn [void]
26
+ #
14
27
  def register_processor(processor = nil, &block)
15
28
  DaimonSkycrawlers::Consumer::HTTPResponse.register(processor, &block)
16
29
  end
17
30
 
31
+ #
32
+ # Register a crawler
33
+ #
34
+ # @param [Crawler] crawler instance which implements `fetch` method
35
+ # @return [void]
36
+ #
18
37
  def register_crawler(crawler)
19
38
  DaimonSkycrawlers::Consumer::URL.register(crawler)
20
39
  end
21
40
 
41
+ #
42
+ # Retrieve configuration object
43
+ #
44
+ # @return [DaimonSkycrawlers::Configuration]
45
+ #
22
46
  def configuration
23
47
  @configuration ||= DaimonSkycrawlers::Configuration.new.tap do |config|
24
48
  config.logger = DaimonSkycrawlers::Logger.default
@@ -28,6 +52,13 @@ module DaimonSkycrawlers
28
52
  end
29
53
  end
30
54
 
55
+ #
56
+ # Configure DaimonSkycrawlers
57
+ #
58
+ # @return [void]
59
+ # @yield [configuration] configure DaimonSkycrawlers
60
+ # @yieldparam configuration [DaimonSkycrawlers::Configuration] configuration object
61
+ # @yieldreturn [void]
31
62
  def configure
32
63
  yield configuration
33
64
  end
@@ -29,14 +29,14 @@ $ bundle exec rake db:migrate
29
29
  3. Open new terminal and run crawler/processor
30
30
 
31
31
  ```
32
- $ bundle exec ruby crawler.rb # on new terminal
33
- $ bundle exec ruby processor.rb # on new terminal
32
+ $ bin/crawler # on new terminal
33
+ $ bin/processor # on new terminal
34
34
  ```
35
35
 
36
36
  4. Enqueue task
37
37
 
38
38
  ```
39
- $ bundle exec ruby enqueue.rb http://example.com/
39
+ $ bin/enqueue url http://example.com/
40
40
  ```
41
41
 
42
42
  5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
@@ -44,7 +44,7 @@ $ bundle exec ruby enqueue.rb http://example.com/
44
44
  6. You can re-enqueue task for processor
45
45
 
46
46
  ```
47
- $ bundle exec ruby enqueue.rb response http://example.com/
47
+ $ bin/enqueue response http://example.com/
48
48
  ```
49
49
 
50
50
  Display `It works with 'http://example.com'` again on your terminal which runs your processor.
@@ -1,14 +1,7 @@
1
- #!/usr/bin/env ruby
2
-
3
- require "daimon_skycrawlers/crawler"
4
1
  require "daimon_skycrawlers/crawler/default"
5
2
 
6
- require_relative "./init"
7
-
8
3
  base_url = "http://www.clear-code.com/blog/"
9
4
 
10
5
  crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
11
6
 
12
7
  DaimonSkycrawlers.register_crawler(crawler)
13
-
14
- DaimonSkycrawlers::Crawler.run
@@ -1,13 +1,8 @@
1
- #!/usr/bin/env ruby
2
-
3
- require "daimon_skycrawlers/processor"
4
1
  require "daimon_skycrawlers/processor/spider"
5
2
  require "daimon_skycrawlers/filter"
6
3
  require "daimon_skycrawlers/filter/duplicate_checker"
7
4
  require "daimon_skycrawlers/filter/update_checker"
8
5
 
9
- require_relative "./init"
10
-
11
6
  default_processor = DaimonSkycrawlers::Processor::Default.new
12
7
  spider = DaimonSkycrawlers::Processor::Spider.new
13
8
  #spider.enqueue = false
@@ -30,5 +25,3 @@ spider.append_filter(update_checker)
30
25
 
31
26
  DaimonSkycrawlers.register_processor(default_processor)
32
27
  DaimonSkycrawlers.register_processor(spider)
33
-
34
- DaimonSkycrawlers::Processor.run
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "../config/init"
4
+ require "daimon_skycrawlers/crawler"
5
+
6
+ Dir.glob("app/crawlers/**/*.rb") do |path|
7
+ require(File.expand_path(path, File.dirname(__dir__)))
8
+ end
9
+
10
+ DaimonSkycrawlers::Crawler.run
@@ -2,11 +2,10 @@
2
2
 
3
3
  require "thor"
4
4
 
5
+ require_relative "../config/init"
5
6
  require "daimon_skycrawlers/crawler"
6
7
  require "daimon_skycrawlers/processor"
7
8
 
8
- require_relative "./init"
9
-
10
9
  class Enqueue < Thor
11
10
  desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
12
11
  def url(url, *rest)
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "../config/init"
4
+
5
+ require "daimon_skycrawlers/processor"
6
+
7
+ Dir.glob("app/processors/**/*.rb") do |path|
8
+ require(File.expand_path(path, File.dirname(__dir__)))
9
+ end
10
+
11
+ DaimonSkycrawlers::Processor.run
File without changes
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryunosuke SATO
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-09-15 00:00:00.000000000 Z
11
+ date: 2016-09-29 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.11'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.11'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: thor
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -95,7 +109,7 @@ dependencies:
95
109
  - !ruby/object:Gem::Version
96
110
  version: '0'
97
111
  - !ruby/object:Gem::Dependency
98
- name: pg
112
+ name: railties
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
115
  - - ">="
@@ -109,7 +123,7 @@ dependencies:
109
123
  - !ruby/object:Gem::Version
110
124
  version: '0'
111
125
  - !ruby/object:Gem::Dependency
112
- name: timers
126
+ name: pg
113
127
  requirement: !ruby/object:Gem::Requirement
114
128
  requirements:
115
129
  - - ">="
@@ -123,19 +137,19 @@ dependencies:
123
137
  - !ruby/object:Gem::Version
124
138
  version: '0'
125
139
  - !ruby/object:Gem::Dependency
126
- name: bundler
140
+ name: timers
127
141
  requirement: !ruby/object:Gem::Requirement
128
142
  requirements:
129
- - - "~>"
143
+ - - ">="
130
144
  - !ruby/object:Gem::Version
131
- version: '1.11'
132
- type: :development
145
+ version: '0'
146
+ type: :runtime
133
147
  prerelease: false
134
148
  version_requirements: !ruby/object:Gem::Requirement
135
149
  requirements:
136
- - - "~>"
150
+ - - ">="
137
151
  - !ruby/object:Gem::Version
138
- version: '1.11'
152
+ version: '0'
139
153
  - !ruby/object:Gem::Dependency
140
154
  name: rake
141
155
  requirement: !ruby/object:Gem::Requirement
@@ -234,6 +248,20 @@ dependencies:
234
248
  - - ">="
235
249
  - !ruby/object:Gem::Version
236
250
  version: '0'
251
+ - !ruby/object:Gem::Dependency
252
+ name: yard
253
+ requirement: !ruby/object:Gem::Requirement
254
+ requirements:
255
+ - - ">="
256
+ - !ruby/object:Gem::Version
257
+ version: '0'
258
+ type: :development
259
+ prerelease: false
260
+ version_requirements: !ruby/object:Gem::Requirement
261
+ requirements:
262
+ - - ">="
263
+ - !ruby/object:Gem::Version
264
+ version: '0'
237
265
  description: This is a crawler framework.
238
266
  email:
239
267
  - tricknotes.rs@gmail.com
@@ -269,12 +297,13 @@ files:
269
297
  - lib/daimon_skycrawlers/generator/templates/new/Gemfile
270
298
  - lib/daimon_skycrawlers/generator/templates/new/README.md.erb
271
299
  - lib/daimon_skycrawlers/generator/templates/new/Rakefile
300
+ - lib/daimon_skycrawlers/generator/templates/new/app/crawlers/sample_crawler.rb
301
+ - lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
302
+ - lib/daimon_skycrawlers/generator/templates/new/bin/crawler
303
+ - lib/daimon_skycrawlers/generator/templates/new/bin/enqueue
304
+ - lib/daimon_skycrawlers/generator/templates/new/bin/processor
272
305
  - lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
273
- - lib/daimon_skycrawlers/generator/templates/new/crawler.rb
274
- - lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb
275
- - lib/daimon_skycrawlers/generator/templates/new/enqueue.rb
276
- - lib/daimon_skycrawlers/generator/templates/new/init.rb
277
- - lib/daimon_skycrawlers/generator/templates/new/processor.rb
306
+ - lib/daimon_skycrawlers/generator/templates/new/config/init.rb
278
307
  - lib/daimon_skycrawlers/logger.rb
279
308
  - lib/daimon_skycrawlers/processor.rb
280
309
  - lib/daimon_skycrawlers/processor/base.rb
@@ -283,6 +312,7 @@ files:
283
312
  - lib/daimon_skycrawlers/queue.rb
284
313
  - lib/daimon_skycrawlers/storage.rb
285
314
  - lib/daimon_skycrawlers/storage/base.rb
315
+ - lib/daimon_skycrawlers/storage/file.rb
286
316
  - lib/daimon_skycrawlers/storage/null.rb
287
317
  - lib/daimon_skycrawlers/storage/rdb.rb
288
318
  - lib/daimon_skycrawlers/tasks.rb
@@ -292,13 +322,15 @@ files:
292
322
  - sample/spider/Gemfile
293
323
  - sample/spider/README.md
294
324
  - sample/spider/Rakefile
325
+ - sample/spider/app/crawlers/blog_crawler.rb
326
+ - sample/spider/app/processors/blog_spider.rb
327
+ - sample/spider/bin/crawler
328
+ - sample/spider/bin/enqueue
329
+ - sample/spider/bin/processor
295
330
  - sample/spider/config/database.yml
296
- - sample/spider/crawler.rb
331
+ - sample/spider/config/init.rb
297
332
  - sample/spider/db/migrate/20160830155803_create_pages.rb
298
333
  - sample/spider/db/schema.rb
299
- - sample/spider/enqueue.rb
300
- - sample/spider/init.rb
301
- - sample/spider/processor.rb
302
334
  homepage: https://github.com/bm-sms/daimon-skycrawlers
303
335
  licenses:
304
336
  - MIT
@@ -1,13 +0,0 @@
1
- class CreatePages < ActiveRecord::Migration
2
- def change
3
- create_table :pages do |t|
4
- t.string :url
5
- t.text :headers
6
- t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
7
- t.datetime :last_modified_at
8
- t.string :etag
9
-
10
- t.timestamps null: false
11
- end
12
- end
13
- end