daimon_skycrawlers 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4e4bd1308a554f55bce802a5d0a038cbb0f0470e
4
- data.tar.gz: a484737c74cc9ff3304a9fda0714282f9f6b0e61
3
+ metadata.gz: cbbd476464b9b44d5bc5f71bc999820126379e2d
4
+ data.tar.gz: 473ac2bd6f9c63a7307357aa2fae7da2be6efde7
5
5
  SHA512:
6
- metadata.gz: c9d4b01cb37808ce43a9786324e6e0d842d67bc44269c39964e4570c5dbab1f0b818449ac6a67864d6834e2b67a1a9f857252fd0d3cc91e87961ad252eccc785
7
- data.tar.gz: c1babe4300e744678672482e3a82c8bc5fa37a5e52ebe8e634b3ae0cdaea17adc75c5132a5a8918c2b3dce76b47c783fedea8e911b18f2e6f0bd7adee0d0525b
6
+ metadata.gz: 788e296ef3fbd73c39db3ca0f6e6507e9c2074893c4895598c3e418849cf70a8951cc65c5fbff88d1a832c721edd53bd7d4078699090b6da36efaf460027a944
7
+ data.tar.gz: 42ba6b8ad282060811d2817d85c8b276ccc0890c4a15c546571f22594cb1a552be2838c166ced8fa609807dc0399b910acc53ba7f56e927222955af2008a831a
@@ -1,5 +1,6 @@
1
1
  require "thor"
2
2
  require "daimon_skycrawlers/generator/new"
3
+ require "daimon_skycrawlers/generator/generate"
3
4
  require "daimon_skycrawlers/commands/enqueue"
4
5
  require "daimon_skycrawlers/commands/runner"
5
6
  require "daimon_skycrawlers/version"
@@ -7,6 +8,7 @@ require "daimon_skycrawlers/version"
7
8
  module DaimonSkycrawlers
8
9
  class CLI < Thor
9
10
  register(Generator::New, "new", "new NAME", "Create new project")
11
+ register(Generator::Generate, "generate", "generate COMMAND", "Generate new code")
10
12
  register(Commands::Runner, "exec", "exec [COMMAND]", "Execute crawler/processor")
11
13
  register(Commands::Enqueue, "enqueue", "enqueue [TYPE] URL [messages...]", "Enqueue URL")
12
14
 
@@ -67,7 +67,7 @@ module DaimonSkycrawlers
67
67
  processors = self.class.processors
68
68
  end
69
69
  processors.each do |processor|
70
- processor.call(message)
70
+ processor.process(message)
71
71
  end
72
72
  end
73
73
  end
@@ -44,14 +44,11 @@ module DaimonSkycrawlers
44
44
  # @private
45
45
  #
46
46
  def process(message)
47
- url = message[:url]
48
- depth = Integer(message[:depth] || 0)
49
-
50
47
  crawler_interval = DaimonSkycrawlers.configuration.crawler_interval
51
48
 
52
49
  # XXX When several crawlers are registered, how should they behave?
53
50
  self.class.crawlers.each do |crawler|
54
- crawler.fetch(url, depth: depth)
51
+ crawler.process(message)
55
52
  if crawler.skipped?
56
53
  sleep(crawler_interval) if crawler.n_processed_urls % 50 == 0
57
54
  else
@@ -20,13 +20,19 @@ module DaimonSkycrawlers
20
20
  # @return [void]
21
21
  attr_writer :storage
22
22
 
23
+ # @!attribute [r] n_processed_urls
24
+ # The number of processed URLs.
25
+ # @return [Integer]
26
+ attr_reader :n_processed_urls
27
+
23
28
  #
24
29
  # @param [String] Base URL for crawler
25
30
  # @param [Hash] options for Faraday
26
31
  #
27
- def initialize(base_url = nil, options = {})
32
+ def initialize(base_url = nil, faraday_options: {}, options: {})
28
33
  super()
29
34
  @base_url = base_url
35
+ @faraday_options = faraday_options
30
36
  @options = options
31
37
  @prepare = ->(connection) {}
32
38
  @skipped = false
@@ -41,7 +47,9 @@ module DaimonSkycrawlers
41
47
  # @yieldparam faraday [Faraday]
42
48
  #
43
49
  def setup_connection(options = {})
44
- @connection = Faraday.new(@base_url, @options.merge(options)) do |faraday|
50
+ merged_options = @faraday_options.merge(options)
51
+ faraday_options = merged_options.empty? ? nil : merged_options
52
+ @connection = Faraday.new(@base_url, faraday_options) do |faraday|
45
53
  yield faraday
46
54
  end
47
55
  end
@@ -66,10 +74,26 @@ module DaimonSkycrawlers
66
74
  end
67
75
 
68
76
  def connection
69
- @connection ||= Faraday.new(@base_url, @options)
77
+ @connection ||= Faraday.new(@base_url, @faraday_options)
78
+ end
79
+
80
+ def process(message, &block)
81
+ url = message.delete(:url)
82
+
83
+ @skipped = false
84
+ @n_processed_urls += 1
85
+ # url can be a path
86
+ url = connection.url_prefix + url
87
+
88
+ apply_filters(url)
89
+
90
+ unless skipped?
91
+ @prepare.call(connection)
92
+ fetch(url, message, &block)
93
+ end
70
94
  end
71
95
 
72
- def fetch(path, params = {}, **kw)
96
+ def fetch(path, message = {})
73
97
  raise NotImplementedError, "Must implement this method in subclass"
74
98
  end
75
99
 
@@ -81,11 +105,28 @@ module DaimonSkycrawlers
81
105
  @connection.post(path, params)
82
106
  end
83
107
 
84
- def n_processed_urls
85
- @n_processed_urls
108
+ private
109
+
110
+ def apply_filters(url)
111
+ if @options[:obey_robots_txt]
112
+ robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url)
113
+ unless robots_txt_checker.allowed?(url)
114
+ skip(url)
115
+ return
116
+ end
117
+ end
118
+ update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
119
+ unless update_checker.updated?(url.to_s, connection: connection)
120
+ skip(url)
121
+ return
122
+ end
86
123
  end
87
124
 
88
- private
125
+ def skip(url)
126
+ log.info("Skip #{url}")
127
+ @skipped = true
128
+ schedule_to_process(url.to_s, heartbeat: true)
129
+ end
89
130
 
90
131
  def schedule_to_process(url, message = {})
91
132
  DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
@@ -10,43 +10,15 @@ module DaimonSkycrawlers
10
10
  # This crawler can GET given URL and store response to storage
11
11
  #
12
12
  class Default < Base
13
- def fetch(path, depth: 3, **kw)
14
- @n_processed_urls += 1
15
- @skipped = false
16
- url = connection.url_prefix + path
17
- if @options[:obey_robots_txt]
18
- robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url)
19
- unless robots_txt_checker.call(url)
20
- skip(url)
21
- return
22
- end
23
- end
24
- update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
25
- unless update_checker.call(url.to_s, connection: connection)
26
- skip(url)
27
- return
28
- end
29
- @prepare.call(connection)
30
- response = get(path)
13
+ def fetch(url, message)
14
+ response = get(url)
31
15
  data = [url.to_s, response.headers, response.body]
32
16
 
33
17
  yield(*data) if block_given?
34
18
 
35
19
  storage.save(*data)
36
- message = {
37
- depth: depth
38
- }
39
- message = message.merge(kw)
40
20
  schedule_to_process(url.to_s, message)
41
21
  end
42
-
43
- private
44
-
45
- def skip(url)
46
- log.info("Skip #{url}")
47
- @skipped = true
48
- schedule_to_process(url.to_s, heartbeat: true)
49
- end
50
22
  end
51
23
  end
52
24
  end
@@ -28,6 +28,15 @@ module DaimonSkycrawlers
28
28
  @urls << url
29
29
  true
30
30
  end
31
+
32
+ #
33
+ # @param [String] url to check duplication. If given URL is
34
+ # relative URL, use `@base_url + url` as absolute URL.
35
+ # @return [true|false] Return true when duplicated, otherwise return false.
36
+ #
37
+ def duplicated?(url)
38
+ !call(url)
39
+ end
31
40
  end
32
41
  end
33
42
  end
@@ -24,6 +24,8 @@ module DaimonSkycrawlers
24
24
  end
25
25
  @webrobots.allowed?(url)
26
26
  end
27
+
28
+ alias allowed? call
27
29
  end
28
30
  end
29
31
  end
@@ -32,11 +32,17 @@ module DaimonSkycrawlers
32
32
  else
33
33
  headers = Faraday.head(url)
34
34
  end
35
- return false if headers["etag"] && page.etag && headers["etag"] == page.etag
36
- return false if headers["last-modified"].nil? && page.last_modified_at.nil?
37
- return false if headers["last-modified"] <= page.last_modified_at
38
- true
35
+ case
36
+ when headers.key?("etag") && page.etag
37
+ headers["etag"] != page.etag
38
+ when headers.key?("last-modified") && page.last_modified_at
39
+ headers["last-modified"] > page.last_modified_at
40
+ else
41
+ true
42
+ end
39
43
  end
44
+
45
+ alias updated? call
40
46
  end
41
47
  end
42
48
  end
@@ -0,0 +1,22 @@
1
+ require "thor"
2
+
3
+ module DaimonSkycrawlers
4
+ module Generator
5
+ class Crawler < Thor::Group
6
+ include Thor::Actions
7
+
8
+ argument :name
9
+
10
+ def self.source_root
11
+ File.join(__dir__, "templates")
12
+ end
13
+
14
+ def create_files
15
+ config = {
16
+ class_name: name.classify,
17
+ }
18
+ template("crawler.rb.erb", "app/crawlers/#{name.underscore}.rb", config)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,12 @@
1
+ require "thor"
2
+ require "daimon_skycrawlers/generator/crawler"
3
+ require "daimon_skycrawlers/generator/processor"
4
+
5
+ module DaimonSkycrawlers
6
+ module Generator
7
+ class Generate < Thor
8
+ register(Crawler, "crawler", "crawler NAME", "Generate new crawler")
9
+ register(Processor, "processor", "processor NAME", "Generate new processor")
10
+ end
11
+ end
12
+ end
@@ -22,16 +22,19 @@ module DaimonSkycrawlers
22
22
  ].each do |path|
23
23
  template("#{path}.erb", "#{name}/#{path}")
24
24
  end
25
+ migration_options = {
26
+ destination_root: File.join(destination_root, name),
27
+ timestamps: true
28
+ }
25
29
  invoke(MigrationGenerator, [
26
30
  "CreatePage",
27
31
  "url:string",
28
32
  "headers:text",
29
33
  "body:binary",
30
34
  "last_modified_at:datetime",
31
- "etag:string",
32
- "timestamps"
35
+ "etag:string"
33
36
  ],
34
- { destination_root: File.join(destination_root, name) })
37
+ migration_options)
35
38
  end
36
39
 
37
40
  def copy_files
@@ -56,7 +59,7 @@ module DaimonSkycrawlers
56
59
  set_local_assigns!
57
60
  validate_file_name!
58
61
  dest = options[:destination_root]
59
- migration_template @migration_template, "#{dest}/db/migrate/#{file_name}.rb"
62
+ migration_template(@migration_template, "#{dest}/db/migrate/#{file_name}.rb")
60
63
  end
61
64
  end
62
65
  end
@@ -0,0 +1,22 @@
1
+ require "thor"
2
+
3
+ module DaimonSkycrawlers
4
+ module Generator
5
+ class Processor < Thor::Group
6
+ include Thor::Actions
7
+
8
+ argument :name
9
+
10
+ def self.source_root
11
+ File.join(__dir__, "templates")
12
+ end
13
+
14
+ def create_files
15
+ config = {
16
+ class_name: name.classify,
17
+ }
18
+ template("processor.rb.erb", "app/processors/#{name.underscore}.rb", config)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,13 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/crawler"
3
+ require "daimon_skycrawlers/crawler/base"
4
+
5
+ class <%= config[:class_name] %> < DaimonSkycrawlers::Crawler::Base
6
+ def fetch(path, **kw)
7
+ # Implement your crawler here
8
+ end
9
+ end
10
+
11
+ base_url = ""
12
+ crawler = <%= config[:class_name] %>.new(base_url)
13
+ DaimonSkycrawlers.register_crawler(crawler)
@@ -0,0 +1,13 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/processor"
3
+ require "daimon_skycrawlers/processor/base"
4
+
5
+ class <%= config[:class_name] %> < DaimonSkycrawlers::Processor::Base
6
+ def call(message)
7
+ # Implement your processor here
8
+ end
9
+ end
10
+
11
+ base_url = ""
12
+ processor = <%= config[:class_name] %>.new(base_url)
13
+ DaimonSkycrawlers.register_processor(processor)
@@ -7,6 +7,24 @@ module DaimonSkycrawlers
7
7
  include DaimonSkycrawlers::LoggerMixin
8
8
  include DaimonSkycrawlers::ConfigMixin
9
9
 
10
+ def initialize
11
+ super
12
+ @before_process_filters = []
13
+ end
14
+
15
+ def before_process(filter = nil, &block)
16
+ if block_given?
17
+ @before_process_filters << block
18
+ else
19
+ @before_process_filters << filter if filter.respond_to?(:call)
20
+ end
21
+ end
22
+
23
+ def process(message)
24
+ return unless apply_before_filters(message[:url])
25
+ call(message)
26
+ end
27
+
10
28
  def call(message)
11
29
  raise "Implement this method in subclass"
12
30
  end
@@ -14,6 +32,14 @@ module DaimonSkycrawlers
14
32
  def storage
15
33
  @storage ||= DaimonSkycrawlers::Storage::RDB.new
16
34
  end
35
+
36
+ private
37
+
38
+ def apply_before_filters(url)
39
+ @before_process_filters.all? do |filter|
40
+ filter.call(url)
41
+ end
42
+ end
17
43
  end
18
44
  end
19
45
  end
@@ -8,17 +8,17 @@ module DaimonSkycrawlers
8
8
 
9
9
  def initialize
10
10
  super
11
- @filters = []
11
+ @link_filters = []
12
12
  @doc = nil
13
13
  @links = nil
14
14
  @enqueue = true
15
15
  end
16
16
 
17
- def append_filter(filter = nil, &block)
17
+ def append_link_filter(filter = nil, &block)
18
18
  if block_given?
19
- @filters << block
19
+ @link_filters << block
20
20
  else
21
- @filters << filter
21
+ @link_filters << filter if filter.respond_to?(:call)
22
22
  end
23
23
  end
24
24
 
@@ -53,15 +53,15 @@ module DaimonSkycrawlers
53
53
  element["href"]
54
54
  end
55
55
  urls.uniq!
56
- apply_filters(urls) || []
56
+ apply_link_filters(urls) || []
57
57
  end
58
58
 
59
- def apply_filters(urls)
59
+ def apply_link_filters(urls)
60
60
  return if urls.nil?
61
61
  return if urls.empty?
62
62
  log.debug("Candidate URLs: #{urls.size}")
63
63
  urls = urls.select do |url|
64
- @filters.inject(true) {|memo, filter| memo & filter.call(url) }
64
+ @link_filters.all? {|filter| filter.call(url) }
65
65
  end
66
66
  log.debug("Filtered URLs: #{urls.size}")
67
67
  urls
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.4.0"
2
+ VERSION = "0.5.0"
3
3
  end
@@ -6,11 +6,11 @@ require "daimon_skycrawlers/filter/update_checker"
6
6
  default_processor = DaimonSkycrawlers::Processor::Default.new
7
7
  spider = DaimonSkycrawlers::Processor::Spider.new
8
8
  #spider.enqueue = false
9
- spider.append_filter do |url|
9
+ spider.append_link_filter do |url|
10
10
  uri = URI(url)
11
11
  uri.host.nil? || uri.host == "www.clear-code.com"
12
12
  end
13
- spider.append_filter do |url|
13
+ spider.append_link_filter do |url|
14
14
  case url
15
15
  when %r!\A(\.\./|/|#)!
16
16
  false
@@ -19,9 +19,9 @@ spider.append_filter do |url|
19
19
  end
20
20
  end
21
21
  duplicate_checker = DaimonSkycrawlers::Filter::DuplicateChecker.new(base_url: "http://www.clear-code.com/blog/")
22
- spider.append_filter(duplicate_checker)
22
+ spider.append_link_filter(duplicate_checker)
23
23
  update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(base_url: "http://www.clear-code.com/blog/")
24
- spider.append_filter(update_checker)
24
+ spider.append_link_filter(update_checker)
25
25
 
26
26
  DaimonSkycrawlers.register_processor(default_processor)
27
27
  DaimonSkycrawlers.register_processor(spider)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryunosuke SATO
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-04 00:00:00.000000000 Z
11
+ date: 2016-10-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -324,7 +324,11 @@ files:
324
324
  - lib/daimon_skycrawlers/filter/duplicate_checker.rb
325
325
  - lib/daimon_skycrawlers/filter/robots_txt_checker.rb
326
326
  - lib/daimon_skycrawlers/filter/update_checker.rb
327
+ - lib/daimon_skycrawlers/generator/crawler.rb
328
+ - lib/daimon_skycrawlers/generator/generate.rb
327
329
  - lib/daimon_skycrawlers/generator/new.rb
330
+ - lib/daimon_skycrawlers/generator/processor.rb
331
+ - lib/daimon_skycrawlers/generator/templates/crawler.rb.erb
328
332
  - lib/daimon_skycrawlers/generator/templates/new/Gemfile
329
333
  - lib/daimon_skycrawlers/generator/templates/new/README.md.erb
330
334
  - lib/daimon_skycrawlers/generator/templates/new/Rakefile
@@ -332,6 +336,7 @@ files:
332
336
  - lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
333
337
  - lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
334
338
  - lib/daimon_skycrawlers/generator/templates/new/config/init.rb
339
+ - lib/daimon_skycrawlers/generator/templates/processor.rb.erb
335
340
  - lib/daimon_skycrawlers/logger.rb
336
341
  - lib/daimon_skycrawlers/processor.rb
337
342
  - lib/daimon_skycrawlers/processor/base.rb