daimon_skycrawlers 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4e4bd1308a554f55bce802a5d0a038cbb0f0470e
4
- data.tar.gz: a484737c74cc9ff3304a9fda0714282f9f6b0e61
3
+ metadata.gz: cbbd476464b9b44d5bc5f71bc999820126379e2d
4
+ data.tar.gz: 473ac2bd6f9c63a7307357aa2fae7da2be6efde7
5
5
  SHA512:
6
- metadata.gz: c9d4b01cb37808ce43a9786324e6e0d842d67bc44269c39964e4570c5dbab1f0b818449ac6a67864d6834e2b67a1a9f857252fd0d3cc91e87961ad252eccc785
7
- data.tar.gz: c1babe4300e744678672482e3a82c8bc5fa37a5e52ebe8e634b3ae0cdaea17adc75c5132a5a8918c2b3dce76b47c783fedea8e911b18f2e6f0bd7adee0d0525b
6
+ metadata.gz: 788e296ef3fbd73c39db3ca0f6e6507e9c2074893c4895598c3e418849cf70a8951cc65c5fbff88d1a832c721edd53bd7d4078699090b6da36efaf460027a944
7
+ data.tar.gz: 42ba6b8ad282060811d2817d85c8b276ccc0890c4a15c546571f22594cb1a552be2838c166ced8fa609807dc0399b910acc53ba7f56e927222955af2008a831a
@@ -1,5 +1,6 @@
1
1
  require "thor"
2
2
  require "daimon_skycrawlers/generator/new"
3
+ require "daimon_skycrawlers/generator/generate"
3
4
  require "daimon_skycrawlers/commands/enqueue"
4
5
  require "daimon_skycrawlers/commands/runner"
5
6
  require "daimon_skycrawlers/version"
@@ -7,6 +8,7 @@ require "daimon_skycrawlers/version"
7
8
  module DaimonSkycrawlers
8
9
  class CLI < Thor
9
10
  register(Generator::New, "new", "new NAME", "Create new project")
11
+ register(Generator::Generate, "generate", "generate COMMAND", "Generate new code")
10
12
  register(Commands::Runner, "exec", "exec [COMMAND]", "Execute crawler/processor")
11
13
  register(Commands::Enqueue, "enqueue", "enqueue [TYPE] URL [messages...]", "Enqueue URL")
12
14
 
@@ -67,7 +67,7 @@ module DaimonSkycrawlers
67
67
  processors = self.class.processors
68
68
  end
69
69
  processors.each do |processor|
70
- processor.call(message)
70
+ processor.process(message)
71
71
  end
72
72
  end
73
73
  end
@@ -44,14 +44,11 @@ module DaimonSkycrawlers
44
44
  # @private
45
45
  #
46
46
  def process(message)
47
- url = message[:url]
48
- depth = Integer(message[:depth] || 0)
49
-
50
47
  crawler_interval = DaimonSkycrawlers.configuration.crawler_interval
51
48
 
52
49
  # XXX When several crawlers are registered, how should they behave?
53
50
  self.class.crawlers.each do |crawler|
54
- crawler.fetch(url, depth: depth)
51
+ crawler.process(message)
55
52
  if crawler.skipped?
56
53
  sleep(crawler_interval) if crawler.n_processed_urls % 50 == 0
57
54
  else
@@ -20,13 +20,19 @@ module DaimonSkycrawlers
20
20
  # @return [void]
21
21
  attr_writer :storage
22
22
 
23
+ # @!attribute [r] n_processed_urls
24
+ # The number of processed URLs.
25
+ # @return [Integer]
26
+ attr_reader :n_processed_urls
27
+
23
28
  #
24
29
  # @param [String] Base URL for crawler
25
30
  # @param [Hash] options for Faraday
26
31
  #
27
- def initialize(base_url = nil, options = {})
32
+ def initialize(base_url = nil, faraday_options: {}, options: {})
28
33
  super()
29
34
  @base_url = base_url
35
+ @faraday_options = faraday_options
30
36
  @options = options
31
37
  @prepare = ->(connection) {}
32
38
  @skipped = false
@@ -41,7 +47,9 @@ module DaimonSkycrawlers
41
47
  # @yieldparam faraday [Faraday]
42
48
  #
43
49
  def setup_connection(options = {})
44
- @connection = Faraday.new(@base_url, @options.merge(options)) do |faraday|
50
+ merged_options = @faraday_options.merge(options)
51
+ faraday_options = merged_options.empty? ? nil : merged_options
52
+ @connection = Faraday.new(@base_url, faraday_options) do |faraday|
45
53
  yield faraday
46
54
  end
47
55
  end
@@ -66,10 +74,26 @@ module DaimonSkycrawlers
66
74
  end
67
75
 
68
76
  def connection
69
- @connection ||= Faraday.new(@base_url, @options)
77
+ @connection ||= Faraday.new(@base_url, @faraday_options)
78
+ end
79
+
80
+ def process(message, &block)
81
+ url = message.delete(:url)
82
+
83
+ @skipped = false
84
+ @n_processed_urls += 1
85
+ # url can be a path
86
+ url = connection.url_prefix + url
87
+
88
+ apply_filters(url)
89
+
90
+ unless skipped?
91
+ @prepare.call(connection)
92
+ fetch(url, message, &block)
93
+ end
70
94
  end
71
95
 
72
- def fetch(path, params = {}, **kw)
96
+ def fetch(path, message = {})
73
97
  raise NotImplementedError, "Must implement this method in subclass"
74
98
  end
75
99
 
@@ -81,11 +105,28 @@ module DaimonSkycrawlers
81
105
  @connection.post(path, params)
82
106
  end
83
107
 
84
- def n_processed_urls
85
- @n_processed_urls
108
+ private
109
+
110
+ def apply_filters(url)
111
+ if @options[:obey_robots_txt]
112
+ robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url)
113
+ unless robots_txt_checker.allowed?(url)
114
+ skip(url)
115
+ return
116
+ end
117
+ end
118
+ update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
119
+ unless update_checker.updated?(url.to_s, connection: connection)
120
+ skip(url)
121
+ return
122
+ end
86
123
  end
87
124
 
88
- private
125
+ def skip(url)
126
+ log.info("Skip #{url}")
127
+ @skipped = true
128
+ schedule_to_process(url.to_s, heartbeat: true)
129
+ end
89
130
 
90
131
  def schedule_to_process(url, message = {})
91
132
  DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
@@ -10,43 +10,15 @@ module DaimonSkycrawlers
10
10
  # This crawler can GET given URL and store response to storage
11
11
  #
12
12
  class Default < Base
13
- def fetch(path, depth: 3, **kw)
14
- @n_processed_urls += 1
15
- @skipped = false
16
- url = connection.url_prefix + path
17
- if @options[:obey_robots_txt]
18
- robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url)
19
- unless robots_txt_checker.call(url)
20
- skip(url)
21
- return
22
- end
23
- end
24
- update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
25
- unless update_checker.call(url.to_s, connection: connection)
26
- skip(url)
27
- return
28
- end
29
- @prepare.call(connection)
30
- response = get(path)
13
+ def fetch(url, message)
14
+ response = get(url)
31
15
  data = [url.to_s, response.headers, response.body]
32
16
 
33
17
  yield(*data) if block_given?
34
18
 
35
19
  storage.save(*data)
36
- message = {
37
- depth: depth
38
- }
39
- message = message.merge(kw)
40
20
  schedule_to_process(url.to_s, message)
41
21
  end
42
-
43
- private
44
-
45
- def skip(url)
46
- log.info("Skip #{url}")
47
- @skipped = true
48
- schedule_to_process(url.to_s, heartbeat: true)
49
- end
50
22
  end
51
23
  end
52
24
  end
@@ -28,6 +28,15 @@ module DaimonSkycrawlers
28
28
  @urls << url
29
29
  true
30
30
  end
31
+
32
+ #
33
+ # @param [String] url to check duplication. If given URL is
34
+ # relative URL, use `@base_url + url` as absolute URL.
35
+ # @return [true|false] Return true when duplicated, otherwise return false.
36
+ #
37
+ def duplicated?(url)
38
+ !call(url)
39
+ end
31
40
  end
32
41
  end
33
42
  end
@@ -24,6 +24,8 @@ module DaimonSkycrawlers
24
24
  end
25
25
  @webrobots.allowed?(url)
26
26
  end
27
+
28
+ alias allowed? call
27
29
  end
28
30
  end
29
31
  end
@@ -32,11 +32,17 @@ module DaimonSkycrawlers
32
32
  else
33
33
  headers = Faraday.head(url)
34
34
  end
35
- return false if headers["etag"] && page.etag && headers["etag"] == page.etag
36
- return false if headers["last-modified"].nil? && page.last_modified_at.nil?
37
- return false if headers["last-modified"] <= page.last_modified_at
38
- true
35
+ case
36
+ when headers.key?("etag") && page.etag
37
+ headers["etag"] != page.etag
38
+ when headers.key?("last-modified") && page.last_modified_at
39
+ headers["last-modified"] > page.last_modified_at
40
+ else
41
+ true
42
+ end
39
43
  end
44
+
45
+ alias updated? call
40
46
  end
41
47
  end
42
48
  end
@@ -0,0 +1,22 @@
1
+ require "thor"
2
+
3
+ module DaimonSkycrawlers
4
+ module Generator
5
+ class Crawler < Thor::Group
6
+ include Thor::Actions
7
+
8
+ argument :name
9
+
10
+ def self.source_root
11
+ File.join(__dir__, "templates")
12
+ end
13
+
14
+ def create_files
15
+ config = {
16
+ class_name: name.classify,
17
+ }
18
+ template("crawler.rb.erb", "app/crawlers/#{name.underscore}.rb", config)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,12 @@
1
+ require "thor"
2
+ require "daimon_skycrawlers/generator/crawler"
3
+ require "daimon_skycrawlers/generator/processor"
4
+
5
+ module DaimonSkycrawlers
6
+ module Generator
7
+ class Generate < Thor
8
+ register(Crawler, "crawler", "crawler NAME", "Generate new crawler")
9
+ register(Processor, "processor", "processor NAME", "Generate new processor")
10
+ end
11
+ end
12
+ end
@@ -22,16 +22,19 @@ module DaimonSkycrawlers
22
22
  ].each do |path|
23
23
  template("#{path}.erb", "#{name}/#{path}")
24
24
  end
25
+ migration_options = {
26
+ destination_root: File.join(destination_root, name),
27
+ timestamps: true
28
+ }
25
29
  invoke(MigrationGenerator, [
26
30
  "CreatePage",
27
31
  "url:string",
28
32
  "headers:text",
29
33
  "body:binary",
30
34
  "last_modified_at:datetime",
31
- "etag:string",
32
- "timestamps"
35
+ "etag:string"
33
36
  ],
34
- { destination_root: File.join(destination_root, name) })
37
+ migration_options)
35
38
  end
36
39
 
37
40
  def copy_files
@@ -56,7 +59,7 @@ module DaimonSkycrawlers
56
59
  set_local_assigns!
57
60
  validate_file_name!
58
61
  dest = options[:destination_root]
59
- migration_template @migration_template, "#{dest}/db/migrate/#{file_name}.rb"
62
+ migration_template(@migration_template, "#{dest}/db/migrate/#{file_name}.rb")
60
63
  end
61
64
  end
62
65
  end
@@ -0,0 +1,22 @@
1
+ require "thor"
2
+
3
+ module DaimonSkycrawlers
4
+ module Generator
5
+ class Processor < Thor::Group
6
+ include Thor::Actions
7
+
8
+ argument :name
9
+
10
+ def self.source_root
11
+ File.join(__dir__, "templates")
12
+ end
13
+
14
+ def create_files
15
+ config = {
16
+ class_name: name.classify,
17
+ }
18
+ template("processor.rb.erb", "app/processors/#{name.underscore}.rb", config)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,13 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/crawler"
3
+ require "daimon_skycrawlers/crawler/base"
4
+
5
+ class <%= config[:class_name] %> < DaimonSkycrawlers::Crawler::Base
6
+ def fetch(path, **kw)
7
+ # Implement your crawler here
8
+ end
9
+ end
10
+
11
+ base_url = ""
12
+ crawler = <%= config[:class_name] %>.new(base_url)
13
+ DaimonSkycrawlers.register_crawler(crawler)
@@ -0,0 +1,13 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/processor"
3
+ require "daimon_skycrawlers/processor/base"
4
+
5
+ class <%= config[:class_name] %> < DaimonSkycrawlers::Processor::Base
6
+ def call(message)
7
+ # Implement your processor here
8
+ end
9
+ end
10
+
11
+ base_url = ""
12
+ processor = <%= config[:class_name] %>.new(base_url)
13
+ DaimonSkycrawlers.register_processor(processor)
@@ -7,6 +7,24 @@ module DaimonSkycrawlers
7
7
  include DaimonSkycrawlers::LoggerMixin
8
8
  include DaimonSkycrawlers::ConfigMixin
9
9
 
10
+ def initialize
11
+ super
12
+ @before_process_filters = []
13
+ end
14
+
15
+ def before_process(filter = nil, &block)
16
+ if block_given?
17
+ @before_process_filters << block
18
+ else
19
+ @before_process_filters << filter if filter.respond_to?(:call)
20
+ end
21
+ end
22
+
23
+ def process(message)
24
+ return unless apply_before_filters(message[:url])
25
+ call(message)
26
+ end
27
+
10
28
  def call(message)
11
29
  raise "Implement this method in subclass"
12
30
  end
@@ -14,6 +32,14 @@ module DaimonSkycrawlers
14
32
  def storage
15
33
  @storage ||= DaimonSkycrawlers::Storage::RDB.new
16
34
  end
35
+
36
+ private
37
+
38
+ def apply_before_filters(url)
39
+ @before_process_filters.all? do |filter|
40
+ filter.call(url)
41
+ end
42
+ end
17
43
  end
18
44
  end
19
45
  end
@@ -8,17 +8,17 @@ module DaimonSkycrawlers
8
8
 
9
9
  def initialize
10
10
  super
11
- @filters = []
11
+ @link_filters = []
12
12
  @doc = nil
13
13
  @links = nil
14
14
  @enqueue = true
15
15
  end
16
16
 
17
- def append_filter(filter = nil, &block)
17
+ def append_link_filter(filter = nil, &block)
18
18
  if block_given?
19
- @filters << block
19
+ @link_filters << block
20
20
  else
21
- @filters << filter
21
+ @link_filters << filter if filter.respond_to?(:call)
22
22
  end
23
23
  end
24
24
 
@@ -53,15 +53,15 @@ module DaimonSkycrawlers
53
53
  element["href"]
54
54
  end
55
55
  urls.uniq!
56
- apply_filters(urls) || []
56
+ apply_link_filters(urls) || []
57
57
  end
58
58
 
59
- def apply_filters(urls)
59
+ def apply_link_filters(urls)
60
60
  return if urls.nil?
61
61
  return if urls.empty?
62
62
  log.debug("Candidate URLs: #{urls.size}")
63
63
  urls = urls.select do |url|
64
- @filters.inject(true) {|memo, filter| memo & filter.call(url) }
64
+ @link_filters.all? {|filter| filter.call(url) }
65
65
  end
66
66
  log.debug("Filtered URLs: #{urls.size}")
67
67
  urls
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.4.0"
2
+ VERSION = "0.5.0"
3
3
  end
@@ -6,11 +6,11 @@ require "daimon_skycrawlers/filter/update_checker"
6
6
  default_processor = DaimonSkycrawlers::Processor::Default.new
7
7
  spider = DaimonSkycrawlers::Processor::Spider.new
8
8
  #spider.enqueue = false
9
- spider.append_filter do |url|
9
+ spider.append_link_filter do |url|
10
10
  uri = URI(url)
11
11
  uri.host.nil? || uri.host == "www.clear-code.com"
12
12
  end
13
- spider.append_filter do |url|
13
+ spider.append_link_filter do |url|
14
14
  case url
15
15
  when %r!\A(\.\./|/|#)!
16
16
  false
@@ -19,9 +19,9 @@ spider.append_filter do |url|
19
19
  end
20
20
  end
21
21
  duplicate_checker = DaimonSkycrawlers::Filter::DuplicateChecker.new(base_url: "http://www.clear-code.com/blog/")
22
- spider.append_filter(duplicate_checker)
22
+ spider.append_link_filter(duplicate_checker)
23
23
  update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(base_url: "http://www.clear-code.com/blog/")
24
- spider.append_filter(update_checker)
24
+ spider.append_link_filter(update_checker)
25
25
 
26
26
  DaimonSkycrawlers.register_processor(default_processor)
27
27
  DaimonSkycrawlers.register_processor(spider)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryunosuke SATO
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-04 00:00:00.000000000 Z
11
+ date: 2016-10-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -324,7 +324,11 @@ files:
324
324
  - lib/daimon_skycrawlers/filter/duplicate_checker.rb
325
325
  - lib/daimon_skycrawlers/filter/robots_txt_checker.rb
326
326
  - lib/daimon_skycrawlers/filter/update_checker.rb
327
+ - lib/daimon_skycrawlers/generator/crawler.rb
328
+ - lib/daimon_skycrawlers/generator/generate.rb
327
329
  - lib/daimon_skycrawlers/generator/new.rb
330
+ - lib/daimon_skycrawlers/generator/processor.rb
331
+ - lib/daimon_skycrawlers/generator/templates/crawler.rb.erb
328
332
  - lib/daimon_skycrawlers/generator/templates/new/Gemfile
329
333
  - lib/daimon_skycrawlers/generator/templates/new/README.md.erb
330
334
  - lib/daimon_skycrawlers/generator/templates/new/Rakefile
@@ -332,6 +336,7 @@ files:
332
336
  - lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
333
337
  - lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
334
338
  - lib/daimon_skycrawlers/generator/templates/new/config/init.rb
339
+ - lib/daimon_skycrawlers/generator/templates/processor.rb.erb
335
340
  - lib/daimon_skycrawlers/logger.rb
336
341
  - lib/daimon_skycrawlers/processor.rb
337
342
  - lib/daimon_skycrawlers/processor/base.rb