daimon_skycrawlers 0.12.0 → 1.0.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ea06b539d343f1a13e2eb22f3cd2e9662ce3d9da
4
- data.tar.gz: 39164028e5902f05a86a4c1e1f1890bbe8065dfa
3
+ metadata.gz: c9a394115d873b122e18a6970e62dfbc897dd690
4
+ data.tar.gz: 55fc07c13fa04c82c5e1c6a1075ffd97793b01f9
5
5
  SHA512:
6
- metadata.gz: 22472b47e7a5e87758c2bf8712bd76d227e7c6589afd641060e93e8265fdb628abf8a236dbcf1dab0dc6390f0fd6fb680e96ff4998c4ba479f4a5ff334b15a4f
7
- data.tar.gz: 8bdf2df5606221dcb984ab3850b8922f524dfbaf12d80cb38c11f65632327d0567b6c17aaff7a51da47e77e88fa27e360045dbdf38af2f3dc02cec3384f15090
6
+ metadata.gz: 5ae484de037763f04487ad8c1ea4047a78a726156781cffeebf0c4c14ae0dfb70029b5f834c1414af9ee43c2b67fb7b137e5d5e335c2dee01abe9aae6ced1099
7
+ data.tar.gz: 63c85408015248ca66448025b2eb8ff6b65080c3da375b2e5b9723e944e3836f73285cb083088a31c1c8d60fe6176f57c47bc2d894fc7618c5853b3aebc3d500
data/.travis.yml CHANGED
@@ -3,7 +3,8 @@ sudo: false
3
3
  cache: bundler
4
4
  rvm:
5
5
  - 2.2.5
6
- - 2.3.1
6
+ - 2.3.3
7
+ - 2.4.0
7
8
 
8
9
  before_install: gem update bundler
9
10
 
data/README.md CHANGED
@@ -1,5 +1,3 @@
1
- ## Caution!! This product is NOT production-ready.
2
-
3
1
  # DaimonSkycrawlers
4
2
 
5
3
  [![Gem Version](https://badge.fury.io/rb/daimon_skycrawlers.svg)](https://badge.fury.io/rb/daimon_skycrawlers)
data/Rakefile CHANGED
@@ -6,7 +6,9 @@ require "daimon_skycrawlers/tasks"
6
6
  Rake::TestTask.new(:test) do |t|
7
7
  t.libs << "test"
8
8
  t.libs << "lib"
9
- t.test_files = FileList["test/**/*_test.rb"]
9
+ t.test_files = FileList["test/**/test_*.rb"]
10
+ t.verbose = false
11
+ t.warning = false
10
12
  end
11
13
 
12
14
  task :default => [:test]
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
31
31
  spec.add_dependency "webrobots"
32
32
 
33
33
  spec.add_development_dependency "bundler", "~> 1.11"
34
- spec.add_development_dependency "rake", "~> 10.0"
34
+ spec.add_development_dependency "rake", "~> 12.0"
35
35
  spec.add_development_dependency "test-unit"
36
36
  spec.add_development_dependency "test-unit-rr"
37
37
  spec.add_development_dependency "test-unit-notify"
@@ -3,6 +3,7 @@ module DaimonSkycrawlers
3
3
  def initialize
4
4
  super
5
5
  @before_process_callbacks = []
6
+ @after_process_callbacks = []
6
7
  end
7
8
 
8
9
  def before_process(callback = nil, &block)
@@ -22,5 +23,23 @@ module DaimonSkycrawlers
22
23
  def clear_before_process_callbacks
23
24
  @before_process_callbacks = []
24
25
  end
26
+
27
+ def after_process(callback = nil, &block)
28
+ if block_given?
29
+ @after_process_callbacks << block
30
+ else
31
+ @after_process_callbacks << callback if callback.respond_to?(:call)
32
+ end
33
+ end
34
+
35
+ def run_after_callbacks(message)
36
+ @after_process_callbacks.each do |callback|
37
+ callback.call(message)
38
+ end
39
+ end
40
+
41
+ def clear_after_process_callbacks
42
+ @after_process_callbacks = []
43
+ end
25
44
  end
26
45
  end
@@ -3,6 +3,7 @@ require "daimon_skycrawlers/crawler"
3
3
  require "daimon_skycrawlers/processor"
4
4
  require "daimon_skycrawlers/sitemap_parser"
5
5
  require "daimon_skycrawlers/version"
6
+ require "thor"
6
7
  require "webrobots"
7
8
 
8
9
  module DaimonSkycrawlers
@@ -47,6 +48,45 @@ module DaimonSkycrawlers
47
48
  end
48
49
  end
49
50
 
51
+ desc "list PATH", "Enqueue URLs from PATH. PATH content includes a URL per line"
52
+ method_option("type", aliases: ["-t"], type: :string, default: "url", desc: "Specify type for URLs")
53
+ def list(path)
54
+ load_init
55
+ File.open(path, "r") do |file|
56
+ file.each_line do |line|
57
+ line.chomp!
58
+ next if /\A#/ =~ line
59
+ case options["type"]
60
+ when "response"
61
+ DaimonSkycrawlers::Processor.enqueue_http_response(line)
62
+ when "url"
63
+ DaimonSkycrawlers::Crawler.enqueue_url(line)
64
+ else
65
+ raise ArgumentError, "Unknown type: #{options["type"]}"
66
+ end
67
+ end
68
+ end
69
+ end
70
+
71
+ desc "yaml PATH", "Enqueue URLs from PATH."
72
+ method_option("type", aliases: ["-t"], type: :string, default: "url", desc: "Specify type for URLs")
73
+ def yaml(path)
74
+ load_init
75
+ YAML.load_file(path).each do |hash|
76
+ url = hash["url"]
77
+ message = hash["message"] || {}
78
+ raise "Could not find URL: #{hash}" unless url
79
+ case options["type"]
80
+ when "response"
81
+ DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
82
+ when "url"
83
+ DaimonSkycrawlers::Crawler.enqueue_url(url, message)
84
+ else
85
+ raise ArgumentError, "Unknown type: #{options["type"]}"
86
+ end
87
+ end
88
+ end
89
+
50
90
  private
51
91
 
52
92
  def load_init
@@ -11,7 +11,7 @@ module DaimonSkycrawlers
11
11
  def crawler
12
12
  load_init
13
13
  load_crawlers
14
- require(File.expand_path("app/crawler.rb", Dri.pwd))
14
+ require(File.expand_path("app/crawler.rb", Dir.pwd))
15
15
  DaimonSkycrawlers::Crawler.run
16
16
  rescue => ex
17
17
  puts ex.message
@@ -1,5 +1,6 @@
1
1
  require "uri"
2
2
  require "faraday"
3
+ require "typhoeus/adapters/faraday"
3
4
 
4
5
  require "daimon_skycrawlers/logger"
5
6
  require "daimon_skycrawlers/config"
@@ -10,6 +11,8 @@ require "daimon_skycrawlers/processor"
10
11
  require "daimon_skycrawlers/filter/update_checker"
11
12
  require "daimon_skycrawlers/filter/robots_txt_checker"
12
13
 
14
+ Faraday.default_adapter = :typhoeus
15
+
13
16
  module DaimonSkycrawlers
14
17
  module Crawler
15
18
  #
@@ -88,10 +91,10 @@ module DaimonSkycrawlers
88
91
  @n_processed_urls += 1
89
92
 
90
93
  setup_default_filters
94
+ setup_default_post_processes
91
95
 
92
96
  proceeding = run_before_callbacks(message)
93
97
  unless proceeding
94
- @skipped = true
95
98
  skip(message[:url])
96
99
  return
97
100
  end
@@ -101,7 +104,10 @@ module DaimonSkycrawlers
101
104
  url = (URI(connection.url_prefix) + url).to_s
102
105
 
103
106
  @prepare.call(connection)
104
- fetch(url, message, &block)
107
+ response = fetch(url, message, &block)
108
+ data = { url: url, message: message, response: response }
109
+ run_after_callbacks(data)
110
+ data
105
111
  end
106
112
 
107
113
  def fetch(path, message = {})
@@ -137,8 +143,17 @@ module DaimonSkycrawlers
137
143
  end
138
144
  end
139
145
 
146
+ def setup_default_post_processes
147
+ after_process do |data|
148
+ storage.save(data)
149
+ message = data[:message]
150
+ url = data[:url]
151
+ schedule_to_process(url, message)
152
+ end
153
+ end
154
+
140
155
  def skip(url)
141
- log.info("Skip #{url}")
156
+ log.info("Skipped '#{url}' by '#{self.class}'")
142
157
  @skipped = true
143
158
  schedule_to_process(url.to_s, heartbeat: true)
144
159
  end
@@ -9,13 +9,13 @@ module DaimonSkycrawlers
9
9
  #
10
10
  class Default < Base
11
11
  def fetch(url, message)
12
- response = get(url)
13
- data = [url.to_s, response.headers, response.body]
14
-
15
- yield(*data) if block_given?
16
-
17
- storage.save(*data)
18
- schedule_to_process(url.to_s, message)
12
+ params = message[:params] || {}
13
+ method = message[:method] || "GET"
14
+ if method == "POST"
15
+ post(url, params)
16
+ else
17
+ get(url, params)
18
+ end
19
19
  end
20
20
  end
21
21
  end
@@ -39,6 +39,7 @@ module DaimonSkycrawlers
39
39
  }
40
40
  invoke(MigrationGenerator, [
41
41
  "CreatePages",
42
+ "key:string",
42
43
  "url:string",
43
44
  "headers:text",
44
45
  "body:binary",
@@ -56,6 +57,8 @@ module DaimonSkycrawlers
56
57
  <<-CODE.chomp
57
58
  #{indent}t.timestamps
58
59
 
60
+ #{indent}t.index [:key]
61
+ #{indent}t.index [:key, :updated_at]
59
62
  #{indent}t.index [:url]
60
63
  #{indent}t.index [:url, :updated_at]
61
64
  CODE
@@ -95,7 +98,7 @@ module DaimonSkycrawlers
95
98
 
96
99
  def display_post_message
97
100
  puts <<MESSAGE
98
- Check .env and .env.db before run `docker-compose build` or `docker-compose up`.
101
+ Check .env and .env.db before running `docker-compose build` or `docker-compose up`.
99
102
  MESSAGE
100
103
  end
101
104
  end
@@ -24,7 +24,7 @@ module DaimonSkycrawlers
24
24
  You can register your processor in `app/processor.rb` to run your processor.
25
25
  Following code snippet is useful:
26
26
 
27
- processor = #{naem.classify}.new
27
+ processor = #{name.classify}.new
28
28
  DaimonSkycrawlers.register_processor(processor)
29
29
 
30
30
  MESSAGE
@@ -11,9 +11,23 @@ module DaimonSkycrawlers
11
11
  include DaimonSkycrawlers::Callbacks
12
12
  include DaimonSkycrawlers::Configurable
13
13
 
14
+ def initialize
15
+ super
16
+ @skipped = false
17
+ end
18
+
19
+ def skipped?
20
+ @skipped
21
+ end
22
+
14
23
  def process(message)
24
+ @skipped = false
25
+ setup_default_filters
15
26
  proceeding = run_before_callbacks(message)
16
- return unless proceeding
27
+ unless proceeding
28
+ skip(message[:url])
29
+ return
30
+ end
17
31
  call(message)
18
32
  end
19
33
 
@@ -24,6 +38,19 @@ module DaimonSkycrawlers
24
38
  def storage
25
39
  @storage ||= DaimonSkycrawlers::Storage::RDB.new
26
40
  end
41
+
42
+ private
43
+
44
+ def setup_default_filters
45
+ before_process do |m|
46
+ !m[:heartbeat]
47
+ end
48
+ end
49
+
50
+ def skip(url)
51
+ log.info("Skipped '#{url}' by '#{self.class}'")
52
+ @skipped = true
53
+ end
27
54
  end
28
55
  end
29
56
  end
@@ -5,7 +5,6 @@ module DaimonSkycrawlers
5
5
  module Processor
6
6
  class Default < Base
7
7
  def call(message)
8
- return if message[:heartbeat]
9
8
  url = message[:url]
10
9
  page = storage.find(url)
11
10
  headers = JSON.parse(page.headers)
@@ -101,7 +101,6 @@ module DaimonSkycrawlers
101
101
  def call(message)
102
102
  key_url = message[:url]
103
103
  depth = Integer(message[:depth] || 2)
104
- return if message[:heartbeat]
105
104
  return if depth <= 1
106
105
  page = storage.find(key_url)
107
106
  @doc = Nokogiri::HTML(page.body)
@@ -6,6 +6,9 @@ require "uri"
6
6
  module DaimonSkycrawlers
7
7
  # Based on https://github.com/benbalter/sitemap-parser
8
8
  class SitemapParser
9
+ class Error < StandardError
10
+ end
11
+
9
12
  def initialize(urls, options = {})
10
13
  @urls = urls
11
14
  end
@@ -14,7 +17,8 @@ module DaimonSkycrawlers
14
17
  hydra = Typhoeus::Hydra.new(max_concurrency: 1)
15
18
  sitemap_urls = []
16
19
  @urls.each do |url|
17
- if URI(url).scheme.start_with?("http")
20
+ uri = URI(url)
21
+ if uri.scheme && uri.scheme.start_with?("http")
18
22
  request = Typhoeus::Request.new(url, followlocation: true)
19
23
  request.on_complete do |response|
20
24
  sitemap_urls.concat(on_complete(response))
@@ -22,18 +26,21 @@ module DaimonSkycrawlers
22
26
  hydra.queue(request)
23
27
  else
24
28
  if File.exist?(url)
25
- extract_urls(File.read(url))
29
+ sitemap_urls.concat(extract_urls(File.read(url)))
26
30
  end
27
31
  end
28
32
  end
29
- hydra.run
33
+ loop do
34
+ hydra.run
35
+ break if hydra.queued_requests.empty?
36
+ end
30
37
  sitemap_urls
31
38
  end
32
39
 
33
40
  private
34
41
 
35
42
  def on_complete(response)
36
- raise "HTTP requset to #{response.effective_url} failed" unless response.success?
43
+ raise Error, "HTTP requset to #{response.effective_url} failed. status: #{response.code}" unless response.success?
37
44
  raw_sitemap = inflate_response(response)
38
45
  extract_urls(raw_sitemap)
39
46
  end
@@ -51,7 +58,7 @@ module DaimonSkycrawlers
51
58
  url.at("loc").content
52
59
  end
53
60
  else
54
- raise "Malformed sitemap.xml no <sitemapindex> or <urlset>"
61
+ raise Error, "Malformed sitemap.xml no <sitemapindex> or <urlset>"
55
62
  end
56
63
  end
57
64
 
@@ -72,7 +79,7 @@ module DaimonSkycrawlers
72
79
  when "deflate", "gzip", "x-gzip"
73
80
  true
74
81
  else
75
- signature = response.body[0, 2]
82
+ signature = response.body[0, 2].b
76
83
  signature == "\x1F\x8B".b
77
84
  end
78
85
  end
@@ -7,7 +7,7 @@ module DaimonSkycrawlers
7
7
  include DaimonSkycrawlers::LoggerMixin
8
8
  include DaimonSkycrawlers::ConfigMixin
9
9
 
10
- def save(url, headers, body)
10
+ def save(data)
11
11
  raise "Implement this in subclass"
12
12
  end
13
13
 
@@ -11,35 +11,46 @@ module DaimonSkycrawlers
11
11
  @base_dir = Pathname(base_dir)
12
12
  end
13
13
 
14
- def save(url, headers, body)
14
+ def save(data)
15
+ url = data[:url]
16
+ message = data[:message]
17
+ key = message[:key]
18
+ response = data[:response]
19
+ headers = response.headers
20
+ body = response.body
15
21
  @base_dir.mkpath
16
- body_path(url).dirname.mkpath
17
- body_path(url).open("wb+") do |file|
22
+ body_path(url, key).dirname.mkpath
23
+ body_path(url, key).open("wb+") do |file|
18
24
  file.write(body)
19
25
  end
20
- headers_path(url).open("wb+") do |file|
26
+ headers_path(url, key).open("wb+") do |file|
21
27
  file.write(JSON.generate(headers))
22
28
  end
23
29
  end
24
30
 
25
- def read(url)
26
- headers = JSON.parse(headers_path(url).read)
27
- body = body_path(url).read
28
- Page.new(url, headers, body, headers["last-modified"], headers["etag"])
31
+ def read(url, message)
32
+ key = message[:key]
33
+ headers = JSON.parse(headers_path(url, key).read)
34
+ body = body_path(url, key).read
35
+ Page.new(url, key, headers, body, headers["last-modified"], headers["etag"])
29
36
  end
30
37
 
31
- Page = Struct.new(:url, :headers, :body, :last_modified, :etag)
38
+ Page = Struct.new(:url, :key, :headers, :body, :last_modified, :etag)
32
39
 
33
40
  private
34
41
 
35
- def body_path(url)
42
+ def body_path(url, key)
36
43
  url = URI(url)
37
- @base_dir + ".#{url.path}"
44
+ if key
45
+ @base_dir + key
46
+ else
47
+ @base_dir + ".#{url.path}"
48
+ end
38
49
  end
39
50
 
40
- def headers_path(url)
51
+ def headers_path(url, key)
41
52
  url = URI(url)
42
- Pathname("#{body_path(url)}-headers.json")
53
+ Pathname("#{body_path(url, key)}-headers.json")
43
54
  end
44
55
  end
45
56
  end
@@ -10,7 +10,7 @@ module DaimonSkycrawlers
10
10
  #
11
11
  # Save nothing
12
12
  #
13
- def save(url, headers, body)
13
+ def save(data)
14
14
  end
15
15
 
16
16
  #
@@ -16,12 +16,20 @@ module DaimonSkycrawlers
16
16
  #
17
17
  # Save
18
18
  #
19
- # @param [String] url identity of the page
20
- # @param [Hash] header of URL
21
- # @param [String] body
19
+ # @param [Hash] data has following keys
20
+ # * :url: URL
21
+ # * :message: Given message
22
+ # * :response: HTTP response
22
23
  #
23
- def save(url, headers, body)
24
+ def save(data)
25
+ url = data[:url]
26
+ message = data[:message]
27
+ key = message[:key] || url
28
+ response = data[:response]
29
+ headers = response.headers
30
+ body = response.body
24
31
  Page.create(url: url,
32
+ key: key,
25
33
  headers: JSON.generate(headers),
26
34
  body: body,
27
35
  last_modified_at: headers["last-modified"],
@@ -33,8 +41,13 @@ module DaimonSkycrawlers
33
41
  #
34
42
  # @param [String] url identity of the page
35
43
  #
36
- def find(url)
37
- Page.where(url: url).order(updated_at: :desc).limit(1).first
44
+ def find(url, message = {})
45
+ key = message[:key]
46
+ if key
47
+ Page.where(key: key).order(updated_at: :desc).limit(1).first
48
+ else
49
+ Page.where(url: url).order(updated_at: :desc).limit(1).first
50
+ end
38
51
  end
39
52
 
40
53
  class Base < ActiveRecord::Base
@@ -8,7 +8,11 @@ module DaimonSkycrawlers
8
8
  def setup_shutdown_timer(queue_name_prefix, interval: 10)
9
9
  timers = Timers::Group.new
10
10
  timer = timers.after(interval) do
11
- Process.kill(:INT, 0)
11
+ if block_given?
12
+ yield
13
+ else
14
+ Process.kill(:INT, 0)
15
+ end
12
16
  end
13
17
  Thread.new(timers) do |t|
14
18
  loop { t.wait }
@@ -19,6 +23,7 @@ module DaimonSkycrawlers
19
23
  DaimonSkycrawlers.configuration.logger.debug("Reset timer: consume message #{queue_name}")
20
24
  timer.reset
21
25
  end
26
+ timers
22
27
  end
23
28
  end
24
29
  end
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.12.0"
2
+ VERSION = "1.0.0-rc1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.0
4
+ version: 1.0.0.pre.rc1
5
5
  platform: ruby
6
6
  authors:
7
7
  - daimon developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-12-20 00:00:00.000000000 Z
11
+ date: 2017-01-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -184,14 +184,14 @@ dependencies:
184
184
  requirements:
185
185
  - - "~>"
186
186
  - !ruby/object:Gem::Version
187
- version: '10.0'
187
+ version: '12.0'
188
188
  type: :development
189
189
  prerelease: false
190
190
  version_requirements: !ruby/object:Gem::Requirement
191
191
  requirements:
192
192
  - - "~>"
193
193
  - !ruby/object:Gem::Version
194
- version: '10.0'
194
+ version: '12.0'
195
195
  - !ruby/object:Gem::Dependency
196
196
  name: test-unit
197
197
  requirement: !ruby/object:Gem::Requirement
@@ -442,12 +442,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
442
442
  version: '0'
443
443
  required_rubygems_version: !ruby/object:Gem::Requirement
444
444
  requirements:
445
- - - ">="
445
+ - - ">"
446
446
  - !ruby/object:Gem::Version
447
- version: '0'
447
+ version: 1.3.1
448
448
  requirements: []
449
449
  rubyforge_project:
450
- rubygems_version: 2.6.4
450
+ rubygems_version: 2.6.8
451
451
  signing_key:
452
452
  specification_version: 4
453
453
  summary: This is a crawler framework.