daimon_skycrawlers 0.12.0 → 1.0.0.pre.rc1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ea06b539d343f1a13e2eb22f3cd2e9662ce3d9da
4
- data.tar.gz: 39164028e5902f05a86a4c1e1f1890bbe8065dfa
3
+ metadata.gz: c9a394115d873b122e18a6970e62dfbc897dd690
4
+ data.tar.gz: 55fc07c13fa04c82c5e1c6a1075ffd97793b01f9
5
5
  SHA512:
6
- metadata.gz: 22472b47e7a5e87758c2bf8712bd76d227e7c6589afd641060e93e8265fdb628abf8a236dbcf1dab0dc6390f0fd6fb680e96ff4998c4ba479f4a5ff334b15a4f
7
- data.tar.gz: 8bdf2df5606221dcb984ab3850b8922f524dfbaf12d80cb38c11f65632327d0567b6c17aaff7a51da47e77e88fa27e360045dbdf38af2f3dc02cec3384f15090
6
+ metadata.gz: 5ae484de037763f04487ad8c1ea4047a78a726156781cffeebf0c4c14ae0dfb70029b5f834c1414af9ee43c2b67fb7b137e5d5e335c2dee01abe9aae6ced1099
7
+ data.tar.gz: 63c85408015248ca66448025b2eb8ff6b65080c3da375b2e5b9723e944e3836f73285cb083088a31c1c8d60fe6176f57c47bc2d894fc7618c5853b3aebc3d500
data/.travis.yml CHANGED
@@ -3,7 +3,8 @@ sudo: false
3
3
  cache: bundler
4
4
  rvm:
5
5
  - 2.2.5
6
- - 2.3.1
6
+ - 2.3.3
7
+ - 2.4.0
7
8
 
8
9
  before_install: gem update bundler
9
10
 
data/README.md CHANGED
@@ -1,5 +1,3 @@
1
- ## Caution!! This product is NOT production-ready.
2
-
3
1
  # DaimonSkycrawlers
4
2
 
5
3
  [![Gem Version](https://badge.fury.io/rb/daimon_skycrawlers.svg)](https://badge.fury.io/rb/daimon_skycrawlers)
data/Rakefile CHANGED
@@ -6,7 +6,9 @@ require "daimon_skycrawlers/tasks"
6
6
  Rake::TestTask.new(:test) do |t|
7
7
  t.libs << "test"
8
8
  t.libs << "lib"
9
- t.test_files = FileList["test/**/*_test.rb"]
9
+ t.test_files = FileList["test/**/test_*.rb"]
10
+ t.verbose = false
11
+ t.warning = false
10
12
  end
11
13
 
12
14
  task :default => [:test]
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
31
31
  spec.add_dependency "webrobots"
32
32
 
33
33
  spec.add_development_dependency "bundler", "~> 1.11"
34
- spec.add_development_dependency "rake", "~> 10.0"
34
+ spec.add_development_dependency "rake", "~> 12.0"
35
35
  spec.add_development_dependency "test-unit"
36
36
  spec.add_development_dependency "test-unit-rr"
37
37
  spec.add_development_dependency "test-unit-notify"
@@ -3,6 +3,7 @@ module DaimonSkycrawlers
3
3
  def initialize
4
4
  super
5
5
  @before_process_callbacks = []
6
+ @after_process_callbacks = []
6
7
  end
7
8
 
8
9
  def before_process(callback = nil, &block)
@@ -22,5 +23,23 @@ module DaimonSkycrawlers
22
23
  def clear_before_process_callbacks
23
24
  @before_process_callbacks = []
24
25
  end
26
+
27
+ def after_process(callback = nil, &block)
28
+ if block_given?
29
+ @after_process_callbacks << block
30
+ else
31
+ @after_process_callbacks << callback if callback.respond_to?(:call)
32
+ end
33
+ end
34
+
35
+ def run_after_callbacks(message)
36
+ @after_process_callbacks.each do |callback|
37
+ callback.call(message)
38
+ end
39
+ end
40
+
41
+ def clear_after_process_callbacks
42
+ @after_process_callbacks = []
43
+ end
25
44
  end
26
45
  end
@@ -3,6 +3,7 @@ require "daimon_skycrawlers/crawler"
3
3
  require "daimon_skycrawlers/processor"
4
4
  require "daimon_skycrawlers/sitemap_parser"
5
5
  require "daimon_skycrawlers/version"
6
+ require "thor"
6
7
  require "webrobots"
7
8
 
8
9
  module DaimonSkycrawlers
@@ -47,6 +48,45 @@ module DaimonSkycrawlers
47
48
  end
48
49
  end
49
50
 
51
+ desc "list PATH", "Enqueue URLs from PATH. PATH content includes a URL per line"
52
+ method_option("type", aliases: ["-t"], type: :string, default: "url", desc: "Specify type for URLs")
53
+ def list(path)
54
+ load_init
55
+ File.open(path, "r") do |file|
56
+ file.each_line do |line|
57
+ line.chomp!
58
+ next if /\A#/ =~ line
59
+ case options["type"]
60
+ when "response"
61
+ DaimonSkycrawlers::Processor.enqueue_http_response(line)
62
+ when "url"
63
+ DaimonSkycrawlers::Crawler.enqueue_url(line)
64
+ else
65
+ raise ArgumentError, "Unknown type: #{options["type"]}"
66
+ end
67
+ end
68
+ end
69
+ end
70
+
71
+ desc "yaml PATH", "Enqueue URLs from PATH."
72
+ method_option("type", aliases: ["-t"], type: :string, default: "url", desc: "Specify type for URLs")
73
+ def yaml(path)
74
+ load_init
75
+ YAML.load_file(path).each do |hash|
76
+ url = hash["url"]
77
+ message = hash["message"] || {}
78
+ raise "Could not find URL: #{hash}" unless url
79
+ case options["type"]
80
+ when "response"
81
+ DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
82
+ when "url"
83
+ DaimonSkycrawlers::Crawler.enqueue_url(url, message)
84
+ else
85
+ raise ArgumentError, "Unknown type: #{options["type"]}"
86
+ end
87
+ end
88
+ end
89
+
50
90
  private
51
91
 
52
92
  def load_init
@@ -11,7 +11,7 @@ module DaimonSkycrawlers
11
11
  def crawler
12
12
  load_init
13
13
  load_crawlers
14
- require(File.expand_path("app/crawler.rb", Dri.pwd))
14
+ require(File.expand_path("app/crawler.rb", Dir.pwd))
15
15
  DaimonSkycrawlers::Crawler.run
16
16
  rescue => ex
17
17
  puts ex.message
@@ -1,5 +1,6 @@
1
1
  require "uri"
2
2
  require "faraday"
3
+ require "typhoeus/adapters/faraday"
3
4
 
4
5
  require "daimon_skycrawlers/logger"
5
6
  require "daimon_skycrawlers/config"
@@ -10,6 +11,8 @@ require "daimon_skycrawlers/processor"
10
11
  require "daimon_skycrawlers/filter/update_checker"
11
12
  require "daimon_skycrawlers/filter/robots_txt_checker"
12
13
 
14
+ Faraday.default_adapter = :typhoeus
15
+
13
16
  module DaimonSkycrawlers
14
17
  module Crawler
15
18
  #
@@ -88,10 +91,10 @@ module DaimonSkycrawlers
88
91
  @n_processed_urls += 1
89
92
 
90
93
  setup_default_filters
94
+ setup_default_post_processes
91
95
 
92
96
  proceeding = run_before_callbacks(message)
93
97
  unless proceeding
94
- @skipped = true
95
98
  skip(message[:url])
96
99
  return
97
100
  end
@@ -101,7 +104,10 @@ module DaimonSkycrawlers
101
104
  url = (URI(connection.url_prefix) + url).to_s
102
105
 
103
106
  @prepare.call(connection)
104
- fetch(url, message, &block)
107
+ response = fetch(url, message, &block)
108
+ data = { url: url, message: message, response: response }
109
+ run_after_callbacks(data)
110
+ data
105
111
  end
106
112
 
107
113
  def fetch(path, message = {})
@@ -137,8 +143,17 @@ module DaimonSkycrawlers
137
143
  end
138
144
  end
139
145
 
146
+ def setup_default_post_processes
147
+ after_process do |data|
148
+ storage.save(data)
149
+ message = data[:message]
150
+ url = data[:url]
151
+ schedule_to_process(url, message)
152
+ end
153
+ end
154
+
140
155
  def skip(url)
141
- log.info("Skip #{url}")
156
+ log.info("Skipped '#{url}' by '#{self.class}'")
142
157
  @skipped = true
143
158
  schedule_to_process(url.to_s, heartbeat: true)
144
159
  end
@@ -9,13 +9,13 @@ module DaimonSkycrawlers
9
9
  #
10
10
  class Default < Base
11
11
  def fetch(url, message)
12
- response = get(url)
13
- data = [url.to_s, response.headers, response.body]
14
-
15
- yield(*data) if block_given?
16
-
17
- storage.save(*data)
18
- schedule_to_process(url.to_s, message)
12
+ params = message[:params] || {}
13
+ method = message[:method] || "GET"
14
+ if method == "POST"
15
+ post(url, params)
16
+ else
17
+ get(url, params)
18
+ end
19
19
  end
20
20
  end
21
21
  end
@@ -39,6 +39,7 @@ module DaimonSkycrawlers
39
39
  }
40
40
  invoke(MigrationGenerator, [
41
41
  "CreatePages",
42
+ "key:string",
42
43
  "url:string",
43
44
  "headers:text",
44
45
  "body:binary",
@@ -56,6 +57,8 @@ module DaimonSkycrawlers
56
57
  <<-CODE.chomp
57
58
  #{indent}t.timestamps
58
59
 
60
+ #{indent}t.index [:key]
61
+ #{indent}t.index [:key, :updated_at]
59
62
  #{indent}t.index [:url]
60
63
  #{indent}t.index [:url, :updated_at]
61
64
  CODE
@@ -95,7 +98,7 @@ module DaimonSkycrawlers
95
98
 
96
99
  def display_post_message
97
100
  puts <<MESSAGE
98
- Check .env and .env.db before run `docker-compose build` or `docker-compose up`.
101
+ Check .env and .env.db before running `docker-compose build` or `docker-compose up`.
99
102
  MESSAGE
100
103
  end
101
104
  end
@@ -24,7 +24,7 @@ module DaimonSkycrawlers
24
24
  You can register your processor in `app/processor.rb` to run your processor.
25
25
  Following code snippet is useful:
26
26
 
27
- processor = #{naem.classify}.new
27
+ processor = #{name.classify}.new
28
28
  DaimonSkycrawlers.register_processor(processor)
29
29
 
30
30
  MESSAGE
@@ -11,9 +11,23 @@ module DaimonSkycrawlers
11
11
  include DaimonSkycrawlers::Callbacks
12
12
  include DaimonSkycrawlers::Configurable
13
13
 
14
+ def initialize
15
+ super
16
+ @skipped = false
17
+ end
18
+
19
+ def skipped?
20
+ @skipped
21
+ end
22
+
14
23
  def process(message)
24
+ @skipped = false
25
+ setup_default_filters
15
26
  proceeding = run_before_callbacks(message)
16
- return unless proceeding
27
+ unless proceeding
28
+ skip(message[:url])
29
+ return
30
+ end
17
31
  call(message)
18
32
  end
19
33
 
@@ -24,6 +38,19 @@ module DaimonSkycrawlers
24
38
  def storage
25
39
  @storage ||= DaimonSkycrawlers::Storage::RDB.new
26
40
  end
41
+
42
+ private
43
+
44
+ def setup_default_filters
45
+ before_process do |m|
46
+ !m[:heartbeat]
47
+ end
48
+ end
49
+
50
+ def skip(url)
51
+ log.info("Skipped '#{url}' by '#{self.class}'")
52
+ @skipped = true
53
+ end
27
54
  end
28
55
  end
29
56
  end
@@ -5,7 +5,6 @@ module DaimonSkycrawlers
5
5
  module Processor
6
6
  class Default < Base
7
7
  def call(message)
8
- return if message[:heartbeat]
9
8
  url = message[:url]
10
9
  page = storage.find(url)
11
10
  headers = JSON.parse(page.headers)
@@ -101,7 +101,6 @@ module DaimonSkycrawlers
101
101
  def call(message)
102
102
  key_url = message[:url]
103
103
  depth = Integer(message[:depth] || 2)
104
- return if message[:heartbeat]
105
104
  return if depth <= 1
106
105
  page = storage.find(key_url)
107
106
  @doc = Nokogiri::HTML(page.body)
@@ -6,6 +6,9 @@ require "uri"
6
6
  module DaimonSkycrawlers
7
7
  # Based on https://github.com/benbalter/sitemap-parser
8
8
  class SitemapParser
9
+ class Error < StandardError
10
+ end
11
+
9
12
  def initialize(urls, options = {})
10
13
  @urls = urls
11
14
  end
@@ -14,7 +17,8 @@ module DaimonSkycrawlers
14
17
  hydra = Typhoeus::Hydra.new(max_concurrency: 1)
15
18
  sitemap_urls = []
16
19
  @urls.each do |url|
17
- if URI(url).scheme.start_with?("http")
20
+ uri = URI(url)
21
+ if uri.scheme && uri.scheme.start_with?("http")
18
22
  request = Typhoeus::Request.new(url, followlocation: true)
19
23
  request.on_complete do |response|
20
24
  sitemap_urls.concat(on_complete(response))
@@ -22,18 +26,21 @@ module DaimonSkycrawlers
22
26
  hydra.queue(request)
23
27
  else
24
28
  if File.exist?(url)
25
- extract_urls(File.read(url))
29
+ sitemap_urls.concat(extract_urls(File.read(url)))
26
30
  end
27
31
  end
28
32
  end
29
- hydra.run
33
+ loop do
34
+ hydra.run
35
+ break if hydra.queued_requests.empty?
36
+ end
30
37
  sitemap_urls
31
38
  end
32
39
 
33
40
  private
34
41
 
35
42
  def on_complete(response)
36
- raise "HTTP requset to #{response.effective_url} failed" unless response.success?
43
+ raise Error, "HTTP requset to #{response.effective_url} failed. status: #{response.code}" unless response.success?
37
44
  raw_sitemap = inflate_response(response)
38
45
  extract_urls(raw_sitemap)
39
46
  end
@@ -51,7 +58,7 @@ module DaimonSkycrawlers
51
58
  url.at("loc").content
52
59
  end
53
60
  else
54
- raise "Malformed sitemap.xml no <sitemapindex> or <urlset>"
61
+ raise Error, "Malformed sitemap.xml no <sitemapindex> or <urlset>"
55
62
  end
56
63
  end
57
64
 
@@ -72,7 +79,7 @@ module DaimonSkycrawlers
72
79
  when "deflate", "gzip", "x-gzip"
73
80
  true
74
81
  else
75
- signature = response.body[0, 2]
82
+ signature = response.body[0, 2].b
76
83
  signature == "\x1F\x8B".b
77
84
  end
78
85
  end
@@ -7,7 +7,7 @@ module DaimonSkycrawlers
7
7
  include DaimonSkycrawlers::LoggerMixin
8
8
  include DaimonSkycrawlers::ConfigMixin
9
9
 
10
- def save(url, headers, body)
10
+ def save(data)
11
11
  raise "Implement this in subclass"
12
12
  end
13
13
 
@@ -11,35 +11,46 @@ module DaimonSkycrawlers
11
11
  @base_dir = Pathname(base_dir)
12
12
  end
13
13
 
14
- def save(url, headers, body)
14
+ def save(data)
15
+ url = data[:url]
16
+ message = data[:message]
17
+ key = message[:key]
18
+ response = data[:response]
19
+ headers = response.headers
20
+ body = response.body
15
21
  @base_dir.mkpath
16
- body_path(url).dirname.mkpath
17
- body_path(url).open("wb+") do |file|
22
+ body_path(url, key).dirname.mkpath
23
+ body_path(url, key).open("wb+") do |file|
18
24
  file.write(body)
19
25
  end
20
- headers_path(url).open("wb+") do |file|
26
+ headers_path(url, key).open("wb+") do |file|
21
27
  file.write(JSON.generate(headers))
22
28
  end
23
29
  end
24
30
 
25
- def read(url)
26
- headers = JSON.parse(headers_path(url).read)
27
- body = body_path(url).read
28
- Page.new(url, headers, body, headers["last-modified"], headers["etag"])
31
+ def read(url, message)
32
+ key = message[:key]
33
+ headers = JSON.parse(headers_path(url, key).read)
34
+ body = body_path(url, key).read
35
+ Page.new(url, key, headers, body, headers["last-modified"], headers["etag"])
29
36
  end
30
37
 
31
- Page = Struct.new(:url, :headers, :body, :last_modified, :etag)
38
+ Page = Struct.new(:url, :key, :headers, :body, :last_modified, :etag)
32
39
 
33
40
  private
34
41
 
35
- def body_path(url)
42
+ def body_path(url, key)
36
43
  url = URI(url)
37
- @base_dir + ".#{url.path}"
44
+ if key
45
+ @base_dir + key
46
+ else
47
+ @base_dir + ".#{url.path}"
48
+ end
38
49
  end
39
50
 
40
- def headers_path(url)
51
+ def headers_path(url, key)
41
52
  url = URI(url)
42
- Pathname("#{body_path(url)}-headers.json")
53
+ Pathname("#{body_path(url, key)}-headers.json")
43
54
  end
44
55
  end
45
56
  end
@@ -10,7 +10,7 @@ module DaimonSkycrawlers
10
10
  #
11
11
  # Save nothing
12
12
  #
13
- def save(url, headers, body)
13
+ def save(data)
14
14
  end
15
15
 
16
16
  #
@@ -16,12 +16,20 @@ module DaimonSkycrawlers
16
16
  #
17
17
  # Save
18
18
  #
19
- # @param [String] url identity of the page
20
- # @param [Hash] header of URL
21
- # @param [String] body
19
+ # @param [Hash] data has following keys
20
+ # * :url: URL
21
+ # * :message: Given message
22
+ # * :response: HTTP response
22
23
  #
23
- def save(url, headers, body)
24
+ def save(data)
25
+ url = data[:url]
26
+ message = data[:message]
27
+ key = message[:key] || url
28
+ response = data[:response]
29
+ headers = response.headers
30
+ body = response.body
24
31
  Page.create(url: url,
32
+ key: key,
25
33
  headers: JSON.generate(headers),
26
34
  body: body,
27
35
  last_modified_at: headers["last-modified"],
@@ -33,8 +41,13 @@ module DaimonSkycrawlers
33
41
  #
34
42
  # @param [String] url identity of the page
35
43
  #
36
- def find(url)
37
- Page.where(url: url).order(updated_at: :desc).limit(1).first
44
+ def find(url, message = {})
45
+ key = message[:key]
46
+ if key
47
+ Page.where(key: key).order(updated_at: :desc).limit(1).first
48
+ else
49
+ Page.where(url: url).order(updated_at: :desc).limit(1).first
50
+ end
38
51
  end
39
52
 
40
53
  class Base < ActiveRecord::Base
@@ -8,7 +8,11 @@ module DaimonSkycrawlers
8
8
  def setup_shutdown_timer(queue_name_prefix, interval: 10)
9
9
  timers = Timers::Group.new
10
10
  timer = timers.after(interval) do
11
- Process.kill(:INT, 0)
11
+ if block_given?
12
+ yield
13
+ else
14
+ Process.kill(:INT, 0)
15
+ end
12
16
  end
13
17
  Thread.new(timers) do |t|
14
18
  loop { t.wait }
@@ -19,6 +23,7 @@ module DaimonSkycrawlers
19
23
  DaimonSkycrawlers.configuration.logger.debug("Reset timer: consume message #{queue_name}")
20
24
  timer.reset
21
25
  end
26
+ timers
22
27
  end
23
28
  end
24
29
  end
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.12.0"
2
+ VERSION = "1.0.0-rc1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.0
4
+ version: 1.0.0.pre.rc1
5
5
  platform: ruby
6
6
  authors:
7
7
  - daimon developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-12-20 00:00:00.000000000 Z
11
+ date: 2017-01-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -184,14 +184,14 @@ dependencies:
184
184
  requirements:
185
185
  - - "~>"
186
186
  - !ruby/object:Gem::Version
187
- version: '10.0'
187
+ version: '12.0'
188
188
  type: :development
189
189
  prerelease: false
190
190
  version_requirements: !ruby/object:Gem::Requirement
191
191
  requirements:
192
192
  - - "~>"
193
193
  - !ruby/object:Gem::Version
194
- version: '10.0'
194
+ version: '12.0'
195
195
  - !ruby/object:Gem::Dependency
196
196
  name: test-unit
197
197
  requirement: !ruby/object:Gem::Requirement
@@ -442,12 +442,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
442
442
  version: '0'
443
443
  required_rubygems_version: !ruby/object:Gem::Requirement
444
444
  requirements:
445
- - - ">="
445
+ - - ">"
446
446
  - !ruby/object:Gem::Version
447
- version: '0'
447
+ version: 1.3.1
448
448
  requirements: []
449
449
  rubyforge_project:
450
- rubygems_version: 2.6.4
450
+ rubygems_version: 2.6.8
451
451
  signing_key:
452
452
  specification_version: 4
453
453
  summary: This is a crawler framework.