daimon_skycrawlers 0.10.0 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/daimon_skycrawlers.gemspec +1 -0
- data/lib/daimon_skycrawlers/callbacks.rb +26 -0
- data/lib/daimon_skycrawlers/crawler/base.rb +28 -17
- data/lib/daimon_skycrawlers/filter/base.rb +7 -0
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +1 -4
- data/lib/daimon_skycrawlers/filter/robots_txt_checker.rb +1 -4
- data/lib/daimon_skycrawlers/filter/update_checker.rb +1 -4
- data/lib/daimon_skycrawlers/generator/templates/new/Dockerfile +7 -2
- data/lib/daimon_skycrawlers/generator/templates/new/docker-compose.yml.erb +8 -0
- data/lib/daimon_skycrawlers/generator/templates/new/env.erb +4 -1
- data/lib/daimon_skycrawlers/generator/templates/new/services/common/docker-entrypoint.sh +9 -3
- data/lib/daimon_skycrawlers/generator/templates/processor.rb.erb +1 -2
- data/lib/daimon_skycrawlers/processor/base.rb +4 -22
- data/lib/daimon_skycrawlers/version.rb +1 -1
- metadata +18 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7c8534d5ada19111a31249998b4ff7c935dc44c4
|
4
|
+
data.tar.gz: 77a7c9d78db52e9d40e412c9ced08d024caa1499
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 205234df54f6c985aa14f97fa5e5a967e18c923d7bf0fc4b29fe3d3c40c85d88667987b6bc7ca9d389dd513b9cb235b699b4741f228ab1151aa7dcb1a5802ebb
|
7
|
+
data.tar.gz: e2090d4983e1ffb7434eef92b76c30795e495f79c89fe9fe3567ba4e7e8f8b650b33870b0ab718d5d1afd799de45d35d7791fbc29e43f4f76bfb23ce510b32e9
|
data/daimon_skycrawlers.gemspec
CHANGED
@@ -37,6 +37,7 @@ Gem::Specification.new do |spec|
|
|
37
37
|
spec.add_development_dependency "test-unit-notify"
|
38
38
|
spec.add_development_dependency "pry"
|
39
39
|
spec.add_development_dependency "tapp"
|
40
|
+
spec.add_development_dependency "simplecov"
|
40
41
|
spec.add_development_dependency "sqlite3"
|
41
42
|
spec.add_development_dependency "yard"
|
42
43
|
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module DaimonSkycrawlers
|
2
|
+
module Callbacks
|
3
|
+
def initialize
|
4
|
+
super
|
5
|
+
@before_process_callbacks = []
|
6
|
+
end
|
7
|
+
|
8
|
+
def before_process(callback = nil, &block)
|
9
|
+
if block_given?
|
10
|
+
@before_process_callbacks << block
|
11
|
+
else
|
12
|
+
@before_process_callbacks << callback if callback.respond_to?(:call)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def run_before_callbacks(message)
|
17
|
+
@before_process_callbacks.all? do |callback|
|
18
|
+
callback.call(message)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def clear_before_process_callbacks
|
23
|
+
@before_process_callbacks = []
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -3,6 +3,7 @@ require "faraday"
|
|
3
3
|
|
4
4
|
require "daimon_skycrawlers/logger"
|
5
5
|
require "daimon_skycrawlers/config"
|
6
|
+
require "daimon_skycrawlers/callbacks"
|
6
7
|
require "daimon_skycrawlers/storage"
|
7
8
|
require "daimon_skycrawlers/processor"
|
8
9
|
require "daimon_skycrawlers/filter/update_checker"
|
@@ -16,6 +17,7 @@ module DaimonSkycrawlers
|
|
16
17
|
class Base
|
17
18
|
include DaimonSkycrawlers::LoggerMixin
|
18
19
|
include DaimonSkycrawlers::ConfigMixin
|
20
|
+
include DaimonSkycrawlers::Callbacks
|
19
21
|
|
20
22
|
# @!attribute [w] storage
|
21
23
|
# Set storage to crawler instance.
|
@@ -80,19 +82,24 @@ module DaimonSkycrawlers
|
|
80
82
|
end
|
81
83
|
|
82
84
|
def process(message, &block)
|
83
|
-
url = message.delete(:url)
|
84
|
-
|
85
85
|
@skipped = false
|
86
86
|
@n_processed_urls += 1
|
87
|
-
# url can be a path
|
88
|
-
url = connection.url_prefix + url
|
89
87
|
|
90
|
-
|
88
|
+
setup_default_filters
|
91
89
|
|
92
|
-
|
93
|
-
|
94
|
-
|
90
|
+
proceeding = run_before_callbacks(message)
|
91
|
+
unless proceeding
|
92
|
+
@skipped = true
|
93
|
+
skip(message[:url])
|
94
|
+
return
|
95
95
|
end
|
96
|
+
|
97
|
+
# url can be a path
|
98
|
+
url = message.delete(:url)
|
99
|
+
url = (URI(connection.url_prefix) + url).to_s
|
100
|
+
|
101
|
+
@prepare.call(connection)
|
102
|
+
fetch(url, message, &block)
|
96
103
|
end
|
97
104
|
|
98
105
|
def fetch(path, message = {})
|
@@ -109,18 +116,22 @@ module DaimonSkycrawlers
|
|
109
116
|
|
110
117
|
private
|
111
118
|
|
112
|
-
def
|
119
|
+
def setup_default_filters
|
113
120
|
if @options[:obey_robots_txt]
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
121
|
+
before_process do |m|
|
122
|
+
robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url)
|
123
|
+
allowed = robots_txt_checker.allowed?(m)
|
124
|
+
log.debug("Not allowed: #{m[:url]}") unless allowed
|
125
|
+
allowed
|
118
126
|
end
|
119
127
|
end
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
128
|
+
before_process do |m|
|
129
|
+
update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
|
130
|
+
updated = update_checker.updated?(m, connection: connection)
|
131
|
+
unless updated
|
132
|
+
log.debug("Not updated: #{m[:url]}")
|
133
|
+
end
|
134
|
+
updated
|
124
135
|
end
|
125
136
|
end
|
126
137
|
|
@@ -27,6 +27,13 @@ module DaimonSkycrawlers
|
|
27
27
|
def call(message)
|
28
28
|
raise NotImplementedError, "Must implement this method in subclass"
|
29
29
|
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def normalize_url(url)
|
34
|
+
return url unless @base_url
|
35
|
+
(URI(@base_url) + url).to_s
|
36
|
+
end
|
30
37
|
end
|
31
38
|
end
|
32
39
|
end
|
@@ -21,10 +21,7 @@ module DaimonSkycrawlers
|
|
21
21
|
# @return [true|false] Return false when duplicated, otherwise return true.
|
22
22
|
#
|
23
23
|
def call(message)
|
24
|
-
url = message[:url]
|
25
|
-
unless URI(url).absolute?
|
26
|
-
url = (@base_url + url).to_s
|
27
|
-
end
|
24
|
+
url = normalize_url(message[:url])
|
28
25
|
return false if @urls.include?(url)
|
29
26
|
@urls << url
|
30
27
|
true
|
@@ -20,10 +20,7 @@ module DaimonSkycrawlers
|
|
20
20
|
# @return [true|false] Return true when web site allows to fetch the URL, otherwise return false
|
21
21
|
#
|
22
22
|
def call(message)
|
23
|
-
url = message[:url]
|
24
|
-
unless URI(url).absolute?
|
25
|
-
url = (@base_url + url).to_s
|
26
|
-
end
|
23
|
+
url = normalize_url(message[:url])
|
27
24
|
@webrobots.allowed?(url)
|
28
25
|
end
|
29
26
|
|
@@ -22,10 +22,7 @@ module DaimonSkycrawlers
|
|
22
22
|
# @return [true|false] Return true when need update, otherwise return false
|
23
23
|
#
|
24
24
|
def call(message, connection: nil)
|
25
|
-
url = message[:url]
|
26
|
-
unless URI(url).absolute?
|
27
|
-
url = (@base_url + url).to_s
|
28
|
-
end
|
25
|
+
url = normalize_url(message[:url])
|
29
26
|
page = storage.find(url)
|
30
27
|
return true unless page
|
31
28
|
if connection
|
@@ -1,6 +1,6 @@
|
|
1
1
|
FROM ruby:2.3.1-alpine
|
2
2
|
|
3
|
-
RUN apk --no-cache --update add build-base ruby-dev libxml2-dev postgresql-dev libcurl git
|
3
|
+
RUN apk --no-cache --update add build-base ruby-dev libxml2-dev postgresql-dev libcurl openssl git
|
4
4
|
|
5
5
|
RUN adduser -D -h /home/crawler -g "DaimonSkycrawlers user" -s /bin/sh crawler crawler
|
6
6
|
|
@@ -8,7 +8,12 @@ ARG SKYCRAWLERS_ENV=production
|
|
8
8
|
ARG SKYCRAWLERS_MAIN=crawler
|
9
9
|
ENV SKYCRAWLERS_ENV=$SKYCRAWLERS_ENV \
|
10
10
|
SKYCRAWLERS_MAIN=$SKYCRAWLERS_MAIN \
|
11
|
-
BUNDLE_JOBS=4
|
11
|
+
BUNDLE_JOBS=4 \
|
12
|
+
DOCKERIZE_VERSION=v0.3.0
|
13
|
+
|
14
|
+
RUN wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz \
|
15
|
+
&& tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz \
|
16
|
+
&& rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
|
12
17
|
|
13
18
|
USER crawler
|
14
19
|
WORKDIR /home/crawler
|
@@ -2,11 +2,15 @@ version: "2"
|
|
2
2
|
services:
|
3
3
|
<%= name %>-rabbitmq:
|
4
4
|
image: rabbitmq
|
5
|
+
volumes:
|
6
|
+
- <%= name %>-rabbitmq-storage:/var/lib/rabbitmq
|
5
7
|
|
6
8
|
<%= name %>-db:
|
7
9
|
build:
|
8
10
|
context: .
|
9
11
|
dockerfile: Dockerfile.db
|
12
|
+
volumes:
|
13
|
+
- <%= name %>-db-storage:/var/lib/postgresql/data
|
10
14
|
env_file: .env.db
|
11
15
|
|
12
16
|
<%= name %>-common: &common
|
@@ -33,3 +37,7 @@ services:
|
|
33
37
|
depends_on:
|
34
38
|
- <%= name %>-crawler
|
35
39
|
command: processor
|
40
|
+
|
41
|
+
volumes:
|
42
|
+
<%= name %>-db-storage:
|
43
|
+
<%= name %>-rabbitmq-storage:
|
@@ -1,2 +1,5 @@
|
|
1
1
|
SKYCRAWLERS_RABBITMQ_HOST=<%= name %>-rabbitmq
|
2
|
-
|
2
|
+
SKYCRAWLERS_RABBITMQ_PORT=5672
|
3
|
+
DATABASE_HOST=<%= name %>-db
|
4
|
+
DATABASE_PORT=5432
|
5
|
+
DATABASE_URL=postgres://crawler:<%= config[:password] %>@<%= name %>-db/<%= name %>_development
|
@@ -3,10 +3,13 @@
|
|
3
3
|
set -x
|
4
4
|
|
5
5
|
MAIN=$1
|
6
|
+
dockerize -timeout 60s \
|
7
|
+
-wait tcp://${DATABASE_HOST}:${DATABASE_PORT} \
|
8
|
+
-wait tcp://${SKYCRAWLERS_RABBITMQ_HOST}:${SKYCRAWLERS_RABBITMQ_PORT}
|
6
9
|
case $MAIN in
|
7
10
|
crawler)
|
8
11
|
bundle check || bundle install --retry=3 --path=vendor/bundle \
|
9
|
-
&& bundle exec rake db:
|
12
|
+
&& bundle exec rake db:migrate
|
10
13
|
bundle exec daimon_skycrawlers exec $MAIN
|
11
14
|
;;
|
12
15
|
processor)
|
@@ -14,13 +17,16 @@ case $MAIN in
|
|
14
17
|
sleep 5
|
15
18
|
done
|
16
19
|
bundle check || bundle install --retry=3 --path=vendor/bundle \
|
17
|
-
&& bundle exec rake db:
|
20
|
+
&& bundle exec rake db:migrate
|
18
21
|
bundle exec daimon_skycrawlers exec $MAIN
|
19
22
|
;;
|
20
23
|
setup)
|
21
|
-
bundle install --path=vendor/bundle
|
24
|
+
bundle install --retry=3 --path=vendor/bundle
|
22
25
|
bundle exec rake db:schema:load
|
23
26
|
;;
|
27
|
+
migrate)
|
28
|
+
bundle exec rake db:migrate
|
29
|
+
;;
|
24
30
|
none)
|
25
31
|
echo NOP
|
26
32
|
;;
|
@@ -1,27 +1,17 @@
|
|
1
1
|
require "daimon_skycrawlers/logger"
|
2
2
|
require "daimon_skycrawlers/config"
|
3
|
+
require "daimon_skycrawlers/callbacks"
|
3
4
|
|
4
5
|
module DaimonSkycrawlers
|
5
6
|
module Processor
|
6
7
|
class Base
|
7
8
|
include DaimonSkycrawlers::LoggerMixin
|
8
9
|
include DaimonSkycrawlers::ConfigMixin
|
9
|
-
|
10
|
-
def initialize
|
11
|
-
super
|
12
|
-
@before_process_filters = []
|
13
|
-
end
|
14
|
-
|
15
|
-
def before_process(filter = nil, &block)
|
16
|
-
if block_given?
|
17
|
-
@before_process_filters << block
|
18
|
-
else
|
19
|
-
@before_process_filters << filter if filter.respond_to?(:call)
|
20
|
-
end
|
21
|
-
end
|
10
|
+
include DaimonSkycrawlers::Callbacks
|
22
11
|
|
23
12
|
def process(message)
|
24
|
-
|
13
|
+
proceeding = run_before_callbacks(message)
|
14
|
+
return unless proceeding
|
25
15
|
call(message)
|
26
16
|
end
|
27
17
|
|
@@ -32,14 +22,6 @@ module DaimonSkycrawlers
|
|
32
22
|
def storage
|
33
23
|
@storage ||= DaimonSkycrawlers::Storage::RDB.new
|
34
24
|
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
def apply_before_filters(message)
|
39
|
-
@before_process_filters.all? do |filter|
|
40
|
-
filter.call(message)
|
41
|
-
end
|
42
|
-
end
|
43
25
|
end
|
44
26
|
end
|
45
27
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: daimon_skycrawlers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- daimon developers
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-12-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -262,6 +262,20 @@ dependencies:
|
|
262
262
|
- - ">="
|
263
263
|
- !ruby/object:Gem::Version
|
264
264
|
version: '0'
|
265
|
+
- !ruby/object:Gem::Dependency
|
266
|
+
name: simplecov
|
267
|
+
requirement: !ruby/object:Gem::Requirement
|
268
|
+
requirements:
|
269
|
+
- - ">="
|
270
|
+
- !ruby/object:Gem::Version
|
271
|
+
version: '0'
|
272
|
+
type: :development
|
273
|
+
prerelease: false
|
274
|
+
version_requirements: !ruby/object:Gem::Requirement
|
275
|
+
requirements:
|
276
|
+
- - ">="
|
277
|
+
- !ruby/object:Gem::Version
|
278
|
+
version: '0'
|
265
279
|
- !ruby/object:Gem::Dependency
|
266
280
|
name: sqlite3
|
267
281
|
requirement: !ruby/object:Gem::Requirement
|
@@ -307,6 +321,7 @@ files:
|
|
307
321
|
- daimon_skycrawlers.gemspec
|
308
322
|
- db/schema.rb
|
309
323
|
- lib/daimon_skycrawlers.rb
|
324
|
+
- lib/daimon_skycrawlers/callbacks.rb
|
310
325
|
- lib/daimon_skycrawlers/cli.rb
|
311
326
|
- lib/daimon_skycrawlers/commands/enqueue.rb
|
312
327
|
- lib/daimon_skycrawlers/commands/runner.rb
|
@@ -407,7 +422,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
407
422
|
version: '0'
|
408
423
|
requirements: []
|
409
424
|
rubyforge_project:
|
410
|
-
rubygems_version: 2.
|
425
|
+
rubygems_version: 2.6.4
|
411
426
|
signing_key:
|
412
427
|
specification_version: 4
|
413
428
|
summary: This is a crawler framework.
|