daimon_skycrawlers 0.10.0 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 58dd7f91f6e9da8f9388a3364731ab0a543c01cd
4
- data.tar.gz: b815a1cdad154eaf1b828568a697887df02bfcb6
3
+ metadata.gz: 7c8534d5ada19111a31249998b4ff7c935dc44c4
4
+ data.tar.gz: 77a7c9d78db52e9d40e412c9ced08d024caa1499
5
5
  SHA512:
6
- metadata.gz: 1ffbdd022a6e6a3a80d292bae8927a716f33f339e19e12f67da536812f9118615be0d8886fc143b529b7854f3b4ef0731962f3102aa6c759790d86dd2df54914
7
- data.tar.gz: 1beb4753e8602224c95081651a4da6276659bfe132cdb9dc5fe55e8ae9f2c524681bd9be8137d4e29d52380d0f137ea2630097337bc07aea31de55e3f2785d3c
6
+ metadata.gz: 205234df54f6c985aa14f97fa5e5a967e18c923d7bf0fc4b29fe3d3c40c85d88667987b6bc7ca9d389dd513b9cb235b699b4741f228ab1151aa7dcb1a5802ebb
7
+ data.tar.gz: e2090d4983e1ffb7434eef92b76c30795e495f79c89fe9fe3567ba4e7e8f8b650b33870b0ab718d5d1afd799de45d35d7791fbc29e43f4f76bfb23ce510b32e9
@@ -37,6 +37,7 @@ Gem::Specification.new do |spec|
37
37
  spec.add_development_dependency "test-unit-notify"
38
38
  spec.add_development_dependency "pry"
39
39
  spec.add_development_dependency "tapp"
40
+ spec.add_development_dependency "simplecov"
40
41
  spec.add_development_dependency "sqlite3"
41
42
  spec.add_development_dependency "yard"
42
43
  end
@@ -0,0 +1,26 @@
1
+ module DaimonSkycrawlers
2
+ module Callbacks
3
+ def initialize
4
+ super
5
+ @before_process_callbacks = []
6
+ end
7
+
8
+ def before_process(callback = nil, &block)
9
+ if block_given?
10
+ @before_process_callbacks << block
11
+ else
12
+ @before_process_callbacks << callback if callback.respond_to?(:call)
13
+ end
14
+ end
15
+
16
+ def run_before_callbacks(message)
17
+ @before_process_callbacks.all? do |callback|
18
+ callback.call(message)
19
+ end
20
+ end
21
+
22
+ def clear_before_process_callbacks
23
+ @before_process_callbacks = []
24
+ end
25
+ end
26
+ end
@@ -3,6 +3,7 @@ require "faraday"
3
3
 
4
4
  require "daimon_skycrawlers/logger"
5
5
  require "daimon_skycrawlers/config"
6
+ require "daimon_skycrawlers/callbacks"
6
7
  require "daimon_skycrawlers/storage"
7
8
  require "daimon_skycrawlers/processor"
8
9
  require "daimon_skycrawlers/filter/update_checker"
@@ -16,6 +17,7 @@ module DaimonSkycrawlers
16
17
  class Base
17
18
  include DaimonSkycrawlers::LoggerMixin
18
19
  include DaimonSkycrawlers::ConfigMixin
20
+ include DaimonSkycrawlers::Callbacks
19
21
 
20
22
  # @!attribute [w] storage
21
23
  # Set storage to crawler instance.
@@ -80,19 +82,24 @@ module DaimonSkycrawlers
80
82
  end
81
83
 
82
84
  def process(message, &block)
83
- url = message.delete(:url)
84
-
85
85
  @skipped = false
86
86
  @n_processed_urls += 1
87
- # url can be a path
88
- url = connection.url_prefix + url
89
87
 
90
- apply_filters(url)
88
+ setup_default_filters
91
89
 
92
- unless skipped?
93
- @prepare.call(connection)
94
- fetch(url, message, &block)
90
+ proceeding = run_before_callbacks(message)
91
+ unless proceeding
92
+ @skipped = true
93
+ skip(message[:url])
94
+ return
95
95
  end
96
+
97
+ # url can be a path
98
+ url = message.delete(:url)
99
+ url = (URI(connection.url_prefix) + url).to_s
100
+
101
+ @prepare.call(connection)
102
+ fetch(url, message, &block)
96
103
  end
97
104
 
98
105
  def fetch(path, message = {})
@@ -109,18 +116,22 @@ module DaimonSkycrawlers
109
116
 
110
117
  private
111
118
 
112
- def apply_filters(url)
119
+ def setup_default_filters
113
120
  if @options[:obey_robots_txt]
114
- robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url)
115
- unless robots_txt_checker.allowed?({ url: url })
116
- skip(url)
117
- return
121
+ before_process do |m|
122
+ robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url)
123
+ allowed = robots_txt_checker.allowed?(m)
124
+ log.debug("Not allowed: #{m[:url]}") unless allowed
125
+ allowed
118
126
  end
119
127
  end
120
- update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
121
- unless update_checker.updated?({ url: url.to_s }, connection: connection)
122
- skip(url)
123
- return
128
+ before_process do |m|
129
+ update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
130
+ updated = update_checker.updated?(m, connection: connection)
131
+ unless updated
132
+ log.debug("Not updated: #{m[:url]}")
133
+ end
134
+ updated
124
135
  end
125
136
  end
126
137
 
@@ -27,6 +27,13 @@ module DaimonSkycrawlers
27
27
  def call(message)
28
28
  raise NotImplementedError, "Must implement this method in subclass"
29
29
  end
30
+
31
+ private
32
+
33
+ def normalize_url(url)
34
+ return url unless @base_url
35
+ (URI(@base_url) + url).to_s
36
+ end
30
37
  end
31
38
  end
32
39
  end
@@ -21,10 +21,7 @@ module DaimonSkycrawlers
21
21
  # @return [true|false] Return false when duplicated, otherwise return true.
22
22
  #
23
23
  def call(message)
24
- url = message[:url]
25
- unless URI(url).absolute?
26
- url = (@base_url + url).to_s
27
- end
24
+ url = normalize_url(message[:url])
28
25
  return false if @urls.include?(url)
29
26
  @urls << url
30
27
  true
@@ -20,10 +20,7 @@ module DaimonSkycrawlers
20
20
  # @return [true|false] Return true when web site allows to fetch the URL, otherwise return false
21
21
  #
22
22
  def call(message)
23
- url = message[:url]
24
- unless URI(url).absolute?
25
- url = (@base_url + url).to_s
26
- end
23
+ url = normalize_url(message[:url])
27
24
  @webrobots.allowed?(url)
28
25
  end
29
26
 
@@ -22,10 +22,7 @@ module DaimonSkycrawlers
22
22
  # @return [true|false] Return true when need update, otherwise return false
23
23
  #
24
24
  def call(message, connection: nil)
25
- url = message[:url]
26
- unless URI(url).absolute?
27
- url = (@base_url + url).to_s
28
- end
25
+ url = normalize_url(message[:url])
29
26
  page = storage.find(url)
30
27
  return true unless page
31
28
  if connection
@@ -1,6 +1,6 @@
1
1
  FROM ruby:2.3.1-alpine
2
2
 
3
- RUN apk --no-cache --update add build-base ruby-dev libxml2-dev postgresql-dev libcurl git
3
+ RUN apk --no-cache --update add build-base ruby-dev libxml2-dev postgresql-dev libcurl openssl git
4
4
 
5
5
  RUN adduser -D -h /home/crawler -g "DaimonSkycrawlers user" -s /bin/sh crawler crawler
6
6
 
@@ -8,7 +8,12 @@ ARG SKYCRAWLERS_ENV=production
8
8
  ARG SKYCRAWLERS_MAIN=crawler
9
9
  ENV SKYCRAWLERS_ENV=$SKYCRAWLERS_ENV \
10
10
  SKYCRAWLERS_MAIN=$SKYCRAWLERS_MAIN \
11
- BUNDLE_JOBS=4
11
+ BUNDLE_JOBS=4 \
12
+ DOCKERIZE_VERSION=v0.3.0
13
+
14
+ RUN wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz \
15
+ && tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz \
16
+ && rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
12
17
 
13
18
  USER crawler
14
19
  WORKDIR /home/crawler
@@ -2,11 +2,15 @@ version: "2"
2
2
  services:
3
3
  <%= name %>-rabbitmq:
4
4
  image: rabbitmq
5
+ volumes:
6
+ - <%= name %>-rabbitmq-storage:/var/lib/rabbitmq
5
7
 
6
8
  <%= name %>-db:
7
9
  build:
8
10
  context: .
9
11
  dockerfile: Dockerfile.db
12
+ volumes:
13
+ - <%= name %>-db-storage:/var/lib/postgresql/data
10
14
  env_file: .env.db
11
15
 
12
16
  <%= name %>-common: &common
@@ -33,3 +37,7 @@ services:
33
37
  depends_on:
34
38
  - <%= name %>-crawler
35
39
  command: processor
40
+
41
+ volumes:
42
+ <%= name %>-db-storage:
43
+ <%= name %>-rabbitmq-storage:
@@ -1,2 +1,5 @@
1
1
  SKYCRAWLERS_RABBITMQ_HOST=<%= name %>-rabbitmq
2
- DATABASE_URL=postgres://crawler:<%= config[:password] %>@<%= name %>-db/xxx_development
2
+ SKYCRAWLERS_RABBITMQ_PORT=5672
3
+ DATABASE_HOST=<%= name %>-db
4
+ DATABASE_PORT=5432
5
+ DATABASE_URL=postgres://crawler:<%= config[:password] %>@<%= name %>-db/<%= name %>_development
@@ -3,10 +3,13 @@
3
3
  set -x
4
4
 
5
5
  MAIN=$1
6
+ dockerize -timeout 60s \
7
+ -wait tcp://${DATABASE_HOST}:${DATABASE_PORT} \
8
+ -wait tcp://${SKYCRAWLERS_RABBITMQ_HOST}:${SKYCRAWLERS_RABBITMQ_PORT}
6
9
  case $MAIN in
7
10
  crawler)
8
11
  bundle check || bundle install --retry=3 --path=vendor/bundle \
9
- && bundle exec rake db:schema:load || bundle exec rake db:migrate
12
+ && bundle exec rake db:migrate
10
13
  bundle exec daimon_skycrawlers exec $MAIN
11
14
  ;;
12
15
  processor)
@@ -14,13 +17,16 @@ case $MAIN in
14
17
  sleep 5
15
18
  done
16
19
  bundle check || bundle install --retry=3 --path=vendor/bundle \
17
- && bundle exec rake db:schema:load || bundle exec rake db:migrate
20
+ && bundle exec rake db:migrate
18
21
  bundle exec daimon_skycrawlers exec $MAIN
19
22
  ;;
20
23
  setup)
21
- bundle install --path=vendor/bundle
24
+ bundle install --retry=3 --path=vendor/bundle
22
25
  bundle exec rake db:schema:load
23
26
  ;;
27
+ migrate)
28
+ bundle exec rake db:migrate
29
+ ;;
24
30
  none)
25
31
  echo NOP
26
32
  ;;
@@ -8,6 +8,5 @@ class <%= config[:class_name] %> < DaimonSkycrawlers::Processor::Base
8
8
  end
9
9
  end
10
10
 
11
- base_url = ""
12
- processor = <%= config[:class_name] %>.new(base_url)
11
+ processor = <%= config[:class_name] %>.new
13
12
  DaimonSkycrawlers.register_processor(processor)
@@ -1,27 +1,17 @@
1
1
  require "daimon_skycrawlers/logger"
2
2
  require "daimon_skycrawlers/config"
3
+ require "daimon_skycrawlers/callbacks"
3
4
 
4
5
  module DaimonSkycrawlers
5
6
  module Processor
6
7
  class Base
7
8
  include DaimonSkycrawlers::LoggerMixin
8
9
  include DaimonSkycrawlers::ConfigMixin
9
-
10
- def initialize
11
- super
12
- @before_process_filters = []
13
- end
14
-
15
- def before_process(filter = nil, &block)
16
- if block_given?
17
- @before_process_filters << block
18
- else
19
- @before_process_filters << filter if filter.respond_to?(:call)
20
- end
21
- end
10
+ include DaimonSkycrawlers::Callbacks
22
11
 
23
12
  def process(message)
24
- return unless apply_before_filters(message)
13
+ proceeding = run_before_callbacks(message)
14
+ return unless proceeding
25
15
  call(message)
26
16
  end
27
17
 
@@ -32,14 +22,6 @@ module DaimonSkycrawlers
32
22
  def storage
33
23
  @storage ||= DaimonSkycrawlers::Storage::RDB.new
34
24
  end
35
-
36
- private
37
-
38
- def apply_before_filters(message)
39
- @before_process_filters.all? do |filter|
40
- filter.call(message)
41
- end
42
- end
43
25
  end
44
26
  end
45
27
  end
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.10.0"
2
+ VERSION = "0.11.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - daimon developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-29 00:00:00.000000000 Z
11
+ date: 2016-12-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -262,6 +262,20 @@ dependencies:
262
262
  - - ">="
263
263
  - !ruby/object:Gem::Version
264
264
  version: '0'
265
+ - !ruby/object:Gem::Dependency
266
+ name: simplecov
267
+ requirement: !ruby/object:Gem::Requirement
268
+ requirements:
269
+ - - ">="
270
+ - !ruby/object:Gem::Version
271
+ version: '0'
272
+ type: :development
273
+ prerelease: false
274
+ version_requirements: !ruby/object:Gem::Requirement
275
+ requirements:
276
+ - - ">="
277
+ - !ruby/object:Gem::Version
278
+ version: '0'
265
279
  - !ruby/object:Gem::Dependency
266
280
  name: sqlite3
267
281
  requirement: !ruby/object:Gem::Requirement
@@ -307,6 +321,7 @@ files:
307
321
  - daimon_skycrawlers.gemspec
308
322
  - db/schema.rb
309
323
  - lib/daimon_skycrawlers.rb
324
+ - lib/daimon_skycrawlers/callbacks.rb
310
325
  - lib/daimon_skycrawlers/cli.rb
311
326
  - lib/daimon_skycrawlers/commands/enqueue.rb
312
327
  - lib/daimon_skycrawlers/commands/runner.rb
@@ -407,7 +422,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
407
422
  version: '0'
408
423
  requirements: []
409
424
  rubyforge_project:
410
- rubygems_version: 2.5.1
425
+ rubygems_version: 2.6.4
411
426
  signing_key:
412
427
  specification_version: 4
413
428
  summary: This is a crawler framework.