daimon_skycrawlers 1.0.0.pre.rc1 → 1.0.0.pre.rc2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +11 -0
- data/daimon_skycrawlers.gemspec +1 -0
- data/lib/daimon_skycrawlers.rb +32 -2
- data/lib/daimon_skycrawlers/callbacks.rb +32 -2
- data/lib/daimon_skycrawlers/cli.rb +4 -0
- data/lib/daimon_skycrawlers/commands/enqueue.rb +4 -1
- data/lib/daimon_skycrawlers/commands/runner.rb +2 -0
- data/lib/daimon_skycrawlers/config.rb +1 -0
- data/lib/daimon_skycrawlers/configurable.rb +6 -1
- data/lib/daimon_skycrawlers/consumer.rb +3 -0
- data/lib/daimon_skycrawlers/consumer/base.rb +5 -0
- data/lib/daimon_skycrawlers/consumer/http_response.rb +1 -1
- data/lib/daimon_skycrawlers/consumer/url.rb +1 -1
- data/lib/daimon_skycrawlers/crawler.rb +5 -2
- data/lib/daimon_skycrawlers/crawler/base.rb +56 -8
- data/lib/daimon_skycrawlers/crawler/default.rb +9 -1
- data/lib/daimon_skycrawlers/filter.rb +3 -0
- data/lib/daimon_skycrawlers/filter/base.rb +12 -0
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +2 -2
- data/lib/daimon_skycrawlers/filter/robots_txt_checker.rb +1 -1
- data/lib/daimon_skycrawlers/filter/update_checker.rb +2 -2
- data/lib/daimon_skycrawlers/generator/crawler.rb +4 -1
- data/lib/daimon_skycrawlers/generator/filter.rb +4 -1
- data/lib/daimon_skycrawlers/generator/generate.rb +3 -0
- data/lib/daimon_skycrawlers/generator/new.rb +5 -1
- data/lib/daimon_skycrawlers/generator/processor.rb +4 -1
- data/lib/daimon_skycrawlers/logger.rb +8 -0
- data/lib/daimon_skycrawlers/processor.rb +5 -2
- data/lib/daimon_skycrawlers/processor/base.rb +28 -2
- data/lib/daimon_skycrawlers/processor/default.rb +7 -1
- data/lib/daimon_skycrawlers/processor/proc.rb +6 -0
- data/lib/daimon_skycrawlers/processor/spider.rb +2 -2
- data/lib/daimon_skycrawlers/queue.rb +31 -0
- data/lib/daimon_skycrawlers/sitemap_parser.rb +23 -1
- data/lib/daimon_skycrawlers/storage.rb +3 -0
- data/lib/daimon_skycrawlers/storage/base.rb +21 -1
- data/lib/daimon_skycrawlers/storage/file.rb +16 -0
- data/lib/daimon_skycrawlers/storage/null.rb +2 -2
- data/lib/daimon_skycrawlers/storage/rdb.rb +25 -7
- data/lib/daimon_skycrawlers/timer.rb +9 -0
- data/lib/daimon_skycrawlers/version.rb +4 -1
- data/sample/amazon-ranking/app/processors/amazon_ranking.rb +1 -1
- data/sample/itp-crawler/app/processors/itp_processor.rb +1 -1
- data/{lib/daimon_skycrawlers/generator/templates → templates}/crawler.rb.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/filter.rb.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Dockerfile +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Dockerfile.db +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Gemfile +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/README.md.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Rakefile +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/app/crawler.rb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/app/processor.rb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/config/database.yml.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/config/init.rb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/docker-compose.yml.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/env.db.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/env.erb +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/services/common/docker-entrypoint.sh +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/new/services/db/init-user-db.sh +0 -0
- data/{lib/daimon_skycrawlers/generator/templates → templates}/processor.rb.erb +0 -0
- metadata +34 -19
@@ -3,15 +3,35 @@ require "daimon_skycrawlers/config"
|
|
3
3
|
|
4
4
|
module DaimonSkycrawlers
|
5
5
|
module Storage
|
6
|
+
#
|
7
|
+
# Base class of storage implementation
|
8
|
+
#
|
6
9
|
class Base
|
7
10
|
include DaimonSkycrawlers::LoggerMixin
|
8
11
|
include DaimonSkycrawlers::ConfigMixin
|
9
12
|
|
13
|
+
#
|
14
|
+
# Save data to storage
|
15
|
+
#
|
16
|
+
# Override this method in subclass
|
17
|
+
#
|
18
|
+
# @param data [Hash] data has following keys
|
19
|
+
# * `:url`: URL
|
20
|
+
# * `:message`: Given message
|
21
|
+
# * `:response`: HTTP response
|
22
|
+
#
|
10
23
|
def save(data)
|
11
24
|
raise "Implement this in subclass"
|
12
25
|
end
|
13
26
|
|
14
|
-
|
27
|
+
#
|
28
|
+
# Fetch page identified by url
|
29
|
+
#
|
30
|
+
# Override this method in subclass
|
31
|
+
#
|
32
|
+
# @param url [String] the key to find data in storage
|
33
|
+
#
|
34
|
+
def read(url, message = {})
|
15
35
|
raise "Implement this in subclass"
|
16
36
|
end
|
17
37
|
end
|
@@ -11,6 +11,14 @@ module DaimonSkycrawlers
|
|
11
11
|
@base_dir = Pathname(base_dir)
|
12
12
|
end
|
13
13
|
|
14
|
+
#
|
15
|
+
# Save data to files under base directory
|
16
|
+
#
|
17
|
+
# @param data [Hash] data has following keys
|
18
|
+
# * `:url`: URL
|
19
|
+
# * `:message`: Given message
|
20
|
+
# * `:response`: HTTP response
|
21
|
+
#
|
14
22
|
def save(data)
|
15
23
|
url = data[:url]
|
16
24
|
message = data[:message]
|
@@ -28,6 +36,11 @@ module DaimonSkycrawlers
|
|
28
36
|
end
|
29
37
|
end
|
30
38
|
|
39
|
+
#
|
40
|
+
# Read data from files under base directory
|
41
|
+
#
|
42
|
+
# @return [DaimonSkycrawlers::Storage::File::Page]
|
43
|
+
#
|
31
44
|
def read(url, message)
|
32
45
|
key = message[:key]
|
33
46
|
headers = JSON.parse(headers_path(url, key).read)
|
@@ -35,6 +48,9 @@ module DaimonSkycrawlers
|
|
35
48
|
Page.new(url, key, headers, body, headers["last-modified"], headers["etag"])
|
36
49
|
end
|
37
50
|
|
51
|
+
#
|
52
|
+
# Page for file storage
|
53
|
+
#
|
38
54
|
Page = Struct.new(:url, :key, :headers, :body, :last_modified, :etag)
|
39
55
|
|
40
56
|
private
|
@@ -14,12 +14,12 @@ module DaimonSkycrawlers
|
|
14
14
|
end
|
15
15
|
|
16
16
|
#
|
17
|
-
# Save
|
17
|
+
# Save data to RDB
|
18
18
|
#
|
19
|
-
# @param [Hash] data has following keys
|
20
|
-
# *
|
21
|
-
# *
|
22
|
-
# *
|
19
|
+
# @param data [Hash] data has following keys
|
20
|
+
# * `:url`: URL
|
21
|
+
# * `:message`: Given message
|
22
|
+
# * `:response`: HTTP response
|
23
23
|
#
|
24
24
|
def save(data)
|
25
25
|
url = data[:url]
|
@@ -39,9 +39,10 @@ module DaimonSkycrawlers
|
|
39
39
|
#
|
40
40
|
# Fetch page identified by url
|
41
41
|
#
|
42
|
-
# @param [String]
|
42
|
+
# @param url [String] identity of the page
|
43
|
+
# @param message [Hash] this hash may include `:key` to find page
|
43
44
|
#
|
44
|
-
def
|
45
|
+
def read(url, message = {})
|
45
46
|
key = message[:key]
|
46
47
|
if key
|
47
48
|
Page.where(key: key).order(updated_at: :desc).limit(1).first
|
@@ -50,10 +51,27 @@ module DaimonSkycrawlers
|
|
50
51
|
end
|
51
52
|
end
|
52
53
|
|
54
|
+
# @private
|
53
55
|
class Base < ActiveRecord::Base
|
54
56
|
self.abstract_class = true
|
55
57
|
end
|
56
58
|
|
59
|
+
#
|
60
|
+
# Model represents page
|
61
|
+
#
|
62
|
+
# * key
|
63
|
+
# * The key to identify page
|
64
|
+
# * url
|
65
|
+
# * The URL of page
|
66
|
+
# * headers
|
67
|
+
# * HTTP response header
|
68
|
+
# * body
|
69
|
+
# * HTTP response body
|
70
|
+
# * last_modified_at
|
71
|
+
# * Last-Modified header
|
72
|
+
# * etag
|
73
|
+
# * ETag header
|
74
|
+
#
|
57
75
|
class Page < Base
|
58
76
|
self.table_name = "pages"
|
59
77
|
end
|
@@ -2,9 +2,18 @@ require "timers"
|
|
2
2
|
require "daimon_skycrawlers"
|
3
3
|
|
4
4
|
module DaimonSkycrawlers
|
5
|
+
#
|
6
|
+
# Name space for timer
|
7
|
+
#
|
5
8
|
module Timer
|
6
9
|
module_function
|
7
10
|
|
11
|
+
# Setup timer for shutdown
|
12
|
+
#
|
13
|
+
# @param queue_name_prefix [String] previx of queue name
|
14
|
+
# @param interval [String] shutdown after this interval after the queue is empty
|
15
|
+
# @return [Timers::Group] timers
|
16
|
+
#
|
8
17
|
def setup_shutdown_timer(queue_name_prefix, interval: 10)
|
9
18
|
timers = Timers::Group.new
|
10
19
|
timer = timers.after(interval) do
|
@@ -7,7 +7,7 @@ class AmazonRanking < DaimonSkycrawlers::Processor::Base
|
|
7
7
|
Item = Struct.new(:rank, :name, :url, :star, :review)
|
8
8
|
def call(message)
|
9
9
|
url = message[:url]
|
10
|
-
page = storage.
|
10
|
+
page = storage.read(url)
|
11
11
|
doc = Nokogiri::HTML(page.body)
|
12
12
|
ranking = []
|
13
13
|
doc.search(".zg_itemRow").each do |item|
|
@@ -7,7 +7,7 @@ require_relative "../models/itp_shop"
|
|
7
7
|
class ItpProcessor < DaimonSkycrawlers::Processor::Base
|
8
8
|
def call(message)
|
9
9
|
key_url = message[:url]
|
10
|
-
page = storage.
|
10
|
+
page = storage.read(key_url)
|
11
11
|
@doc = Nokogiri::HTML(page.body.encode("UTF-8", "CP932"))
|
12
12
|
ItpShop.transaction do
|
13
13
|
prepare_shops do |shop|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/{lib/daimon_skycrawlers/generator/templates → templates}/new/services/db/init-user-db.sh
RENAMED
File without changes
|
File without changes
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: daimon_skycrawlers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.0.pre.
|
4
|
+
version: 1.0.0.pre.rc2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- daimon developers
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-02-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -248,6 +248,20 @@ dependencies:
|
|
248
248
|
- - ">="
|
249
249
|
- !ruby/object:Gem::Version
|
250
250
|
version: '0'
|
251
|
+
- !ruby/object:Gem::Dependency
|
252
|
+
name: redcarpet
|
253
|
+
requirement: !ruby/object:Gem::Requirement
|
254
|
+
requirements:
|
255
|
+
- - ">="
|
256
|
+
- !ruby/object:Gem::Version
|
257
|
+
version: '0'
|
258
|
+
type: :development
|
259
|
+
prerelease: false
|
260
|
+
version_requirements: !ruby/object:Gem::Requirement
|
261
|
+
requirements:
|
262
|
+
- - ">="
|
263
|
+
- !ruby/object:Gem::Version
|
264
|
+
version: '0'
|
251
265
|
- !ruby/object:Gem::Dependency
|
252
266
|
name: tapp
|
253
267
|
requirement: !ruby/object:Gem::Requirement
|
@@ -313,6 +327,7 @@ extra_rdoc_files: []
|
|
313
327
|
files:
|
314
328
|
- ".gitignore"
|
315
329
|
- ".travis.yml"
|
330
|
+
- ".yardopts"
|
316
331
|
- Gemfile
|
317
332
|
- LICENSE.txt
|
318
333
|
- README.md
|
@@ -344,23 +359,6 @@ files:
|
|
344
359
|
- lib/daimon_skycrawlers/generator/generate.rb
|
345
360
|
- lib/daimon_skycrawlers/generator/new.rb
|
346
361
|
- lib/daimon_skycrawlers/generator/processor.rb
|
347
|
-
- lib/daimon_skycrawlers/generator/templates/crawler.rb.erb
|
348
|
-
- lib/daimon_skycrawlers/generator/templates/filter.rb.erb
|
349
|
-
- lib/daimon_skycrawlers/generator/templates/new/Dockerfile
|
350
|
-
- lib/daimon_skycrawlers/generator/templates/new/Dockerfile.db
|
351
|
-
- lib/daimon_skycrawlers/generator/templates/new/Gemfile
|
352
|
-
- lib/daimon_skycrawlers/generator/templates/new/README.md.erb
|
353
|
-
- lib/daimon_skycrawlers/generator/templates/new/Rakefile
|
354
|
-
- lib/daimon_skycrawlers/generator/templates/new/app/crawler.rb
|
355
|
-
- lib/daimon_skycrawlers/generator/templates/new/app/processor.rb
|
356
|
-
- lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
|
357
|
-
- lib/daimon_skycrawlers/generator/templates/new/config/init.rb
|
358
|
-
- lib/daimon_skycrawlers/generator/templates/new/docker-compose.yml.erb
|
359
|
-
- lib/daimon_skycrawlers/generator/templates/new/env.db.erb
|
360
|
-
- lib/daimon_skycrawlers/generator/templates/new/env.erb
|
361
|
-
- lib/daimon_skycrawlers/generator/templates/new/services/common/docker-entrypoint.sh
|
362
|
-
- lib/daimon_skycrawlers/generator/templates/new/services/db/init-user-db.sh
|
363
|
-
- lib/daimon_skycrawlers/generator/templates/processor.rb.erb
|
364
362
|
- lib/daimon_skycrawlers/logger.rb
|
365
363
|
- lib/daimon_skycrawlers/processor.rb
|
366
364
|
- lib/daimon_skycrawlers/processor/base.rb
|
@@ -427,6 +425,23 @@ files:
|
|
427
425
|
- sample/spider/config/init.rb
|
428
426
|
- sample/spider/db/migrate/20160830155803_create_pages.rb
|
429
427
|
- sample/spider/db/schema.rb
|
428
|
+
- templates/crawler.rb.erb
|
429
|
+
- templates/filter.rb.erb
|
430
|
+
- templates/new/Dockerfile
|
431
|
+
- templates/new/Dockerfile.db
|
432
|
+
- templates/new/Gemfile
|
433
|
+
- templates/new/README.md.erb
|
434
|
+
- templates/new/Rakefile
|
435
|
+
- templates/new/app/crawler.rb
|
436
|
+
- templates/new/app/processor.rb
|
437
|
+
- templates/new/config/database.yml.erb
|
438
|
+
- templates/new/config/init.rb
|
439
|
+
- templates/new/docker-compose.yml.erb
|
440
|
+
- templates/new/env.db.erb
|
441
|
+
- templates/new/env.erb
|
442
|
+
- templates/new/services/common/docker-entrypoint.sh
|
443
|
+
- templates/new/services/db/init-user-db.sh
|
444
|
+
- templates/processor.rb.erb
|
430
445
|
homepage: https://github.com/bm-sms/daimon_skycrawlers
|
431
446
|
licenses:
|
432
447
|
- MIT
|