daimon_skycrawlers 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +34 -26
- data/daimon_skycrawlers.gemspec +2 -0
- data/lib/daimon_skycrawlers/cli.rb +4 -0
- data/lib/daimon_skycrawlers/commands/enqueue.rb +58 -0
- data/lib/daimon_skycrawlers/commands/runner.rb +47 -0
- data/lib/daimon_skycrawlers/crawler/default.rb +17 -3
- data/lib/daimon_skycrawlers/filter/base.rb +8 -0
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +2 -0
- data/lib/daimon_skycrawlers/filter/robots_txt_checker.rb +29 -0
- data/lib/daimon_skycrawlers/filter/update_checker.rb +3 -0
- data/lib/daimon_skycrawlers/generator/new.rb +0 -3
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/lib/daimon_skycrawlers.rb +12 -0
- metadata +33 -8
- data/lib/daimon_skycrawlers/generator/templates/new/bin/crawler +0 -10
- data/lib/daimon_skycrawlers/generator/templates/new/bin/enqueue +0 -23
- data/lib/daimon_skycrawlers/generator/templates/new/bin/processor +0 -10
- data/sample/spider/bin/crawler +0 -10
- data/sample/spider/bin/enqueue +0 -23
- data/sample/spider/bin/processor +0 -11
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 4e4bd1308a554f55bce802a5d0a038cbb0f0470e
|
|
4
|
+
data.tar.gz: a484737c74cc9ff3304a9fda0714282f9f6b0e61
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c9d4b01cb37808ce43a9786324e6e0d842d67bc44269c39964e4570c5dbab1f0b818449ac6a67864d6834e2b67a1a9f857252fd0d3cc91e87961ad252eccc785
|
|
7
|
+
data.tar.gz: c1babe4300e744678672482e3a82c8bc5fa37a5e52ebe8e634b3ae0cdaea17adc75c5132a5a8918c2b3dce76b47c783fedea8e911b18f2e6f0bd7adee0d0525b
|
data/README.md
CHANGED
|
@@ -33,52 +33,60 @@ Or install it yourself as:
|
|
|
33
33
|
|
|
34
34
|
1. Create project
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
36
|
+
```
|
|
37
|
+
$ bundle exec daimon-skycrawlers new mycrawlers
|
|
38
|
+
$ cd mycrawlers
|
|
39
|
+
```
|
|
40
|
+
or
|
|
41
|
+
```
|
|
42
|
+
$ daimon-skycrawlers new mycrawlers
|
|
43
|
+
$ cd mycrawlers
|
|
44
|
+
```
|
|
45
45
|
|
|
46
46
|
2. Install dependencies
|
|
47
47
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
48
|
+
```
|
|
49
|
+
$ bundle install
|
|
50
|
+
```
|
|
51
51
|
|
|
52
52
|
3. Create database
|
|
53
53
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
54
|
+
```
|
|
55
|
+
$ bundle exec rake db:create
|
|
56
|
+
$ bundle exec rake db:migrate
|
|
57
|
+
```
|
|
58
58
|
|
|
59
59
|
4. Open new terminal and run crawler/processor
|
|
60
60
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
61
|
+
```
|
|
62
|
+
$ daimon-skycrawlers exec crawler # on new terminal
|
|
63
|
+
$ daimon-skycrawlers exec processor # on new terminal
|
|
64
|
+
```
|
|
65
65
|
|
|
66
66
|
NOTE: Execute step 5 as soon as possible. Because bin/crawler and
|
|
67
67
|
bin/processor will stop after 10 seconds by default if their
|
|
68
68
|
queues are empty.
|
|
69
69
|
|
|
70
|
+
NOTE: You can change `shutdown_interval` using following code in config/init.rb:
|
|
71
|
+
|
|
72
|
+
```ruby
|
|
73
|
+
DaimonSkycrawlers.configure do |config|
|
|
74
|
+
config.shutdown_interval = 30
|
|
75
|
+
end
|
|
76
|
+
```
|
|
77
|
+
|
|
70
78
|
5. Enqueue task
|
|
71
79
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
80
|
+
```
|
|
81
|
+
$ daimon-skycrawlers enqueue url http://example.com/
|
|
82
|
+
```
|
|
75
83
|
|
|
76
84
|
6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
|
77
85
|
7. You can re-enqueue task for processor
|
|
78
86
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
87
|
+
```
|
|
88
|
+
$ daimon-skycrawlers enqueue response http://example.com/
|
|
89
|
+
```
|
|
82
90
|
|
|
83
91
|
Display `It works with 'http://example.com'` again on your terminal which runs your processor.
|
|
84
92
|
|
data/daimon_skycrawlers.gemspec
CHANGED
|
@@ -28,6 +28,8 @@ Gem::Specification.new do |spec|
|
|
|
28
28
|
spec.add_dependency "railties"
|
|
29
29
|
spec.add_dependency "pg"
|
|
30
30
|
spec.add_dependency "timers"
|
|
31
|
+
spec.add_dependency "sitemap-parser"
|
|
32
|
+
spec.add_dependency "webrobots"
|
|
31
33
|
|
|
32
34
|
spec.add_development_dependency "rake", "~> 10.0"
|
|
33
35
|
spec.add_development_dependency "test-unit"
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
require "thor"
|
|
2
2
|
require "daimon_skycrawlers/generator/new"
|
|
3
|
+
require "daimon_skycrawlers/commands/enqueue"
|
|
4
|
+
require "daimon_skycrawlers/commands/runner"
|
|
3
5
|
require "daimon_skycrawlers/version"
|
|
4
6
|
|
|
5
7
|
module DaimonSkycrawlers
|
|
6
8
|
class CLI < Thor
|
|
7
9
|
register(Generator::New, "new", "new NAME", "Create new project")
|
|
10
|
+
register(Commands::Runner, "exec", "exec [COMMAND]", "Execute crawler/processor")
|
|
11
|
+
register(Commands::Enqueue, "enqueue", "enqueue [TYPE] URL [messages...]", "Enqueue URL")
|
|
8
12
|
|
|
9
13
|
desc "version", "Show version"
|
|
10
14
|
def version
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
require "daimon_skycrawlers"
|
|
2
|
+
require "daimon_skycrawlers/crawler"
|
|
3
|
+
require "daimon_skycrawlers/processor"
|
|
4
|
+
require "daimon_skycrawlers/version"
|
|
5
|
+
require "sitemap-parser"
|
|
6
|
+
require "webrobots"
|
|
7
|
+
|
|
8
|
+
module DaimonSkycrawlers
|
|
9
|
+
module Commands
|
|
10
|
+
class Enqueue < Thor
|
|
11
|
+
desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
|
|
12
|
+
def url(url, *rest)
|
|
13
|
+
load_init
|
|
14
|
+
message = rest.map {|arg| arg.split(":") }.to_h
|
|
15
|
+
log.debug("Enqueue URL for crawler: #{url} : #{message}")
|
|
16
|
+
DaimonSkycrawlers::Crawler.enqueue_url(url, message)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
desc "response URL [key1:value1 key2:value2...]", "Enqueue URL for HTTP response consumer"
|
|
20
|
+
def response(url, *rest)
|
|
21
|
+
load_init
|
|
22
|
+
message = rest.map {|arg| arg.split(":") }.to_h
|
|
23
|
+
log.debug("Enqueue URL for processor: #{url} : #{message}")
|
|
24
|
+
DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
desc "sitemap [OPTIONS] URL", "Enqueue URLs from simtemap.xml"
|
|
28
|
+
method_option("robots-txt", aliases: ["-r"], type: :boolean,
|
|
29
|
+
desc: "URL for robots.txt. Detect robots.txt automatically if URL is not robots.txt")
|
|
30
|
+
def sitemap(url)
|
|
31
|
+
load_init
|
|
32
|
+
if options["robots-txt"]
|
|
33
|
+
webrobots = WebRobots.new("DaimonSkycrawlers/#{DaimonSkycrawlers::VERSION}")
|
|
34
|
+
sitemaps = webrobots.sitemaps(url).uniq
|
|
35
|
+
else
|
|
36
|
+
sitemaps = [url]
|
|
37
|
+
end
|
|
38
|
+
urls = sitemaps.flat_map do |sitemap|
|
|
39
|
+
sitemap_parser = SitemapParser.new(sitemap)
|
|
40
|
+
sitemap_parser.to_a
|
|
41
|
+
end
|
|
42
|
+
urls.each do |_url|
|
|
43
|
+
DaimonSkycrawlers::Crawler.enqueue_url(_url)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
def load_init
|
|
50
|
+
DaimonSkycrawlers.load_init
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def log
|
|
54
|
+
DaimonSkycrawlers.configuration.logger
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
require "thor"
|
|
2
|
+
require "daimon_skycrawlers"
|
|
3
|
+
require "daimon_skycrawlers/crawler"
|
|
4
|
+
|
|
5
|
+
module DaimonSkycrawlers
|
|
6
|
+
module Commands
|
|
7
|
+
class Runner < Thor
|
|
8
|
+
namespace "exec"
|
|
9
|
+
|
|
10
|
+
desc "crawler", "Execute crawler"
|
|
11
|
+
def crawler
|
|
12
|
+
load_init
|
|
13
|
+
Dir.glob("app/crawlers/**/*.rb") do |path|
|
|
14
|
+
require(File.expand_path(path, Dir.pwd))
|
|
15
|
+
log.info("Loaded crawler: #{path}")
|
|
16
|
+
end
|
|
17
|
+
DaimonSkycrawlers::Crawler.run
|
|
18
|
+
rescue => ex
|
|
19
|
+
puts ex.message
|
|
20
|
+
exit(false)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
desc "processor", "Execute processor"
|
|
24
|
+
def processor
|
|
25
|
+
load_init
|
|
26
|
+
Dir.glob("app/processors/**/*.rb") do |path|
|
|
27
|
+
require(File.expand_path(path, Dir.pwd))
|
|
28
|
+
log.info("Loaded processor: #{path}")
|
|
29
|
+
end
|
|
30
|
+
DaimonSkycrawlers::Processor.run
|
|
31
|
+
rescue => ex
|
|
32
|
+
puts ex.message
|
|
33
|
+
exit(false)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
def load_init
|
|
39
|
+
DaimonSkycrawlers.load_init
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def log
|
|
43
|
+
DaimonSkycrawlers.configuration.logger
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
require "daimon_skycrawlers/crawler/base"
|
|
2
2
|
require "daimon_skycrawlers/filter/update_checker"
|
|
3
|
+
require "daimon_skycrawlers/filter/robots_txt_checker"
|
|
3
4
|
|
|
4
5
|
module DaimonSkycrawlers
|
|
5
6
|
module Crawler
|
|
@@ -13,11 +14,16 @@ module DaimonSkycrawlers
|
|
|
13
14
|
@n_processed_urls += 1
|
|
14
15
|
@skipped = false
|
|
15
16
|
url = connection.url_prefix + path
|
|
17
|
+
if @options[:obey_robots_txt]
|
|
18
|
+
robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url)
|
|
19
|
+
unless robots_txt_checker.call(url)
|
|
20
|
+
skip(url)
|
|
21
|
+
return
|
|
22
|
+
end
|
|
23
|
+
end
|
|
16
24
|
update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
|
|
17
25
|
unless update_checker.call(url.to_s, connection: connection)
|
|
18
|
-
|
|
19
|
-
@skipped = true
|
|
20
|
-
schedule_to_process(url.to_s, heartbeat: true)
|
|
26
|
+
skip(url)
|
|
21
27
|
return
|
|
22
28
|
end
|
|
23
29
|
@prepare.call(connection)
|
|
@@ -33,6 +39,14 @@ module DaimonSkycrawlers
|
|
|
33
39
|
message = message.merge(kw)
|
|
34
40
|
schedule_to_process(url.to_s, message)
|
|
35
41
|
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
def skip(url)
|
|
46
|
+
log.info("Skip #{url}")
|
|
47
|
+
@skipped = true
|
|
48
|
+
schedule_to_process(url.to_s, heartbeat: true)
|
|
49
|
+
end
|
|
36
50
|
end
|
|
37
51
|
end
|
|
38
52
|
end
|
|
@@ -3,6 +3,14 @@ require "daimon_skycrawlers/config"
|
|
|
3
3
|
|
|
4
4
|
module DaimonSkycrawlers
|
|
5
5
|
module Filter
|
|
6
|
+
#
|
|
7
|
+
# Base class of filters.
|
|
8
|
+
#
|
|
9
|
+
# You must implement `#call` in your filter and it must return
|
|
10
|
+
# true or false. If your filter returns true, processors can
|
|
11
|
+
# process given URL after your filter. Otherwise framework skips
|
|
12
|
+
# given URL to skip processors.
|
|
13
|
+
#
|
|
6
14
|
class Base
|
|
7
15
|
include DaimonSkycrawlers::LoggerMixin
|
|
8
16
|
include DaimonSkycrawlers::ConfigMixin
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require "webrobots"
|
|
2
|
+
require "daimon_skycrawlers/filter/base"
|
|
3
|
+
require "daimon_skycrawlers/version"
|
|
4
|
+
|
|
5
|
+
module DaimonSkycrawlers
|
|
6
|
+
module Filter
|
|
7
|
+
#
|
|
8
|
+
# This filter provides robots.txt checker for given URL.
|
|
9
|
+
# We want to obey robots.txt provided by a web site.
|
|
10
|
+
#
|
|
11
|
+
class RobotsTxtChecker < Base
|
|
12
|
+
def initialize(base_url: nil, user_agent: "DaimonSkycrawlers/#{DaimonSkycrawlers::VERSION}")
|
|
13
|
+
super()
|
|
14
|
+
@webrobots = WebRobots.new(user_agent)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
#
|
|
18
|
+
# @param [String] url
|
|
19
|
+
# @return [true|false] Return true when web site allows to fetch the URL, otherwise return false
|
|
20
|
+
#
|
|
21
|
+
def call(url)
|
|
22
|
+
unless URI(url).absolute?
|
|
23
|
+
url = (@base_url + url).to_s
|
|
24
|
+
end
|
|
25
|
+
@webrobots.allowed?(url)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -6,6 +6,9 @@ module DaimonSkycrawlers
|
|
|
6
6
|
#
|
|
7
7
|
# This filter provides update checker for given URL.
|
|
8
8
|
#
|
|
9
|
+
# Skip processing URLs that is latest (not updated since previous
|
|
10
|
+
# access).
|
|
11
|
+
#
|
|
9
12
|
class UpdateChecker < Base
|
|
10
13
|
def initialize(storage: nil, base_url: nil)
|
|
11
14
|
super(storage: storage)
|
|
@@ -40,9 +40,6 @@ module DaimonSkycrawlers
|
|
|
40
40
|
"Rakefile",
|
|
41
41
|
"app/crawlers/sample_crawler.rb",
|
|
42
42
|
"app/processors/sample_processor.rb",
|
|
43
|
-
"bin/crawler",
|
|
44
|
-
"bin/enqueue",
|
|
45
|
-
"bin/processor",
|
|
46
43
|
"config/init.rb",
|
|
47
44
|
].each do |path|
|
|
48
45
|
copy_file(path, "#{name}/#{path}", mode: :preserve)
|
data/lib/daimon_skycrawlers.rb
CHANGED
|
@@ -62,5 +62,17 @@ module DaimonSkycrawlers
|
|
|
62
62
|
def configure
|
|
63
63
|
yield configuration
|
|
64
64
|
end
|
|
65
|
+
|
|
66
|
+
#
|
|
67
|
+
# Load "config/init.rb"
|
|
68
|
+
#
|
|
69
|
+
# @return [void]
|
|
70
|
+
#
|
|
71
|
+
def load_init
|
|
72
|
+
require(File.expand_path("config/init.rb", Dir.pwd))
|
|
73
|
+
rescue LoadError => ex
|
|
74
|
+
puts ex.message
|
|
75
|
+
exit(false)
|
|
76
|
+
end
|
|
65
77
|
end
|
|
66
78
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: daimon_skycrawlers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ryunosuke SATO
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2016-
|
|
11
|
+
date: 2016-10-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -150,6 +150,34 @@ dependencies:
|
|
|
150
150
|
- - ">="
|
|
151
151
|
- !ruby/object:Gem::Version
|
|
152
152
|
version: '0'
|
|
153
|
+
- !ruby/object:Gem::Dependency
|
|
154
|
+
name: sitemap-parser
|
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
|
156
|
+
requirements:
|
|
157
|
+
- - ">="
|
|
158
|
+
- !ruby/object:Gem::Version
|
|
159
|
+
version: '0'
|
|
160
|
+
type: :runtime
|
|
161
|
+
prerelease: false
|
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
163
|
+
requirements:
|
|
164
|
+
- - ">="
|
|
165
|
+
- !ruby/object:Gem::Version
|
|
166
|
+
version: '0'
|
|
167
|
+
- !ruby/object:Gem::Dependency
|
|
168
|
+
name: webrobots
|
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
|
170
|
+
requirements:
|
|
171
|
+
- - ">="
|
|
172
|
+
- !ruby/object:Gem::Version
|
|
173
|
+
version: '0'
|
|
174
|
+
type: :runtime
|
|
175
|
+
prerelease: false
|
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
177
|
+
requirements:
|
|
178
|
+
- - ">="
|
|
179
|
+
- !ruby/object:Gem::Version
|
|
180
|
+
version: '0'
|
|
153
181
|
- !ruby/object:Gem::Dependency
|
|
154
182
|
name: rake
|
|
155
183
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -281,6 +309,8 @@ files:
|
|
|
281
309
|
- db/schema.rb
|
|
282
310
|
- lib/daimon_skycrawlers.rb
|
|
283
311
|
- lib/daimon_skycrawlers/cli.rb
|
|
312
|
+
- lib/daimon_skycrawlers/commands/enqueue.rb
|
|
313
|
+
- lib/daimon_skycrawlers/commands/runner.rb
|
|
284
314
|
- lib/daimon_skycrawlers/config.rb
|
|
285
315
|
- lib/daimon_skycrawlers/consumer.rb
|
|
286
316
|
- lib/daimon_skycrawlers/consumer/base.rb
|
|
@@ -292,6 +322,7 @@ files:
|
|
|
292
322
|
- lib/daimon_skycrawlers/filter.rb
|
|
293
323
|
- lib/daimon_skycrawlers/filter/base.rb
|
|
294
324
|
- lib/daimon_skycrawlers/filter/duplicate_checker.rb
|
|
325
|
+
- lib/daimon_skycrawlers/filter/robots_txt_checker.rb
|
|
295
326
|
- lib/daimon_skycrawlers/filter/update_checker.rb
|
|
296
327
|
- lib/daimon_skycrawlers/generator/new.rb
|
|
297
328
|
- lib/daimon_skycrawlers/generator/templates/new/Gemfile
|
|
@@ -299,9 +330,6 @@ files:
|
|
|
299
330
|
- lib/daimon_skycrawlers/generator/templates/new/Rakefile
|
|
300
331
|
- lib/daimon_skycrawlers/generator/templates/new/app/crawlers/sample_crawler.rb
|
|
301
332
|
- lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
|
|
302
|
-
- lib/daimon_skycrawlers/generator/templates/new/bin/crawler
|
|
303
|
-
- lib/daimon_skycrawlers/generator/templates/new/bin/enqueue
|
|
304
|
-
- lib/daimon_skycrawlers/generator/templates/new/bin/processor
|
|
305
333
|
- lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
|
|
306
334
|
- lib/daimon_skycrawlers/generator/templates/new/config/init.rb
|
|
307
335
|
- lib/daimon_skycrawlers/logger.rb
|
|
@@ -324,9 +352,6 @@ files:
|
|
|
324
352
|
- sample/spider/Rakefile
|
|
325
353
|
- sample/spider/app/crawlers/blog_crawler.rb
|
|
326
354
|
- sample/spider/app/processors/blog_spider.rb
|
|
327
|
-
- sample/spider/bin/crawler
|
|
328
|
-
- sample/spider/bin/enqueue
|
|
329
|
-
- sample/spider/bin/processor
|
|
330
355
|
- sample/spider/config/database.yml
|
|
331
356
|
- sample/spider/config/init.rb
|
|
332
357
|
- sample/spider/db/migrate/20160830155803_create_pages.rb
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
|
|
3
|
-
require "thor"
|
|
4
|
-
|
|
5
|
-
require_relative "../config/init"
|
|
6
|
-
require "daimon_skycrawlers/crawler"
|
|
7
|
-
require "daimon_skycrawlers/processor"
|
|
8
|
-
|
|
9
|
-
class Enqueue < Thor
|
|
10
|
-
desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
|
|
11
|
-
def url(url, *rest)
|
|
12
|
-
message = rest.map {|arg| arg.split(":") }.to_h
|
|
13
|
-
DaimonSkycrawlers::Crawler.enqueue_url(url, message)
|
|
14
|
-
end
|
|
15
|
-
|
|
16
|
-
desc "response URL [key1:value1 key2:value2...]", "Enqueue URL for HTTP response consumer"
|
|
17
|
-
def response(url, *rest)
|
|
18
|
-
message = rest.map {|arg| arg.split(":") }.to_h
|
|
19
|
-
DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
|
|
20
|
-
end
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
Enqueue.start(ARGV)
|
data/sample/spider/bin/crawler
DELETED
data/sample/spider/bin/enqueue
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
|
|
3
|
-
require "thor"
|
|
4
|
-
|
|
5
|
-
require_relative "../config/init"
|
|
6
|
-
require "daimon_skycrawlers/crawler"
|
|
7
|
-
require "daimon_skycrawlers/processor"
|
|
8
|
-
|
|
9
|
-
class Enqueue < Thor
|
|
10
|
-
desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
|
|
11
|
-
def url(url, *rest)
|
|
12
|
-
message = rest.map {|arg| arg.split(":") }.to_h
|
|
13
|
-
DaimonSkycrawlers::Crawler.enqueue_url(url, message)
|
|
14
|
-
end
|
|
15
|
-
|
|
16
|
-
desc "response URL [key1:value1 key2:value2...]", "Enqueue URL for HTTP response consumer"
|
|
17
|
-
def response(url, *rest)
|
|
18
|
-
message = rest.map {|arg| arg.split(":") }.to_h
|
|
19
|
-
DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
|
|
20
|
-
end
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
Enqueue.start(ARGV)
|
data/sample/spider/bin/processor
DELETED