daimon_skycrawlers 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 411d6b35b2c909712f9f79cd5e45440226ebba28
4
- data.tar.gz: afa7bdc3cc98a28742c64ba46d7237fcce981b92
3
+ metadata.gz: 4e4bd1308a554f55bce802a5d0a038cbb0f0470e
4
+ data.tar.gz: a484737c74cc9ff3304a9fda0714282f9f6b0e61
5
5
  SHA512:
6
- metadata.gz: f8844e9ebd88e7be7344a607866c15ea74be64a8a405f05f196d54f0125bf66be6a81c4ff27ea3ee4480a82989abbedd79cd934c122c7ac1bee5d83bbfa1b6d3
7
- data.tar.gz: 37fe8376a4165aef7f29aa29af51bb637402c48d9d5327c44ea05c37dac2418c39b1f5e55303ec8c66e9cfe8eb3e77f0dc264648caa262abc551c72f06944b55
6
+ metadata.gz: c9d4b01cb37808ce43a9786324e6e0d842d67bc44269c39964e4570c5dbab1f0b818449ac6a67864d6834e2b67a1a9f857252fd0d3cc91e87961ad252eccc785
7
+ data.tar.gz: c1babe4300e744678672482e3a82c8bc5fa37a5e52ebe8e634b3ae0cdaea17adc75c5132a5a8918c2b3dce76b47c783fedea8e911b18f2e6f0bd7adee0d0525b
data/README.md CHANGED
@@ -33,52 +33,60 @@ Or install it yourself as:
33
33
 
34
34
  1. Create project
35
35
 
36
- ```
37
- $ bundle exec daimon-skycrawlers new mycrawlers
38
- $ cd mycrawlers
39
- ```
40
- or
41
- ```
42
- $ daimon-skycrawlers new mycrawlers
43
- $ cd mycrawlers
44
- ```
36
+ ```
37
+ $ bundle exec daimon-skycrawlers new mycrawlers
38
+ $ cd mycrawlers
39
+ ```
40
+ or
41
+ ```
42
+ $ daimon-skycrawlers new mycrawlers
43
+ $ cd mycrawlers
44
+ ```
45
45
 
46
46
  2. Install dependencies
47
47
 
48
- ```
49
- $ bundle install
50
- ```
48
+ ```
49
+ $ bundle install
50
+ ```
51
51
 
52
52
  3. Create database
53
53
 
54
- ```
55
- $ bundle exec rake db:create
56
- $ bundle exec rake db:migrate
57
- ```
54
+ ```
55
+ $ bundle exec rake db:create
56
+ $ bundle exec rake db:migrate
57
+ ```
58
58
 
59
59
  4. Open new terminal and run crawler/processor
60
60
 
61
- ```
62
- $ bin/crawler # on new terminal
63
- $ bin/processor # on new terminal
64
- ```
61
+ ```
62
+ $ daimon-skycrawlers exec crawler # on new terminal
63
+ $ daimon-skycrawlers exec processor # on new terminal
64
+ ```
65
65
 
66
66
  NOTE: Execute step 5 as soon as possible. Because bin/crawler and
67
67
  bin/processor will stop after 10 seconds by default if their
68
68
  queues are empty.
69
69
 
70
+ NOTE: You can change `shutdown_interval` using following code in config/init.rb:
71
+
72
+ ```ruby
73
+ DaimonSkycrawlers.configure do |config|
74
+ config.shutdown_interval = 30
75
+ end
76
+ ```
77
+
70
78
  5. Enqueue task
71
79
 
72
- ```
73
- $ bin/enqueue url http://example.com/
74
- ```
80
+ ```
81
+ $ daimon-skycrawlers enqueue url http://example.com/
82
+ ```
75
83
 
76
84
  6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
77
85
  7. You can re-enqueue task for processor
78
86
 
79
- ```
80
- $ bin/enqueue response http://example.com/
81
- ```
87
+ ```
88
+ $ daimon-skycrawlers enqueue response http://example.com/
89
+ ```
82
90
 
83
91
  Display `It works with 'http://example.com'` again on your terminal which runs your processor.
84
92
 
@@ -28,6 +28,8 @@ Gem::Specification.new do |spec|
28
28
  spec.add_dependency "railties"
29
29
  spec.add_dependency "pg"
30
30
  spec.add_dependency "timers"
31
+ spec.add_dependency "sitemap-parser"
32
+ spec.add_dependency "webrobots"
31
33
 
32
34
  spec.add_development_dependency "rake", "~> 10.0"
33
35
  spec.add_development_dependency "test-unit"
@@ -1,10 +1,14 @@
1
1
  require "thor"
2
2
  require "daimon_skycrawlers/generator/new"
3
+ require "daimon_skycrawlers/commands/enqueue"
4
+ require "daimon_skycrawlers/commands/runner"
3
5
  require "daimon_skycrawlers/version"
4
6
 
5
7
  module DaimonSkycrawlers
6
8
  class CLI < Thor
7
9
  register(Generator::New, "new", "new NAME", "Create new project")
10
+ register(Commands::Runner, "exec", "exec [COMMAND]", "Execute crawler/processor")
11
+ register(Commands::Enqueue, "enqueue", "enqueue [TYPE] URL [messages...]", "Enqueue URL")
8
12
 
9
13
  desc "version", "Show version"
10
14
  def version
@@ -0,0 +1,58 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/crawler"
3
+ require "daimon_skycrawlers/processor"
4
+ require "daimon_skycrawlers/version"
5
+ require "sitemap-parser"
6
+ require "webrobots"
7
+
8
+ module DaimonSkycrawlers
9
+ module Commands
10
+ class Enqueue < Thor
11
+ desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
12
+ def url(url, *rest)
13
+ load_init
14
+ message = rest.map {|arg| arg.split(":") }.to_h
15
+ log.debug("Enqueue URL for crawler: #{url} : #{message}")
16
+ DaimonSkycrawlers::Crawler.enqueue_url(url, message)
17
+ end
18
+
19
+ desc "response URL [key1:value1 key2:value2...]", "Enqueue URL for HTTP response consumer"
20
+ def response(url, *rest)
21
+ load_init
22
+ message = rest.map {|arg| arg.split(":") }.to_h
23
+ log.debug("Enqueue URL for processor: #{url} : #{message}")
24
+ DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
25
+ end
26
+
27
+ desc "sitemap [OPTIONS] URL", "Enqueue URLs from simtemap.xml"
28
+ method_option("robots-txt", aliases: ["-r"], type: :boolean,
29
+ desc: "URL for robots.txt. Detect robots.txt automatically if URL is not robots.txt")
30
+ def sitemap(url)
31
+ load_init
32
+ if options["robots-txt"]
33
+ webrobots = WebRobots.new("DaimonSkycrawlers/#{DaimonSkycrawlers::VERSION}")
34
+ sitemaps = webrobots.sitemaps(url).uniq
35
+ else
36
+ sitemaps = [url]
37
+ end
38
+ urls = sitemaps.flat_map do |sitemap|
39
+ sitemap_parser = SitemapParser.new(sitemap)
40
+ sitemap_parser.to_a
41
+ end
42
+ urls.each do |_url|
43
+ DaimonSkycrawlers::Crawler.enqueue_url(_url)
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def load_init
50
+ DaimonSkycrawlers.load_init
51
+ end
52
+
53
+ def log
54
+ DaimonSkycrawlers.configuration.logger
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,47 @@
1
+ require "thor"
2
+ require "daimon_skycrawlers"
3
+ require "daimon_skycrawlers/crawler"
4
+
5
+ module DaimonSkycrawlers
6
+ module Commands
7
+ class Runner < Thor
8
+ namespace "exec"
9
+
10
+ desc "crawler", "Execute crawler"
11
+ def crawler
12
+ load_init
13
+ Dir.glob("app/crawlers/**/*.rb") do |path|
14
+ require(File.expand_path(path, Dir.pwd))
15
+ log.info("Loaded crawler: #{path}")
16
+ end
17
+ DaimonSkycrawlers::Crawler.run
18
+ rescue => ex
19
+ puts ex.message
20
+ exit(false)
21
+ end
22
+
23
+ desc "processor", "Execute processor"
24
+ def processor
25
+ load_init
26
+ Dir.glob("app/processors/**/*.rb") do |path|
27
+ require(File.expand_path(path, Dir.pwd))
28
+ log.info("Loaded processor: #{path}")
29
+ end
30
+ DaimonSkycrawlers::Processor.run
31
+ rescue => ex
32
+ puts ex.message
33
+ exit(false)
34
+ end
35
+
36
+ private
37
+
38
+ def load_init
39
+ DaimonSkycrawlers.load_init
40
+ end
41
+
42
+ def log
43
+ DaimonSkycrawlers.configuration.logger
44
+ end
45
+ end
46
+ end
47
+ end
@@ -1,5 +1,6 @@
1
1
  require "daimon_skycrawlers/crawler/base"
2
2
  require "daimon_skycrawlers/filter/update_checker"
3
+ require "daimon_skycrawlers/filter/robots_txt_checker"
3
4
 
4
5
  module DaimonSkycrawlers
5
6
  module Crawler
@@ -13,11 +14,16 @@ module DaimonSkycrawlers
13
14
  @n_processed_urls += 1
14
15
  @skipped = false
15
16
  url = connection.url_prefix + path
17
+ if @options[:obey_robots_txt]
18
+ robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url)
19
+ unless robots_txt_checker.call(url)
20
+ skip(url)
21
+ return
22
+ end
23
+ end
16
24
  update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
17
25
  unless update_checker.call(url.to_s, connection: connection)
18
- log.info("Skip #{url}")
19
- @skipped = true
20
- schedule_to_process(url.to_s, heartbeat: true)
26
+ skip(url)
21
27
  return
22
28
  end
23
29
  @prepare.call(connection)
@@ -33,6 +39,14 @@ module DaimonSkycrawlers
33
39
  message = message.merge(kw)
34
40
  schedule_to_process(url.to_s, message)
35
41
  end
42
+
43
+ private
44
+
45
+ def skip(url)
46
+ log.info("Skip #{url}")
47
+ @skipped = true
48
+ schedule_to_process(url.to_s, heartbeat: true)
49
+ end
36
50
  end
37
51
  end
38
52
  end
@@ -3,6 +3,14 @@ require "daimon_skycrawlers/config"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Filter
6
+ #
7
+ # Base class of filters.
8
+ #
9
+ # You must implement `#call` in your filter and it must return
10
+ # true or false. If your filter returns true, processors can
11
+ # process given URL after your filter. Otherwise framework skips
12
+ # given URL to skip processors.
13
+ #
6
14
  class Base
7
15
  include DaimonSkycrawlers::LoggerMixin
8
16
  include DaimonSkycrawlers::ConfigMixin
@@ -6,6 +6,8 @@ module DaimonSkycrawlers
6
6
  #
7
7
  # This filter provides duplication checker for given URL.
8
8
  #
9
+ # Skip processing duplicated URLs.
10
+ #
9
11
  class DuplicateChecker < Base
10
12
  def initialize(base_url: nil)
11
13
  @base_url = nil
@@ -0,0 +1,29 @@
1
+ require "webrobots"
2
+ require "daimon_skycrawlers/filter/base"
3
+ require "daimon_skycrawlers/version"
4
+
5
+ module DaimonSkycrawlers
6
+ module Filter
7
+ #
8
+ # This filter provides robots.txt checker for given URL.
9
+ # We want to obey robots.txt provided by a web site.
10
+ #
11
+ class RobotsTxtChecker < Base
12
+ def initialize(base_url: nil, user_agent: "DaimonSkycrawlers/#{DaimonSkycrawlers::VERSION}")
13
+ super()
14
+ @webrobots = WebRobots.new(user_agent)
15
+ end
16
+
17
+ #
18
+ # @param [String] url
19
+ # @return [true|false] Return true when web site allows to fetch the URL, otherwise return false
20
+ #
21
+ def call(url)
22
+ unless URI(url).absolute?
23
+ url = (@base_url + url).to_s
24
+ end
25
+ @webrobots.allowed?(url)
26
+ end
27
+ end
28
+ end
29
+ end
@@ -6,6 +6,9 @@ module DaimonSkycrawlers
6
6
  #
7
7
  # This filter provides update checker for given URL.
8
8
  #
9
+ # Skip processing URLs that is latest (not updated since previous
10
+ # access).
11
+ #
9
12
  class UpdateChecker < Base
10
13
  def initialize(storage: nil, base_url: nil)
11
14
  super(storage: storage)
@@ -40,9 +40,6 @@ module DaimonSkycrawlers
40
40
  "Rakefile",
41
41
  "app/crawlers/sample_crawler.rb",
42
42
  "app/processors/sample_processor.rb",
43
- "bin/crawler",
44
- "bin/enqueue",
45
- "bin/processor",
46
43
  "config/init.rb",
47
44
  ].each do |path|
48
45
  copy_file(path, "#{name}/#{path}", mode: :preserve)
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -62,5 +62,17 @@ module DaimonSkycrawlers
62
62
  def configure
63
63
  yield configuration
64
64
  end
65
+
66
+ #
67
+ # Load "config/init.rb"
68
+ #
69
+ # @return [void]
70
+ #
71
+ def load_init
72
+ require(File.expand_path("config/init.rb", Dir.pwd))
73
+ rescue LoadError => ex
74
+ puts ex.message
75
+ exit(false)
76
+ end
65
77
  end
66
78
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryunosuke SATO
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-09-29 00:00:00.000000000 Z
11
+ date: 2016-10-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -150,6 +150,34 @@ dependencies:
150
150
  - - ">="
151
151
  - !ruby/object:Gem::Version
152
152
  version: '0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: sitemap-parser
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: webrobots
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :runtime
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
153
181
  - !ruby/object:Gem::Dependency
154
182
  name: rake
155
183
  requirement: !ruby/object:Gem::Requirement
@@ -281,6 +309,8 @@ files:
281
309
  - db/schema.rb
282
310
  - lib/daimon_skycrawlers.rb
283
311
  - lib/daimon_skycrawlers/cli.rb
312
+ - lib/daimon_skycrawlers/commands/enqueue.rb
313
+ - lib/daimon_skycrawlers/commands/runner.rb
284
314
  - lib/daimon_skycrawlers/config.rb
285
315
  - lib/daimon_skycrawlers/consumer.rb
286
316
  - lib/daimon_skycrawlers/consumer/base.rb
@@ -292,6 +322,7 @@ files:
292
322
  - lib/daimon_skycrawlers/filter.rb
293
323
  - lib/daimon_skycrawlers/filter/base.rb
294
324
  - lib/daimon_skycrawlers/filter/duplicate_checker.rb
325
+ - lib/daimon_skycrawlers/filter/robots_txt_checker.rb
295
326
  - lib/daimon_skycrawlers/filter/update_checker.rb
296
327
  - lib/daimon_skycrawlers/generator/new.rb
297
328
  - lib/daimon_skycrawlers/generator/templates/new/Gemfile
@@ -299,9 +330,6 @@ files:
299
330
  - lib/daimon_skycrawlers/generator/templates/new/Rakefile
300
331
  - lib/daimon_skycrawlers/generator/templates/new/app/crawlers/sample_crawler.rb
301
332
  - lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
302
- - lib/daimon_skycrawlers/generator/templates/new/bin/crawler
303
- - lib/daimon_skycrawlers/generator/templates/new/bin/enqueue
304
- - lib/daimon_skycrawlers/generator/templates/new/bin/processor
305
333
  - lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
306
334
  - lib/daimon_skycrawlers/generator/templates/new/config/init.rb
307
335
  - lib/daimon_skycrawlers/logger.rb
@@ -324,9 +352,6 @@ files:
324
352
  - sample/spider/Rakefile
325
353
  - sample/spider/app/crawlers/blog_crawler.rb
326
354
  - sample/spider/app/processors/blog_spider.rb
327
- - sample/spider/bin/crawler
328
- - sample/spider/bin/enqueue
329
- - sample/spider/bin/processor
330
355
  - sample/spider/config/database.yml
331
356
  - sample/spider/config/init.rb
332
357
  - sample/spider/db/migrate/20160830155803_create_pages.rb
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative "../config/init"
4
- require "daimon_skycrawlers/crawler"
5
-
6
- Dir.glob("app/crawlers/**/*.rb") do |path|
7
- require(File.expand_path(path, File.dirname(__dir__)))
8
- end
9
-
10
- DaimonSkycrawlers::Crawler.run
@@ -1,23 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require "thor"
4
-
5
- require_relative "../config/init"
6
- require "daimon_skycrawlers/crawler"
7
- require "daimon_skycrawlers/processor"
8
-
9
- class Enqueue < Thor
10
- desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
11
- def url(url, *rest)
12
- message = rest.map {|arg| arg.split(":") }.to_h
13
- DaimonSkycrawlers::Crawler.enqueue_url(url, message)
14
- end
15
-
16
- desc "response URL [key1:value1 key2:value2...]", "Enqueue URL for HTTP response consumer"
17
- def response(url, *rest)
18
- message = rest.map {|arg| arg.split(":") }.to_h
19
- DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
20
- end
21
- end
22
-
23
- Enqueue.start(ARGV)
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative "../config/init"
4
- require "daimon_skycrawlers/processor"
5
-
6
- Dir.glob("app/processors/**/*.rb") do |path|
7
- require(File.expand_path(path, File.dirname(__dir__)))
8
- end
9
-
10
- DaimonSkycrawlers::Processor.run
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative "../config/init"
4
- require "daimon_skycrawlers/crawler"
5
-
6
- Dir.glob("app/crawlers/**/*.rb") do |path|
7
- require(File.expand_path(path, File.dirname(__dir__)))
8
- end
9
-
10
- DaimonSkycrawlers::Crawler.run
@@ -1,23 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require "thor"
4
-
5
- require_relative "../config/init"
6
- require "daimon_skycrawlers/crawler"
7
- require "daimon_skycrawlers/processor"
8
-
9
- class Enqueue < Thor
10
- desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
11
- def url(url, *rest)
12
- message = rest.map {|arg| arg.split(":") }.to_h
13
- DaimonSkycrawlers::Crawler.enqueue_url(url, message)
14
- end
15
-
16
- desc "response URL [key1:value1 key2:value2...]", "Enqueue URL for HTTP response consumer"
17
- def response(url, *rest)
18
- message = rest.map {|arg| arg.split(":") }.to_h
19
- DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
20
- end
21
- end
22
-
23
- Enqueue.start(ARGV)
@@ -1,11 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative "../config/init"
4
-
5
- require "daimon_skycrawlers/processor"
6
-
7
- Dir.glob("app/processors/**/*.rb") do |path|
8
- require(File.expand_path(path, File.dirname(__dir__)))
9
- end
10
-
11
- DaimonSkycrawlers::Processor.run