daimon_skycrawlers 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 411d6b35b2c909712f9f79cd5e45440226ebba28
4
- data.tar.gz: afa7bdc3cc98a28742c64ba46d7237fcce981b92
3
+ metadata.gz: 4e4bd1308a554f55bce802a5d0a038cbb0f0470e
4
+ data.tar.gz: a484737c74cc9ff3304a9fda0714282f9f6b0e61
5
5
  SHA512:
6
- metadata.gz: f8844e9ebd88e7be7344a607866c15ea74be64a8a405f05f196d54f0125bf66be6a81c4ff27ea3ee4480a82989abbedd79cd934c122c7ac1bee5d83bbfa1b6d3
7
- data.tar.gz: 37fe8376a4165aef7f29aa29af51bb637402c48d9d5327c44ea05c37dac2418c39b1f5e55303ec8c66e9cfe8eb3e77f0dc264648caa262abc551c72f06944b55
6
+ metadata.gz: c9d4b01cb37808ce43a9786324e6e0d842d67bc44269c39964e4570c5dbab1f0b818449ac6a67864d6834e2b67a1a9f857252fd0d3cc91e87961ad252eccc785
7
+ data.tar.gz: c1babe4300e744678672482e3a82c8bc5fa37a5e52ebe8e634b3ae0cdaea17adc75c5132a5a8918c2b3dce76b47c783fedea8e911b18f2e6f0bd7adee0d0525b
data/README.md CHANGED
@@ -33,52 +33,60 @@ Or install it yourself as:
33
33
 
34
34
  1. Create project
35
35
 
36
- ```
37
- $ bundle exec daimon-skycrawlers new mycrawlers
38
- $ cd mycrawlers
39
- ```
40
- or
41
- ```
42
- $ daimon-skycrawlers new mycrawlers
43
- $ cd mycrawlers
44
- ```
36
+ ```
37
+ $ bundle exec daimon-skycrawlers new mycrawlers
38
+ $ cd mycrawlers
39
+ ```
40
+ or
41
+ ```
42
+ $ daimon-skycrawlers new mycrawlers
43
+ $ cd mycrawlers
44
+ ```
45
45
 
46
46
  2. Install dependencies
47
47
 
48
- ```
49
- $ bundle install
50
- ```
48
+ ```
49
+ $ bundle install
50
+ ```
51
51
 
52
52
  3. Create database
53
53
 
54
- ```
55
- $ bundle exec rake db:create
56
- $ bundle exec rake db:migrate
57
- ```
54
+ ```
55
+ $ bundle exec rake db:create
56
+ $ bundle exec rake db:migrate
57
+ ```
58
58
 
59
59
  4. Open new terminal and run crawler/processor
60
60
 
61
- ```
62
- $ bin/crawler # on new terminal
63
- $ bin/processor # on new terminal
64
- ```
61
+ ```
62
+ $ daimon-skycrawlers exec crawler # on new terminal
63
+ $ daimon-skycrawlers exec processor # on new terminal
64
+ ```
65
65
 
66
66
  NOTE: Execute step 5 as soon as possible. Because bin/crawler and
67
67
  bin/processor will stop after 10 seconds by default if their
68
68
  queues are empty.
69
69
 
70
+ NOTE: You can change `shutdown_interval` using following code in config/init.rb:
71
+
72
+ ```ruby
73
+ DaimonSkycrawlers.configure do |config|
74
+ config.shutdown_interval = 30
75
+ end
76
+ ```
77
+
70
78
  5. Enqueue task
71
79
 
72
- ```
73
- $ bin/enqueue url http://example.com/
74
- ```
80
+ ```
81
+ $ daimon-skycrawlers enqueue url http://example.com/
82
+ ```
75
83
 
76
84
  6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
77
85
  7. You can re-enqueue task for processor
78
86
 
79
- ```
80
- $ bin/enqueue response http://example.com/
81
- ```
87
+ ```
88
+ $ daimon-skycrawlers enqueue response http://example.com/
89
+ ```
82
90
 
83
91
  Display `It works with 'http://example.com'` again on your terminal which runs your processor.
84
92
 
@@ -28,6 +28,8 @@ Gem::Specification.new do |spec|
28
28
  spec.add_dependency "railties"
29
29
  spec.add_dependency "pg"
30
30
  spec.add_dependency "timers"
31
+ spec.add_dependency "sitemap-parser"
32
+ spec.add_dependency "webrobots"
31
33
 
32
34
  spec.add_development_dependency "rake", "~> 10.0"
33
35
  spec.add_development_dependency "test-unit"
@@ -1,10 +1,14 @@
1
1
  require "thor"
2
2
  require "daimon_skycrawlers/generator/new"
3
+ require "daimon_skycrawlers/commands/enqueue"
4
+ require "daimon_skycrawlers/commands/runner"
3
5
  require "daimon_skycrawlers/version"
4
6
 
5
7
  module DaimonSkycrawlers
6
8
  class CLI < Thor
7
9
  register(Generator::New, "new", "new NAME", "Create new project")
10
+ register(Commands::Runner, "exec", "exec [COMMAND]", "Execute crawler/processor")
11
+ register(Commands::Enqueue, "enqueue", "enqueue [TYPE] URL [messages...]", "Enqueue URL")
8
12
 
9
13
  desc "version", "Show version"
10
14
  def version
@@ -0,0 +1,58 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/crawler"
3
+ require "daimon_skycrawlers/processor"
4
+ require "daimon_skycrawlers/version"
5
+ require "sitemap-parser"
6
+ require "webrobots"
7
+
8
+ module DaimonSkycrawlers
9
+ module Commands
10
+ class Enqueue < Thor
11
+ desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
12
+ def url(url, *rest)
13
+ load_init
14
+ message = rest.map {|arg| arg.split(":") }.to_h
15
+ log.debug("Enqueue URL for crawler: #{url} : #{message}")
16
+ DaimonSkycrawlers::Crawler.enqueue_url(url, message)
17
+ end
18
+
19
+ desc "response URL [key1:value1 key2:value2...]", "Enqueue URL for HTTP response consumer"
20
+ def response(url, *rest)
21
+ load_init
22
+ message = rest.map {|arg| arg.split(":") }.to_h
23
+ log.debug("Enqueue URL for processor: #{url} : #{message}")
24
+ DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
25
+ end
26
+
27
+ desc "sitemap [OPTIONS] URL", "Enqueue URLs from simtemap.xml"
28
+ method_option("robots-txt", aliases: ["-r"], type: :boolean,
29
+ desc: "URL for robots.txt. Detect robots.txt automatically if URL is not robots.txt")
30
+ def sitemap(url)
31
+ load_init
32
+ if options["robots-txt"]
33
+ webrobots = WebRobots.new("DaimonSkycrawlers/#{DaimonSkycrawlers::VERSION}")
34
+ sitemaps = webrobots.sitemaps(url).uniq
35
+ else
36
+ sitemaps = [url]
37
+ end
38
+ urls = sitemaps.flat_map do |sitemap|
39
+ sitemap_parser = SitemapParser.new(sitemap)
40
+ sitemap_parser.to_a
41
+ end
42
+ urls.each do |_url|
43
+ DaimonSkycrawlers::Crawler.enqueue_url(_url)
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def load_init
50
+ DaimonSkycrawlers.load_init
51
+ end
52
+
53
+ def log
54
+ DaimonSkycrawlers.configuration.logger
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,47 @@
1
+ require "thor"
2
+ require "daimon_skycrawlers"
3
+ require "daimon_skycrawlers/crawler"
4
+
5
+ module DaimonSkycrawlers
6
+ module Commands
7
+ class Runner < Thor
8
+ namespace "exec"
9
+
10
+ desc "crawler", "Execute crawler"
11
+ def crawler
12
+ load_init
13
+ Dir.glob("app/crawlers/**/*.rb") do |path|
14
+ require(File.expand_path(path, Dir.pwd))
15
+ log.info("Loaded crawler: #{path}")
16
+ end
17
+ DaimonSkycrawlers::Crawler.run
18
+ rescue => ex
19
+ puts ex.message
20
+ exit(false)
21
+ end
22
+
23
+ desc "processor", "Execute processor"
24
+ def processor
25
+ load_init
26
+ Dir.glob("app/processors/**/*.rb") do |path|
27
+ require(File.expand_path(path, Dir.pwd))
28
+ log.info("Loaded processor: #{path}")
29
+ end
30
+ DaimonSkycrawlers::Processor.run
31
+ rescue => ex
32
+ puts ex.message
33
+ exit(false)
34
+ end
35
+
36
+ private
37
+
38
+ def load_init
39
+ DaimonSkycrawlers.load_init
40
+ end
41
+
42
+ def log
43
+ DaimonSkycrawlers.configuration.logger
44
+ end
45
+ end
46
+ end
47
+ end
@@ -1,5 +1,6 @@
1
1
  require "daimon_skycrawlers/crawler/base"
2
2
  require "daimon_skycrawlers/filter/update_checker"
3
+ require "daimon_skycrawlers/filter/robots_txt_checker"
3
4
 
4
5
  module DaimonSkycrawlers
5
6
  module Crawler
@@ -13,11 +14,16 @@ module DaimonSkycrawlers
13
14
  @n_processed_urls += 1
14
15
  @skipped = false
15
16
  url = connection.url_prefix + path
17
+ if @options[:obey_robots_txt]
18
+ robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url)
19
+ unless robots_txt_checker.call(url)
20
+ skip(url)
21
+ return
22
+ end
23
+ end
16
24
  update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
17
25
  unless update_checker.call(url.to_s, connection: connection)
18
- log.info("Skip #{url}")
19
- @skipped = true
20
- schedule_to_process(url.to_s, heartbeat: true)
26
+ skip(url)
21
27
  return
22
28
  end
23
29
  @prepare.call(connection)
@@ -33,6 +39,14 @@ module DaimonSkycrawlers
33
39
  message = message.merge(kw)
34
40
  schedule_to_process(url.to_s, message)
35
41
  end
42
+
43
+ private
44
+
45
+ def skip(url)
46
+ log.info("Skip #{url}")
47
+ @skipped = true
48
+ schedule_to_process(url.to_s, heartbeat: true)
49
+ end
36
50
  end
37
51
  end
38
52
  end
@@ -3,6 +3,14 @@ require "daimon_skycrawlers/config"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Filter
6
+ #
7
+ # Base class of filters.
8
+ #
9
+ # You must implement `#call` in your filter and it must return
10
+ # true or false. If your filter returns true, processors can
11
+ # process given URL after your filter. Otherwise framework skips
12
+ # given URL to skip processors.
13
+ #
6
14
  class Base
7
15
  include DaimonSkycrawlers::LoggerMixin
8
16
  include DaimonSkycrawlers::ConfigMixin
@@ -6,6 +6,8 @@ module DaimonSkycrawlers
6
6
  #
7
7
  # This filter provides duplication checker for given URL.
8
8
  #
9
+ # Skip processing duplicated URLs.
10
+ #
9
11
  class DuplicateChecker < Base
10
12
  def initialize(base_url: nil)
11
13
  @base_url = nil
@@ -0,0 +1,29 @@
1
+ require "webrobots"
2
+ require "daimon_skycrawlers/filter/base"
3
+ require "daimon_skycrawlers/version"
4
+
5
+ module DaimonSkycrawlers
6
+ module Filter
7
+ #
8
+ # This filter provides robots.txt checker for given URL.
9
+ # We want to obey robots.txt provided by a web site.
10
+ #
11
+ class RobotsTxtChecker < Base
12
+ def initialize(base_url: nil, user_agent: "DaimonSkycrawlers/#{DaimonSkycrawlers::VERSION}")
13
+ super()
14
+ @webrobots = WebRobots.new(user_agent)
15
+ end
16
+
17
+ #
18
+ # @param [String] url
19
+ # @return [true|false] Return true when web site allows to fetch the URL, otherwise return false
20
+ #
21
+ def call(url)
22
+ unless URI(url).absolute?
23
+ url = (@base_url + url).to_s
24
+ end
25
+ @webrobots.allowed?(url)
26
+ end
27
+ end
28
+ end
29
+ end
@@ -6,6 +6,9 @@ module DaimonSkycrawlers
6
6
  #
7
7
  # This filter provides update checker for given URL.
8
8
  #
9
+ # Skip processing URLs that is latest (not updated since previous
10
+ # access).
11
+ #
9
12
  class UpdateChecker < Base
10
13
  def initialize(storage: nil, base_url: nil)
11
14
  super(storage: storage)
@@ -40,9 +40,6 @@ module DaimonSkycrawlers
40
40
  "Rakefile",
41
41
  "app/crawlers/sample_crawler.rb",
42
42
  "app/processors/sample_processor.rb",
43
- "bin/crawler",
44
- "bin/enqueue",
45
- "bin/processor",
46
43
  "config/init.rb",
47
44
  ].each do |path|
48
45
  copy_file(path, "#{name}/#{path}", mode: :preserve)
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -62,5 +62,17 @@ module DaimonSkycrawlers
62
62
  def configure
63
63
  yield configuration
64
64
  end
65
+
66
+ #
67
+ # Load "config/init.rb"
68
+ #
69
+ # @return [void]
70
+ #
71
+ def load_init
72
+ require(File.expand_path("config/init.rb", Dir.pwd))
73
+ rescue LoadError => ex
74
+ puts ex.message
75
+ exit(false)
76
+ end
65
77
  end
66
78
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryunosuke SATO
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-09-29 00:00:00.000000000 Z
11
+ date: 2016-10-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -150,6 +150,34 @@ dependencies:
150
150
  - - ">="
151
151
  - !ruby/object:Gem::Version
152
152
  version: '0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: sitemap-parser
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: webrobots
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :runtime
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
153
181
  - !ruby/object:Gem::Dependency
154
182
  name: rake
155
183
  requirement: !ruby/object:Gem::Requirement
@@ -281,6 +309,8 @@ files:
281
309
  - db/schema.rb
282
310
  - lib/daimon_skycrawlers.rb
283
311
  - lib/daimon_skycrawlers/cli.rb
312
+ - lib/daimon_skycrawlers/commands/enqueue.rb
313
+ - lib/daimon_skycrawlers/commands/runner.rb
284
314
  - lib/daimon_skycrawlers/config.rb
285
315
  - lib/daimon_skycrawlers/consumer.rb
286
316
  - lib/daimon_skycrawlers/consumer/base.rb
@@ -292,6 +322,7 @@ files:
292
322
  - lib/daimon_skycrawlers/filter.rb
293
323
  - lib/daimon_skycrawlers/filter/base.rb
294
324
  - lib/daimon_skycrawlers/filter/duplicate_checker.rb
325
+ - lib/daimon_skycrawlers/filter/robots_txt_checker.rb
295
326
  - lib/daimon_skycrawlers/filter/update_checker.rb
296
327
  - lib/daimon_skycrawlers/generator/new.rb
297
328
  - lib/daimon_skycrawlers/generator/templates/new/Gemfile
@@ -299,9 +330,6 @@ files:
299
330
  - lib/daimon_skycrawlers/generator/templates/new/Rakefile
300
331
  - lib/daimon_skycrawlers/generator/templates/new/app/crawlers/sample_crawler.rb
301
332
  - lib/daimon_skycrawlers/generator/templates/new/app/processors/sample_processor.rb
302
- - lib/daimon_skycrawlers/generator/templates/new/bin/crawler
303
- - lib/daimon_skycrawlers/generator/templates/new/bin/enqueue
304
- - lib/daimon_skycrawlers/generator/templates/new/bin/processor
305
333
  - lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
306
334
  - lib/daimon_skycrawlers/generator/templates/new/config/init.rb
307
335
  - lib/daimon_skycrawlers/logger.rb
@@ -324,9 +352,6 @@ files:
324
352
  - sample/spider/Rakefile
325
353
  - sample/spider/app/crawlers/blog_crawler.rb
326
354
  - sample/spider/app/processors/blog_spider.rb
327
- - sample/spider/bin/crawler
328
- - sample/spider/bin/enqueue
329
- - sample/spider/bin/processor
330
355
  - sample/spider/config/database.yml
331
356
  - sample/spider/config/init.rb
332
357
  - sample/spider/db/migrate/20160830155803_create_pages.rb
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative "../config/init"
4
- require "daimon_skycrawlers/crawler"
5
-
6
- Dir.glob("app/crawlers/**/*.rb") do |path|
7
- require(File.expand_path(path, File.dirname(__dir__)))
8
- end
9
-
10
- DaimonSkycrawlers::Crawler.run
@@ -1,23 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require "thor"
4
-
5
- require_relative "../config/init"
6
- require "daimon_skycrawlers/crawler"
7
- require "daimon_skycrawlers/processor"
8
-
9
- class Enqueue < Thor
10
- desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
11
- def url(url, *rest)
12
- message = rest.map {|arg| arg.split(":") }.to_h
13
- DaimonSkycrawlers::Crawler.enqueue_url(url, message)
14
- end
15
-
16
- desc "response URL [key1:value1 key2:value2...]", "Enqueue URL for HTTP response consumer"
17
- def response(url, *rest)
18
- message = rest.map {|arg| arg.split(":") }.to_h
19
- DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
20
- end
21
- end
22
-
23
- Enqueue.start(ARGV)
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative "../config/init"
4
- require "daimon_skycrawlers/processor"
5
-
6
- Dir.glob("app/processors/**/*.rb") do |path|
7
- require(File.expand_path(path, File.dirname(__dir__)))
8
- end
9
-
10
- DaimonSkycrawlers::Processor.run
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative "../config/init"
4
- require "daimon_skycrawlers/crawler"
5
-
6
- Dir.glob("app/crawlers/**/*.rb") do |path|
7
- require(File.expand_path(path, File.dirname(__dir__)))
8
- end
9
-
10
- DaimonSkycrawlers::Crawler.run
@@ -1,23 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require "thor"
4
-
5
- require_relative "../config/init"
6
- require "daimon_skycrawlers/crawler"
7
- require "daimon_skycrawlers/processor"
8
-
9
- class Enqueue < Thor
10
- desc "url URL [key1:value1 key2:value2...]", "Enqueue URL for URL consumer"
11
- def url(url, *rest)
12
- message = rest.map {|arg| arg.split(":") }.to_h
13
- DaimonSkycrawlers::Crawler.enqueue_url(url, message)
14
- end
15
-
16
- desc "response URL [key1:value1 key2:value2...]", "Enqueue URL for HTTP response consumer"
17
- def response(url, *rest)
18
- message = rest.map {|arg| arg.split(":") }.to_h
19
- DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
20
- end
21
- end
22
-
23
- Enqueue.start(ARGV)
@@ -1,11 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative "../config/init"
4
-
5
- require "daimon_skycrawlers/processor"
6
-
7
- Dir.glob("app/processors/**/*.rb") do |path|
8
- require(File.expand_path(path, File.dirname(__dir__)))
9
- end
10
-
11
- DaimonSkycrawlers::Processor.run