daimon_skycrawlers 1.0.0.pre.rc1 → 1.0.0.pre.rc2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +11 -0
  3. data/daimon_skycrawlers.gemspec +1 -0
  4. data/lib/daimon_skycrawlers.rb +32 -2
  5. data/lib/daimon_skycrawlers/callbacks.rb +32 -2
  6. data/lib/daimon_skycrawlers/cli.rb +4 -0
  7. data/lib/daimon_skycrawlers/commands/enqueue.rb +4 -1
  8. data/lib/daimon_skycrawlers/commands/runner.rb +2 -0
  9. data/lib/daimon_skycrawlers/config.rb +1 -0
  10. data/lib/daimon_skycrawlers/configurable.rb +6 -1
  11. data/lib/daimon_skycrawlers/consumer.rb +3 -0
  12. data/lib/daimon_skycrawlers/consumer/base.rb +5 -0
  13. data/lib/daimon_skycrawlers/consumer/http_response.rb +1 -1
  14. data/lib/daimon_skycrawlers/consumer/url.rb +1 -1
  15. data/lib/daimon_skycrawlers/crawler.rb +5 -2
  16. data/lib/daimon_skycrawlers/crawler/base.rb +56 -8
  17. data/lib/daimon_skycrawlers/crawler/default.rb +9 -1
  18. data/lib/daimon_skycrawlers/filter.rb +3 -0
  19. data/lib/daimon_skycrawlers/filter/base.rb +12 -0
  20. data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +2 -2
  21. data/lib/daimon_skycrawlers/filter/robots_txt_checker.rb +1 -1
  22. data/lib/daimon_skycrawlers/filter/update_checker.rb +2 -2
  23. data/lib/daimon_skycrawlers/generator/crawler.rb +4 -1
  24. data/lib/daimon_skycrawlers/generator/filter.rb +4 -1
  25. data/lib/daimon_skycrawlers/generator/generate.rb +3 -0
  26. data/lib/daimon_skycrawlers/generator/new.rb +5 -1
  27. data/lib/daimon_skycrawlers/generator/processor.rb +4 -1
  28. data/lib/daimon_skycrawlers/logger.rb +8 -0
  29. data/lib/daimon_skycrawlers/processor.rb +5 -2
  30. data/lib/daimon_skycrawlers/processor/base.rb +28 -2
  31. data/lib/daimon_skycrawlers/processor/default.rb +7 -1
  32. data/lib/daimon_skycrawlers/processor/proc.rb +6 -0
  33. data/lib/daimon_skycrawlers/processor/spider.rb +2 -2
  34. data/lib/daimon_skycrawlers/queue.rb +31 -0
  35. data/lib/daimon_skycrawlers/sitemap_parser.rb +23 -1
  36. data/lib/daimon_skycrawlers/storage.rb +3 -0
  37. data/lib/daimon_skycrawlers/storage/base.rb +21 -1
  38. data/lib/daimon_skycrawlers/storage/file.rb +16 -0
  39. data/lib/daimon_skycrawlers/storage/null.rb +2 -2
  40. data/lib/daimon_skycrawlers/storage/rdb.rb +25 -7
  41. data/lib/daimon_skycrawlers/timer.rb +9 -0
  42. data/lib/daimon_skycrawlers/version.rb +4 -1
  43. data/sample/amazon-ranking/app/processors/amazon_ranking.rb +1 -1
  44. data/sample/itp-crawler/app/processors/itp_processor.rb +1 -1
  45. data/{lib/daimon_skycrawlers/generator/templates → templates}/crawler.rb.erb +0 -0
  46. data/{lib/daimon_skycrawlers/generator/templates → templates}/filter.rb.erb +0 -0
  47. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Dockerfile +0 -0
  48. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Dockerfile.db +0 -0
  49. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Gemfile +0 -0
  50. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/README.md.erb +0 -0
  51. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/Rakefile +0 -0
  52. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/app/crawler.rb +0 -0
  53. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/app/processor.rb +0 -0
  54. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/config/database.yml.erb +0 -0
  55. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/config/init.rb +0 -0
  56. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/docker-compose.yml.erb +0 -0
  57. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/env.db.erb +0 -0
  58. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/env.erb +0 -0
  59. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/services/common/docker-entrypoint.sh +0 -0
  60. data/{lib/daimon_skycrawlers/generator/templates → templates}/new/services/db/init-user-db.sh +0 -0
  61. data/{lib/daimon_skycrawlers/generator/templates → templates}/processor.rb.erb +0 -0
  62. metadata +34 -19
@@ -1,4 +1,7 @@
1
1
  module DaimonSkycrawlers
2
+ #
3
+ # Name space for storage
4
+ #
2
5
  module Storage
3
6
  end
4
7
  end
@@ -3,15 +3,35 @@ require "daimon_skycrawlers/config"
3
3
 
4
4
  module DaimonSkycrawlers
5
5
  module Storage
6
+ #
7
+ # Base class of storage implementation
8
+ #
6
9
  class Base
7
10
  include DaimonSkycrawlers::LoggerMixin
8
11
  include DaimonSkycrawlers::ConfigMixin
9
12
 
13
+ #
14
+ # Save data to storage
15
+ #
16
+ # Override this method in subclass
17
+ #
18
+ # @param data [Hash] data has following keys
19
+ # * `:url`: URL
20
+ # * `:message`: Given message
21
+ # * `:response`: HTTP response
22
+ #
10
23
  def save(data)
11
24
  raise "Implement this in subclass"
12
25
  end
13
26
 
14
- def read(url)
27
+ #
28
+ # Fetch page identified by url
29
+ #
30
+ # Override this method in subclass
31
+ #
32
+ # @param url [String] the key to find data in storage
33
+ #
34
+ def read(url, message = {})
15
35
  raise "Implement this in subclass"
16
36
  end
17
37
  end
@@ -11,6 +11,14 @@ module DaimonSkycrawlers
11
11
  @base_dir = Pathname(base_dir)
12
12
  end
13
13
 
14
+ #
15
+ # Save data to files under base directory
16
+ #
17
+ # @param data [Hash] data has following keys
18
+ # * `:url`: URL
19
+ # * `:message`: Given message
20
+ # * `:response`: HTTP response
21
+ #
14
22
  def save(data)
15
23
  url = data[:url]
16
24
  message = data[:message]
@@ -28,6 +36,11 @@ module DaimonSkycrawlers
28
36
  end
29
37
  end
30
38
 
39
+ #
40
+ # Read data from files under base directory
41
+ #
42
+ # @return [DaimonSkycrawlers::Storage::File::Page]
43
+ #
31
44
  def read(url, message)
32
45
  key = message[:key]
33
46
  headers = JSON.parse(headers_path(url, key).read)
@@ -35,6 +48,9 @@ module DaimonSkycrawlers
35
48
  Page.new(url, key, headers, body, headers["last-modified"], headers["etag"])
36
49
  end
37
50
 
51
+ #
52
+ # Page for file storage
53
+ #
38
54
  Page = Struct.new(:url, :key, :headers, :body, :last_modified, :etag)
39
55
 
40
56
  private
@@ -14,9 +14,9 @@ module DaimonSkycrawlers
14
14
  end
15
15
 
16
16
  #
17
- # Find nothing
17
+ # Read nothing
18
18
  #
19
- def find(url)
19
+ def read(url, message = {})
20
20
  end
21
21
  end
22
22
  end
@@ -14,12 +14,12 @@ module DaimonSkycrawlers
14
14
  end
15
15
 
16
16
  #
17
- # Save
17
+ # Save data to RDB
18
18
  #
19
- # @param [Hash] data has following keys
20
- # * :url: URL
21
- # * :message: Given message
22
- # * :response: HTTP response
19
+ # @param data [Hash] data has following keys
20
+ # * `:url`: URL
21
+ # * `:message`: Given message
22
+ # * `:response`: HTTP response
23
23
  #
24
24
  def save(data)
25
25
  url = data[:url]
@@ -39,9 +39,10 @@ module DaimonSkycrawlers
39
39
  #
40
40
  # Fetch page identified by url
41
41
  #
42
- # @param [String] url identity of the page
42
+ # @param url [String] identity of the page
43
+ # @param message [Hash] this hash may include `:key` to find page
43
44
  #
44
- def find(url, message = {})
45
+ def read(url, message = {})
45
46
  key = message[:key]
46
47
  if key
47
48
  Page.where(key: key).order(updated_at: :desc).limit(1).first
@@ -50,10 +51,27 @@ module DaimonSkycrawlers
50
51
  end
51
52
  end
52
53
 
54
+ # @private
53
55
  class Base < ActiveRecord::Base
54
56
  self.abstract_class = true
55
57
  end
56
58
 
59
+ #
60
+ # Model represents page
61
+ #
62
+ # * key
63
+ # * The key to identify page
64
+ # * url
65
+ # * The URL of page
66
+ # * headers
67
+ # * HTTP response header
68
+ # * body
69
+ # * HTTP response body
70
+ # * last_modified_at
71
+ # * Last-Modified header
72
+ # * etag
73
+ # * ETag header
74
+ #
57
75
  class Page < Base
58
76
  self.table_name = "pages"
59
77
  end
@@ -2,9 +2,18 @@ require "timers"
2
2
  require "daimon_skycrawlers"
3
3
 
4
4
  module DaimonSkycrawlers
5
+ #
6
+ # Name space for timer
7
+ #
5
8
  module Timer
6
9
  module_function
7
10
 
11
+ # Setup timer for shutdown
12
+ #
13
+ # @param queue_name_prefix [String] previx of queue name
14
+ # @param interval [String] shutdown after this interval after the queue is empty
15
+ # @return [Timers::Group] timers
16
+ #
8
17
  def setup_shutdown_timer(queue_name_prefix, interval: 10)
9
18
  timers = Timers::Group.new
10
19
  timer = timers.after(interval) do
@@ -1,3 +1,6 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "1.0.0-rc1"
2
+ #
3
+ # Version of this library
4
+ #
5
+ VERSION = "1.0.0-rc2"
3
6
  end
@@ -7,7 +7,7 @@ class AmazonRanking < DaimonSkycrawlers::Processor::Base
7
7
  Item = Struct.new(:rank, :name, :url, :star, :review)
8
8
  def call(message)
9
9
  url = message[:url]
10
- page = storage.find(url)
10
+ page = storage.read(url)
11
11
  doc = Nokogiri::HTML(page.body)
12
12
  ranking = []
13
13
  doc.search(".zg_itemRow").each do |item|
@@ -7,7 +7,7 @@ require_relative "../models/itp_shop"
7
7
  class ItpProcessor < DaimonSkycrawlers::Processor::Base
8
8
  def call(message)
9
9
  key_url = message[:url]
10
- page = storage.find(key_url)
10
+ page = storage.read(key_url)
11
11
  @doc = Nokogiri::HTML(page.body.encode("UTF-8", "CP932"))
12
12
  ItpShop.transaction do
13
13
  prepare_shops do |shop|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0.pre.rc1
4
+ version: 1.0.0.pre.rc2
5
5
  platform: ruby
6
6
  authors:
7
7
  - daimon developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-01-31 00:00:00.000000000 Z
11
+ date: 2017-02-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -248,6 +248,20 @@ dependencies:
248
248
  - - ">="
249
249
  - !ruby/object:Gem::Version
250
250
  version: '0'
251
+ - !ruby/object:Gem::Dependency
252
+ name: redcarpet
253
+ requirement: !ruby/object:Gem::Requirement
254
+ requirements:
255
+ - - ">="
256
+ - !ruby/object:Gem::Version
257
+ version: '0'
258
+ type: :development
259
+ prerelease: false
260
+ version_requirements: !ruby/object:Gem::Requirement
261
+ requirements:
262
+ - - ">="
263
+ - !ruby/object:Gem::Version
264
+ version: '0'
251
265
  - !ruby/object:Gem::Dependency
252
266
  name: tapp
253
267
  requirement: !ruby/object:Gem::Requirement
@@ -313,6 +327,7 @@ extra_rdoc_files: []
313
327
  files:
314
328
  - ".gitignore"
315
329
  - ".travis.yml"
330
+ - ".yardopts"
316
331
  - Gemfile
317
332
  - LICENSE.txt
318
333
  - README.md
@@ -344,23 +359,6 @@ files:
344
359
  - lib/daimon_skycrawlers/generator/generate.rb
345
360
  - lib/daimon_skycrawlers/generator/new.rb
346
361
  - lib/daimon_skycrawlers/generator/processor.rb
347
- - lib/daimon_skycrawlers/generator/templates/crawler.rb.erb
348
- - lib/daimon_skycrawlers/generator/templates/filter.rb.erb
349
- - lib/daimon_skycrawlers/generator/templates/new/Dockerfile
350
- - lib/daimon_skycrawlers/generator/templates/new/Dockerfile.db
351
- - lib/daimon_skycrawlers/generator/templates/new/Gemfile
352
- - lib/daimon_skycrawlers/generator/templates/new/README.md.erb
353
- - lib/daimon_skycrawlers/generator/templates/new/Rakefile
354
- - lib/daimon_skycrawlers/generator/templates/new/app/crawler.rb
355
- - lib/daimon_skycrawlers/generator/templates/new/app/processor.rb
356
- - lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
357
- - lib/daimon_skycrawlers/generator/templates/new/config/init.rb
358
- - lib/daimon_skycrawlers/generator/templates/new/docker-compose.yml.erb
359
- - lib/daimon_skycrawlers/generator/templates/new/env.db.erb
360
- - lib/daimon_skycrawlers/generator/templates/new/env.erb
361
- - lib/daimon_skycrawlers/generator/templates/new/services/common/docker-entrypoint.sh
362
- - lib/daimon_skycrawlers/generator/templates/new/services/db/init-user-db.sh
363
- - lib/daimon_skycrawlers/generator/templates/processor.rb.erb
364
362
  - lib/daimon_skycrawlers/logger.rb
365
363
  - lib/daimon_skycrawlers/processor.rb
366
364
  - lib/daimon_skycrawlers/processor/base.rb
@@ -427,6 +425,23 @@ files:
427
425
  - sample/spider/config/init.rb
428
426
  - sample/spider/db/migrate/20160830155803_create_pages.rb
429
427
  - sample/spider/db/schema.rb
428
+ - templates/crawler.rb.erb
429
+ - templates/filter.rb.erb
430
+ - templates/new/Dockerfile
431
+ - templates/new/Dockerfile.db
432
+ - templates/new/Gemfile
433
+ - templates/new/README.md.erb
434
+ - templates/new/Rakefile
435
+ - templates/new/app/crawler.rb
436
+ - templates/new/app/processor.rb
437
+ - templates/new/config/database.yml.erb
438
+ - templates/new/config/init.rb
439
+ - templates/new/docker-compose.yml.erb
440
+ - templates/new/env.db.erb
441
+ - templates/new/env.erb
442
+ - templates/new/services/common/docker-entrypoint.sh
443
+ - templates/new/services/db/init-user-db.sh
444
+ - templates/processor.rb.erb
430
445
  homepage: https://github.com/bm-sms/daimon_skycrawlers
431
446
  licenses:
432
447
  - MIT