crawl_station 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.codeclimate.yml +24 -0
  3. data/.gitignore +10 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +98 -0
  6. data/.travis.yml +9 -0
  7. data/CODE_OF_CONDUCT.md +74 -0
  8. data/Gemfile +14 -0
  9. data/LICENSE.txt +21 -0
  10. data/README.md +43 -0
  11. data/Rakefile +6 -0
  12. data/benchmarks/.gitkeep +0 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/crawl_station.gemspec +32 -0
  16. data/examples/.gitkeep +0 -0
  17. data/exe/station +3 -0
  18. data/lib/crawl_station/application_record.rb +17 -0
  19. data/lib/crawl_station/cache.rb +13 -0
  20. data/lib/crawl_station/cache_adapters/abstract_adapter.rb +6 -0
  21. data/lib/crawl_station/cache_adapters/db_adapter.rb +6 -0
  22. data/lib/crawl_station/cache_adapters/memory_adapter.rb +17 -0
  23. data/lib/crawl_station/cli.rb +26 -0
  24. data/lib/crawl_station/command/create.rb +20 -0
  25. data/lib/crawl_station/command/generate.rb +88 -0
  26. data/lib/crawl_station/command.rb +26 -0
  27. data/lib/crawl_station/concerns/adapter_concern.rb +26 -0
  28. data/lib/crawl_station/configuration.rb +7 -0
  29. data/lib/crawl_station/fundation/parse_struct.rb +45 -0
  30. data/lib/crawl_station/launcher.rb +28 -0
  31. data/lib/crawl_station/logger.rb +12 -0
  32. data/lib/crawl_station/producer.rb +64 -0
  33. data/lib/crawl_station/ruby_version_check.rb +9 -0
  34. data/lib/crawl_station/schedule.rb +13 -0
  35. data/lib/crawl_station/schedule_adapters/abstract_adapter.rb +18 -0
  36. data/lib/crawl_station/schedule_adapters/db_adapter.rb +18 -0
  37. data/lib/crawl_station/schedule_adapters/memory_adapter.rb +21 -0
  38. data/lib/crawl_station/tasks/db.rake +33 -0
  39. data/lib/crawl_station/tasks/launcher.rake +6 -0
  40. data/lib/crawl_station/tasks/module.rake +6 -0
  41. data/lib/crawl_station/tasks.rb +3 -0
  42. data/lib/crawl_station/templates/create/Gemfile +11 -0
  43. data/lib/crawl_station/templates/create/Rakefile +2 -0
  44. data/lib/crawl_station/templates/create/config/boot.rb +6 -0
  45. data/lib/crawl_station/templates/create/config/initializers/station_config.rb +4 -0
  46. data/lib/crawl_station/templates/create/lib/tasks/.gitkeep +0 -0
  47. data/lib/crawl_station/templates/create/module/.gitkeep +0 -0
  48. data/lib/crawl_station/templates/generate/item.erb +9 -0
  49. data/lib/crawl_station/templates/generate/migration.erb +4 -0
  50. data/lib/crawl_station/templates/generate/module/config/.gitkeep +0 -0
  51. data/lib/crawl_station/templates/generate/module/db/migrate/.gitkeep +0 -0
  52. data/lib/crawl_station/templates/generate/module/item/.gitkeep +0 -0
  53. data/lib/crawl_station/templates/generate/module/parser/.gitkeep +0 -0
  54. data/lib/crawl_station/templates/generate/module/tasks/.gitkeep +0 -0
  55. data/lib/crawl_station/templates/generate/parser.erb +10 -0
  56. data/lib/crawl_station/utils.rb +36 -0
  57. data/lib/crawl_station/version.rb +3 -0
  58. data/lib/crawl_station.rb +152 -0
  59. metadata +199 -0
@@ -0,0 +1,45 @@
1
+ require 'ostruct'
2
+ module CrawlStation
3
+ class ParseStruct
4
+ extend Forwardable
5
+ attr_accessor :parse
6
+
7
+ %w(namespace parser item link).each do |method_name|
8
+ define_method(method_name) { get_value(method_name) }
9
+ define_method("#{method_name}=") { |v| set_value(method_name, v) }
10
+ end
11
+
12
+ def initialize(opts = {})
13
+ @parse = opts
14
+ @parse.deep_symbolize_keys!
15
+ end
16
+
17
+ def parser_class
18
+ path = "#{namespace}/parser/#{parser}"
19
+ path.camelize.constantize
20
+ end
21
+
22
+ def item_class
23
+ path = "#{namespace}/item/#{parser}"
24
+ path.camelize.constantize
25
+ end
26
+
27
+ def [](item)
28
+ get_value(item)
29
+ end
30
+
31
+ def []=(item, value)
32
+ set_value(item, value)
33
+ end
34
+
35
+ private
36
+
37
+ def get_value(item)
38
+ @parse[item.to_sym]
39
+ end
40
+
41
+ def set_value(item, value)
42
+ @parse[item.to_sym] = value
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,28 @@
1
+ module CrawlStation
2
+ class Launcher
3
+ include Celluloid
4
+ include Celluloid::Notifications
5
+
6
+ def initialize
7
+ @exit_sign = false
8
+ subscribe 'exit_launcher', :exit_message
9
+ end
10
+
11
+ def start
12
+ CS.logger.debug('station starting ... ')
13
+
14
+ CS.concurrent_count.times do
15
+ supervisor = CS::Producer.pool args: [CS.schedule, CS.cache]
16
+ supervisor.async.start
17
+ end
18
+ loop do
19
+ sleep(10)
20
+ break if @exit_sign
21
+ end
22
+ end
23
+
24
+ def exit_message(_topic, data)
25
+ @exit_sign = data
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,12 @@
1
+ module CrawlStation
2
+ module Logger
3
+ class << self
4
+ attr_accessor :logger
5
+ def method_missing(method_name, *args, &block)
6
+ return false if logger.nil?
7
+ return super unless logger.respond_to?(method_name)
8
+ logger.send(method_name, *args, &block)
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,64 @@
1
+ module CrawlStation
2
+ class Producer
3
+ include Celluloid
4
+
5
+ attr_accessor :schedule, :cache, :proxies, :proxy
6
+
7
+ def initialize(schedule, cache, proxies = nil)
8
+ @schedule = schedule
9
+ @cache = cache
10
+ @proxies = proxies
11
+ end
12
+
13
+ def start
14
+ loop { break unless loop_parser }
15
+ Logger.debug "#{self} done"
16
+ end
17
+
18
+ def loop_parser
19
+ return sleep(0.2) || true if @schedule.empty?
20
+ item = @schedule.pop
21
+ item = CS::ParseStruct.new(item) if item.is_a?(Hash)
22
+ return sleep(0.2) || true if parsed?(item)
23
+ Logger.debug "start parse #{item.link}"
24
+ data = parse_item(item)
25
+ return true if data.nil? || data.empty?
26
+ data = parse_links(data, item.namespace)
27
+ return true if data.empty?
28
+ item.item_class.new.save(item.link, data)
29
+ true
30
+ end
31
+
32
+ def parse_item(item)
33
+ data = cache(item) { item.parser_class.new.crawl(item.link) }
34
+ @schedule.done(item)
35
+ data
36
+ rescue Exception => e
37
+ Logger.error("%s: %s\n%s" % [item.link, e.message, e.backtrace[0..10].join("\n")])
38
+ @schedule.failed(item)
39
+ nil
40
+ end
41
+
42
+ def parse_links(data, namespace)
43
+ links = ->(data, namespace) do
44
+ next if data['link'].blank? || parsed?(data)
45
+ @schedule.push ParseStruct.new(parser: data['parser'], link: data['link'], namespace: namespace)
46
+ end
47
+ ['pages', 'details'].each do |field|
48
+ data.delete(field)&.map { |page| links.call(page, namespace) }
49
+ end
50
+ data
51
+ end
52
+
53
+ def parsed?(data)
54
+ data.nil? || @cache.include?(data['link'])
55
+ end
56
+
57
+ def cache(item, data = 'parsing')
58
+ @cache[item['link']] = data
59
+ data = yield if block_given?
60
+ @cache[item['link']] = data
61
+ data
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,9 @@
1
+ if ENV['RUBY_VERSION'] < '2.3.0'
2
+ abort <<-end_message
3
+ CrawlStation require ruby 2.3.0 or newer.
4
+ your are running
5
+ #{ENV['RUBY_VERSION']}
6
+
7
+ please upgrade to Ruby 2.3.0 or newer to continue.
8
+ end_message
9
+ end
@@ -0,0 +1,13 @@
1
+ module CrawlStation
2
+ module Schedule
3
+ extend self
4
+ extend Forwardable
5
+ include Concerns::AdapterConcern
6
+
7
+ def adapter=(adapter_name)
8
+ custom_adapter(adapter_name, CrawlStation::ScheduleAdapters)
9
+ end
10
+
11
+ def_delegators :@adapter, :push, :pop, :empty?, :include?
12
+ end
13
+ end
@@ -0,0 +1,18 @@
1
+ module CrawlStation
2
+ module ScheduleAdapters
3
+ class AbstractAdapter
4
+ def push(_item)
5
+ end
6
+ def pop
7
+ end
8
+ def empty?
9
+ end
10
+
11
+ def failed(item)
12
+ end
13
+
14
+ def done(item)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ module CrawlStation
2
+ module ScheduleAdapters
3
+ class DbAdapter < AbstractAdapter
4
+ def push(_item)
5
+ end
6
+ def pop
7
+ end
8
+ def empty?
9
+ end
10
+
11
+ def failed(_item)
12
+ end
13
+
14
+ def done(_item)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,21 @@
1
+ module CrawlStation
2
+ module ScheduleAdapters
3
+ class MemoryAdapter < AbstractAdapter
4
+ def initialize
5
+ @queue = Queue.new
6
+ @queue.extend(MonitorMixin)
7
+ %w(push pop empty?).each do |method_name|
8
+ self.define_singleton_method method_name do |*args|
9
+ @queue.synchronize do
10
+ @queue.send(method_name, *args)
11
+ end
12
+ end
13
+ end
14
+ end
15
+
16
+ def failed(item)
17
+ CS.logger.debug "#{item.link} failed"
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,33 @@
1
+ namespace :db do
2
+ def db_operation(operator)
3
+ Rake::Task["db:#{operator}"].invoke('crawl_station')
4
+ Dir["#{CrawlStation.root}/module/*"].each do |dir|
5
+ module_name = dirr.split('/').last
6
+ Rake::Task["db:#{operator}"].invoke(module_name)
7
+ end
8
+ end
9
+
10
+ desc 'db:create[module_name] if module_name is nil, create all module databases'
11
+ task :create, [:module_name] => :environment do |_t, args|
12
+ m_name = args[:module_name]
13
+ return db_operation(:create) if m_name.nil?
14
+ m_name = nil if m_name == 'crawl_station'
15
+ CrawlStation::Utils.create_database(m_name)
16
+ end
17
+
18
+ desc 'db:migrate[module_name] if module_name is nil, migrate all module migrations'
19
+ task :migrate, [:module_name] => :environment do |_t, args|
20
+ version = ENV['VERSION']
21
+ module_name = args[:module_name]
22
+ return db_operation(:migrate) if module_name.nil?
23
+ path = "#{CrawlStation::Utils.module_path(module_name)}/db/migrate"
24
+ path = "#{CrawlStation::Utils.gems_path}/db/migrate" if module_name == 'crawl_station'
25
+ ActiveRecord::Migrator.migrate(path, version ? version.to_i : nil)
26
+ end
27
+
28
+ task :environment, [:module_name] do |_t, args|
29
+ config = CrawlStation::Utils.database_config args[:module_name]
30
+ ActiveRecord::Base.logger = CrawlStation.logger
31
+ ActiveRecord::Base.establish_connection config
32
+ end
33
+ end
@@ -0,0 +1,6 @@
1
+ desc 'launch station system'
2
+ task :launch do
3
+ CrawlStation::Launcher.new.start
4
+ end
5
+
6
+ task default: %w(launch)
@@ -0,0 +1,6 @@
1
+ require 'rake'
2
+
3
+ desc 'show module list'
4
+ task :module_list do
5
+ Dir['module/*'].each { |dir| puts dir.split('/').last }
6
+ end
@@ -0,0 +1,3 @@
1
+ require 'rake'
2
+
3
+ Dir['tasks/*.rake'].each { |f| load f }
@@ -0,0 +1,11 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'rake'
4
+ gem 'thor'
5
+ gem 'celluloid', github: 'celluloid/celluloid', submodules: true
6
+ gem 'wombat', github: 'watsy0007/wombat'
7
+ gem 'crawl_station'
8
+
9
+ group :development do
10
+ gem 'pry'
11
+ end
@@ -0,0 +1,2 @@
1
+ require_relative './config/boot'
2
+ CrawlStation.load_tasks
@@ -0,0 +1,6 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ Bundler.setup(:default)
4
+ require 'wombat'
5
+ require 'crawl_station'
6
+ CrawlStation.boot
@@ -0,0 +1,4 @@
1
+ CrawlStation.config do |config|
2
+ config.adapter = :memory
3
+ config.parsers = []
4
+ end
@@ -0,0 +1,9 @@
1
+ module <%= module_class_name %>
2
+ module Item
3
+ class <%= class_name %>
4
+ def save(link, data)
5
+ puts "#{link}\n#{data}"
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,4 @@
1
+ class <%= class_name %> < ActiveRecord::Migration[5.0]
2
+ def change
3
+ end
4
+ end
@@ -0,0 +1,10 @@
1
+ module <%= module_class_name %>
2
+ module Parser
3
+ class <%= class_name %>
4
+ include Wombat::Crawler
5
+
6
+ # title xpath: "//div[@class='title']"
7
+ # sub_title css: 'div.sub_title'
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,36 @@
1
+ require 'yaml'
2
+ require 'active_support/core_ext'
3
+ module CrawlStation
4
+ module Utils
5
+ class << self
6
+ def module_path(module_name)
7
+ "#{CS.root}/module/#{module_name}"
8
+ end
9
+
10
+ def database_path(module_name = nil)
11
+ global_db_path = "#{CS.root}/config/database.yml"
12
+ return global_db_path if module_name.nil?
13
+ path = "#{module_path(module_name)}/config/database.yml"
14
+ return path if File.exist?(path)
15
+ global_db_path
16
+ end
17
+
18
+ def database_config(module_name = nil)
19
+ result = ERB.new(IO.read(database_path(module_name))).result
20
+ ::YAML.load(result).deep_symbolize_keys[CS.env.to_sym]
21
+ end
22
+
23
+ def templates_path
24
+ "#{gem_path}/crawl_station/templates"
25
+ end
26
+
27
+ def template_filepath(path)
28
+ "#{templates_path}/#{path}"
29
+ end
30
+
31
+ def gem_path
32
+ File.expand_path('../../', __FILE__)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,3 @@
1
+ module CrawlStation
2
+ VERSION = '0.1.1'
3
+ end
@@ -0,0 +1,152 @@
1
+ require 'crawl_station/version'
2
+ require 'pathname'
3
+ require 'active_support'
4
+ require 'active_record'
5
+ require 'active_support/dependencies/autoload'
6
+ require 'logger'
7
+ require 'thor'
8
+ require 'celluloid/debug'
9
+ require 'celluloid/current'
10
+ module CrawlStation # :nodoc:
11
+ extend ActiveSupport::Autoload
12
+
13
+ autoload :Configuration
14
+ autoload :Logger
15
+ autoload :Utils
16
+ autoload :ApplicationRecord
17
+ autoload :Producer
18
+ autoload :Launcher
19
+ autoload :Cache
20
+ autoload :Schedule
21
+ autoload :ParseStruct, 'crawl_station/fundation/parse_struct'
22
+ autoload :Command
23
+
24
+ module ScheduleAdapters
25
+ extend ActiveSupport::Autoload
26
+
27
+ autoload :AbstractAdapter
28
+ autoload :MemoryAdapter
29
+ autoload :DbAdapter
30
+ end
31
+
32
+ module Command
33
+ extend ActiveSupport::Autoload
34
+
35
+ autoload :Create
36
+ autoload :Generate
37
+ end
38
+
39
+ module CacheAdapters
40
+ extend ActiveSupport::Autoload
41
+
42
+ autoload :AbstractAdapter
43
+ autoload :MemoryAdapter
44
+ autoload :DbAdapter
45
+ end
46
+
47
+ module Concerns
48
+ extend ActiveSupport::Autoload
49
+
50
+ autoload :AdapterConcern
51
+ end
52
+
53
+ class << self
54
+ def config
55
+ yield @config if block_given?
56
+ end
57
+
58
+ def env
59
+ @_env ||= ActiveSupport::StringInquirer.new(ENV['CRAWL_STATION_ENV'] || 'development')
60
+ end
61
+
62
+ def env=(environment)
63
+ @_env = ActiveSupport::StringInquirer.new(environment)
64
+ end
65
+
66
+ def root
67
+ Pathname.new(File.expand_path('.'))
68
+ end
69
+
70
+ def logger
71
+ @_logger ||= begin
72
+ CrawlStation::Logger.logger ||= ::Logger.new(STDERR) do |log|
73
+ log.level = ::Logger.DEBUG
74
+ end
75
+ end
76
+ end
77
+
78
+ def logger=(logger)
79
+ @_logger = CrawlStation::Logger.logger = logger
80
+ end
81
+
82
+ def concurrent_count
83
+ @_concurrent_count ||= ENV['CRAWL_STATION_CONCURRENT_COUNT'] || 1
84
+ end
85
+
86
+ def concurrent_count=(count)
87
+ @_concurrent_count = count
88
+ end
89
+
90
+ def schedule
91
+ Schedule.adapter
92
+ end
93
+
94
+ def schedule=(item)
95
+ Schedule.adpater(item)
96
+ end
97
+
98
+ def cache
99
+ Cache.adapter
100
+ end
101
+
102
+ def cache=(item)
103
+ Cache.adapter(item)
104
+ end
105
+
106
+ def proxies
107
+ end
108
+
109
+ def proxies(item)
110
+ end
111
+
112
+ def load_tasks
113
+ task_path = File.expand_path('../', __FILE__)
114
+ [
115
+ "#{task_path}/crawl_station/tasks/*.rake",
116
+ "#{CrawlStation.root}/lib/tasks/**/*.rake"
117
+ ].each { |path| Dir[path].each { |f| load f } }
118
+ end
119
+
120
+ def load_modules
121
+ %w(item parser config).each do |path|
122
+ Dir["#{CS.root}/module/*/#{path}/**/*.rb"].each { |f| require f }
123
+ end
124
+ end
125
+
126
+ def init_application
127
+ @config ||= CrawlStation::Configuration
128
+ Dir["#{CS.root}/config/initializers/**/*.rb"].each { |f| require f }
129
+ end
130
+
131
+ def config_adapter
132
+ adapter = @config.adapter || 'memory'
133
+ schedule = adapter
134
+ cache = adapter
135
+ end
136
+
137
+ def config_parsers
138
+ parsers = @config.parsers || []
139
+ parsers.each { |p| schedule.push p }
140
+ end
141
+
142
+ def boot
143
+ init_application
144
+ load_modules
145
+ config_adapter
146
+ config_parsers
147
+ end
148
+ end
149
+ end
150
+
151
+ CS = CrawlStation
152
+ Celluloid.boot