crawl_station 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.codeclimate.yml +24 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +98 -0
- data/.travis.yml +9 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +14 -0
- data/LICENSE.txt +21 -0
- data/README.md +43 -0
- data/Rakefile +6 -0
- data/benchmarks/.gitkeep +0 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/crawl_station.gemspec +32 -0
- data/examples/.gitkeep +0 -0
- data/exe/station +3 -0
- data/lib/crawl_station/application_record.rb +17 -0
- data/lib/crawl_station/cache.rb +13 -0
- data/lib/crawl_station/cache_adapters/abstract_adapter.rb +6 -0
- data/lib/crawl_station/cache_adapters/db_adapter.rb +6 -0
- data/lib/crawl_station/cache_adapters/memory_adapter.rb +17 -0
- data/lib/crawl_station/cli.rb +26 -0
- data/lib/crawl_station/command/create.rb +20 -0
- data/lib/crawl_station/command/generate.rb +88 -0
- data/lib/crawl_station/command.rb +26 -0
- data/lib/crawl_station/concerns/adapter_concern.rb +26 -0
- data/lib/crawl_station/configuration.rb +7 -0
- data/lib/crawl_station/fundation/parse_struct.rb +45 -0
- data/lib/crawl_station/launcher.rb +28 -0
- data/lib/crawl_station/logger.rb +12 -0
- data/lib/crawl_station/producer.rb +64 -0
- data/lib/crawl_station/ruby_version_check.rb +9 -0
- data/lib/crawl_station/schedule.rb +13 -0
- data/lib/crawl_station/schedule_adapters/abstract_adapter.rb +18 -0
- data/lib/crawl_station/schedule_adapters/db_adapter.rb +18 -0
- data/lib/crawl_station/schedule_adapters/memory_adapter.rb +21 -0
- data/lib/crawl_station/tasks/db.rake +33 -0
- data/lib/crawl_station/tasks/launcher.rake +6 -0
- data/lib/crawl_station/tasks/module.rake +6 -0
- data/lib/crawl_station/tasks.rb +3 -0
- data/lib/crawl_station/templates/create/Gemfile +11 -0
- data/lib/crawl_station/templates/create/Rakefile +2 -0
- data/lib/crawl_station/templates/create/config/boot.rb +6 -0
- data/lib/crawl_station/templates/create/config/initializers/station_config.rb +4 -0
- data/lib/crawl_station/templates/create/lib/tasks/.gitkeep +0 -0
- data/lib/crawl_station/templates/create/module/.gitkeep +0 -0
- data/lib/crawl_station/templates/generate/item.erb +9 -0
- data/lib/crawl_station/templates/generate/migration.erb +4 -0
- data/lib/crawl_station/templates/generate/module/config/.gitkeep +0 -0
- data/lib/crawl_station/templates/generate/module/db/migrate/.gitkeep +0 -0
- data/lib/crawl_station/templates/generate/module/item/.gitkeep +0 -0
- data/lib/crawl_station/templates/generate/module/parser/.gitkeep +0 -0
- data/lib/crawl_station/templates/generate/module/tasks/.gitkeep +0 -0
- data/lib/crawl_station/templates/generate/parser.erb +10 -0
- data/lib/crawl_station/utils.rb +36 -0
- data/lib/crawl_station/version.rb +3 -0
- data/lib/crawl_station.rb +152 -0
- metadata +199 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
require 'ostruct'
|
|
2
|
+
module CrawlStation
|
|
3
|
+
class ParseStruct
|
|
4
|
+
extend Forwardable
|
|
5
|
+
attr_accessor :parse
|
|
6
|
+
|
|
7
|
+
%w(namespace parser item link).each do |method_name|
|
|
8
|
+
define_method(method_name) { get_value(method_name) }
|
|
9
|
+
define_method("#{method_name}=") { |v| set_value(method_name, v) }
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def initialize(opts = {})
|
|
13
|
+
@parse = opts
|
|
14
|
+
@parse.deep_symbolize_keys!
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def parser_class
|
|
18
|
+
path = "#{namespace}/parser/#{parser}"
|
|
19
|
+
path.camelize.constantize
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def item_class
|
|
23
|
+
path = "#{namespace}/item/#{parser}"
|
|
24
|
+
path.camelize.constantize
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def [](item)
|
|
28
|
+
get_value(item)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def []=(item, value)
|
|
32
|
+
set_value(item, value)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def get_value(item)
|
|
38
|
+
@parse[item.to_sym]
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def set_value(item, value)
|
|
42
|
+
@parse[item.to_sym] = value
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
module CrawlStation
|
|
2
|
+
class Launcher
|
|
3
|
+
include Celluloid
|
|
4
|
+
include Celluloid::Notifications
|
|
5
|
+
|
|
6
|
+
def initialize
|
|
7
|
+
@exit_sign = false
|
|
8
|
+
subscribe 'exit_launcher', :exit_message
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def start
|
|
12
|
+
CS.logger.debug('station starting ... ')
|
|
13
|
+
|
|
14
|
+
CS.concurrent_count.times do
|
|
15
|
+
supervisor = CS::Producer.pool args: [CS.schedule, CS.cache]
|
|
16
|
+
supervisor.async.start
|
|
17
|
+
end
|
|
18
|
+
loop do
|
|
19
|
+
sleep(10)
|
|
20
|
+
break if @exit_sign
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def exit_message(_topic, data)
|
|
25
|
+
@exit_sign = data
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
module CrawlStation
|
|
2
|
+
module Logger
|
|
3
|
+
class << self
|
|
4
|
+
attr_accessor :logger
|
|
5
|
+
def method_missing(method_name, *args, &block)
|
|
6
|
+
return false if logger.nil?
|
|
7
|
+
return super unless logger.respond_to?(method_name)
|
|
8
|
+
logger.send(method_name, *args, &block)
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
module CrawlStation
|
|
2
|
+
class Producer
|
|
3
|
+
include Celluloid
|
|
4
|
+
|
|
5
|
+
attr_accessor :schedule, :cache, :proxies, :proxy
|
|
6
|
+
|
|
7
|
+
def initialize(schedule, cache, proxies = nil)
|
|
8
|
+
@schedule = schedule
|
|
9
|
+
@cache = cache
|
|
10
|
+
@proxies = proxies
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def start
|
|
14
|
+
loop { break unless loop_parser }
|
|
15
|
+
Logger.debug "#{self} done"
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def loop_parser
|
|
19
|
+
return sleep(0.2) || true if @schedule.empty?
|
|
20
|
+
item = @schedule.pop
|
|
21
|
+
item = CS::ParseStruct.new(item) if item.is_a?(Hash)
|
|
22
|
+
return sleep(0.2) || true if parsed?(item)
|
|
23
|
+
Logger.debug "start parse #{item.link}"
|
|
24
|
+
data = parse_item(item)
|
|
25
|
+
return true if data.nil? || data.empty?
|
|
26
|
+
data = parse_links(data, item.namespace)
|
|
27
|
+
return true if data.empty?
|
|
28
|
+
item.item_class.new.save(item.link, data)
|
|
29
|
+
true
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def parse_item(item)
|
|
33
|
+
data = cache(item) { item.parser_class.new.crawl(item.link) }
|
|
34
|
+
@schedule.done(item)
|
|
35
|
+
data
|
|
36
|
+
rescue Exception => e
|
|
37
|
+
Logger.error("%s: %s\n%s" % [item.link, e.message, e.backtrace[0..10].join("\n")])
|
|
38
|
+
@schedule.failed(item)
|
|
39
|
+
nil
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def parse_links(data, namespace)
|
|
43
|
+
links = ->(data, namespace) do
|
|
44
|
+
next if data['link'].blank? || parsed?(data)
|
|
45
|
+
@schedule.push ParseStruct.new(parser: data['parser'], link: data['link'], namespace: namespace)
|
|
46
|
+
end
|
|
47
|
+
['pages', 'details'].each do |field|
|
|
48
|
+
data.delete(field)&.map { |page| links.call(page, namespace) }
|
|
49
|
+
end
|
|
50
|
+
data
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def parsed?(data)
|
|
54
|
+
data.nil? || @cache.include?(data['link'])
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def cache(item, data = 'parsing')
|
|
58
|
+
@cache[item['link']] = data
|
|
59
|
+
data = yield if block_given?
|
|
60
|
+
@cache[item['link']] = data
|
|
61
|
+
data
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
module CrawlStation
|
|
2
|
+
module Schedule
|
|
3
|
+
extend self
|
|
4
|
+
extend Forwardable
|
|
5
|
+
include Concerns::AdapterConcern
|
|
6
|
+
|
|
7
|
+
def adapter=(adapter_name)
|
|
8
|
+
custom_adapter(adapter_name, CrawlStation::ScheduleAdapters)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def_delegators :@adapter, :push, :pop, :empty?, :include?
|
|
12
|
+
end
|
|
13
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
module CrawlStation
|
|
2
|
+
module ScheduleAdapters
|
|
3
|
+
class MemoryAdapter < AbstractAdapter
|
|
4
|
+
def initialize
|
|
5
|
+
@queue = Queue.new
|
|
6
|
+
@queue.extend(MonitorMixin)
|
|
7
|
+
%w(push pop empty?).each do |method_name|
|
|
8
|
+
self.define_singleton_method method_name do |*args|
|
|
9
|
+
@queue.synchronize do
|
|
10
|
+
@queue.send(method_name, *args)
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def failed(item)
|
|
17
|
+
CS.logger.debug "#{item.link} failed"
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
namespace :db do
|
|
2
|
+
def db_operation(operator)
|
|
3
|
+
Rake::Task["db:#{operator}"].invoke('crawl_station')
|
|
4
|
+
Dir["#{CrawlStation.root}/module/*"].each do |dir|
|
|
5
|
+
module_name = dirr.split('/').last
|
|
6
|
+
Rake::Task["db:#{operator}"].invoke(module_name)
|
|
7
|
+
end
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
desc 'db:create[module_name] if module_name is nil, create all module databases'
|
|
11
|
+
task :create, [:module_name] => :environment do |_t, args|
|
|
12
|
+
m_name = args[:module_name]
|
|
13
|
+
return db_operation(:create) if m_name.nil?
|
|
14
|
+
m_name = nil if m_name == 'crawl_station'
|
|
15
|
+
CrawlStation::Utils.create_database(m_name)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
desc 'db:migrate[module_name] if module_name is nil, migrate all module migrations'
|
|
19
|
+
task :migrate, [:module_name] => :environment do |_t, args|
|
|
20
|
+
version = ENV['VERSION']
|
|
21
|
+
module_name = args[:module_name]
|
|
22
|
+
return db_operation(:migrate) if module_name.nil?
|
|
23
|
+
path = "#{CrawlStation::Utils.module_path(module_name)}/db/migrate"
|
|
24
|
+
path = "#{CrawlStation::Utils.gems_path}/db/migrate" if module_name == 'crawl_station'
|
|
25
|
+
ActiveRecord::Migrator.migrate(path, version ? version.to_i : nil)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
task :environment, [:module_name] do |_t, args|
|
|
29
|
+
config = CrawlStation::Utils.database_config args[:module_name]
|
|
30
|
+
ActiveRecord::Base.logger = CrawlStation.logger
|
|
31
|
+
ActiveRecord::Base.establish_connection config
|
|
32
|
+
end
|
|
33
|
+
end
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
require 'yaml'
|
|
2
|
+
require 'active_support/core_ext'
|
|
3
|
+
module CrawlStation
|
|
4
|
+
module Utils
|
|
5
|
+
class << self
|
|
6
|
+
def module_path(module_name)
|
|
7
|
+
"#{CS.root}/module/#{module_name}"
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def database_path(module_name = nil)
|
|
11
|
+
global_db_path = "#{CS.root}/config/database.yml"
|
|
12
|
+
return global_db_path if module_name.nil?
|
|
13
|
+
path = "#{module_path(module_name)}/config/database.yml"
|
|
14
|
+
return path if File.exist?(path)
|
|
15
|
+
global_db_path
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def database_config(module_name = nil)
|
|
19
|
+
result = ERB.new(IO.read(database_path(module_name))).result
|
|
20
|
+
::YAML.load(result).deep_symbolize_keys[CS.env.to_sym]
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def templates_path
|
|
24
|
+
"#{gem_path}/crawl_station/templates"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def template_filepath(path)
|
|
28
|
+
"#{templates_path}/#{path}"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def gem_path
|
|
32
|
+
File.expand_path('../../', __FILE__)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
require 'crawl_station/version'
|
|
2
|
+
require 'pathname'
|
|
3
|
+
require 'active_support'
|
|
4
|
+
require 'active_record'
|
|
5
|
+
require 'active_support/dependencies/autoload'
|
|
6
|
+
require 'logger'
|
|
7
|
+
require 'thor'
|
|
8
|
+
require 'celluloid/debug'
|
|
9
|
+
require 'celluloid/current'
|
|
10
|
+
module CrawlStation # :nodoc:
|
|
11
|
+
extend ActiveSupport::Autoload
|
|
12
|
+
|
|
13
|
+
autoload :Configuration
|
|
14
|
+
autoload :Logger
|
|
15
|
+
autoload :Utils
|
|
16
|
+
autoload :ApplicationRecord
|
|
17
|
+
autoload :Producer
|
|
18
|
+
autoload :Launcher
|
|
19
|
+
autoload :Cache
|
|
20
|
+
autoload :Schedule
|
|
21
|
+
autoload :ParseStruct, 'crawl_station/fundation/parse_struct'
|
|
22
|
+
autoload :Command
|
|
23
|
+
|
|
24
|
+
module ScheduleAdapters
|
|
25
|
+
extend ActiveSupport::Autoload
|
|
26
|
+
|
|
27
|
+
autoload :AbstractAdapter
|
|
28
|
+
autoload :MemoryAdapter
|
|
29
|
+
autoload :DbAdapter
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
module Command
|
|
33
|
+
extend ActiveSupport::Autoload
|
|
34
|
+
|
|
35
|
+
autoload :Create
|
|
36
|
+
autoload :Generate
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
module CacheAdapters
|
|
40
|
+
extend ActiveSupport::Autoload
|
|
41
|
+
|
|
42
|
+
autoload :AbstractAdapter
|
|
43
|
+
autoload :MemoryAdapter
|
|
44
|
+
autoload :DbAdapter
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
module Concerns
|
|
48
|
+
extend ActiveSupport::Autoload
|
|
49
|
+
|
|
50
|
+
autoload :AdapterConcern
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
class << self
|
|
54
|
+
def config
|
|
55
|
+
yield @config if block_given?
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def env
|
|
59
|
+
@_env ||= ActiveSupport::StringInquirer.new(ENV['CRAWL_STATION_ENV'] || 'development')
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def env=(environment)
|
|
63
|
+
@_env = ActiveSupport::StringInquirer.new(environment)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def root
|
|
67
|
+
Pathname.new(File.expand_path('.'))
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def logger
|
|
71
|
+
@_logger ||= begin
|
|
72
|
+
CrawlStation::Logger.logger ||= ::Logger.new(STDERR) do |log|
|
|
73
|
+
log.level = ::Logger.DEBUG
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def logger=(logger)
|
|
79
|
+
@_logger = CrawlStation::Logger.logger = logger
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def concurrent_count
|
|
83
|
+
@_concurrent_count ||= ENV['CRAWL_STATION_CONCURRENT_COUNT'] || 1
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def concurrent_count=(count)
|
|
87
|
+
@_concurrent_count = count
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def schedule
|
|
91
|
+
Schedule.adapter
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def schedule=(item)
|
|
95
|
+
Schedule.adpater(item)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def cache
|
|
99
|
+
Cache.adapter
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def cache=(item)
|
|
103
|
+
Cache.adapter(item)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def proxies
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def proxies(item)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def load_tasks
|
|
113
|
+
task_path = File.expand_path('../', __FILE__)
|
|
114
|
+
[
|
|
115
|
+
"#{task_path}/crawl_station/tasks/*.rake",
|
|
116
|
+
"#{CrawlStation.root}/lib/tasks/**/*.rake"
|
|
117
|
+
].each { |path| Dir[path].each { |f| load f } }
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def load_modules
|
|
121
|
+
%w(item parser config).each do |path|
|
|
122
|
+
Dir["#{CS.root}/module/*/#{path}/**/*.rb"].each { |f| require f }
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def init_application
|
|
127
|
+
@config ||= CrawlStation::Configuration
|
|
128
|
+
Dir["#{CS.root}/config/initializers/**/*.rb"].each { |f| require f }
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def config_adapter
|
|
132
|
+
adapter = @config.adapter || 'memory'
|
|
133
|
+
schedule = adapter
|
|
134
|
+
cache = adapter
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def config_parsers
|
|
138
|
+
parsers = @config.parsers || []
|
|
139
|
+
parsers.each { |p| schedule.push p }
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def boot
|
|
143
|
+
init_application
|
|
144
|
+
load_modules
|
|
145
|
+
config_adapter
|
|
146
|
+
config_parsers
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
CS = CrawlStation
|
|
152
|
+
Celluloid.boot
|