spider_framework 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/spider.rb +37 -0
- data/lib/enhanced_logger.rb +37 -0
- data/lib/scheduler.rb +53 -0
- data/lib/spider_framework.rb +4 -0
- metadata +47 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 78c21ffecbe0e94d8abd0900f5f150e01406b0f6e3259d21d182b0decfdd7fcc
|
4
|
+
data.tar.gz: 5fa7e8c00333f1980ba7429116c8ed45d7458cc5f761c6f918db646024e2dcad
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 4f36eb2f29f6072915025106b6da1536209d08b135f5a8366299afffd0a9f43e00122c34116a0d6d5e3e93b04243f843e3f845dee2ac4405dc22d6567625e93c
|
7
|
+
data.tar.gz: efd5784653d9e802afae75947646aef98dc8bac385fbaf66a849d1804752c7eb67a0c9f0b80fc5ad5e6f077f58e2f7e3005949fe0209bb829da2f15621912480
|
data/bin/spider.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'logger'
|
5
|
+
require_relative '../lib/spider_framework'
|
6
|
+
|
7
|
+
options = {
|
8
|
+
log_level: Logger::DEBUG,
|
9
|
+
log_only: false
|
10
|
+
}
|
11
|
+
|
12
|
+
ARGV.unshift('-h') if ARGV.empty?
|
13
|
+
OptionParser.new do |opts|
|
14
|
+
opts.banner = <<~BANNER
|
15
|
+
执行一个爬虫文件
|
16
|
+
用法:spider [options] file
|
17
|
+
|
18
|
+
BANNER
|
19
|
+
|
20
|
+
opts.on("-l", "--log-level=LEVEL", "设置日志级别,可选值包括 FATAL、ERROR、WARN、INFO、DEBUG(默认)") do |level|
|
21
|
+
if %w[FATAL ERROR WARN INFO DEBUG].include?(level)
|
22
|
+
options[:log_level] = Logger.const_get(level)
|
23
|
+
else
|
24
|
+
raise OptionParser::InvalidArgument, level
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
opts.on("-n", "--log-only=A,B,C", Array, "只允许设置的日志器名称在控制台显示") do |names|
|
29
|
+
options[:log_only] = names
|
30
|
+
end
|
31
|
+
end.parse!
|
32
|
+
|
33
|
+
SpiderFramework::EnhancedLogger.level = options[:log_level]
|
34
|
+
SpiderFramework::EnhancedLogger.names_only = options[:log_only]
|
35
|
+
|
36
|
+
program_name = ARGV.shift
|
37
|
+
load program_name
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# 将本模块加入到目标类里,可生成类方法和实例方法 logger,它返回绑定到目标类的日志。
|
2
|
+
require 'logger'
|
3
|
+
|
4
|
+
module SpiderFramework::EnhancedLogger
|
5
|
+
|
6
|
+
@level = ::Logger::DEBUG
|
7
|
+
|
8
|
+
class << self
|
9
|
+
attr_accessor :level, :names_only
|
10
|
+
|
11
|
+
def included(base)
|
12
|
+
# 构造绑定类的 @logger 实例
|
13
|
+
base.class_eval do
|
14
|
+
name = base.name
|
15
|
+
names_only = EnhancedLogger.names_only
|
16
|
+
if !names_only || names_only.include?(name)
|
17
|
+
@logger = Logger.new(STDOUT)
|
18
|
+
@logger.level = EnhancedLogger.level
|
19
|
+
original_formatter = Logger::Formatter.new
|
20
|
+
@logger.formatter = proc { |severity, datetime, progname, msg|
|
21
|
+
"[#{datetime.strftime('%F %T')}] #{severity} -- #{name}: #{msg}\n"
|
22
|
+
}
|
23
|
+
else
|
24
|
+
@logger = Logger.new(IO::NULL)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
base.singleton_class.class_eval do
|
29
|
+
attr_accessor :logger
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def logger
|
35
|
+
self.class.logger
|
36
|
+
end
|
37
|
+
end
|
data/lib/scheduler.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
##通用调度器
|
2
|
+
#
|
3
|
+
# 调度器接受具有 call 或 start 方法的对象,并按照先入先出的顺序执行。
|
4
|
+
|
5
|
+
class SpiderFramework::Scheduler
|
6
|
+
def initialize
|
7
|
+
@tasks = []
|
8
|
+
@item_handlers = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def loop
|
12
|
+
until @tasks.empty?
|
13
|
+
task = @tasks.shift # Ruby queue performance: O(1)
|
14
|
+
|
15
|
+
callable, block = [task[:callable], task[:block]]
|
16
|
+
method = callable.respond_to?(:start) ? :start : :call
|
17
|
+
block ? callable.send(method, &block) : callable.send(method)
|
18
|
+
|
19
|
+
# 不执行任何逻辑了
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def push(callable, &block)
|
24
|
+
raise NoMethodError, "undefined method `call' or `start' for #{callable}" \
|
25
|
+
unless callable.respond_to?(:call) || callable.respond_to?(:start)
|
26
|
+
|
27
|
+
@tasks.push(callable: callable, block: block)
|
28
|
+
end
|
29
|
+
|
30
|
+
alias :<< :push
|
31
|
+
|
32
|
+
def add_item(type, item, &block)
|
33
|
+
handler = @item_handlers[type.to_s.to_sym]
|
34
|
+
raise "No handler found for type #{type.inspect}" unless handler
|
35
|
+
|
36
|
+
raise NoMethodError, "undefined method `call' or `handle' for #{handler}" \
|
37
|
+
unless handler.respond_to?(:handle) || handler.respond_to?(:call)
|
38
|
+
method = handler.respond_to?(:handle) ? :handle : :call
|
39
|
+
|
40
|
+
p = proc do
|
41
|
+
block ? handler.send(method, item, &block) : handler.send(method, item)
|
42
|
+
end
|
43
|
+
|
44
|
+
push(p)
|
45
|
+
end
|
46
|
+
|
47
|
+
def add_item_handler(type, handler)
|
48
|
+
type = type.to_s.to_sym
|
49
|
+
raise 'Handler already set' if @item_handlers.key?(type)
|
50
|
+
|
51
|
+
@item_handlers[type] = handler
|
52
|
+
end
|
53
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: spider_framework
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- run27017
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-08-13 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A lightweight ruby spider framework.
|
14
|
+
email: run27017@126.com
|
15
|
+
executables:
|
16
|
+
- spider.rb
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- bin/spider.rb
|
21
|
+
- lib/enhanced_logger.rb
|
22
|
+
- lib/scheduler.rb
|
23
|
+
- lib/spider_framework.rb
|
24
|
+
homepage: https://gitee.com/run27017/spider_framework
|
25
|
+
licenses:
|
26
|
+
- LGPL-2.0
|
27
|
+
metadata: {}
|
28
|
+
post_install_message:
|
29
|
+
rdoc_options: []
|
30
|
+
require_paths:
|
31
|
+
- lib
|
32
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
33
|
+
requirements:
|
34
|
+
- - ">="
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: '0'
|
37
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
requirements: []
|
43
|
+
rubygems_version: 3.0.3
|
44
|
+
signing_key:
|
45
|
+
specification_version: 4
|
46
|
+
summary: Spider Framework
|
47
|
+
test_files: []
|