spider_framework 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/spider.rb +37 -0
- data/lib/enhanced_logger.rb +37 -0
- data/lib/scheduler.rb +53 -0
- data/lib/spider_framework.rb +4 -0
- metadata +47 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 78c21ffecbe0e94d8abd0900f5f150e01406b0f6e3259d21d182b0decfdd7fcc
|
4
|
+
data.tar.gz: 5fa7e8c00333f1980ba7429116c8ed45d7458cc5f761c6f918db646024e2dcad
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 4f36eb2f29f6072915025106b6da1536209d08b135f5a8366299afffd0a9f43e00122c34116a0d6d5e3e93b04243f843e3f845dee2ac4405dc22d6567625e93c
|
7
|
+
data.tar.gz: efd5784653d9e802afae75947646aef98dc8bac385fbaf66a849d1804752c7eb67a0c9f0b80fc5ad5e6f077f58e2f7e3005949fe0209bb829da2f15621912480
|
data/bin/spider.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'logger'
|
5
|
+
require_relative '../lib/spider_framework'
|
6
|
+
|
7
|
+
options = {
|
8
|
+
log_level: Logger::DEBUG,
|
9
|
+
log_only: false
|
10
|
+
}
|
11
|
+
|
12
|
+
ARGV.unshift('-h') if ARGV.empty?
|
13
|
+
OptionParser.new do |opts|
|
14
|
+
opts.banner = <<~BANNER
|
15
|
+
执行一个爬虫文件
|
16
|
+
用法:spider [options] file
|
17
|
+
|
18
|
+
BANNER
|
19
|
+
|
20
|
+
opts.on("-l", "--log-level=LEVEL", "设置日志级别,可选值包括 FATAL、ERROR、WARN、INFO、DEBUG(默认)") do |level|
|
21
|
+
if %w[FATAL ERROR WARN INFO DEBUG].include?(level)
|
22
|
+
options[:log_level] = Logger.const_get(level)
|
23
|
+
else
|
24
|
+
raise OptionParser::InvalidArgument, level
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
opts.on("-n", "--log-only=A,B,C", Array, "只允许设置的日志器名称在控制台显示") do |names|
|
29
|
+
options[:log_only] = names
|
30
|
+
end
|
31
|
+
end.parse!
|
32
|
+
|
33
|
+
SpiderFramework::EnhancedLogger.level = options[:log_level]
|
34
|
+
SpiderFramework::EnhancedLogger.names_only = options[:log_only]
|
35
|
+
|
36
|
+
program_name = ARGV.shift
|
37
|
+
load program_name
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# 将本模块加入到目标类里,可生成类方法和实例方法 logger,它返回绑定到目标类的日志。
|
2
|
+
require 'logger'
|
3
|
+
|
4
|
+
module SpiderFramework::EnhancedLogger
|
5
|
+
|
6
|
+
@level = ::Logger::DEBUG
|
7
|
+
|
8
|
+
class << self
|
9
|
+
attr_accessor :level, :names_only
|
10
|
+
|
11
|
+
def included(base)
|
12
|
+
# 构造绑定类的 @logger 实例
|
13
|
+
base.class_eval do
|
14
|
+
name = base.name
|
15
|
+
names_only = EnhancedLogger.names_only
|
16
|
+
if !names_only || names_only.include?(name)
|
17
|
+
@logger = Logger.new(STDOUT)
|
18
|
+
@logger.level = EnhancedLogger.level
|
19
|
+
original_formatter = Logger::Formatter.new
|
20
|
+
@logger.formatter = proc { |severity, datetime, progname, msg|
|
21
|
+
"[#{datetime.strftime('%F %T')}] #{severity} -- #{name}: #{msg}\n"
|
22
|
+
}
|
23
|
+
else
|
24
|
+
@logger = Logger.new(IO::NULL)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
base.singleton_class.class_eval do
|
29
|
+
attr_accessor :logger
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def logger
|
35
|
+
self.class.logger
|
36
|
+
end
|
37
|
+
end
|
data/lib/scheduler.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
##通用调度器
|
2
|
+
#
|
3
|
+
# 调度器接受具有 call 或 start 方法的对象,并按照先入先出的顺序执行。
|
4
|
+
|
5
|
+
class SpiderFramework::Scheduler
|
6
|
+
def initialize
|
7
|
+
@tasks = []
|
8
|
+
@item_handlers = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def loop
|
12
|
+
until @tasks.empty?
|
13
|
+
task = @tasks.shift # Ruby queue performance: O(1)
|
14
|
+
|
15
|
+
callable, block = [task[:callable], task[:block]]
|
16
|
+
method = callable.respond_to?(:start) ? :start : :call
|
17
|
+
block ? callable.send(method, &block) : callable.send(method)
|
18
|
+
|
19
|
+
# 不执行任何逻辑了
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def push(callable, &block)
|
24
|
+
raise NoMethodError, "undefined method `call' or `start' for #{callable}" \
|
25
|
+
unless callable.respond_to?(:call) || callable.respond_to?(:start)
|
26
|
+
|
27
|
+
@tasks.push(callable: callable, block: block)
|
28
|
+
end
|
29
|
+
|
30
|
+
alias :<< :push
|
31
|
+
|
32
|
+
def add_item(type, item, &block)
|
33
|
+
handler = @item_handlers[type.to_s.to_sym]
|
34
|
+
raise "No handler found for type #{type.inspect}" unless handler
|
35
|
+
|
36
|
+
raise NoMethodError, "undefined method `call' or `handle' for #{handler}" \
|
37
|
+
unless handler.respond_to?(:handle) || handler.respond_to?(:call)
|
38
|
+
method = handler.respond_to?(:handle) ? :handle : :call
|
39
|
+
|
40
|
+
p = proc do
|
41
|
+
block ? handler.send(method, item, &block) : handler.send(method, item)
|
42
|
+
end
|
43
|
+
|
44
|
+
push(p)
|
45
|
+
end
|
46
|
+
|
47
|
+
def add_item_handler(type, handler)
|
48
|
+
type = type.to_s.to_sym
|
49
|
+
raise 'Handler already set' if @item_handlers.key?(type)
|
50
|
+
|
51
|
+
@item_handlers[type] = handler
|
52
|
+
end
|
53
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: spider_framework
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- run27017
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-08-13 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A lightweight ruby spider framework.
|
14
|
+
email: run27017@126.com
|
15
|
+
executables:
|
16
|
+
- spider.rb
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- bin/spider.rb
|
21
|
+
- lib/enhanced_logger.rb
|
22
|
+
- lib/scheduler.rb
|
23
|
+
- lib/spider_framework.rb
|
24
|
+
homepage: https://gitee.com/run27017/spider_framework
|
25
|
+
licenses:
|
26
|
+
- LGPL-2.0
|
27
|
+
metadata: {}
|
28
|
+
post_install_message:
|
29
|
+
rdoc_options: []
|
30
|
+
require_paths:
|
31
|
+
- lib
|
32
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
33
|
+
requirements:
|
34
|
+
- - ">="
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: '0'
|
37
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
requirements: []
|
43
|
+
rubygems_version: 3.0.3
|
44
|
+
signing_key:
|
45
|
+
specification_version: 4
|
46
|
+
summary: Spider Framework
|
47
|
+
test_files: []
|