spider_framework 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 78c21ffecbe0e94d8abd0900f5f150e01406b0f6e3259d21d182b0decfdd7fcc
4
+ data.tar.gz: 5fa7e8c00333f1980ba7429116c8ed45d7458cc5f761c6f918db646024e2dcad
5
+ SHA512:
6
+ metadata.gz: 4f36eb2f29f6072915025106b6da1536209d08b135f5a8366299afffd0a9f43e00122c34116a0d6d5e3e93b04243f843e3f845dee2ac4405dc22d6567625e93c
7
+ data.tar.gz: efd5784653d9e802afae75947646aef98dc8bac385fbaf66a849d1804752c7eb67a0c9f0b80fc5ad5e6f077f58e2f7e3005949fe0209bb829da2f15621912480
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'logger'
5
+ require_relative '../lib/spider_framework'
6
+
7
+ options = {
8
+ log_level: Logger::DEBUG,
9
+ log_only: false
10
+ }
11
+
12
+ ARGV.unshift('-h') if ARGV.empty?
13
+ OptionParser.new do |opts|
14
+ opts.banner = <<~BANNER
15
+ 执行一个爬虫文件
16
+ 用法:spider [options] file
17
+
18
+ BANNER
19
+
20
+ opts.on("-l", "--log-level=LEVEL", "设置日志级别,可选值包括 FATAL、ERROR、WARN、INFO、DEBUG(默认)") do |level|
21
+ if %w[FATAL ERROR WARN INFO DEBUG].include?(level)
22
+ options[:log_level] = Logger.const_get(level)
23
+ else
24
+ raise OptionParser::InvalidArgument, level
25
+ end
26
+ end
27
+
28
+ opts.on("-n", "--log-only=A,B,C", Array, "只允许设置的日志器名称在控制台显示") do |names|
29
+ options[:log_only] = names
30
+ end
31
+ end.parse!
32
+
33
+ SpiderFramework::EnhancedLogger.level = options[:log_level]
34
+ SpiderFramework::EnhancedLogger.names_only = options[:log_only]
35
+
36
+ program_name = ARGV.shift
37
+ load program_name
@@ -0,0 +1,37 @@
1
+ # 将本模块加入到目标类里,可生成类方法和实例方法 logger,它返回绑定到目标类的日志。
2
+ require 'logger'
3
+
4
+ module SpiderFramework::EnhancedLogger
5
+
6
+ @level = ::Logger::DEBUG
7
+
8
+ class << self
9
+ attr_accessor :level, :names_only
10
+
11
+ def included(base)
12
+ # 构造绑定类的 @logger 实例
13
+ base.class_eval do
14
+ name = base.name
15
+ names_only = EnhancedLogger.names_only
16
+ if !names_only || names_only.include?(name)
17
+ @logger = Logger.new(STDOUT)
18
+ @logger.level = EnhancedLogger.level
19
+ original_formatter = Logger::Formatter.new
20
+ @logger.formatter = proc { |severity, datetime, progname, msg|
21
+ "[#{datetime.strftime('%F %T')}] #{severity} -- #{name}: #{msg}\n"
22
+ }
23
+ else
24
+ @logger = Logger.new(IO::NULL)
25
+ end
26
+ end
27
+
28
+ base.singleton_class.class_eval do
29
+ attr_accessor :logger
30
+ end
31
+ end
32
+ end
33
+
34
+ def logger
35
+ self.class.logger
36
+ end
37
+ end
@@ -0,0 +1,53 @@
1
+ ##通用调度器
2
+ #
3
+ # 调度器接受具有 call 或 start 方法的对象,并按照先入先出的顺序执行。
4
+
5
+ class SpiderFramework::Scheduler
6
+ def initialize
7
+ @tasks = []
8
+ @item_handlers = {}
9
+ end
10
+
11
+ def loop
12
+ until @tasks.empty?
13
+ task = @tasks.shift # Ruby queue performance: O(1)
14
+
15
+ callable, block = [task[:callable], task[:block]]
16
+ method = callable.respond_to?(:start) ? :start : :call
17
+ block ? callable.send(method, &block) : callable.send(method)
18
+
19
+ # 不执行任何逻辑了
20
+ end
21
+ end
22
+
23
+ def push(callable, &block)
24
+ raise NoMethodError, "undefined method `call' or `start' for #{callable}" \
25
+ unless callable.respond_to?(:call) || callable.respond_to?(:start)
26
+
27
+ @tasks.push(callable: callable, block: block)
28
+ end
29
+
30
+ alias :<< :push
31
+
32
+ def add_item(type, item, &block)
33
+ handler = @item_handlers[type.to_s.to_sym]
34
+ raise "No handler found for type #{type.inspect}" unless handler
35
+
36
+ raise NoMethodError, "undefined method `call' or `handle' for #{handler}" \
37
+ unless handler.respond_to?(:handle) || handler.respond_to?(:call)
38
+ method = handler.respond_to?(:handle) ? :handle : :call
39
+
40
+ p = proc do
41
+ block ? handler.send(method, item, &block) : handler.send(method, item)
42
+ end
43
+
44
+ push(p)
45
+ end
46
+
47
+ def add_item_handler(type, handler)
48
+ type = type.to_s.to_sym
49
+ raise 'Handler already set' if @item_handlers.key?(type)
50
+
51
+ @item_handlers[type] = handler
52
+ end
53
+ end
@@ -0,0 +1,4 @@
1
+ module SpiderFramework
2
+ require_relative 'scheduler'
3
+ require_relative 'enhanced_logger'
4
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spider_framework
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - run27017
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-08-13 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A lightweight ruby spider framework.
14
+ email: run27017@126.com
15
+ executables:
16
+ - spider.rb
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - bin/spider.rb
21
+ - lib/enhanced_logger.rb
22
+ - lib/scheduler.rb
23
+ - lib/spider_framework.rb
24
+ homepage: https://gitee.com/run27017/spider_framework
25
+ licenses:
26
+ - LGPL-2.0
27
+ metadata: {}
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubygems_version: 3.0.3
44
+ signing_key:
45
+ specification_version: 4
46
+ summary: Spider Framework
47
+ test_files: []