spider_framework 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 78c21ffecbe0e94d8abd0900f5f150e01406b0f6e3259d21d182b0decfdd7fcc
4
+ data.tar.gz: 5fa7e8c00333f1980ba7429116c8ed45d7458cc5f761c6f918db646024e2dcad
5
+ SHA512:
6
+ metadata.gz: 4f36eb2f29f6072915025106b6da1536209d08b135f5a8366299afffd0a9f43e00122c34116a0d6d5e3e93b04243f843e3f845dee2ac4405dc22d6567625e93c
7
+ data.tar.gz: efd5784653d9e802afae75947646aef98dc8bac385fbaf66a849d1804752c7eb67a0c9f0b80fc5ad5e6f077f58e2f7e3005949fe0209bb829da2f15621912480
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'logger'
5
+ require_relative '../lib/spider_framework'
6
+
7
+ options = {
8
+ log_level: Logger::DEBUG,
9
+ log_only: false
10
+ }
11
+
12
+ ARGV.unshift('-h') if ARGV.empty?
13
+ OptionParser.new do |opts|
14
+ opts.banner = <<~BANNER
15
+ 执行一个爬虫文件
16
+ 用法:spider [options] file
17
+
18
+ BANNER
19
+
20
+ opts.on("-l", "--log-level=LEVEL", "设置日志级别,可选值包括 FATAL、ERROR、WARN、INFO、DEBUG(默认)") do |level|
21
+ if %w[FATAL ERROR WARN INFO DEBUG].include?(level)
22
+ options[:log_level] = Logger.const_get(level)
23
+ else
24
+ raise OptionParser::InvalidArgument, level
25
+ end
26
+ end
27
+
28
+ opts.on("-n", "--log-only=A,B,C", Array, "只允许设置的日志器名称在控制台显示") do |names|
29
+ options[:log_only] = names
30
+ end
31
+ end.parse!
32
+
33
+ SpiderFramework::EnhancedLogger.level = options[:log_level]
34
+ SpiderFramework::EnhancedLogger.names_only = options[:log_only]
35
+
36
+ program_name = ARGV.shift
37
+ load program_name
@@ -0,0 +1,37 @@
1
+ # 将本模块加入到目标类里,可生成类方法和实例方法 logger,它返回绑定到目标类的日志。
2
+ require 'logger'
3
+
4
+ module SpiderFramework::EnhancedLogger
5
+
6
+ @level = ::Logger::DEBUG
7
+
8
+ class << self
9
+ attr_accessor :level, :names_only
10
+
11
+ def included(base)
12
+ # 构造绑定类的 @logger 实例
13
+ base.class_eval do
14
+ name = base.name
15
+ names_only = EnhancedLogger.names_only
16
+ if !names_only || names_only.include?(name)
17
+ @logger = Logger.new(STDOUT)
18
+ @logger.level = EnhancedLogger.level
19
+ original_formatter = Logger::Formatter.new
20
+ @logger.formatter = proc { |severity, datetime, progname, msg|
21
+ "[#{datetime.strftime('%F %T')}] #{severity} -- #{name}: #{msg}\n"
22
+ }
23
+ else
24
+ @logger = Logger.new(IO::NULL)
25
+ end
26
+ end
27
+
28
+ base.singleton_class.class_eval do
29
+ attr_accessor :logger
30
+ end
31
+ end
32
+ end
33
+
34
+ def logger
35
+ self.class.logger
36
+ end
37
+ end
@@ -0,0 +1,53 @@
1
+ ##通用调度器
2
+ #
3
+ # 调度器接受具有 call 或 start 方法的对象,并按照先入先出的顺序执行。
4
+
5
+ class SpiderFramework::Scheduler
6
+ def initialize
7
+ @tasks = []
8
+ @item_handlers = {}
9
+ end
10
+
11
+ def loop
12
+ until @tasks.empty?
13
+ task = @tasks.shift # Ruby queue performance: O(1)
14
+
15
+ callable, block = [task[:callable], task[:block]]
16
+ method = callable.respond_to?(:start) ? :start : :call
17
+ block ? callable.send(method, &block) : callable.send(method)
18
+
19
+ # 不执行任何逻辑了
20
+ end
21
+ end
22
+
23
+ def push(callable, &block)
24
+ raise NoMethodError, "undefined method `call' or `start' for #{callable}" \
25
+ unless callable.respond_to?(:call) || callable.respond_to?(:start)
26
+
27
+ @tasks.push(callable: callable, block: block)
28
+ end
29
+
30
+ alias :<< :push
31
+
32
+ def add_item(type, item, &block)
33
+ handler = @item_handlers[type.to_s.to_sym]
34
+ raise "No handler found for type #{type.inspect}" unless handler
35
+
36
+ raise NoMethodError, "undefined method `call' or `handle' for #{handler}" \
37
+ unless handler.respond_to?(:handle) || handler.respond_to?(:call)
38
+ method = handler.respond_to?(:handle) ? :handle : :call
39
+
40
+ p = proc do
41
+ block ? handler.send(method, item, &block) : handler.send(method, item)
42
+ end
43
+
44
+ push(p)
45
+ end
46
+
47
+ def add_item_handler(type, handler)
48
+ type = type.to_s.to_sym
49
+ raise 'Handler already set' if @item_handlers.key?(type)
50
+
51
+ @item_handlers[type] = handler
52
+ end
53
+ end
@@ -0,0 +1,4 @@
1
+ module SpiderFramework
2
+ require_relative 'scheduler'
3
+ require_relative 'enhanced_logger'
4
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spider_framework
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - run27017
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-08-13 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A lightweight ruby spider framework.
14
+ email: run27017@126.com
15
+ executables:
16
+ - spider.rb
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - bin/spider.rb
21
+ - lib/enhanced_logger.rb
22
+ - lib/scheduler.rb
23
+ - lib/spider_framework.rb
24
+ homepage: https://gitee.com/run27017/spider_framework
25
+ licenses:
26
+ - LGPL-2.0
27
+ metadata: {}
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubygems_version: 3.0.3
44
+ signing_key:
45
+ specification_version: 4
46
+ summary: Spider Framework
47
+ test_files: []