seiya 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +1 -0
- data/bin/seiya +58 -0
- data/lib/seiya/contrib/pipelines.rb +11 -0
- data/lib/seiya/contrib.rb +1 -0
- data/lib/seiya/item.rb +20 -0
- data/lib/seiya/pipeline.rb +7 -0
- data/lib/seiya/request.rb +35 -0
- data/lib/seiya/response.rb +45 -0
- data/lib/seiya/scheduler.rb +83 -0
- data/lib/seiya/settings.rb +7 -0
- data/lib/seiya/task.rb +35 -0
- data/lib/seiya/util.rb +41 -0
- data/lib/seiya/version.rb +3 -0
- data/lib/seiya.rb +47 -0
- data/sample/test/aa/items.rb +9 -0
- data/sample/test/aa/pipelines/t.rb +18 -0
- data/sample/test/aa/settings.rb +6 -0
- data/sample/test/aa/tasks/test.rb +26 -0
- data/sample/test/aa/tasks.rb +1 -0
- data/sample/test/seiya.ini +2 -0
- metadata +64 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: f514e3349566ec80c0e81e09852949553b1f2ffc
|
|
4
|
+
data.tar.gz: 6eb2642bd8c0b5c1687a9d8fd24fdad9e019c17b
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 10b5496b4f8c5b725ba5a143dedec113984edf3903ee90997398269bc225d4e20e717c82ddd5ba7944264ed7119688a9d477550e4c8dd7450003eb1c7fa846f5
|
|
7
|
+
data.tar.gz: ebaf58dbff82691e387d6bfea6294f04a4eafe28c219b962a70b97d30ccfcd8dd43a94f7875b5ace3b43ba4b64ebd3984e8e15d2057ca9b0d9ac90129f971148
|
data/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
A ruby spider like scrapy-python.
|
data/bin/seiya
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
$:.unshift File.expand_path '.'
|
|
4
|
+
$:.unshift File.expand_path '../../lib', __FILE__
|
|
5
|
+
require 'optparse'
|
|
6
|
+
|
|
7
|
+
options = {}
|
|
8
|
+
|
|
9
|
+
OptionParser.new do |opts|
|
|
10
|
+
opts.banner = 'Usage: seiya [options]'
|
|
11
|
+
|
|
12
|
+
opts.on '-v', '--version', 'seiya version' do |v|
|
|
13
|
+
options[:version] = v
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
opts.on '-tTask', '--task=Task', 'seiya task' do |t|
|
|
17
|
+
options[:task] = t
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
opts.on '-aArg', '--argument=Arg', 'send argument to seiya task' do |a|
|
|
21
|
+
options[:args] = [] unless options[:args]
|
|
22
|
+
options[:args] << a
|
|
23
|
+
end
|
|
24
|
+
end.parse!
|
|
25
|
+
|
|
26
|
+
def extend_load_path(path)
|
|
27
|
+
Dir.foreach path do |f|
|
|
28
|
+
unless %w(. ..).include? f
|
|
29
|
+
if File.directory? File.join path, f
|
|
30
|
+
new_path = File.join path, f
|
|
31
|
+
extend_load_path new_path
|
|
32
|
+
elsif f == 'tasks.rb'
|
|
33
|
+
$:.unshift path
|
|
34
|
+
break
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
pwd = Dir.pwd
|
|
40
|
+
extend_load_path pwd
|
|
41
|
+
|
|
42
|
+
begin
|
|
43
|
+
require 'tasks'
|
|
44
|
+
rescue
|
|
45
|
+
raise 'No spiders!'
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
Seiya.setup
|
|
49
|
+
|
|
50
|
+
if options.key? :task
|
|
51
|
+
task_name = options[:task]
|
|
52
|
+
task = Seiya.get_task task_name
|
|
53
|
+
task.run
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
if options.key? :version
|
|
57
|
+
p Seiya::VERSION
|
|
58
|
+
end
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
require 'seiya/contrib/pipelines'
|
data/lib/seiya/item.rb
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
require 'httpclient'
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
require 'singleton'
|
|
4
|
+
require 'seiya/response'
|
|
5
|
+
|
|
6
|
+
module Seiya
|
|
7
|
+
class Request
|
|
8
|
+
def initialize(url, *args, method: 'get')
|
|
9
|
+
@url = url
|
|
10
|
+
@method = method.upcase
|
|
11
|
+
@args = args
|
|
12
|
+
@httpclient = HTTPClient.new
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def get_response
|
|
16
|
+
Response.new @httpclient.send(@method.downcase, @url, *@args)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def register(&block)
|
|
20
|
+
@handler = proc do
|
|
21
|
+
Enumerator.new do |enum|
|
|
22
|
+
block.call(get_response, enum)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def registered?
|
|
28
|
+
!@handler.nil?
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def fire
|
|
32
|
+
@handler.call
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
module Seiya
|
|
2
|
+
class Response
|
|
3
|
+
def initialize(resp)
|
|
4
|
+
@resp = resp
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
def url
|
|
8
|
+
@resp.http_header.request_uri.to_s
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def body
|
|
12
|
+
@resp.body
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def doc
|
|
16
|
+
begin
|
|
17
|
+
return @doc unless @doc.nil?
|
|
18
|
+
@doc = Nokogiri::HTML body
|
|
19
|
+
@has_doc = true
|
|
20
|
+
rescue
|
|
21
|
+
@has_doc = false
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def json
|
|
26
|
+
begin
|
|
27
|
+
return @json unless @json.nil?
|
|
28
|
+
@json = JSON.parse body
|
|
29
|
+
@has_json = true
|
|
30
|
+
rescue
|
|
31
|
+
@has_json = false
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def has_doc?
|
|
36
|
+
doc
|
|
37
|
+
!!@has_doc
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def has_json?
|
|
41
|
+
json
|
|
42
|
+
!!@has_json
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
require 'singleton'
|
|
2
|
+
|
|
3
|
+
module Seiya
|
|
4
|
+
class Scheduler
|
|
5
|
+
include Singleton
|
|
6
|
+
|
|
7
|
+
def initialize
|
|
8
|
+
@request_q = Queue.new
|
|
9
|
+
@run = false
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def add_requests(requests)
|
|
13
|
+
requests.each do |request|
|
|
14
|
+
@request_q << request if request.registered?
|
|
15
|
+
end
|
|
16
|
+
run unless @run
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def run
|
|
22
|
+
@run = true
|
|
23
|
+
@thread = Thread.new do
|
|
24
|
+
requests = []
|
|
25
|
+
until @request_q.empty?
|
|
26
|
+
requests << @request_q.pop
|
|
27
|
+
end
|
|
28
|
+
multi_run requests
|
|
29
|
+
end
|
|
30
|
+
@thread.join
|
|
31
|
+
if @request_q.empty?
|
|
32
|
+
stop
|
|
33
|
+
else
|
|
34
|
+
run
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def num_processors
|
|
39
|
+
return @num_processors unless @num_processors.nil?
|
|
40
|
+
@num_processors = Util.num_processors
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def multi_run(requests)
|
|
44
|
+
count = requests.count / num_processors + 1
|
|
45
|
+
threads = []
|
|
46
|
+
requests.each_slice(count) do |slice|
|
|
47
|
+
threads << Thread.new do
|
|
48
|
+
process_requests slice
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
threads.each do |t|
|
|
52
|
+
t.join
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def process_requests(requests)
|
|
57
|
+
requests.each do |request|
|
|
58
|
+
gen = request.fire
|
|
59
|
+
process_gen gen
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def process_gen(gen)
|
|
64
|
+
gen.each do |e|
|
|
65
|
+
if e.is_a? Array
|
|
66
|
+
e.each do |_e|
|
|
67
|
+
process_gen _e
|
|
68
|
+
end
|
|
69
|
+
elsif e.is_a? Seiya::Item
|
|
70
|
+
Seiya.process_item e
|
|
71
|
+
elsif e.is_a? Request
|
|
72
|
+
add_requests [e]
|
|
73
|
+
elsif e.is_a? Enumerator
|
|
74
|
+
process_gen e
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def stop
|
|
80
|
+
@run = false
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
data/lib/seiya/task.rb
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
require 'seiya/request'
|
|
2
|
+
require 'seiya/item'
|
|
3
|
+
require 'seiya/pipeline'
|
|
4
|
+
require 'seiya/util'
|
|
5
|
+
require 'seiya/scheduler'
|
|
6
|
+
|
|
7
|
+
def process_gen(gen)
|
|
8
|
+
gen.each do |e|
|
|
9
|
+
if e.is_a? Seiya::Item
|
|
10
|
+
Seiya.process_item e
|
|
11
|
+
elsif e.is_a? Enumerator
|
|
12
|
+
process_gen e
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
module Seiya
|
|
19
|
+
class Task
|
|
20
|
+
def initialize
|
|
21
|
+
@start_urls = []
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def run
|
|
25
|
+
return unless @start_urls.is_a? Array
|
|
26
|
+
requests = @start_urls.map do |url|
|
|
27
|
+
request = Request.new url
|
|
28
|
+
handler = method :parse
|
|
29
|
+
request.register &handler
|
|
30
|
+
request
|
|
31
|
+
end
|
|
32
|
+
Scheduler.instance.add_requests requests
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
data/lib/seiya/util.rb
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
module Seiya
|
|
2
|
+
module Util
|
|
3
|
+
extend self
|
|
4
|
+
|
|
5
|
+
def get_const(str)
|
|
6
|
+
str.split('::').inject(Object) do |o, c|
|
|
7
|
+
o.const_get c
|
|
8
|
+
end
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def processors_in_use
|
|
12
|
+
procs=[]
|
|
13
|
+
Dir.glob('/proc/*/stat') do |filename|
|
|
14
|
+
next if File.directory?(filename)
|
|
15
|
+
this_proc=[]
|
|
16
|
+
File.open(filename) { |file| this_proc = file.gets.split.values_at(2, 38) }
|
|
17
|
+
procs << this_proc[1].to_i if this_proc[0] == 'R'
|
|
18
|
+
end
|
|
19
|
+
procs.uniq.length
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def num_processors
|
|
23
|
+
IO.readlines('/proc/cpuinfo').delete_if { |x| x.index('processor') == nil }.length
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def num_free_processors
|
|
27
|
+
num_processors - processors_in_use
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def estimate_free_cpus(count, wait_time)
|
|
31
|
+
results = []
|
|
32
|
+
count.times {
|
|
33
|
+
results << num_free_processors
|
|
34
|
+
sleep(wait_time)
|
|
35
|
+
}
|
|
36
|
+
sum = 0
|
|
37
|
+
results.each { |x| sum += x }
|
|
38
|
+
(sum.to_f / results.length).round
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
data/lib/seiya.rb
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
require 'seiya/version'
|
|
2
|
+
require 'seiya/request'
|
|
3
|
+
require 'seiya/task'
|
|
4
|
+
require 'seiya/item'
|
|
5
|
+
require 'seiya/pipeline'
|
|
6
|
+
require 'seiya/settings'
|
|
7
|
+
|
|
8
|
+
module Seiya
|
|
9
|
+
extend self
|
|
10
|
+
|
|
11
|
+
def process_item(item)
|
|
12
|
+
@pipelines.each do |p|
|
|
13
|
+
item = p.process_item item
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def setup(conf_file: 'seiya.ini')
|
|
18
|
+
require 'inifile'
|
|
19
|
+
require 'seiya/util'
|
|
20
|
+
conf = IniFile.load conf_file
|
|
21
|
+
settings_file = conf.to_h.fetch('global', {}).fetch('settings', 'settings')
|
|
22
|
+
require settings_file
|
|
23
|
+
|
|
24
|
+
begin
|
|
25
|
+
pipelines = ::Settings::PIPELINES
|
|
26
|
+
rescue NameError
|
|
27
|
+
pipelines = {}
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
pipelines.merge! Settings::PIPELINES
|
|
31
|
+
|
|
32
|
+
pipelines = pipelines.sort_by { |_, v| v }.to_h
|
|
33
|
+
|
|
34
|
+
@pipelines = pipelines.keys.map do |k|
|
|
35
|
+
require_str, class_name = k.split('|')
|
|
36
|
+
require require_str
|
|
37
|
+
clazz = Util.get_const class_name
|
|
38
|
+
clazz.new
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def get_task(task_name)
|
|
43
|
+
task_name = 'Tasks::' << task_name unless task_name.include? '::'
|
|
44
|
+
clazz = Util::get_const task_name
|
|
45
|
+
clazz.new
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
require 'seiya'
|
|
2
|
+
|
|
3
|
+
module Pipelines
|
|
4
|
+
class A < Seiya::Pipeline
|
|
5
|
+
def process_item(item)
|
|
6
|
+
p 'I am A Pipeline'
|
|
7
|
+
item[:pipeline] = 'A'
|
|
8
|
+
item
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
class B < Seiya::Pipeline
|
|
12
|
+
def process_item(item)
|
|
13
|
+
p 'I am B Pipeline'
|
|
14
|
+
p item
|
|
15
|
+
item
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
require 'seiya'
|
|
2
|
+
require_relative '../items'
|
|
3
|
+
|
|
4
|
+
module Tasks
|
|
5
|
+
class Test < Seiya::Task
|
|
6
|
+
def initialize
|
|
7
|
+
@start_urls = ('a'..'d').map do |w|
|
|
8
|
+
'http://www.baidu.com/?key=' << w
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def parse(response, enum)
|
|
13
|
+
item = Items::Test.new
|
|
14
|
+
item[:url] = response.url
|
|
15
|
+
enum.yield item
|
|
16
|
+
request = Seiya::Request.new 'http://www.collipa.com'
|
|
17
|
+
request.register &method(:other_parse)
|
|
18
|
+
enum.yield request
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def other_parse(response, enum)
|
|
22
|
+
item = Items::Test.new name: 'yetone', url: response.url
|
|
23
|
+
enum.yield item
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
require 'tasks/test'
|
metadata
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: seiya
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- yetone
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2015-02-05 00:00:00.000000000 Z
|
|
12
|
+
dependencies: []
|
|
13
|
+
description:
|
|
14
|
+
email: i@yetone.net
|
|
15
|
+
executables:
|
|
16
|
+
- seiya
|
|
17
|
+
extensions: []
|
|
18
|
+
extra_rdoc_files: []
|
|
19
|
+
files:
|
|
20
|
+
- README.md
|
|
21
|
+
- bin/seiya
|
|
22
|
+
- lib/seiya.rb
|
|
23
|
+
- lib/seiya/contrib.rb
|
|
24
|
+
- lib/seiya/contrib/pipelines.rb
|
|
25
|
+
- lib/seiya/item.rb
|
|
26
|
+
- lib/seiya/pipeline.rb
|
|
27
|
+
- lib/seiya/request.rb
|
|
28
|
+
- lib/seiya/response.rb
|
|
29
|
+
- lib/seiya/scheduler.rb
|
|
30
|
+
- lib/seiya/settings.rb
|
|
31
|
+
- lib/seiya/task.rb
|
|
32
|
+
- lib/seiya/util.rb
|
|
33
|
+
- lib/seiya/version.rb
|
|
34
|
+
- sample/test/aa/items.rb
|
|
35
|
+
- sample/test/aa/pipelines/t.rb
|
|
36
|
+
- sample/test/aa/settings.rb
|
|
37
|
+
- sample/test/aa/tasks.rb
|
|
38
|
+
- sample/test/aa/tasks/test.rb
|
|
39
|
+
- sample/test/seiya.ini
|
|
40
|
+
homepage: https://github.com/yetone/seiya
|
|
41
|
+
licenses:
|
|
42
|
+
- WTFPL
|
|
43
|
+
metadata: {}
|
|
44
|
+
post_install_message:
|
|
45
|
+
rdoc_options: []
|
|
46
|
+
require_paths:
|
|
47
|
+
- lib
|
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
49
|
+
requirements:
|
|
50
|
+
- - ">="
|
|
51
|
+
- !ruby/object:Gem::Version
|
|
52
|
+
version: '0'
|
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
54
|
+
requirements:
|
|
55
|
+
- - ">="
|
|
56
|
+
- !ruby/object:Gem::Version
|
|
57
|
+
version: '0'
|
|
58
|
+
requirements: []
|
|
59
|
+
rubyforge_project:
|
|
60
|
+
rubygems_version: 2.4.5
|
|
61
|
+
signing_key:
|
|
62
|
+
specification_version: 4
|
|
63
|
+
summary: A ruby spider like scrapy-python
|
|
64
|
+
test_files: []
|