seiya 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f514e3349566ec80c0e81e09852949553b1f2ffc
4
+ data.tar.gz: 6eb2642bd8c0b5c1687a9d8fd24fdad9e019c17b
5
+ SHA512:
6
+ metadata.gz: 10b5496b4f8c5b725ba5a143dedec113984edf3903ee90997398269bc225d4e20e717c82ddd5ba7944264ed7119688a9d477550e4c8dd7450003eb1c7fa846f5
7
+ data.tar.gz: ebaf58dbff82691e387d6bfea6294f04a4eafe28c219b962a70b97d30ccfcd8dd43a94f7875b5ace3b43ba4b64ebd3984e8e15d2057ca9b0d9ac90129f971148
data/README.md ADDED
@@ -0,0 +1 @@
1
+ A ruby spider like scrapy-python.
data/bin/seiya ADDED
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $:.unshift File.expand_path '.'
4
+ $:.unshift File.expand_path '../../lib', __FILE__
5
+ require 'optparse'
6
+
7
+ options = {}
8
+
9
+ OptionParser.new do |opts|
10
+ opts.banner = 'Usage: seiya [options]'
11
+
12
+ opts.on '-v', '--version', 'seiya version' do |v|
13
+ options[:version] = v
14
+ end
15
+
16
+ opts.on '-tTask', '--task=Task', 'seiya task' do |t|
17
+ options[:task] = t
18
+ end
19
+
20
+ opts.on '-aArg', '--argument=Arg', 'send argument to seiya task' do |a|
21
+ options[:args] = [] unless options[:args]
22
+ options[:args] << a
23
+ end
24
+ end.parse!
25
+
26
+ def extend_load_path(path)
27
+ Dir.foreach path do |f|
28
+ unless %w(. ..).include? f
29
+ if File.directory? File.join path, f
30
+ new_path = File.join path, f
31
+ extend_load_path new_path
32
+ elsif f == 'tasks.rb'
33
+ $:.unshift path
34
+ break
35
+ end
36
+ end
37
+ end
38
+ end
39
+ pwd = Dir.pwd
40
+ extend_load_path pwd
41
+
42
+ begin
43
+ require 'tasks'
44
+ rescue
45
+ raise 'No spiders!'
46
+ end
47
+
48
+ Seiya.setup
49
+
50
+ if options.key? :task
51
+ task_name = options[:task]
52
+ task = Seiya.get_task task_name
53
+ task.run
54
+ end
55
+
56
+ if options.key? :version
57
+ p Seiya::VERSION
58
+ end
@@ -0,0 +1,11 @@
1
+ require 'seiya/pipeline'
2
+
3
+ module Contrib
4
+ module Pipelines
5
+ class BasePipeline < Seiya::Pipeline
6
+ def process_item(item)
7
+ item
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1 @@
1
+ require 'seiya/contrib/pipelines'
data/lib/seiya/item.rb ADDED
@@ -0,0 +1,20 @@
1
+ module Seiya
2
+ class Item
3
+ def initialize(h = {})
4
+ @data = {}
5
+ @data.merge!(h)
6
+ end
7
+
8
+ def [](key)
9
+ @data[key]
10
+ end
11
+
12
+ def []=(key, value)
13
+ @data[key] = value
14
+ end
15
+
16
+ def inspect
17
+ @data.inspect
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,7 @@
1
+ module Seiya
2
+ class Pipeline
3
+ def process_item(item)
4
+ item
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,35 @@
1
+ require 'httpclient'
2
+ require 'nokogiri'
3
+ require 'singleton'
4
+ require 'seiya/response'
5
+
6
+ module Seiya
7
+ class Request
8
+ def initialize(url, *args, method: 'get')
9
+ @url = url
10
+ @method = method.upcase
11
+ @args = args
12
+ @httpclient = HTTPClient.new
13
+ end
14
+
15
+ def get_response
16
+ Response.new @httpclient.send(@method.downcase, @url, *@args)
17
+ end
18
+
19
+ def register(&block)
20
+ @handler = proc do
21
+ Enumerator.new do |enum|
22
+ block.call(get_response, enum)
23
+ end
24
+ end
25
+ end
26
+
27
+ def registered?
28
+ !@handler.nil?
29
+ end
30
+
31
+ def fire
32
+ @handler.call
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,45 @@
1
+ module Seiya
2
+ class Response
3
+ def initialize(resp)
4
+ @resp = resp
5
+ end
6
+
7
+ def url
8
+ @resp.http_header.request_uri.to_s
9
+ end
10
+
11
+ def body
12
+ @resp.body
13
+ end
14
+
15
+ def doc
16
+ begin
17
+ return @doc unless @doc.nil?
18
+ @doc = Nokogiri::HTML body
19
+ @has_doc = true
20
+ rescue
21
+ @has_doc = false
22
+ end
23
+ end
24
+
25
+ def json
26
+ begin
27
+ return @json unless @json.nil?
28
+ @json = JSON.parse body
29
+ @has_json = true
30
+ rescue
31
+ @has_json = false
32
+ end
33
+ end
34
+
35
+ def has_doc?
36
+ doc
37
+ !!@has_doc
38
+ end
39
+
40
+ def has_json?
41
+ json
42
+ !!@has_json
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,83 @@
1
+ require 'singleton'
2
+
3
+ module Seiya
4
+ class Scheduler
5
+ include Singleton
6
+
7
+ def initialize
8
+ @request_q = Queue.new
9
+ @run = false
10
+ end
11
+
12
+ def add_requests(requests)
13
+ requests.each do |request|
14
+ @request_q << request if request.registered?
15
+ end
16
+ run unless @run
17
+ end
18
+
19
+ private
20
+
21
+ def run
22
+ @run = true
23
+ @thread = Thread.new do
24
+ requests = []
25
+ until @request_q.empty?
26
+ requests << @request_q.pop
27
+ end
28
+ multi_run requests
29
+ end
30
+ @thread.join
31
+ if @request_q.empty?
32
+ stop
33
+ else
34
+ run
35
+ end
36
+ end
37
+
38
+ def num_processors
39
+ return @num_processors unless @num_processors.nil?
40
+ @num_processors = Util.num_processors
41
+ end
42
+
43
+ def multi_run(requests)
44
+ count = requests.count / num_processors + 1
45
+ threads = []
46
+ requests.each_slice(count) do |slice|
47
+ threads << Thread.new do
48
+ process_requests slice
49
+ end
50
+ end
51
+ threads.each do |t|
52
+ t.join
53
+ end
54
+ end
55
+
56
+ def process_requests(requests)
57
+ requests.each do |request|
58
+ gen = request.fire
59
+ process_gen gen
60
+ end
61
+ end
62
+
63
+ def process_gen(gen)
64
+ gen.each do |e|
65
+ if e.is_a? Array
66
+ e.each do |_e|
67
+ process_gen _e
68
+ end
69
+ elsif e.is_a? Seiya::Item
70
+ Seiya.process_item e
71
+ elsif e.is_a? Request
72
+ add_requests [e]
73
+ elsif e.is_a? Enumerator
74
+ process_gen e
75
+ end
76
+ end
77
+ end
78
+
79
+ def stop
80
+ @run = false
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,7 @@
1
+ module Seiya
2
+ module Settings
3
+ PIPELINES = {
4
+ 'seiya/contrib|Contrib::Pipelines::BasePipeline' => 2
5
+ }
6
+ end
7
+ end
data/lib/seiya/task.rb ADDED
@@ -0,0 +1,35 @@
1
+ require 'seiya/request'
2
+ require 'seiya/item'
3
+ require 'seiya/pipeline'
4
+ require 'seiya/util'
5
+ require 'seiya/scheduler'
6
+
7
+ def process_gen(gen)
8
+ gen.each do |e|
9
+ if e.is_a? Seiya::Item
10
+ Seiya.process_item e
11
+ elsif e.is_a? Enumerator
12
+ process_gen e
13
+ end
14
+ end
15
+ end
16
+
17
+
18
+ module Seiya
19
+ class Task
20
+ def initialize
21
+ @start_urls = []
22
+ end
23
+
24
+ def run
25
+ return unless @start_urls.is_a? Array
26
+ requests = @start_urls.map do |url|
27
+ request = Request.new url
28
+ handler = method :parse
29
+ request.register &handler
30
+ request
31
+ end
32
+ Scheduler.instance.add_requests requests
33
+ end
34
+ end
35
+ end
data/lib/seiya/util.rb ADDED
@@ -0,0 +1,41 @@
1
+ module Seiya
2
+ module Util
3
+ extend self
4
+
5
+ def get_const(str)
6
+ str.split('::').inject(Object) do |o, c|
7
+ o.const_get c
8
+ end
9
+ end
10
+
11
+ def processors_in_use
12
+ procs=[]
13
+ Dir.glob('/proc/*/stat') do |filename|
14
+ next if File.directory?(filename)
15
+ this_proc=[]
16
+ File.open(filename) { |file| this_proc = file.gets.split.values_at(2, 38) }
17
+ procs << this_proc[1].to_i if this_proc[0] == 'R'
18
+ end
19
+ procs.uniq.length
20
+ end
21
+
22
+ def num_processors
23
+ IO.readlines('/proc/cpuinfo').delete_if { |x| x.index('processor') == nil }.length
24
+ end
25
+
26
+ def num_free_processors
27
+ num_processors - processors_in_use
28
+ end
29
+
30
+ def estimate_free_cpus(count, wait_time)
31
+ results = []
32
+ count.times {
33
+ results << num_free_processors
34
+ sleep(wait_time)
35
+ }
36
+ sum = 0
37
+ results.each { |x| sum += x }
38
+ (sum.to_f / results.length).round
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,3 @@
1
+ module Seiya
2
+ VERSION = '0.0.1'
3
+ end
data/lib/seiya.rb ADDED
@@ -0,0 +1,47 @@
1
+ require 'seiya/version'
2
+ require 'seiya/request'
3
+ require 'seiya/task'
4
+ require 'seiya/item'
5
+ require 'seiya/pipeline'
6
+ require 'seiya/settings'
7
+
8
+ module Seiya
9
+ extend self
10
+
11
+ def process_item(item)
12
+ @pipelines.each do |p|
13
+ item = p.process_item item
14
+ end
15
+ end
16
+
17
+ def setup(conf_file: 'seiya.ini')
18
+ require 'inifile'
19
+ require 'seiya/util'
20
+ conf = IniFile.load conf_file
21
+ settings_file = conf.to_h.fetch('global', {}).fetch('settings', 'settings')
22
+ require settings_file
23
+
24
+ begin
25
+ pipelines = ::Settings::PIPELINES
26
+ rescue NameError
27
+ pipelines = {}
28
+ end
29
+
30
+ pipelines.merge! Settings::PIPELINES
31
+
32
+ pipelines = pipelines.sort_by { |_, v| v }.to_h
33
+
34
+ @pipelines = pipelines.keys.map do |k|
35
+ require_str, class_name = k.split('|')
36
+ require require_str
37
+ clazz = Util.get_const class_name
38
+ clazz.new
39
+ end
40
+ end
41
+
42
+ def get_task(task_name)
43
+ task_name = 'Tasks::' << task_name unless task_name.include? '::'
44
+ clazz = Util::get_const task_name
45
+ clazz.new
46
+ end
47
+ end
@@ -0,0 +1,9 @@
1
+ require 'seiya'
2
+
3
+ module Items
4
+ class Test < Seiya::Item
5
+ def to_s
6
+ 'fuck'
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,18 @@
1
+ require 'seiya'
2
+
3
+ module Pipelines
4
+ class A < Seiya::Pipeline
5
+ def process_item(item)
6
+ p 'I am A Pipeline'
7
+ item[:pipeline] = 'A'
8
+ item
9
+ end
10
+ end
11
+ class B < Seiya::Pipeline
12
+ def process_item(item)
13
+ p 'I am B Pipeline'
14
+ p item
15
+ item
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,6 @@
1
+ module Settings
2
+ PIPELINES = {
3
+ 'aa/pipelines/t|Pipelines::A' => 1,
4
+ 'aa/pipelines/t|Pipelines::B' => 2,
5
+ }
6
+ end
@@ -0,0 +1,26 @@
1
+ require 'seiya'
2
+ require_relative '../items'
3
+
4
+ module Tasks
5
+ class Test < Seiya::Task
6
+ def initialize
7
+ @start_urls = ('a'..'d').map do |w|
8
+ 'http://www.baidu.com/?key=' << w
9
+ end
10
+ end
11
+
12
+ def parse(response, enum)
13
+ item = Items::Test.new
14
+ item[:url] = response.url
15
+ enum.yield item
16
+ request = Seiya::Request.new 'http://www.collipa.com'
17
+ request.register &method(:other_parse)
18
+ enum.yield request
19
+ end
20
+
21
+ def other_parse(response, enum)
22
+ item = Items::Test.new name: 'yetone', url: response.url
23
+ enum.yield item
24
+ end
25
+ end
26
+ end
@@ -0,0 +1 @@
1
+ require 'tasks/test'
@@ -0,0 +1,2 @@
1
+ [global]
2
+ settings = aa/settings
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: seiya
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - yetone
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-02-05 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: i@yetone.net
15
+ executables:
16
+ - seiya
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - README.md
21
+ - bin/seiya
22
+ - lib/seiya.rb
23
+ - lib/seiya/contrib.rb
24
+ - lib/seiya/contrib/pipelines.rb
25
+ - lib/seiya/item.rb
26
+ - lib/seiya/pipeline.rb
27
+ - lib/seiya/request.rb
28
+ - lib/seiya/response.rb
29
+ - lib/seiya/scheduler.rb
30
+ - lib/seiya/settings.rb
31
+ - lib/seiya/task.rb
32
+ - lib/seiya/util.rb
33
+ - lib/seiya/version.rb
34
+ - sample/test/aa/items.rb
35
+ - sample/test/aa/pipelines/t.rb
36
+ - sample/test/aa/settings.rb
37
+ - sample/test/aa/tasks.rb
38
+ - sample/test/aa/tasks/test.rb
39
+ - sample/test/seiya.ini
40
+ homepage: https://github.com/yetone/seiya
41
+ licenses:
42
+ - WTFPL
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubyforge_project:
60
+ rubygems_version: 2.4.5
61
+ signing_key:
62
+ specification_version: 4
63
+ summary: A ruby spider like scrapy-python
64
+ test_files: []