jobmaster 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/job_master/gen_methods.rb +68 -0
- data/lib/job_master/stealth.rb +13 -0
- data/lib/job_master.rb +138 -0
- metadata +73 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 2c5405222718b1846cc2b84ce1b83d99307fba52
|
4
|
+
data.tar.gz: 838630db1843875775075cc9d9321a5eb0173a02
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 327dc4714bb69f279dec5ef74ec7287cb4600bd543fb1183c6729a861f7f36b4d49a6c4992e4ef3ccc599868d1ae1727d3a00308b5b225862f941b19ec260fd2
|
7
|
+
data.tar.gz: 516b2ec4d1a7e75e2ae907e453c5a7e42357f3a2ebb344841bc9e015b72885c02237799d6a9a4e6513984255aacce5c21cee4f1da67c50225f83aceff54ae5d8
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'flex_core'
|
2
|
+
|
3
|
+
class NilClass
|
4
|
+
def to_hash
|
5
|
+
{}
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
class JobMaster
|
10
|
+
|
11
|
+
def date_gen (args)
|
12
|
+
min = Date.parse(args[:earliest]).jd
|
13
|
+
max = args[:latest] == 'none' ? Date.today.jd : Date.parse(args[:latest]).jd
|
14
|
+
units = (min..max).to_a.shuffle
|
15
|
+
end
|
16
|
+
|
17
|
+
def from_table (args)
|
18
|
+
table_name = args[:table_name]
|
19
|
+
columns = args[:columns]
|
20
|
+
filters = args[:filters]
|
21
|
+
|
22
|
+
table_name = table_name.to_sym if table_name.is_a?(String)
|
23
|
+
columns = eval(columns) if columns.is_a?(String)
|
24
|
+
filters = eval(filters) if filters.is_a?(String)
|
25
|
+
|
26
|
+
output = database[table_name].all
|
27
|
+
output.map! { |row| row.keep(*columns) } unless columns.nil?
|
28
|
+
|
29
|
+
unless filters.nil?
|
30
|
+
output.reject! { |row|
|
31
|
+
rejected = false
|
32
|
+
filters.each { |key, value|
|
33
|
+
if row[key] != value
|
34
|
+
rejected = true
|
35
|
+
break
|
36
|
+
end
|
37
|
+
}
|
38
|
+
rejected
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
output.map!(&:inspect)
|
43
|
+
output.to_a.shuffle!
|
44
|
+
end
|
45
|
+
|
46
|
+
def from_csv (file, *labels)
|
47
|
+
|
48
|
+
file = File.open(file, 'r').read.gsub(/\r/, '')
|
49
|
+
rows = CSV.parse(file)
|
50
|
+
headers = rows.shift.map(&:downcase.strip)
|
51
|
+
end_index = headers.rindex(nil)
|
52
|
+
headers.slice!(0..end_index) unless end_index == -1
|
53
|
+
raise ArgumentError.new('Header cannot contain empty cells') if headers.include?(nil)
|
54
|
+
raise ArgumentError.new('Header labels must be unique.') if headers.size != headers.uniq.size
|
55
|
+
indexes = labels.map { |label|
|
56
|
+
headers.index(label)
|
57
|
+
}
|
58
|
+
rows.map{ |row|
|
59
|
+
new_row = row.values_at(*indexes)
|
60
|
+
hash = {}
|
61
|
+
new_row.each_with_index { |cell, index|
|
62
|
+
hash.store!(labels[index], cell)
|
63
|
+
}
|
64
|
+
}
|
65
|
+
rows
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'tunnel_blick'
|
2
|
+
|
3
|
+
class JobMaster
|
4
|
+
def stealth(max_requests, args, *websites)
|
5
|
+
@tunnel = TunnelBlick.new(database)
|
6
|
+
dom_ids = websites.map {|ws|
|
7
|
+
database[:websites][domain_url: ws.home_url.scan(/[^\/]+/)[1]][:id]
|
8
|
+
}
|
9
|
+
@tunnel.connect_smart(max_requests, args, dom_ids)
|
10
|
+
@tunnel.add_websites(*websites)
|
11
|
+
@tunnel
|
12
|
+
end
|
13
|
+
end
|
data/lib/job_master.rb
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
require 'job_master/gen_methods'
|
2
|
+
require 'job_master/stealth'
|
3
|
+
require 'flex_core'
|
4
|
+
require 'flex_pg'
|
5
|
+
|
6
|
+
class JobMaster
|
7
|
+
attr_reader :database, :programs_db, :jobs_db, :websites
|
8
|
+
|
9
|
+
def initialize (database)
|
10
|
+
@myself = ENV['USER']
|
11
|
+
@database = database
|
12
|
+
@programs_db = database[:program_constraints]
|
13
|
+
@jobs_db = database[:job_log]
|
14
|
+
end
|
15
|
+
|
16
|
+
def add_websites (*websites)
|
17
|
+
@websites = websites
|
18
|
+
end
|
19
|
+
|
20
|
+
class << self
|
21
|
+
|
22
|
+
LEGEND = {
|
23
|
+
date_range_gen: [:earliest, :latest],
|
24
|
+
from_csv: ['file', '*labels']
|
25
|
+
}
|
26
|
+
|
27
|
+
def date_range_gen
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
def start_single (prog_name , minutes = 10, sleep = 3)
|
34
|
+
|
35
|
+
@start_time = Time.now
|
36
|
+
set_up_job(prog_name)
|
37
|
+
|
38
|
+
unit_time = 0.0
|
39
|
+
|
40
|
+
while (Time.now - @start_time) / 60 < minutes
|
41
|
+
|
42
|
+
unless unit_time == 0.0
|
43
|
+
update_job(unit_time)
|
44
|
+
end
|
45
|
+
|
46
|
+
unit_start = Time.now
|
47
|
+
unit = next_unit
|
48
|
+
|
49
|
+
break if unit.nil?
|
50
|
+
|
51
|
+
@websites.first.send(prog_name.to_sym, unit)
|
52
|
+
|
53
|
+
puts unit_time = (Time.now - unit_start) / 60
|
54
|
+
sleep(sleep)
|
55
|
+
unit_time += sleep / 60
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
def step_single (prog_name)
|
61
|
+
|
62
|
+
@start_time = Time.now
|
63
|
+
set_up_job(prog_name)
|
64
|
+
unit = next_unit
|
65
|
+
return false if unit.nil?
|
66
|
+
|
67
|
+
@websites.first.send(prog_name.to_sym, unit)
|
68
|
+
puts unit_time = (Time.now - @start_time) / 60
|
69
|
+
update_job(unit_time)
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
def set_up_job (program)
|
74
|
+
last_matching = programs_db.where(program: program, @myself => Sequel.pg_array_op(:machines).any).order(:id).last
|
75
|
+
|
76
|
+
@program_id = last_matching[:id]
|
77
|
+
the_job = jobs_db.where(program: program, constraints: @program_id).order(:id).last
|
78
|
+
|
79
|
+
if the_job.nil?
|
80
|
+
units = self.send(last_matching[:method].to_sym, last_matching[:constraints].merge(program: program))
|
81
|
+
insert_job_details(program, units, last_matching[:machines])
|
82
|
+
end
|
83
|
+
|
84
|
+
@job_id = the_job[:id]
|
85
|
+
end
|
86
|
+
|
87
|
+
def insert_job_details (program, units, machines)
|
88
|
+
|
89
|
+
time_share = machines.inject({}) { |acc, m|
|
90
|
+
acc.store(m, 0)
|
91
|
+
acc
|
92
|
+
}
|
93
|
+
|
94
|
+
details = {
|
95
|
+
program: program,
|
96
|
+
created: Time.now,
|
97
|
+
constraints: @program_id,
|
98
|
+
units: Sequel.pg_array(units),
|
99
|
+
size: units.size,
|
100
|
+
total_minutes: 0.0,
|
101
|
+
time_share: Sequel.hstore(time_share),
|
102
|
+
unit_checkout: Sequel.hstore({:apples => :bananas})
|
103
|
+
}
|
104
|
+
jobs_db.insert(details)
|
105
|
+
end
|
106
|
+
|
107
|
+
def next_unit
|
108
|
+
checkout = jobs_db[id: @job_id][:unit_checkout].to_hash
|
109
|
+
|
110
|
+
if checkout.keys.include?(@myself)
|
111
|
+
checkout[@myself]
|
112
|
+
else
|
113
|
+
units = jobs_db[id: @job_id][:units]
|
114
|
+
return false if units.empty?
|
115
|
+
next_unit = units.pop
|
116
|
+
jobs_db.where(id: @job_id).update(
|
117
|
+
:units => units,
|
118
|
+
:unit_checkout => Sequel.hstore(Sequel.expr(:unit_checkout)).merge(@myself => next_unit))
|
119
|
+
next_unit
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def update_job (unit_time)
|
124
|
+
|
125
|
+
time_share = jobs_db[id: @job_id][:time_share].to_hash
|
126
|
+
time_share.find_add!(@myself, unit_time)
|
127
|
+
|
128
|
+
unit_checkout = jobs_db[id: @job_id][:unit_checkout].to_hash
|
129
|
+
unit_checkout.delete!(@myself)
|
130
|
+
|
131
|
+
jobs_db.where(id: @job_id).update(
|
132
|
+
:total_minutes => Sequel.expr(:total_minutes) + unit_time,
|
133
|
+
:time_share => Sequel.hstore(time_share),
|
134
|
+
:unit_checkout => Sequel.hstore(unit_checkout)
|
135
|
+
)
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
metadata
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: jobmaster
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Eugene Lai
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-03-05 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: flex_core
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: flex_pg
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: To do
|
42
|
+
email: ejt.lai@gmail.com
|
43
|
+
executables: []
|
44
|
+
extensions: []
|
45
|
+
extra_rdoc_files: []
|
46
|
+
files:
|
47
|
+
- lib/job_master.rb
|
48
|
+
- lib/job_master/gen_methods.rb
|
49
|
+
- lib/job_master/stealth.rb
|
50
|
+
homepage:
|
51
|
+
licenses: []
|
52
|
+
metadata: {}
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options: []
|
55
|
+
require_paths:
|
56
|
+
- lib
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
requirements: []
|
68
|
+
rubyforge_project:
|
69
|
+
rubygems_version: 2.5.2
|
70
|
+
signing_key:
|
71
|
+
specification_version: 4
|
72
|
+
summary: Job manager for multi-website automated scraping
|
73
|
+
test_files: []
|