jobmaster 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2c5405222718b1846cc2b84ce1b83d99307fba52
4
+ data.tar.gz: 838630db1843875775075cc9d9321a5eb0173a02
5
+ SHA512:
6
+ metadata.gz: 327dc4714bb69f279dec5ef74ec7287cb4600bd543fb1183c6729a861f7f36b4d49a6c4992e4ef3ccc599868d1ae1727d3a00308b5b225862f941b19ec260fd2
7
+ data.tar.gz: 516b2ec4d1a7e75e2ae907e453c5a7e42357f3a2ebb344841bc9e015b72885c02237799d6a9a4e6513984255aacce5c21cee4f1da67c50225f83aceff54ae5d8
@@ -0,0 +1,68 @@
1
+ require 'flex_core'
2
+
3
+ class NilClass
4
+ def to_hash
5
+ {}
6
+ end
7
+ end
8
+
9
+ class JobMaster
10
+
11
+ def date_gen (args)
12
+ min = Date.parse(args[:earliest]).jd
13
+ max = args[:latest] == 'none' ? Date.today.jd : Date.parse(args[:latest]).jd
14
+ units = (min..max).to_a.shuffle
15
+ end
16
+
17
+ def from_table (args)
18
+ table_name = args[:table_name]
19
+ columns = args[:columns]
20
+ filters = args[:filters]
21
+
22
+ table_name = table_name.to_sym if table_name.is_a?(String)
23
+ columns = eval(columns) if columns.is_a?(String)
24
+ filters = eval(filters) if filters.is_a?(String)
25
+
26
+ output = database[table_name].all
27
+ output.map! { |row| row.keep(*columns) } unless columns.nil?
28
+
29
+ unless filters.nil?
30
+ output.reject! { |row|
31
+ rejected = false
32
+ filters.each { |key, value|
33
+ if row[key] != value
34
+ rejected = true
35
+ break
36
+ end
37
+ }
38
+ rejected
39
+ }
40
+ end
41
+
42
+ output.map!(&:inspect)
43
+ output.to_a.shuffle!
44
+ end
45
+
46
+ def from_csv (file, *labels)
47
+
48
+ file = File.open(file, 'r').read.gsub(/\r/, '')
49
+ rows = CSV.parse(file)
50
+ headers = rows.shift.map(&:downcase.strip)
51
+ end_index = headers.rindex(nil)
52
+ headers.slice!(0..end_index) unless end_index == -1
53
+ raise ArgumentError.new('Header cannot contain empty cells') if headers.include?(nil)
54
+ raise ArgumentError.new('Header labels must be unique.') if headers.size != headers.uniq.size
55
+ indexes = labels.map { |label|
56
+ headers.index(label)
57
+ }
58
+ rows.map{ |row|
59
+ new_row = row.values_at(*indexes)
60
+ hash = {}
61
+ new_row.each_with_index { |cell, index|
62
+ hash.store!(labels[index], cell)
63
+ }
64
+ }
65
+ rows
66
+ end
67
+
68
+ end
@@ -0,0 +1,13 @@
1
+ require 'tunnel_blick'
2
+
3
+ class JobMaster
4
+ def stealth(max_requests, args, *websites)
5
+ @tunnel = TunnelBlick.new(database)
6
+ dom_ids = websites.map {|ws|
7
+ database[:websites][domain_url: ws.home_url.scan(/[^\/]+/)[1]][:id]
8
+ }
9
+ @tunnel.connect_smart(max_requests, args, dom_ids)
10
+ @tunnel.add_websites(*websites)
11
+ @tunnel
12
+ end
13
+ end
data/lib/job_master.rb ADDED
@@ -0,0 +1,138 @@
1
+ require 'job_master/gen_methods'
2
+ require 'job_master/stealth'
3
+ require 'flex_core'
4
+ require 'flex_pg'
5
+
6
+ class JobMaster
7
+ attr_reader :database, :programs_db, :jobs_db, :websites
8
+
9
+ def initialize (database)
10
+ @myself = ENV['USER']
11
+ @database = database
12
+ @programs_db = database[:program_constraints]
13
+ @jobs_db = database[:job_log]
14
+ end
15
+
16
+ def add_websites (*websites)
17
+ @websites = websites
18
+ end
19
+
20
+ class << self
21
+
22
+ LEGEND = {
23
+ date_range_gen: [:earliest, :latest],
24
+ from_csv: ['file', '*labels']
25
+ }
26
+
27
+ def date_range_gen
28
+
29
+ end
30
+
31
+ end
32
+
33
+ def start_single (prog_name , minutes = 10, sleep = 3)
34
+
35
+ @start_time = Time.now
36
+ set_up_job(prog_name)
37
+
38
+ unit_time = 0.0
39
+
40
+ while (Time.now - @start_time) / 60 < minutes
41
+
42
+ unless unit_time == 0.0
43
+ update_job(unit_time)
44
+ end
45
+
46
+ unit_start = Time.now
47
+ unit = next_unit
48
+
49
+ break if unit.nil?
50
+
51
+ @websites.first.send(prog_name.to_sym, unit)
52
+
53
+ puts unit_time = (Time.now - unit_start) / 60
54
+ sleep(sleep)
55
+ unit_time += sleep / 60
56
+ end
57
+
58
+ end
59
+
60
+ def step_single (prog_name)
61
+
62
+ @start_time = Time.now
63
+ set_up_job(prog_name)
64
+ unit = next_unit
65
+ return false if unit.nil?
66
+
67
+ @websites.first.send(prog_name.to_sym, unit)
68
+ puts unit_time = (Time.now - @start_time) / 60
69
+ update_job(unit_time)
70
+
71
+ end
72
+
73
+ def set_up_job (program)
74
+ last_matching = programs_db.where(program: program, @myself => Sequel.pg_array_op(:machines).any).order(:id).last
75
+
76
+ @program_id = last_matching[:id]
77
+ the_job = jobs_db.where(program: program, constraints: @program_id).order(:id).last
78
+
79
+ if the_job.nil?
80
+ units = self.send(last_matching[:method].to_sym, last_matching[:constraints].merge(program: program))
81
+ insert_job_details(program, units, last_matching[:machines])
82
+ end
83
+
84
+ @job_id = the_job[:id]
85
+ end
86
+
87
+ def insert_job_details (program, units, machines)
88
+
89
+ time_share = machines.inject({}) { |acc, m|
90
+ acc.store(m, 0)
91
+ acc
92
+ }
93
+
94
+ details = {
95
+ program: program,
96
+ created: Time.now,
97
+ constraints: @program_id,
98
+ units: Sequel.pg_array(units),
99
+ size: units.size,
100
+ total_minutes: 0.0,
101
+ time_share: Sequel.hstore(time_share),
102
+ unit_checkout: Sequel.hstore({:apples => :bananas})
103
+ }
104
+ jobs_db.insert(details)
105
+ end
106
+
107
+ def next_unit
108
+ checkout = jobs_db[id: @job_id][:unit_checkout].to_hash
109
+
110
+ if checkout.keys.include?(@myself)
111
+ checkout[@myself]
112
+ else
113
+ units = jobs_db[id: @job_id][:units]
114
+ return false if units.empty?
115
+ next_unit = units.pop
116
+ jobs_db.where(id: @job_id).update(
117
+ :units => units,
118
+ :unit_checkout => Sequel.hstore(Sequel.expr(:unit_checkout)).merge(@myself => next_unit))
119
+ next_unit
120
+ end
121
+ end
122
+
123
+ def update_job (unit_time)
124
+
125
+ time_share = jobs_db[id: @job_id][:time_share].to_hash
126
+ time_share.find_add!(@myself, unit_time)
127
+
128
+ unit_checkout = jobs_db[id: @job_id][:unit_checkout].to_hash
129
+ unit_checkout.delete!(@myself)
130
+
131
+ jobs_db.where(id: @job_id).update(
132
+ :total_minutes => Sequel.expr(:total_minutes) + unit_time,
133
+ :time_share => Sequel.hstore(time_share),
134
+ :unit_checkout => Sequel.hstore(unit_checkout)
135
+ )
136
+ end
137
+
138
+ end
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jobmaster
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Eugene Lai
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-03-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: flex_core
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: flex_pg
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: To do
42
+ email: ejt.lai@gmail.com
43
+ executables: []
44
+ extensions: []
45
+ extra_rdoc_files: []
46
+ files:
47
+ - lib/job_master.rb
48
+ - lib/job_master/gen_methods.rb
49
+ - lib/job_master/stealth.rb
50
+ homepage:
51
+ licenses: []
52
+ metadata: {}
53
+ post_install_message:
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ requirements: []
68
+ rubyforge_project:
69
+ rubygems_version: 2.5.2
70
+ signing_key:
71
+ specification_version: 4
72
+ summary: Job manager for multi-website automated scraping
73
+ test_files: []