jobmaster 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2c5405222718b1846cc2b84ce1b83d99307fba52
4
+ data.tar.gz: 838630db1843875775075cc9d9321a5eb0173a02
5
+ SHA512:
6
+ metadata.gz: 327dc4714bb69f279dec5ef74ec7287cb4600bd543fb1183c6729a861f7f36b4d49a6c4992e4ef3ccc599868d1ae1727d3a00308b5b225862f941b19ec260fd2
7
+ data.tar.gz: 516b2ec4d1a7e75e2ae907e453c5a7e42357f3a2ebb344841bc9e015b72885c02237799d6a9a4e6513984255aacce5c21cee4f1da67c50225f83aceff54ae5d8
@@ -0,0 +1,68 @@
1
+ require 'flex_core'
2
+
3
+ class NilClass
4
+ def to_hash
5
+ {}
6
+ end
7
+ end
8
+
9
+ class JobMaster
10
+
11
+ def date_gen (args)
12
+ min = Date.parse(args[:earliest]).jd
13
+ max = args[:latest] == 'none' ? Date.today.jd : Date.parse(args[:latest]).jd
14
+ units = (min..max).to_a.shuffle
15
+ end
16
+
17
+ def from_table (args)
18
+ table_name = args[:table_name]
19
+ columns = args[:columns]
20
+ filters = args[:filters]
21
+
22
+ table_name = table_name.to_sym if table_name.is_a?(String)
23
+ columns = eval(columns) if columns.is_a?(String)
24
+ filters = eval(filters) if filters.is_a?(String)
25
+
26
+ output = database[table_name].all
27
+ output.map! { |row| row.keep(*columns) } unless columns.nil?
28
+
29
+ unless filters.nil?
30
+ output.reject! { |row|
31
+ rejected = false
32
+ filters.each { |key, value|
33
+ if row[key] != value
34
+ rejected = true
35
+ break
36
+ end
37
+ }
38
+ rejected
39
+ }
40
+ end
41
+
42
+ output.map!(&:inspect)
43
+ output.to_a.shuffle!
44
+ end
45
+
46
+ def from_csv (file, *labels)
47
+
48
+ file = File.open(file, 'r').read.gsub(/\r/, '')
49
+ rows = CSV.parse(file)
50
+ headers = rows.shift.map(&:downcase.strip)
51
+ end_index = headers.rindex(nil)
52
+ headers.slice!(0..end_index) unless end_index == -1
53
+ raise ArgumentError.new('Header cannot contain empty cells') if headers.include?(nil)
54
+ raise ArgumentError.new('Header labels must be unique.') if headers.size != headers.uniq.size
55
+ indexes = labels.map { |label|
56
+ headers.index(label)
57
+ }
58
+ rows.map{ |row|
59
+ new_row = row.values_at(*indexes)
60
+ hash = {}
61
+ new_row.each_with_index { |cell, index|
62
+ hash.store!(labels[index], cell)
63
+ }
64
+ }
65
+ rows
66
+ end
67
+
68
+ end
@@ -0,0 +1,13 @@
1
+ require 'tunnel_blick'
2
+
3
+ class JobMaster
4
+ def stealth(max_requests, args, *websites)
5
+ @tunnel = TunnelBlick.new(database)
6
+ dom_ids = websites.map {|ws|
7
+ database[:websites][domain_url: ws.home_url.scan(/[^\/]+/)[1]][:id]
8
+ }
9
+ @tunnel.connect_smart(max_requests, args, dom_ids)
10
+ @tunnel.add_websites(*websites)
11
+ @tunnel
12
+ end
13
+ end
data/lib/job_master.rb ADDED
@@ -0,0 +1,138 @@
1
+ require 'job_master/gen_methods'
2
+ require 'job_master/stealth'
3
+ require 'flex_core'
4
+ require 'flex_pg'
5
+
6
+ class JobMaster
7
+ attr_reader :database, :programs_db, :jobs_db, :websites
8
+
9
+ def initialize (database)
10
+ @myself = ENV['USER']
11
+ @database = database
12
+ @programs_db = database[:program_constraints]
13
+ @jobs_db = database[:job_log]
14
+ end
15
+
16
+ def add_websites (*websites)
17
+ @websites = websites
18
+ end
19
+
20
+ class << self
21
+
22
+ LEGEND = {
23
+ date_range_gen: [:earliest, :latest],
24
+ from_csv: ['file', '*labels']
25
+ }
26
+
27
+ def date_range_gen
28
+
29
+ end
30
+
31
+ end
32
+
33
+ def start_single (prog_name , minutes = 10, sleep = 3)
34
+
35
+ @start_time = Time.now
36
+ set_up_job(prog_name)
37
+
38
+ unit_time = 0.0
39
+
40
+ while (Time.now - @start_time) / 60 < minutes
41
+
42
+ unless unit_time == 0.0
43
+ update_job(unit_time)
44
+ end
45
+
46
+ unit_start = Time.now
47
+ unit = next_unit
48
+
49
+ break if unit.nil?
50
+
51
+ @websites.first.send(prog_name.to_sym, unit)
52
+
53
+ puts unit_time = (Time.now - unit_start) / 60
54
+ sleep(sleep)
55
+ unit_time += sleep / 60
56
+ end
57
+
58
+ end
59
+
60
+ def step_single (prog_name)
61
+
62
+ @start_time = Time.now
63
+ set_up_job(prog_name)
64
+ unit = next_unit
65
+ return false if unit.nil?
66
+
67
+ @websites.first.send(prog_name.to_sym, unit)
68
+ puts unit_time = (Time.now - @start_time) / 60
69
+ update_job(unit_time)
70
+
71
+ end
72
+
73
+ def set_up_job (program)
74
+ last_matching = programs_db.where(program: program, @myself => Sequel.pg_array_op(:machines).any).order(:id).last
75
+
76
+ @program_id = last_matching[:id]
77
+ the_job = jobs_db.where(program: program, constraints: @program_id).order(:id).last
78
+
79
+ if the_job.nil?
80
+ units = self.send(last_matching[:method].to_sym, last_matching[:constraints].merge(program: program))
81
+ insert_job_details(program, units, last_matching[:machines])
82
+ end
83
+
84
+ @job_id = the_job[:id]
85
+ end
86
+
87
+ def insert_job_details (program, units, machines)
88
+
89
+ time_share = machines.inject({}) { |acc, m|
90
+ acc.store(m, 0)
91
+ acc
92
+ }
93
+
94
+ details = {
95
+ program: program,
96
+ created: Time.now,
97
+ constraints: @program_id,
98
+ units: Sequel.pg_array(units),
99
+ size: units.size,
100
+ total_minutes: 0.0,
101
+ time_share: Sequel.hstore(time_share),
102
+ unit_checkout: Sequel.hstore({:apples => :bananas})
103
+ }
104
+ jobs_db.insert(details)
105
+ end
106
+
107
+ def next_unit
108
+ checkout = jobs_db[id: @job_id][:unit_checkout].to_hash
109
+
110
+ if checkout.keys.include?(@myself)
111
+ checkout[@myself]
112
+ else
113
+ units = jobs_db[id: @job_id][:units]
114
+ return false if units.empty?
115
+ next_unit = units.pop
116
+ jobs_db.where(id: @job_id).update(
117
+ :units => units,
118
+ :unit_checkout => Sequel.hstore(Sequel.expr(:unit_checkout)).merge(@myself => next_unit))
119
+ next_unit
120
+ end
121
+ end
122
+
123
+ def update_job (unit_time)
124
+
125
+ time_share = jobs_db[id: @job_id][:time_share].to_hash
126
+ time_share.find_add!(@myself, unit_time)
127
+
128
+ unit_checkout = jobs_db[id: @job_id][:unit_checkout].to_hash
129
+ unit_checkout.delete!(@myself)
130
+
131
+ jobs_db.where(id: @job_id).update(
132
+ :total_minutes => Sequel.expr(:total_minutes) + unit_time,
133
+ :time_share => Sequel.hstore(time_share),
134
+ :unit_checkout => Sequel.hstore(unit_checkout)
135
+ )
136
+ end
137
+
138
+ end
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jobmaster
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Eugene Lai
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-03-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: flex_core
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: flex_pg
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: To do
42
+ email: ejt.lai@gmail.com
43
+ executables: []
44
+ extensions: []
45
+ extra_rdoc_files: []
46
+ files:
47
+ - lib/job_master.rb
48
+ - lib/job_master/gen_methods.rb
49
+ - lib/job_master/stealth.rb
50
+ homepage:
51
+ licenses: []
52
+ metadata: {}
53
+ post_install_message:
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ requirements: []
68
+ rubyforge_project:
69
+ rubygems_version: 2.5.2
70
+ signing_key:
71
+ specification_version: 4
72
+ summary: Job manager for multi-website automated scraping
73
+ test_files: []