workflow-archiver 1.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/VERSION +2 -0
- data/lib/dor/archiver_version.rb +11 -0
- data/lib/dor/workflow_archiver.rb +274 -0
- metadata +130 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
OGY0MWY1NjlmNmVlMzI3ZmRiMzg5ZThkNzhjMzFlYzQyMzdlZTVmOQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MGNlY2I2OTlhNGYzNWRhODQ4N2I2YmEwMzc4Y2NjMTkzMzViNzg4Zg==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NDkyYjQ5ZmFmOTA2ZDEyMmNjOTU4Njk0ZTJkYTkxNzllMjIxMmIxNmUxZjY4
|
10
|
+
NmI5YTQ0OTQ2ZWZhMjhkY2MxZTZiMWYwZjRlYzEzYTBmOWEyNzY5OWE3N2My
|
11
|
+
NjQ4ZTI4ODllODM4OWQ4YWM5NDk4OGVhOGJkZDljMDliZGNlMzM=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
ZDkzMjY1YzdmNjE4NDAwMTQ1MGE1NzU0NGMxOGQ2NWJmOTA1MGI5MDhiYjRh
|
14
|
+
MzY4YTlmZTRmOTBkMTQ1YjI4OTcxZTA4NzA5NjVjMzU1ZWJiNGQ3ODk0ZTM1
|
15
|
+
Mzk3OTIxMzA0ZTJiNWExYzkyODgwMGQ1YjViMmYzNGJjNGVmNzY=
|
data/VERSION
ADDED
@@ -0,0 +1,274 @@
|
|
1
|
+
require 'rest_client'
|
2
|
+
require 'oci8'
|
3
|
+
|
4
|
+
module Dor
|
5
|
+
|
6
|
+
# Holds the paramaters about the workflow rows that need to be deleted
|
7
|
+
ArchiveCriteria = Struct.new(:repository, :druid, :datastream, :version) do
|
8
|
+
# @param [Array<Hash>] List of objects returned from {WorkflowArchiver#find_completed_objects}. It expects the following keys in the hash
|
9
|
+
# "REPOSITORY", "DRUID", "DATASTREAM". Note they are all caps strings, not symbols
|
10
|
+
def setup_from_query(row_hash)
|
11
|
+
self.repository = row_hash["REPOSITORY"]
|
12
|
+
self.druid = row_hash["DRUID"]
|
13
|
+
self.datastream = row_hash["DATASTREAM"]
|
14
|
+
set_current_version
|
15
|
+
self
|
16
|
+
end
|
17
|
+
|
18
|
+
# Removes version from list of members, then picks out non nil members and builds a hash of column_name => column_value
|
19
|
+
# @return [Hash] Maps column names (in ALL caps) to non-nil column values
|
20
|
+
def to_bind_hash
|
21
|
+
h = {}
|
22
|
+
members.reject{|mem| mem =~ /version/}.each do |m|
|
23
|
+
h[m.swapcase] = self.send(m) if(self.send(m))
|
24
|
+
end
|
25
|
+
h
|
26
|
+
end
|
27
|
+
|
28
|
+
def set_current_version
|
29
|
+
begin
|
30
|
+
self.version = RestClient.get WorkflowArchiver.config.dor_service_uri + "/dor/v1/objects/#{self.druid}/versions/current"
|
31
|
+
rescue RestClient::InternalServerError => ise
|
32
|
+
raise unless(ise.inspect =~ /Unable to find.*in fedora/)
|
33
|
+
LyberCore::Log.warn "#{ise.inspect}"
|
34
|
+
LyberCore::Log.warn "Moving workflow rows with version set to '1'"
|
35
|
+
self.version = '1'
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class WorkflowArchiver
|
41
|
+
WF_COLUMNS = %w(ID DRUID DATASTREAM PROCESS STATUS ERROR_MSG ERROR_TXT DATETIME ATTEMPTS LIFECYCLE ELAPSED REPOSITORY NOTE PRIORITY)
|
42
|
+
|
43
|
+
# These attributes mostly used for testing
|
44
|
+
attr_reader :conn, :errors
|
45
|
+
|
46
|
+
def WorkflowArchiver.config
|
47
|
+
@@conf ||= Confstruct::Configuration.new
|
48
|
+
end
|
49
|
+
|
50
|
+
# Sets up logging and connects to the database. By default it reads values from constants:
|
51
|
+
# WORKFLOW_DB_LOGIN, WORKFLOW_DB_PASSWORD, WORKFLOW_DB_URI, DOR_SERVICE_URI but can be overriden with the opts Hash
|
52
|
+
# @param [Hash] opts Options to override database parameters
|
53
|
+
# @option opts [String] :login ('WORKFLOW_DB_LOGIN') Database login id
|
54
|
+
# @option opts [String] :password ('WORKFLOW_DB_PASSWORD') Database password
|
55
|
+
# @option opts [String] :db_uri ('WORKFLOW_DB_URI') Database uri
|
56
|
+
# @option opts [String] :wf_table ('workflow') Name of the active workflow table
|
57
|
+
# @option opts [String] :wfa_table ('workflow_archive') Name of the workflow archive table
|
58
|
+
# @option opts [String] :dor_service_uri ('DOR_SERVICE_URI') URI of the DOR Rest service
|
59
|
+
# @option opts [Integer] :retry_delay (5) Number of seconds to sleep between retries of database operations
|
60
|
+
def initialize(opts={})
|
61
|
+
@login = (opts.include?(:login) ? opts[:login] : WorkflowArchiver.config.db_login)
|
62
|
+
@password = (opts.include?(:password) ? opts[:password] : WorkflowArchiver.config.db_password)
|
63
|
+
@db_uri = (opts.include?(:db_uri) ? opts[:db_uri] : WorkflowArchiver.config.db_uri)
|
64
|
+
@dor_service_uri = (opts.include?(:dor_service_uri) ? opts[:dor_service_uri] : WorkflowArchiver.config.dor_service_uri)
|
65
|
+
@workflow_table = (opts.include?(:wf_table) ? opts[:wf_table] : "workflow")
|
66
|
+
@workflow_archive_table = (opts.include?(:wfa_table) ? opts[:wfa_table] : "workflow_archive")
|
67
|
+
@retry_delay = (opts.include?(:retry_delay) ? opts[:retry_delay] : 5)
|
68
|
+
|
69
|
+
# initialize some counters
|
70
|
+
@errors = 0
|
71
|
+
@archived = 0
|
72
|
+
end
|
73
|
+
|
74
|
+
def connect_to_db
|
75
|
+
$odb_pool ||= OCI8::ConnectionPool.new(1, 5, 2, @login, @password, @db_uri)
|
76
|
+
@conn = OCI8.new(@login, @password, $odb_pool)
|
77
|
+
@conn.autocommit = false
|
78
|
+
end
|
79
|
+
|
80
|
+
def destroy_pool
|
81
|
+
$odb_pool.destroy if($odb_pool)
|
82
|
+
end
|
83
|
+
|
84
|
+
def bind_and_exec_sql(sql, workflow_info)
|
85
|
+
# LyberCore::Log.debug("Executing: #{sql}")
|
86
|
+
cursor = @conn.parse(sql)
|
87
|
+
|
88
|
+
workflow_info.to_bind_hash.each do |k, v|
|
89
|
+
param = ":#{k}"
|
90
|
+
#LyberCore::Log.debug("Setting: #{param} #{v}")
|
91
|
+
cursor.bind_param(param, v)
|
92
|
+
end
|
93
|
+
|
94
|
+
num_rows = cursor.exec
|
95
|
+
unless num_rows > 0
|
96
|
+
raise "Expected more than 0 rows to be updated"
|
97
|
+
end
|
98
|
+
ensure
|
99
|
+
cursor.close
|
100
|
+
end
|
101
|
+
|
102
|
+
# @return String The columns appended with comma and newline
|
103
|
+
def wf_column_string
|
104
|
+
WF_COLUMNS.inject('') { |str, col| str << col << ",\n"}
|
105
|
+
end
|
106
|
+
|
107
|
+
# @return String The columns prepended with 'w.' and appended with comma and newline
|
108
|
+
def wf_archive_column_string
|
109
|
+
WF_COLUMNS.inject('') { |str, col| str << 'w.' << col << ",\n"}
|
110
|
+
end
|
111
|
+
|
112
|
+
# Use this as a one-shot method to archive all the steps of an object's particular datastream
|
113
|
+
# It will connect to the database, archive the rows, then logoff. Assumes caller will set version (like the Dor REST service)
|
114
|
+
# @note Caller of this method must handle destroying of the connection pool
|
115
|
+
# @param [String] repository
|
116
|
+
# @param [String] druid
|
117
|
+
# @param [String] datastream
|
118
|
+
# @param [String] version
|
119
|
+
def archive_one_datastream(repository, druid, datastream, version)
|
120
|
+
criteria = [ArchiveCriteria.new(repository, druid, datastream, version)]
|
121
|
+
connect_to_db
|
122
|
+
archive_rows criteria
|
123
|
+
ensure
|
124
|
+
@conn.logoff if(@conn)
|
125
|
+
end
|
126
|
+
|
127
|
+
# Copies rows from the workflow table to the workflow_archive table, then deletes the rows from workflow
|
128
|
+
# Both operations must complete, or they get rolled back
|
129
|
+
# @param [Array<ArchiveCriteria>] objs List of objects returned from {#find_completed_objects} and mapped to an array of ArchiveCriteria objects.
|
130
|
+
def archive_rows(objs)
|
131
|
+
Array(objs).each do |obj|
|
132
|
+
tries = 0
|
133
|
+
begin
|
134
|
+
tries += 1
|
135
|
+
do_one_archive(obj)
|
136
|
+
@archived += 1
|
137
|
+
rescue => e
|
138
|
+
LyberCore::Log.error "Rolling back transaction due to: #{e.inspect}\n" << e.backtrace.join("\n") << "\n!!!!!!!!!!!!!!!!!!"
|
139
|
+
@conn.rollback
|
140
|
+
|
141
|
+
# Retry this druid up to 3 times
|
142
|
+
if tries < 3
|
143
|
+
LyberCore::Log.error " Retrying archive operation in #{@retry_delay.to_s} seconds..."
|
144
|
+
sleep @retry_delay
|
145
|
+
retry
|
146
|
+
end
|
147
|
+
LyberCore::Log.error " Too many retries. Giving up on #{obj.inspect}"
|
148
|
+
|
149
|
+
@errors += 1
|
150
|
+
if @errors >= 3
|
151
|
+
LyberCore::Log.fatal("Too many errors. Archiving halted")
|
152
|
+
break
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
end # druids.each
|
157
|
+
end
|
158
|
+
|
159
|
+
# @param [ArchiveCriteria] workflow_info contains paramaters on the workflow rows to archive
|
160
|
+
def do_one_archive(workflow_info)
|
161
|
+
LyberCore::Log.info "Archiving #{workflow_info.inspect}"
|
162
|
+
|
163
|
+
|
164
|
+
copy_sql =<<-EOSQL
|
165
|
+
insert into #{@workflow_archive_table} (
|
166
|
+
#{wf_column_string}
|
167
|
+
VERSION
|
168
|
+
)
|
169
|
+
select
|
170
|
+
#{wf_archive_column_string}
|
171
|
+
#{workflow_info.version} as VERSION
|
172
|
+
from #{@workflow_table} w
|
173
|
+
where w.druid = :DRUID
|
174
|
+
and w.datastream = :DATASTREAM
|
175
|
+
EOSQL
|
176
|
+
|
177
|
+
delete_sql = "delete #{@workflow_table} where druid = :DRUID and datastream = :DATASTREAM "
|
178
|
+
|
179
|
+
if(workflow_info.repository)
|
180
|
+
copy_sql << "and w.repository = :REPOSITORY"
|
181
|
+
delete_sql << "and repository = :REPOSITORY"
|
182
|
+
else
|
183
|
+
copy_sql << "and w.repository IS NULL"
|
184
|
+
delete_sql << "and repository IS NULL"
|
185
|
+
end
|
186
|
+
|
187
|
+
bind_and_exec_sql(copy_sql, workflow_info)
|
188
|
+
|
189
|
+
LyberCore::Log.debug " Removing old workflow rows"
|
190
|
+
bind_and_exec_sql(delete_sql, workflow_info)
|
191
|
+
|
192
|
+
@conn.commit
|
193
|
+
end
|
194
|
+
|
195
|
+
# Finds objects where all workflow steps are complete
|
196
|
+
# Returns an array of hashes, each hash having the following keys:
|
197
|
+
# {"REPOSITORY"=>"dor", "DRUID"=>"druid:345", "DATASTREAM"=>"googleScannedBookWF"}
|
198
|
+
def find_completed_objects
|
199
|
+
completed_query =<<-EOSQL
|
200
|
+
select distinct repository, datastream, druid
|
201
|
+
from workflow w1
|
202
|
+
where w1.status in ('completed', 'skipped')
|
203
|
+
and not exists
|
204
|
+
(
|
205
|
+
select *
|
206
|
+
from workflow w2
|
207
|
+
where w1.repository = w2.repository
|
208
|
+
and w1.datastream = w2.datastream
|
209
|
+
and w1.druid = w2.druid
|
210
|
+
and w2.status not in ('completed', 'skipped')
|
211
|
+
)
|
212
|
+
EOSQL
|
213
|
+
|
214
|
+
rows = []
|
215
|
+
cursor = @conn.exec(completed_query)
|
216
|
+
while r = cursor.fetch_hash
|
217
|
+
rows << r
|
218
|
+
end
|
219
|
+
rows
|
220
|
+
end
|
221
|
+
|
222
|
+
# @param [Array<Hash>] rows result from #find_completed_objects
|
223
|
+
# @return [Array<ArchiveCriteria>] each result mapped to an ArchiveCriteria object
|
224
|
+
def map_result_to_criteria(rows)
|
225
|
+
criteria = rows.map do |r|
|
226
|
+
begin
|
227
|
+
ArchiveCriteria.new.setup_from_query(r)
|
228
|
+
rescue => e
|
229
|
+
LyberCore::Log.error("Skipping archiving of #{r['DRUID']}")
|
230
|
+
LyberCore::Log.error("#{e.inspect}\n" + e.backtrace.join("\n"))
|
231
|
+
nil
|
232
|
+
end
|
233
|
+
end
|
234
|
+
criteria.reject {|c| c.nil?}
|
235
|
+
end
|
236
|
+
|
237
|
+
def simple_sql_exec(sql)
|
238
|
+
@conn.exec(sql)
|
239
|
+
rescue Exception => e
|
240
|
+
LyberCore::Log.warn "Ignoring error: #{e.message}\n while trying to execute: " << sql
|
241
|
+
end
|
242
|
+
|
243
|
+
def with_indexing_disabled(&block)
|
244
|
+
simple_sql_exec("drop index ds_wf_ar_bitmap_idx")
|
245
|
+
simple_sql_exec("drop index repo_wf_ar_bitmap_idx")
|
246
|
+
yield
|
247
|
+
ensure
|
248
|
+
simple_sql_exec("create bitmap index ds_wf_ar_bitmap_idx on workflow_archive (datastream)")
|
249
|
+
simple_sql_exec("create bitmap index repo_wf_ar_bitmap_idx on workflow_archive (repository)")
|
250
|
+
end
|
251
|
+
|
252
|
+
# Does the work of finding completed objects and archiving the rows
|
253
|
+
def archive
|
254
|
+
objs = find_completed_objects
|
255
|
+
|
256
|
+
if objs.size == 0
|
257
|
+
LyberCore::Log.info "Nothing to archive"
|
258
|
+
exit true
|
259
|
+
end
|
260
|
+
|
261
|
+
LyberCore::Log.info "Found #{objs.size.to_s} completed workflows"
|
262
|
+
|
263
|
+
archiving_criteria = map_result_to_criteria(objs)
|
264
|
+
with_indexing_disabled { archive_rows(archiving_criteria) }
|
265
|
+
|
266
|
+
LyberCore::Log.info "DONE! Processed #{@archived.to_s} objects with #{@errors.to_s} errors" if(@errors < 3 )
|
267
|
+
ensure
|
268
|
+
@conn.logoff
|
269
|
+
destroy_pool
|
270
|
+
end
|
271
|
+
|
272
|
+
end
|
273
|
+
|
274
|
+
end
|
metadata
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: workflow-archiver
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.2.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Willy Mene
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-07-23 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: lyber-core
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rest-client
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: ruby-oci8
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ! '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ! '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: active-fedora
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ! '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: Can be used standalone or used as a library
|
98
|
+
email:
|
99
|
+
- wmene@stanford.edu
|
100
|
+
executables: []
|
101
|
+
extensions: []
|
102
|
+
extra_rdoc_files: []
|
103
|
+
files:
|
104
|
+
- VERSION
|
105
|
+
- lib/dor/archiver_version.rb
|
106
|
+
- lib/dor/workflow_archiver.rb
|
107
|
+
homepage:
|
108
|
+
licenses: []
|
109
|
+
metadata: {}
|
110
|
+
post_install_message:
|
111
|
+
rdoc_options: []
|
112
|
+
require_paths:
|
113
|
+
- lib
|
114
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
115
|
+
requirements:
|
116
|
+
- - ! '>='
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: '0'
|
119
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - ! '>='
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: 1.3.6
|
124
|
+
requirements: []
|
125
|
+
rubyforge_project:
|
126
|
+
rubygems_version: 2.2.2
|
127
|
+
signing_key:
|
128
|
+
specification_version: 4
|
129
|
+
summary: Enables archiving of DOR workflows
|
130
|
+
test_files: []
|