data-validation-engine 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_validation_engine-0.6.2.dist-info/METADATA +104 -0
- data_validation_engine-0.6.2.dist-info/RECORD +105 -0
- data_validation_engine-0.6.2.dist-info/WHEEL +4 -0
- data_validation_engine-0.6.2.dist-info/licenses/LICENSE +21 -0
- dve/__init__.py +0 -0
- dve/common/__init__.py +0 -0
- dve/common/error_utils.py +189 -0
- dve/core_engine/__init__.py +0 -0
- dve/core_engine/backends/__init__.py +1 -0
- dve/core_engine/backends/base/__init__.py +1 -0
- dve/core_engine/backends/base/auditing.py +618 -0
- dve/core_engine/backends/base/backend.py +240 -0
- dve/core_engine/backends/base/contract.py +454 -0
- dve/core_engine/backends/base/core.py +124 -0
- dve/core_engine/backends/base/reader.py +176 -0
- dve/core_engine/backends/base/reference_data.py +217 -0
- dve/core_engine/backends/base/rules.py +685 -0
- dve/core_engine/backends/base/utilities.py +146 -0
- dve/core_engine/backends/exceptions.py +311 -0
- dve/core_engine/backends/implementations/__init__.py +1 -0
- dve/core_engine/backends/implementations/duckdb/__init__.py +26 -0
- dve/core_engine/backends/implementations/duckdb/auditing.py +234 -0
- dve/core_engine/backends/implementations/duckdb/contract.py +213 -0
- dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +288 -0
- dve/core_engine/backends/implementations/duckdb/readers/__init__.py +13 -0
- dve/core_engine/backends/implementations/duckdb/readers/csv.py +222 -0
- dve/core_engine/backends/implementations/duckdb/readers/json.py +50 -0
- dve/core_engine/backends/implementations/duckdb/readers/xml.py +45 -0
- dve/core_engine/backends/implementations/duckdb/reference_data.py +49 -0
- dve/core_engine/backends/implementations/duckdb/rules.py +534 -0
- dve/core_engine/backends/implementations/duckdb/types.py +47 -0
- dve/core_engine/backends/implementations/duckdb/utilities.py +41 -0
- dve/core_engine/backends/implementations/spark/__init__.py +22 -0
- dve/core_engine/backends/implementations/spark/auditing.py +230 -0
- dve/core_engine/backends/implementations/spark/backend.py +78 -0
- dve/core_engine/backends/implementations/spark/contract.py +241 -0
- dve/core_engine/backends/implementations/spark/readers/__init__.py +15 -0
- dve/core_engine/backends/implementations/spark/readers/csv.py +77 -0
- dve/core_engine/backends/implementations/spark/readers/json.py +66 -0
- dve/core_engine/backends/implementations/spark/readers/xml.py +202 -0
- dve/core_engine/backends/implementations/spark/reference_data.py +42 -0
- dve/core_engine/backends/implementations/spark/rules.py +430 -0
- dve/core_engine/backends/implementations/spark/spark_helpers.py +412 -0
- dve/core_engine/backends/implementations/spark/types.py +21 -0
- dve/core_engine/backends/implementations/spark/utilities.py +144 -0
- dve/core_engine/backends/metadata/__init__.py +47 -0
- dve/core_engine/backends/metadata/contract.py +80 -0
- dve/core_engine/backends/metadata/reporting.py +374 -0
- dve/core_engine/backends/metadata/rules.py +737 -0
- dve/core_engine/backends/readers/__init__.py +41 -0
- dve/core_engine/backends/readers/csv.py +232 -0
- dve/core_engine/backends/readers/utilities.py +21 -0
- dve/core_engine/backends/readers/xml.py +432 -0
- dve/core_engine/backends/readers/xml_linting.py +142 -0
- dve/core_engine/backends/types.py +26 -0
- dve/core_engine/backends/utilities.py +177 -0
- dve/core_engine/configuration/__init__.py +1 -0
- dve/core_engine/configuration/base.py +56 -0
- dve/core_engine/configuration/v1/__init__.py +351 -0
- dve/core_engine/configuration/v1/filters.py +60 -0
- dve/core_engine/configuration/v1/rule_stores/__init__.py +1 -0
- dve/core_engine/configuration/v1/rule_stores/models.py +57 -0
- dve/core_engine/configuration/v1/steps.py +365 -0
- dve/core_engine/constants.py +8 -0
- dve/core_engine/engine.py +265 -0
- dve/core_engine/exceptions.py +29 -0
- dve/core_engine/functions/__init__.py +6 -0
- dve/core_engine/functions/implementations.py +200 -0
- dve/core_engine/loggers.py +57 -0
- dve/core_engine/message.py +512 -0
- dve/core_engine/models.py +196 -0
- dve/core_engine/templating.py +114 -0
- dve/core_engine/type_hints.py +255 -0
- dve/core_engine/validation.py +160 -0
- dve/metadata_parser/__init__.py +2 -0
- dve/metadata_parser/domain_types.py +682 -0
- dve/metadata_parser/exc.py +44 -0
- dve/metadata_parser/function_library.py +64 -0
- dve/metadata_parser/function_wrapper.py +201 -0
- dve/metadata_parser/model_generator.py +119 -0
- dve/metadata_parser/models.py +410 -0
- dve/metadata_parser/utilities.py +54 -0
- dve/parser/__init__.py +1 -0
- dve/parser/exceptions.py +50 -0
- dve/parser/file_handling/__init__.py +31 -0
- dve/parser/file_handling/helpers.py +29 -0
- dve/parser/file_handling/implementations/__init__.py +7 -0
- dve/parser/file_handling/implementations/base.py +97 -0
- dve/parser/file_handling/implementations/dbfs.py +81 -0
- dve/parser/file_handling/implementations/file.py +203 -0
- dve/parser/file_handling/implementations/s3.py +371 -0
- dve/parser/file_handling/log_handler.py +215 -0
- dve/parser/file_handling/service.py +441 -0
- dve/parser/file_handling/utilities.py +53 -0
- dve/parser/type_hints.py +46 -0
- dve/parser/utilities.py +113 -0
- dve/pipeline/__init__.py +0 -0
- dve/pipeline/duckdb_pipeline.py +56 -0
- dve/pipeline/foundry_ddb_pipeline.py +171 -0
- dve/pipeline/pipeline.py +935 -0
- dve/pipeline/spark_pipeline.py +69 -0
- dve/pipeline/utils.py +96 -0
- dve/reporting/__init__.py +1 -0
- dve/reporting/error_report.py +153 -0
- dve/reporting/excel_report.py +319 -0
|
@@ -0,0 +1,618 @@
|
|
|
1
|
+
# pylint: disable=C0209
|
|
2
|
+
"""Base auditing objects and managers for use of DVE services"""
|
|
3
|
+
import multiprocessing
|
|
4
|
+
import operator
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
from abc import abstractmethod
|
|
8
|
+
from collections.abc import Callable, Iterable
|
|
9
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from datetime import date, datetime, timedelta
|
|
12
|
+
from multiprocessing import Queue as ProcessQueue
|
|
13
|
+
from queue import Queue as ThreadQueue
|
|
14
|
+
from types import TracebackType
|
|
15
|
+
from typing import Any, ClassVar, Generic, Optional, TypeVar, Union
|
|
16
|
+
|
|
17
|
+
from pydantic import ValidationError, validate_arguments
|
|
18
|
+
from typing_extensions import Literal, get_origin
|
|
19
|
+
|
|
20
|
+
from dve.core_engine.models import (
|
|
21
|
+
AuditRecord,
|
|
22
|
+
ProcessingStatusRecord,
|
|
23
|
+
SubmissionInfo,
|
|
24
|
+
SubmissionStatisticsRecord,
|
|
25
|
+
TransferRecord,
|
|
26
|
+
)
|
|
27
|
+
from dve.core_engine.type_hints import (
|
|
28
|
+
BinaryComparator,
|
|
29
|
+
ExecutorType,
|
|
30
|
+
ProcessingStatus,
|
|
31
|
+
QueueType,
|
|
32
|
+
SubmissionResult,
|
|
33
|
+
)
|
|
34
|
+
from dve.pipeline.utils import SubmissionStatus
|
|
35
|
+
|
|
36
|
+
AuditReturnType = TypeVar("AuditReturnType") # pylint: disable=invalid-name
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class FilterCriteria:
|
|
41
|
+
"""Stores information about a filter criteria to be applied to audit records"""
|
|
42
|
+
|
|
43
|
+
field: str
|
|
44
|
+
comparison_value: Any
|
|
45
|
+
operator_: Callable = operator.eq
|
|
46
|
+
operator_mapping: ClassVar[dict[BinaryComparator, str]] = {
|
|
47
|
+
operator.eq: "=",
|
|
48
|
+
operator.ne: "!=",
|
|
49
|
+
operator.lt: "<",
|
|
50
|
+
operator.le: "<=",
|
|
51
|
+
operator.gt: ">",
|
|
52
|
+
operator.ge: ">=",
|
|
53
|
+
operator.contains: "in",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def _quote_comp_val(val: Any) -> str:
|
|
58
|
+
if isinstance(val, (list, tuple, set)):
|
|
59
|
+
return str(tuple(val))
|
|
60
|
+
return f"'{val}'"
|
|
61
|
+
|
|
62
|
+
def to_sql(self) -> str:
|
|
63
|
+
"""Create sql expression from filter criteria"""
|
|
64
|
+
return "{} {} {}".format(
|
|
65
|
+
self.field,
|
|
66
|
+
self.operator_mapping[self.operator_],
|
|
67
|
+
self._quote_comp_val(self.comparison_value),
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def __str__(self) -> str:
|
|
71
|
+
return self.to_sql()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class OrderCriteria:
|
|
76
|
+
"""Stores information about ordering criteria to be applied to audit records"""
|
|
77
|
+
|
|
78
|
+
field_name: str
|
|
79
|
+
descending: bool = False
|
|
80
|
+
|
|
81
|
+
def to_sql(self) -> str:
|
|
82
|
+
"""Create sql expression from order criteria"""
|
|
83
|
+
return self.field_name + (" DESC" if self.descending else "")
|
|
84
|
+
|
|
85
|
+
def __str__(self) -> str:
|
|
86
|
+
return self.to_sql()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class BaseAuditor(Generic[AuditReturnType]):
|
|
90
|
+
"""Base auditor object - defines structure for implementations to use
|
|
91
|
+
in conjunction with AuditingManager"""
|
|
92
|
+
|
|
93
|
+
def __init__(self, name: str, record_type: type[AuditRecord]):
|
|
94
|
+
self._name = name
|
|
95
|
+
self._record_type = record_type
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def schema(self) -> dict[str, type]:
|
|
99
|
+
"""Determine python schema of auditor"""
|
|
100
|
+
return {
|
|
101
|
+
fld: str if get_origin(mdl.type_) == Literal else mdl.type_
|
|
102
|
+
for fld, mdl in self._record_type.__fields__.items()
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
@staticmethod
|
|
106
|
+
def normalise_filter(filter_condition: FilterCriteria):
|
|
107
|
+
"""Ensure filter criteria is converted to implementation specific format"""
|
|
108
|
+
raise NotImplementedError()
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def normalise_order(order_condition: OrderCriteria):
|
|
112
|
+
"""Ensure order criteria is converted to implementation specific format"""
|
|
113
|
+
raise NotImplementedError()
|
|
114
|
+
|
|
115
|
+
@staticmethod
|
|
116
|
+
def normalise_field(field: str):
|
|
117
|
+
"""Ensure field is converted to implementation specific format"""
|
|
118
|
+
raise NotImplementedError()
|
|
119
|
+
|
|
120
|
+
@abstractmethod
|
|
121
|
+
def conv_to_records(self, recs: AuditReturnType) -> Iterable[AuditRecord]:
|
|
122
|
+
"""Convert the AuditReturnType for the implementation to an iterable of pydantic models"""
|
|
123
|
+
raise NotImplementedError()
|
|
124
|
+
|
|
125
|
+
@abstractmethod
|
|
126
|
+
def conv_to_entity(self, recs: list[AuditRecord]) -> AuditReturnType:
|
|
127
|
+
"""Convert the list of pydantic models to an entity for use in pipelines"""
|
|
128
|
+
raise NotImplementedError()
|
|
129
|
+
|
|
130
|
+
@abstractmethod
|
|
131
|
+
def add_records(self, records: Iterable[dict[str, Any]]):
|
|
132
|
+
"""Add audit records to the Auditor"""
|
|
133
|
+
raise NotImplementedError()
|
|
134
|
+
|
|
135
|
+
@abstractmethod
|
|
136
|
+
def retrieve_records(
|
|
137
|
+
self, filter_criteria: list[FilterCriteria], data: Optional[AuditReturnType] = None
|
|
138
|
+
) -> AuditReturnType:
|
|
139
|
+
"""Retrieve audit records from the Auditor"""
|
|
140
|
+
raise NotImplementedError()
|
|
141
|
+
|
|
142
|
+
def get_most_recent_records(
|
|
143
|
+
self,
|
|
144
|
+
order_criteria: list[OrderCriteria],
|
|
145
|
+
partition_fields: Optional[list[str]] = None,
|
|
146
|
+
pre_filter_criteria: Optional[list[FilterCriteria]] = None,
|
|
147
|
+
) -> AuditReturnType:
|
|
148
|
+
"""Retrieve the most recent records, defined by the ordering criteria
|
|
149
|
+
for each partition combination"""
|
|
150
|
+
raise NotImplementedError()
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
AuditorType = TypeVar("AuditorType", bound=BaseAuditor) # pylint: disable=C0103
|
|
154
|
+
SubmissionMetadata = TypeVar("SubmissionMetadata", bound=SubmissionInfo)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class BaseAuditingManager(
|
|
158
|
+
Generic[AuditorType, AuditReturnType]
|
|
159
|
+
): # pylint: disable=too-many-public-methods
|
|
160
|
+
"""Manager of auditors - controls adding records to and querying from the
|
|
161
|
+
audit objects supplied"""
|
|
162
|
+
|
|
163
|
+
def __init__(
|
|
164
|
+
self,
|
|
165
|
+
processing_status: AuditorType,
|
|
166
|
+
submission_info: AuditorType,
|
|
167
|
+
submission_statistics: AuditorType,
|
|
168
|
+
transfers: AuditorType,
|
|
169
|
+
pool: Optional[ExecutorType] = None,
|
|
170
|
+
):
|
|
171
|
+
"""Audit manager to handle writing of audit information to auditors."""
|
|
172
|
+
self._processing_status = processing_status
|
|
173
|
+
self._submission_info = submission_info
|
|
174
|
+
self._submission_statistics = submission_statistics
|
|
175
|
+
self._transfers = transfers
|
|
176
|
+
self.pool = pool
|
|
177
|
+
if self.pool is not None:
|
|
178
|
+
thread = isinstance(self.pool, ThreadPoolExecutor)
|
|
179
|
+
self.queue: QueueType = ThreadQueue() if thread else ProcessQueue()
|
|
180
|
+
self.pool_result = self.pool.submit(self._process_queue)
|
|
181
|
+
self.clear_down = False
|
|
182
|
+
self.processing_lock = threading.Lock() if thread else multiprocessing.Lock()
|
|
183
|
+
|
|
184
|
+
@abstractmethod
|
|
185
|
+
def combine_auditor_information(
|
|
186
|
+
self,
|
|
187
|
+
left: Union[AuditorType, AuditReturnType],
|
|
188
|
+
right: Union[AuditorType, AuditReturnType],
|
|
189
|
+
) -> AuditReturnType:
|
|
190
|
+
"""Method to combine audit information of two auditors based on submission_id"""
|
|
191
|
+
raise NotImplementedError()
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def conv_to_iterable(recs: Union[AuditorType, AuditReturnType]) -> Iterable[dict[str, Any]]:
|
|
195
|
+
"""Convert AuditReturnType to iterable of dictionaries"""
|
|
196
|
+
raise NotImplementedError()
|
|
197
|
+
|
|
198
|
+
@validate_arguments
|
|
199
|
+
def add_processing_records(self, processing_records: list[ProcessingStatusRecord]):
|
|
200
|
+
"""Add an entry to the processing_status auditor."""
|
|
201
|
+
if self.pool:
|
|
202
|
+
return self._submit(
|
|
203
|
+
audit_object=self._processing_status,
|
|
204
|
+
records=[dict(rec) for rec in processing_records],
|
|
205
|
+
)
|
|
206
|
+
return self._processing_status.add_records(
|
|
207
|
+
records=[dict(rec) for rec in processing_records]
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
@validate_arguments
|
|
211
|
+
def add_submission_statistics_records(self, sub_stats: list[SubmissionStatisticsRecord]):
|
|
212
|
+
"""Add an entry to the submission statistics auditor."""
|
|
213
|
+
if self.pool:
|
|
214
|
+
return self._submit(
|
|
215
|
+
audit_object=self._submission_statistics,
|
|
216
|
+
records=[dict(rec) for rec in sub_stats],
|
|
217
|
+
)
|
|
218
|
+
return self._submission_statistics.add_records(records=[dict(rec) for rec in sub_stats])
|
|
219
|
+
|
|
220
|
+
@validate_arguments
|
|
221
|
+
def add_transfer_records(self, transfer_records: list[TransferRecord]):
|
|
222
|
+
"""Add an entry to the transfers auditor"""
|
|
223
|
+
if self.pool:
|
|
224
|
+
return self._submit(
|
|
225
|
+
audit_object=self._transfers, records=[dict(rec) for rec in transfer_records]
|
|
226
|
+
)
|
|
227
|
+
return self._transfers.add_records(records=[dict(rec) for rec in transfer_records])
|
|
228
|
+
|
|
229
|
+
@validate_arguments
|
|
230
|
+
def add_new_submissions(
|
|
231
|
+
self,
|
|
232
|
+
submissions: list[SubmissionMetadata],
|
|
233
|
+
job_run_id: Optional[int] = None,
|
|
234
|
+
):
|
|
235
|
+
"""Add an entry to the submission_info auditor."""
|
|
236
|
+
# get timestamp of update and date of update to align
|
|
237
|
+
# submission_info and processing_status tables
|
|
238
|
+
time_now: datetime = datetime.now()
|
|
239
|
+
ts_info = {"time_updated": time_now, "date_updated": time_now.date()}
|
|
240
|
+
|
|
241
|
+
processing_status_recs: list[dict[str, Any]] = []
|
|
242
|
+
submission_info_recs: list[dict[str, Any]] = []
|
|
243
|
+
|
|
244
|
+
for sub_info in submissions:
|
|
245
|
+
# add processing_record - add time info
|
|
246
|
+
processing_rec = {
|
|
247
|
+
**ProcessingStatusRecord(
|
|
248
|
+
submission_id=sub_info.submission_id,
|
|
249
|
+
processing_status="received",
|
|
250
|
+
job_run_id=job_run_id,
|
|
251
|
+
**ts_info,
|
|
252
|
+
).dict(),
|
|
253
|
+
}
|
|
254
|
+
processing_status_recs.append(processing_rec)
|
|
255
|
+
if sub_info:
|
|
256
|
+
sub_info_rec = {**dict(sub_info), **ts_info}
|
|
257
|
+
submission_info_recs.append(sub_info_rec)
|
|
258
|
+
|
|
259
|
+
if self.pool:
|
|
260
|
+
self._submit(audit_object=self._processing_status, records=processing_status_recs)
|
|
261
|
+
self._submit(audit_object=self._submission_info, records=submission_info_recs)
|
|
262
|
+
else:
|
|
263
|
+
self._processing_status.add_records(records=processing_status_recs)
|
|
264
|
+
self._submission_info.add_records(records=submission_info_recs)
|
|
265
|
+
return submissions
|
|
266
|
+
|
|
267
|
+
def _submit(self, **kwargs):
|
|
268
|
+
self.queue.put(kwargs)
|
|
269
|
+
|
|
270
|
+
def _process_queue(self):
|
|
271
|
+
while True:
|
|
272
|
+
if self.queue.empty():
|
|
273
|
+
time.sleep(2)
|
|
274
|
+
continue
|
|
275
|
+
item = self.queue.get()
|
|
276
|
+
if item is None:
|
|
277
|
+
break
|
|
278
|
+
try:
|
|
279
|
+
with self.processing_lock:
|
|
280
|
+
item.get("audit_object").add_records(item.get("records"))
|
|
281
|
+
except Exception as exc: # pylint: disable=broad-except
|
|
282
|
+
print(exc) # TODO - log this - rather than print
|
|
283
|
+
|
|
284
|
+
def is_writing(self) -> bool:
|
|
285
|
+
"""Check if the audit manager is currently writing data to auditors"""
|
|
286
|
+
if self.pool is None:
|
|
287
|
+
return False
|
|
288
|
+
if isinstance(self.processing_lock, type(threading.Lock())):
|
|
289
|
+
locked = self.processing_lock.locked()
|
|
290
|
+
else:
|
|
291
|
+
# process locks don't have a locked method, if we try to aquire with a low timeout
|
|
292
|
+
# if it succeeds then the lock is free. if it fails then it's not
|
|
293
|
+
try:
|
|
294
|
+
self.processing_lock.acquire(timeout=0.001) # type: ignore
|
|
295
|
+
locked = False
|
|
296
|
+
self.processing_lock.release() # type: ignore
|
|
297
|
+
except TimeoutError:
|
|
298
|
+
locked = True
|
|
299
|
+
|
|
300
|
+
return not self.queue.empty() or locked
|
|
301
|
+
|
|
302
|
+
def mark_transform(self, submission_ids: list[str], **kwargs):
|
|
303
|
+
"""Update submission processing_status to file_transformation."""
|
|
304
|
+
|
|
305
|
+
recs = [
|
|
306
|
+
ProcessingStatusRecord(
|
|
307
|
+
submission_id=submission_id, processing_status="file_transformation", **kwargs
|
|
308
|
+
)
|
|
309
|
+
for submission_id in submission_ids
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
return self.add_processing_records(recs)
|
|
313
|
+
|
|
314
|
+
def mark_data_contract(self, submission_ids: list[str], **kwargs):
|
|
315
|
+
"""Update submission processing_status to data_contract."""
|
|
316
|
+
|
|
317
|
+
recs = [
|
|
318
|
+
ProcessingStatusRecord(
|
|
319
|
+
submission_id=submission_id, processing_status="data_contract", **kwargs
|
|
320
|
+
)
|
|
321
|
+
for submission_id in submission_ids
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
return self.add_processing_records(recs)
|
|
325
|
+
|
|
326
|
+
def mark_business_rules(self, submissions: list[tuple[str, bool]], **kwargs):
|
|
327
|
+
"""Update submission processing_status to business_rules."""
|
|
328
|
+
|
|
329
|
+
recs = [
|
|
330
|
+
ProcessingStatusRecord(
|
|
331
|
+
submission_id=submission_id,
|
|
332
|
+
processing_status="business_rules",
|
|
333
|
+
submission_result="validation_failed" if failed else None,
|
|
334
|
+
**kwargs,
|
|
335
|
+
)
|
|
336
|
+
for submission_id, failed in submissions
|
|
337
|
+
]
|
|
338
|
+
|
|
339
|
+
return self.add_processing_records(recs)
|
|
340
|
+
|
|
341
|
+
def mark_error_report(
|
|
342
|
+
self,
|
|
343
|
+
submissions: list[tuple[str, SubmissionResult]],
|
|
344
|
+
job_run_id: Optional[int] = None,
|
|
345
|
+
):
|
|
346
|
+
"""Mark the given submission as being ready for error report"""
|
|
347
|
+
processing_recs: list[ProcessingStatusRecord] = []
|
|
348
|
+
|
|
349
|
+
sub_id: str
|
|
350
|
+
sub_result: str
|
|
351
|
+
|
|
352
|
+
for sub_id, sub_result in submissions:
|
|
353
|
+
processing_recs.append(
|
|
354
|
+
ProcessingStatusRecord(
|
|
355
|
+
submission_id=sub_id,
|
|
356
|
+
processing_status="error_report",
|
|
357
|
+
submission_result=sub_result,
|
|
358
|
+
job_run_id=job_run_id,
|
|
359
|
+
)
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
return self.add_processing_records(processing_recs)
|
|
363
|
+
|
|
364
|
+
def mark_finished(self, submissions: list[tuple[str, SubmissionResult]], **kwargs):
|
|
365
|
+
"""Update submission processing_status to finished."""
|
|
366
|
+
|
|
367
|
+
recs = [
|
|
368
|
+
ProcessingStatusRecord(
|
|
369
|
+
submission_id=sub_id,
|
|
370
|
+
processing_status="success",
|
|
371
|
+
submission_result=sub_res,
|
|
372
|
+
**kwargs,
|
|
373
|
+
)
|
|
374
|
+
for sub_id, sub_res in submissions
|
|
375
|
+
]
|
|
376
|
+
|
|
377
|
+
return self.add_processing_records(recs)
|
|
378
|
+
|
|
379
|
+
def mark_failed(self, submissions: list[str], **kwargs):
|
|
380
|
+
"""Update submission processing_status to failed."""
|
|
381
|
+
recs = [
|
|
382
|
+
ProcessingStatusRecord(
|
|
383
|
+
submission_id=submission_id,
|
|
384
|
+
processing_status="failed",
|
|
385
|
+
submission_result="processing_failed",
|
|
386
|
+
**kwargs,
|
|
387
|
+
)
|
|
388
|
+
for submission_id in submissions
|
|
389
|
+
]
|
|
390
|
+
|
|
391
|
+
return self.add_processing_records(recs)
|
|
392
|
+
|
|
393
|
+
def mark_archived(self, submissions: list[str], **kwargs):
|
|
394
|
+
"""Update submission processing_status to archived."""
|
|
395
|
+
recs = [
|
|
396
|
+
ProcessingStatusRecord(
|
|
397
|
+
submission_id=submission_id, processing_status="archived", **kwargs
|
|
398
|
+
)
|
|
399
|
+
for submission_id in submissions
|
|
400
|
+
]
|
|
401
|
+
|
|
402
|
+
return self.add_processing_records(recs)
|
|
403
|
+
|
|
404
|
+
def add_feedback_transfer_ids(self, submissions: list[tuple[str, str]], **kwargs):
|
|
405
|
+
"""Adds transfer_id for error report to submission"""
|
|
406
|
+
recs = [
|
|
407
|
+
TransferRecord(
|
|
408
|
+
submission_id=submission_id,
|
|
409
|
+
report_name="error_report",
|
|
410
|
+
transfer_id=transfer_id,
|
|
411
|
+
**kwargs,
|
|
412
|
+
)
|
|
413
|
+
for submission_id, transfer_id in submissions
|
|
414
|
+
]
|
|
415
|
+
|
|
416
|
+
return self.add_transfer_records(recs)
|
|
417
|
+
|
|
418
|
+
def get_latest_processing_records(
|
|
419
|
+
self, filter_criteria: Optional[list[FilterCriteria]] = None
|
|
420
|
+
) -> AuditReturnType:
|
|
421
|
+
"""Get the most recent processing record for each submission_id stored in
|
|
422
|
+
the processing_status auditor"""
|
|
423
|
+
return self._processing_status.get_most_recent_records(
|
|
424
|
+
order_criteria=[OrderCriteria("time_updated", True)],
|
|
425
|
+
partition_fields=["submission_id"],
|
|
426
|
+
pre_filter_criteria=filter_criteria,
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
def downstream_pending(
|
|
430
|
+
self,
|
|
431
|
+
status: ProcessingStatus,
|
|
432
|
+
max_concurrency: int = 1,
|
|
433
|
+
run_number: int = 0,
|
|
434
|
+
max_days_old: int = 3,
|
|
435
|
+
statuses_to_include: Optional[list[ProcessingStatus]] = None,
|
|
436
|
+
) -> bool:
|
|
437
|
+
"""Checks if there are any downstream submissions currently pending"""
|
|
438
|
+
steps: list[ProcessingStatus] = [
|
|
439
|
+
"received",
|
|
440
|
+
"file_transformation",
|
|
441
|
+
"data_contract",
|
|
442
|
+
"business_rules",
|
|
443
|
+
"error_report",
|
|
444
|
+
]
|
|
445
|
+
|
|
446
|
+
downstream: set[ProcessingStatus]
|
|
447
|
+
if statuses_to_include:
|
|
448
|
+
downstream = {status, *statuses_to_include}
|
|
449
|
+
else:
|
|
450
|
+
downstream = {*steps[: steps.index(status) + 1]}
|
|
451
|
+
|
|
452
|
+
pending = self._processing_status.conv_to_records(
|
|
453
|
+
self._processing_status.retrieve_records(
|
|
454
|
+
filter_criteria=[
|
|
455
|
+
FilterCriteria(
|
|
456
|
+
"date_updated",
|
|
457
|
+
str(date.today() - timedelta(days=max_days_old)),
|
|
458
|
+
operator.gt,
|
|
459
|
+
),
|
|
460
|
+
FilterCriteria("processing_status", downstream, operator.contains),
|
|
461
|
+
],
|
|
462
|
+
data=self.get_latest_processing_records(),
|
|
463
|
+
)
|
|
464
|
+
)
|
|
465
|
+
pending_for_job = filter(
|
|
466
|
+
lambda sub_id: int(sub_id, 16) % max_concurrency == run_number, # type: ignore
|
|
467
|
+
[rw.submission_id for rw in pending],
|
|
468
|
+
)
|
|
469
|
+
try:
|
|
470
|
+
next(pending_for_job)
|
|
471
|
+
return True
|
|
472
|
+
except StopIteration:
|
|
473
|
+
return False
|
|
474
|
+
|
|
475
|
+
def get_submission_info(self, submission_id: str) -> Optional[SubmissionInfo]:
|
|
476
|
+
"""Get all stored info for a submission"""
|
|
477
|
+
try:
|
|
478
|
+
return next( # type: ignore
|
|
479
|
+
self._submission_info.conv_to_records(
|
|
480
|
+
self._submission_info.retrieve_records(
|
|
481
|
+
filter_criteria=[FilterCriteria("submission_id", submission_id)]
|
|
482
|
+
)
|
|
483
|
+
)
|
|
484
|
+
)
|
|
485
|
+
except StopIteration:
|
|
486
|
+
return None
|
|
487
|
+
|
|
488
|
+
def get_submission_statistics(self, submission_id: str) -> Optional[SubmissionStatisticsRecord]:
|
|
489
|
+
"""Get submission statistics record for submission if one exists"""
|
|
490
|
+
try:
|
|
491
|
+
return next( # type: ignore
|
|
492
|
+
self._submission_statistics.conv_to_records(
|
|
493
|
+
self._submission_statistics.retrieve_records(
|
|
494
|
+
filter_criteria=[FilterCriteria("submission_id", submission_id)]
|
|
495
|
+
)
|
|
496
|
+
)
|
|
497
|
+
)
|
|
498
|
+
except StopIteration:
|
|
499
|
+
return None
|
|
500
|
+
|
|
501
|
+
def get_submission_status(self, submission_id: str) -> Optional[SubmissionStatus]:
|
|
502
|
+
"""Get the latest submission status for a submission"""
|
|
503
|
+
|
|
504
|
+
try:
|
|
505
|
+
processing_rec: ProcessingStatusRecord = next( # type: ignore
|
|
506
|
+
self._processing_status.conv_to_records(
|
|
507
|
+
self._processing_status.get_most_recent_records(
|
|
508
|
+
order_criteria=[OrderCriteria("time_updated", True)],
|
|
509
|
+
pre_filter_criteria=[FilterCriteria("submission_id", submission_id)],
|
|
510
|
+
)
|
|
511
|
+
)
|
|
512
|
+
)
|
|
513
|
+
except StopIteration:
|
|
514
|
+
return None
|
|
515
|
+
sub_status = SubmissionStatus()
|
|
516
|
+
sub_stats_rec: Optional[SubmissionStatisticsRecord] = self.get_submission_statistics(
|
|
517
|
+
submission_id
|
|
518
|
+
)
|
|
519
|
+
if processing_rec.submission_result == "processing_failed":
|
|
520
|
+
sub_status.processing_failed = True
|
|
521
|
+
if processing_rec.submission_result == "validation_failed":
|
|
522
|
+
sub_status.validation_failed = True
|
|
523
|
+
if sub_stats_rec:
|
|
524
|
+
sub_status.number_of_records = sub_stats_rec.record_count
|
|
525
|
+
|
|
526
|
+
return sub_status
|
|
527
|
+
|
|
528
|
+
def __enter__(self):
|
|
529
|
+
"""Use audit table as context manager"""
|
|
530
|
+
if self.pool and self.pool_result.done():
|
|
531
|
+
thread = isinstance(self.pool, ThreadPoolExecutor)
|
|
532
|
+
self.queue: QueueType = ThreadQueue() if thread else ProcessQueue()
|
|
533
|
+
self.pool_result = self.pool.submit(self._process_queue)
|
|
534
|
+
self.clear_down = False
|
|
535
|
+
self.processing_lock = threading.Lock() if thread else multiprocessing.Lock()
|
|
536
|
+
return self
|
|
537
|
+
|
|
538
|
+
def __exit__(
|
|
539
|
+
self,
|
|
540
|
+
exc_type: Optional[type[Exception]],
|
|
541
|
+
exc_value: Optional[Exception],
|
|
542
|
+
traceback: Optional[TracebackType],
|
|
543
|
+
) -> None:
|
|
544
|
+
"""Use audit table as context manager"""
|
|
545
|
+
if self.pool:
|
|
546
|
+
self.queue.put(None)
|
|
547
|
+
print(self.pool_result.result())
|
|
548
|
+
while not self.queue.empty():
|
|
549
|
+
time.sleep(1)
|
|
550
|
+
|
|
551
|
+
def _get_status(
|
|
552
|
+
self,
|
|
553
|
+
status: Union[ProcessingStatus, set[ProcessingStatus], list[ProcessingStatus]],
|
|
554
|
+
max_days_old: int,
|
|
555
|
+
) -> AuditReturnType:
|
|
556
|
+
_filter = [
|
|
557
|
+
FilterCriteria(
|
|
558
|
+
"date_updated", str(date.today() - timedelta(days=max_days_old)), operator.gt
|
|
559
|
+
)
|
|
560
|
+
]
|
|
561
|
+
if isinstance(status, (set, list)):
|
|
562
|
+
_filter.append(
|
|
563
|
+
FilterCriteria(
|
|
564
|
+
"processing_status",
|
|
565
|
+
status,
|
|
566
|
+
operator.contains,
|
|
567
|
+
)
|
|
568
|
+
)
|
|
569
|
+
else:
|
|
570
|
+
_filter.append(FilterCriteria("processing_status", status))
|
|
571
|
+
return self._processing_status.retrieve_records(filter_criteria=_filter)
|
|
572
|
+
|
|
573
|
+
def get_all_file_transformation_submissions(self, max_days_old: int = 3) -> AuditReturnType:
|
|
574
|
+
"""Gets all of the submissions that are ready to be parsed"""
|
|
575
|
+
return self._get_status("file_transformation", max_days_old)
|
|
576
|
+
|
|
577
|
+
def get_all_data_contract_submissions(self, max_days_old: int = 3) -> AuditReturnType:
|
|
578
|
+
"""Gets all of the submissions that are ready for data contract to be applied"""
|
|
579
|
+
return self._get_status("data_contract", max_days_old)
|
|
580
|
+
|
|
581
|
+
def get_all_business_rule_submissions(self, max_days_old: int = 3) -> AuditReturnType:
|
|
582
|
+
"""Gets all of the submissions that are ready for business rules to be applied"""
|
|
583
|
+
return self._get_status("business_rules", max_days_old)
|
|
584
|
+
|
|
585
|
+
def get_all_error_report_submissions(self, max_days_old: int = 3):
|
|
586
|
+
"""Gets all the submissions that are ready for error reports to be generated"""
|
|
587
|
+
subs = self._get_status("error_report", max_days_old)
|
|
588
|
+
|
|
589
|
+
sub_infos = self.conv_to_iterable(
|
|
590
|
+
self.combine_auditor_information(subs, self._submission_info)
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
processed: list[SubmissionInfo] = []
|
|
594
|
+
dodgy_info: list[tuple[dict, str]] = []
|
|
595
|
+
|
|
596
|
+
for sub_info in sub_infos:
|
|
597
|
+
try:
|
|
598
|
+
processed.append(SubmissionInfo(**sub_info))
|
|
599
|
+
except ValidationError:
|
|
600
|
+
dodgy_info.append((sub_info, sub_info["submission_id"]))
|
|
601
|
+
|
|
602
|
+
return processed, dodgy_info
|
|
603
|
+
|
|
604
|
+
def get_current_processing_info(self, submission_id: str) -> Optional[ProcessingStatusRecord]:
|
|
605
|
+
"""Gets the current status of the record with the given submission_id"""
|
|
606
|
+
try:
|
|
607
|
+
return next( # type: ignore
|
|
608
|
+
iter(
|
|
609
|
+
self._processing_status.conv_to_records(
|
|
610
|
+
self._processing_status.get_most_recent_records(
|
|
611
|
+
pre_filter_criteria=[FilterCriteria("submission_id", submission_id)],
|
|
612
|
+
order_criteria=[OrderCriteria("time_updated", True)],
|
|
613
|
+
)
|
|
614
|
+
)
|
|
615
|
+
)
|
|
616
|
+
)
|
|
617
|
+
except StopIteration:
|
|
618
|
+
return None
|