documente_shared 0.1.145__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- documente_shared/__init__.py +0 -0
- documente_shared/application/__init__.py +0 -0
- documente_shared/application/dates.py +7 -0
- documente_shared/application/digest.py +7 -0
- documente_shared/application/exceptions.py +23 -0
- documente_shared/application/files.py +27 -0
- documente_shared/application/json.py +45 -0
- documente_shared/application/numbers.py +7 -0
- documente_shared/application/payloads.py +29 -0
- documente_shared/application/query_params.py +133 -0
- documente_shared/application/retry_utils.py +69 -0
- documente_shared/application/time_utils.py +13 -0
- documente_shared/application/timezone.py +7 -0
- documente_shared/domain/__init__.py +0 -0
- documente_shared/domain/base_enum.py +54 -0
- documente_shared/domain/constants.py +8 -0
- documente_shared/domain/entities/__init__.py +0 -0
- documente_shared/domain/entities/document.py +410 -0
- documente_shared/domain/entities/document_metadata.py +64 -0
- documente_shared/domain/entities/in_memory_document.py +75 -0
- documente_shared/domain/entities/processing_case.py +215 -0
- documente_shared/domain/entities/processing_case_filters.py +51 -0
- documente_shared/domain/entities/processing_case_item.py +300 -0
- documente_shared/domain/entities/processing_case_item_filters.py +54 -0
- documente_shared/domain/entities/processing_documents.py +11 -0
- documente_shared/domain/entities/processing_event.py +71 -0
- documente_shared/domain/entities/scaling.py +31 -0
- documente_shared/domain/enums/__init__.py +0 -0
- documente_shared/domain/enums/circular_oficio.py +29 -0
- documente_shared/domain/enums/common.py +133 -0
- documente_shared/domain/enums/document.py +124 -0
- documente_shared/domain/enums/document_type_record.py +13 -0
- documente_shared/domain/enums/processing_case.py +66 -0
- documente_shared/domain/exceptions.py +5 -0
- documente_shared/domain/interfaces/__init__.py +0 -0
- documente_shared/domain/interfaces/scaling.py +10 -0
- documente_shared/domain/repositories/__init__.py +0 -0
- documente_shared/domain/repositories/document.py +24 -0
- documente_shared/domain/repositories/processing_case.py +36 -0
- documente_shared/domain/repositories/processing_case_item.py +49 -0
- documente_shared/infrastructure/__init__.py +0 -0
- documente_shared/infrastructure/documente_client.py +27 -0
- documente_shared/infrastructure/dynamo_table.py +75 -0
- documente_shared/infrastructure/lambdas.py +14 -0
- documente_shared/infrastructure/repositories/__init__.py +0 -0
- documente_shared/infrastructure/repositories/dynamo_document.py +43 -0
- documente_shared/infrastructure/repositories/dynamo_processing_case.py +55 -0
- documente_shared/infrastructure/repositories/dynamo_processing_case_item.py +70 -0
- documente_shared/infrastructure/repositories/http_document.py +66 -0
- documente_shared/infrastructure/repositories/http_processing_case.py +82 -0
- documente_shared/infrastructure/repositories/http_processing_case_item.py +118 -0
- documente_shared/infrastructure/repositories/mem_document.py +46 -0
- documente_shared/infrastructure/repositories/mem_processing_case.py +44 -0
- documente_shared/infrastructure/repositories/mem_processing_case_item.py +52 -0
- documente_shared/infrastructure/s3_bucket.py +58 -0
- documente_shared/infrastructure/services/__init__.py +0 -0
- documente_shared/infrastructure/services/http_scaling.py +25 -0
- documente_shared/infrastructure/sqs_queue.py +48 -0
- documente_shared/presentation/__init__.py +0 -0
- documente_shared/presentation/presenters.py +16 -0
- documente_shared-0.1.145.dist-info/METADATA +39 -0
- documente_shared-0.1.145.dist-info/RECORD +63 -0
- documente_shared-0.1.145.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime, tzinfo
|
|
4
|
+
from decimal import Decimal
|
|
5
|
+
from typing import Optional, List
|
|
6
|
+
|
|
7
|
+
from documente_shared.application.files import (
|
|
8
|
+
remove_slash_from_path,
|
|
9
|
+
get_filename_from_path,
|
|
10
|
+
remove_extension,
|
|
11
|
+
)
|
|
12
|
+
from documente_shared.application.numbers import normalize_number
|
|
13
|
+
from documente_shared.application.time_utils import get_datetime_from_data
|
|
14
|
+
from documente_shared.domain.constants import la_paz_tz
|
|
15
|
+
from documente_shared.domain.entities.document_metadata import DocumentProcessingMetadata
|
|
16
|
+
from documente_shared.domain.enums.document import (
|
|
17
|
+
DocumentProcessingStatus,
|
|
18
|
+
DocumentProcessingCategory,
|
|
19
|
+
DocumentProcessingSubCategory,
|
|
20
|
+
DocumentProcessingSource,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class DocumentProcessing(object):
|
|
26
|
+
digest: str
|
|
27
|
+
status: DocumentProcessingStatus
|
|
28
|
+
category: DocumentProcessingCategory
|
|
29
|
+
file_path: Optional[str] = None
|
|
30
|
+
file_auxiliar_path: Optional[str] = None
|
|
31
|
+
file_bytes: Optional[bytes] = None
|
|
32
|
+
sub_category: Optional[DocumentProcessingSubCategory] = None
|
|
33
|
+
uploaded_from: Optional[DocumentProcessingSource] = None
|
|
34
|
+
locked_by_admin: Optional[bool] = False
|
|
35
|
+
processed_csv_path: Optional[str] = None
|
|
36
|
+
processed_csv_bytes: Optional[bytes] = None
|
|
37
|
+
processed_xlsx_path: Optional[str] = None
|
|
38
|
+
processed_xlsx_bytes: Optional[bytes] = None
|
|
39
|
+
processed_json_path: Optional[str] = None
|
|
40
|
+
processed_json_bytes: Optional[bytes] = None
|
|
41
|
+
processed_metadata_path: Optional[str] = None
|
|
42
|
+
processing_time: Optional[Decimal] = None
|
|
43
|
+
processing_accuracy: Optional[Decimal] = None
|
|
44
|
+
issued_at: Optional[datetime] = None
|
|
45
|
+
uploaded_at: Optional[datetime] = None
|
|
46
|
+
enqueued_at: Optional[datetime] = None
|
|
47
|
+
started_at: Optional[datetime] = None
|
|
48
|
+
failed_at: Optional[datetime] = None
|
|
49
|
+
failed_reason: Optional[str] = None
|
|
50
|
+
feedback: Optional[list | dict] = None
|
|
51
|
+
completed_at: Optional[datetime] = None
|
|
52
|
+
metadata: Optional[dict] = None
|
|
53
|
+
document_size: Optional[Decimal] = None
|
|
54
|
+
document_pages: Optional[int] = None
|
|
55
|
+
metadata_items: Optional[List[DocumentProcessingMetadata]] = None
|
|
56
|
+
|
|
57
|
+
def __post_init__(self):
|
|
58
|
+
self.metadata_items = self.metadata_items or []
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def strategy_id(self) -> str:
|
|
62
|
+
return str(self.category)
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def is_pending(self) -> bool:
|
|
66
|
+
return self.status == DocumentProcessingStatus.PENDING
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def is_enqueued(self) -> bool:
|
|
70
|
+
return self.status == DocumentProcessingStatus.ENQUEUED
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def is_processing(self) -> bool:
|
|
74
|
+
return self.status == DocumentProcessingStatus.PROCESSING
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def is_completed(self) -> bool:
|
|
78
|
+
return self.status == DocumentProcessingStatus.COMPLETED
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def is_incomplete(self) -> bool:
|
|
82
|
+
return self.status == DocumentProcessingStatus.INCOMPLETE
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def is_failed(self) -> bool:
|
|
86
|
+
return self.status == DocumentProcessingStatus.FAILED
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def is_inreview(self) -> bool:
|
|
90
|
+
return self.status == DocumentProcessingStatus.IN_REVIEW
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def is_circular(self) -> bool:
|
|
94
|
+
return self.category and self.category.is_circular
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def is_desgravamen(self) -> bool:
|
|
98
|
+
return self.category and self.category.is_desgravamen
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def is_valid(self) -> bool:
|
|
102
|
+
return all([
|
|
103
|
+
self.digest,
|
|
104
|
+
self.status,
|
|
105
|
+
self.file_path,
|
|
106
|
+
])
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def is_finished(self) -> bool:
|
|
110
|
+
return self.status in [
|
|
111
|
+
DocumentProcessingStatus.COMPLETED,
|
|
112
|
+
DocumentProcessingStatus.FAILED,
|
|
113
|
+
]
|
|
114
|
+
@property
|
|
115
|
+
def file_key(self) -> str:
|
|
116
|
+
return remove_slash_from_path(self.file_path)
|
|
117
|
+
|
|
118
|
+
@property
|
|
119
|
+
def processed_csv_key(self) -> str:
|
|
120
|
+
return remove_slash_from_path(self.processed_csv_path)
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def processed_xlsx_key(self) -> str:
|
|
124
|
+
return remove_slash_from_path(self.processed_xlsx_path)
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def processed_json_key(self) -> str:
|
|
128
|
+
return remove_slash_from_path(self.processed_json_path)
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def processed_csv_filename(self) -> str:
|
|
132
|
+
return get_filename_from_path(self.processed_csv_path)
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def processed_xlsx_filename(self) -> str:
|
|
136
|
+
return get_filename_from_path(self.processed_xlsx_path)
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def processed_json_filename(self) -> str:
|
|
140
|
+
return get_filename_from_path(self.processed_json_path)
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def processed_metadata_key(self) -> str:
|
|
144
|
+
return remove_slash_from_path(self.processed_metadata_path)
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def extended_filename(self) -> str:
|
|
148
|
+
if not self.file_path:
|
|
149
|
+
return ''
|
|
150
|
+
return self.file_path.split('/')[-1]
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def raw_file_name(self) -> str:
|
|
154
|
+
return remove_extension(self.extended_filename)
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def filename(self) -> str:
|
|
158
|
+
filename_with_extension = self.extended_filename
|
|
159
|
+
return filename_with_extension.split('.')[0]
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def metadata_items_bytes(self) -> bytes:
|
|
163
|
+
metadata_items = [
|
|
164
|
+
metadata_item.to_dict
|
|
165
|
+
for metadata_item in self.metadata_items
|
|
166
|
+
]
|
|
167
|
+
return json.dumps(metadata_items).encode('utf-8')
|
|
168
|
+
|
|
169
|
+
@property
|
|
170
|
+
def has_original_file(self) -> bool:
|
|
171
|
+
return bool(self.file_path) and self.file_bytes
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def has_processed_csv(self) -> bool:
|
|
175
|
+
return bool(self.processed_csv_path) and self.processed_csv_bytes
|
|
176
|
+
|
|
177
|
+
@property
|
|
178
|
+
def has_processed_xlsx(self) -> bool:
|
|
179
|
+
return bool(self.processed_xlsx_path) and self.processed_xlsx_bytes
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def has_processed_json(self) -> bool:
|
|
183
|
+
return bool(self.processed_json_path) and self.processed_json_bytes
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def has_processed_metadata(self) -> bool:
|
|
187
|
+
return bool(self.processed_metadata_path) and self.metadata_items
|
|
188
|
+
|
|
189
|
+
def pending(self, timezone: tzinfo = la_paz_tz):
|
|
190
|
+
self.status = DocumentProcessingStatus.PENDING
|
|
191
|
+
self.started_at = None
|
|
192
|
+
self.uploaded_at = datetime.now(tz=timezone)
|
|
193
|
+
|
|
194
|
+
def enqueue(self, timezone: tzinfo = la_paz_tz):
|
|
195
|
+
self.status = DocumentProcessingStatus.ENQUEUED
|
|
196
|
+
self.enqueued_at = datetime.now(tz=timezone)
|
|
197
|
+
|
|
198
|
+
def processing(self, timezone: tzinfo = la_paz_tz):
|
|
199
|
+
self.status = DocumentProcessingStatus.PROCESSING
|
|
200
|
+
self.started_at = datetime.now(tz=timezone)
|
|
201
|
+
|
|
202
|
+
def failed(
|
|
203
|
+
self,
|
|
204
|
+
error_message: Optional[str] = None,
|
|
205
|
+
timezone: tzinfo = la_paz_tz,
|
|
206
|
+
):
|
|
207
|
+
self.failed_reason = error_message
|
|
208
|
+
self.status = DocumentProcessingStatus.FAILED
|
|
209
|
+
self.failed_at = datetime.now(tz=timezone)
|
|
210
|
+
|
|
211
|
+
def completed(self, timezone: tzinfo = la_paz_tz):
|
|
212
|
+
self.status = DocumentProcessingStatus.COMPLETED
|
|
213
|
+
self.completed_at = datetime.now(tz=timezone)
|
|
214
|
+
self.failed_reason = None
|
|
215
|
+
|
|
216
|
+
def incomplete(self, timezone: tzinfo = la_paz_tz):
|
|
217
|
+
self.status = DocumentProcessingStatus.INCOMPLETE
|
|
218
|
+
self.completed_at = datetime.now(tz=timezone)
|
|
219
|
+
|
|
220
|
+
def deleted(self):
|
|
221
|
+
self.status = DocumentProcessingStatus.DELETED
|
|
222
|
+
|
|
223
|
+
def in_review(self):
|
|
224
|
+
self.status = DocumentProcessingStatus.IN_REVIEW
|
|
225
|
+
|
|
226
|
+
def has_pages_mt(self, pages: int = 100) -> bool:
|
|
227
|
+
return self.document_pages and self.document_pages >= pages
|
|
228
|
+
|
|
229
|
+
def has_pages_lt(self, pages: int = 100) -> bool:
|
|
230
|
+
return self.document_pages and self.document_pages <= pages
|
|
231
|
+
|
|
232
|
+
def is_size_gt(self, size: Decimal = Decimal(10)):
|
|
233
|
+
return self.document_size and self.document_size >= size
|
|
234
|
+
|
|
235
|
+
def is_size_lt(self, size: Decimal = Decimal(10)):
|
|
236
|
+
return self.document_size and self.document_size <= size
|
|
237
|
+
|
|
238
|
+
def __eq__(self, other: 'DocumentProcessing') -> bool:
|
|
239
|
+
if not other:
|
|
240
|
+
return False
|
|
241
|
+
|
|
242
|
+
return (
|
|
243
|
+
self.digest == other.digest
|
|
244
|
+
and self.status == other.status
|
|
245
|
+
and self.file_path == other.file_path
|
|
246
|
+
and self.file_auxiliar_path == other.file_auxiliar_path
|
|
247
|
+
and self.issued_at == other.issued_at
|
|
248
|
+
and self.uploaded_at == other.uploaded_at
|
|
249
|
+
and self.enqueued_at == other.enqueued_at
|
|
250
|
+
and self.started_at == other.started_at
|
|
251
|
+
and self.failed_at == other.failed_at
|
|
252
|
+
and self.completed_at == other.completed_at
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
@property
|
|
256
|
+
def to_dict(self) -> dict:
|
|
257
|
+
return {
|
|
258
|
+
'digest': self.digest,
|
|
259
|
+
'status': str(self.status),
|
|
260
|
+
'file_path': self.file_path,
|
|
261
|
+
'file_auxiliar_path': self.file_auxiliar_path,
|
|
262
|
+
'category': (
|
|
263
|
+
str(self.category)
|
|
264
|
+
if self.category else None
|
|
265
|
+
),
|
|
266
|
+
'sub_category': (
|
|
267
|
+
str(self.sub_category)
|
|
268
|
+
if self.sub_category else None
|
|
269
|
+
),
|
|
270
|
+
'uploaded_from': (
|
|
271
|
+
str(self.uploaded_from)
|
|
272
|
+
if self.uploaded_from else None
|
|
273
|
+
),
|
|
274
|
+
'locked_by_admin': self.locked_by_admin,
|
|
275
|
+
'processed_csv_path': self.processed_csv_path,
|
|
276
|
+
'processed_xlsx_path': self.processed_xlsx_path,
|
|
277
|
+
'processed_json_path': self.processed_json_path,
|
|
278
|
+
'processed_metadata_path': self.processed_metadata_path,
|
|
279
|
+
'processing_time': (
|
|
280
|
+
normalize_number(self.processing_time)
|
|
281
|
+
if self.processing_time else None
|
|
282
|
+
),
|
|
283
|
+
'processing_accuracy': (
|
|
284
|
+
normalize_number(self.processing_accuracy)
|
|
285
|
+
if self.processing_accuracy else None
|
|
286
|
+
),
|
|
287
|
+
'issued_at': self.issued_at.isoformat() if self.issued_at else None,
|
|
288
|
+
'uploaded_at': self.uploaded_at.isoformat() if self.uploaded_at else None,
|
|
289
|
+
'enqueued_at': self.enqueued_at.isoformat() if self.enqueued_at else None,
|
|
290
|
+
'started_at': self.started_at.isoformat() if self.started_at else None,
|
|
291
|
+
'failed_at': self.failed_at.isoformat() if self.failed_at else None,
|
|
292
|
+
'failed_reason': self.failed_reason,
|
|
293
|
+
'feedback': self.feedback,
|
|
294
|
+
'metadata': self.metadata,
|
|
295
|
+
'completed_at': self.completed_at.isoformat() if self.completed_at else None,
|
|
296
|
+
'metadata_items': [metadata.to_dict for metadata in self.metadata_items],
|
|
297
|
+
'document_size': (
|
|
298
|
+
normalize_number(self.document_size)
|
|
299
|
+
if self.document_size else None
|
|
300
|
+
),
|
|
301
|
+
'document_pages': self.document_pages,
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
@property
|
|
305
|
+
def to_simple_dict(self) -> dict:
|
|
306
|
+
simple_dict = self.to_dict.copy()
|
|
307
|
+
simple_dict.pop('metadata_items')
|
|
308
|
+
return simple_dict
|
|
309
|
+
|
|
310
|
+
@property
|
|
311
|
+
def to_queue_dict(self) -> dict:
|
|
312
|
+
return self.to_dict
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def overload(
|
|
316
|
+
self,
|
|
317
|
+
new_instance: 'DocumentProcessing',
|
|
318
|
+
properties: List[str] = None,
|
|
319
|
+
):
|
|
320
|
+
instance_properties = properties or [
|
|
321
|
+
'status',
|
|
322
|
+
'metadata',
|
|
323
|
+
'file_path',
|
|
324
|
+
'file_auxiliar_path',
|
|
325
|
+
'file_bytes',
|
|
326
|
+
'category',
|
|
327
|
+
'sub_category',
|
|
328
|
+
'uploaded_from',
|
|
329
|
+
'locked_by_admin',
|
|
330
|
+
'processed_csv_path',
|
|
331
|
+
'processed_csv_bytes',
|
|
332
|
+
'processed_xlsx_path',
|
|
333
|
+
'processed_xlsx_bytes',
|
|
334
|
+
'processed_json_path',
|
|
335
|
+
'processed_json_bytes',
|
|
336
|
+
'processed_metadata_path',
|
|
337
|
+
'processed_metadata_bytes',
|
|
338
|
+
'processing_time',
|
|
339
|
+
'processing_accuracy',
|
|
340
|
+
'issued_at',
|
|
341
|
+
'uploaded_at',
|
|
342
|
+
'enqueued_at',
|
|
343
|
+
'started_at',
|
|
344
|
+
'failed_at',
|
|
345
|
+
'failed_reason',
|
|
346
|
+
'feedback',
|
|
347
|
+
'metadata',
|
|
348
|
+
'document_size',
|
|
349
|
+
'document_pages'
|
|
350
|
+
'completed_at',
|
|
351
|
+
]
|
|
352
|
+
for _property in instance_properties:
|
|
353
|
+
property_value = getattr(new_instance, _property)
|
|
354
|
+
if not hasattr(self, _property):
|
|
355
|
+
continue
|
|
356
|
+
setattr(self, _property, property_value)
|
|
357
|
+
return self
|
|
358
|
+
|
|
359
|
+
@classmethod
|
|
360
|
+
def from_dict(cls, data: dict) -> 'DocumentProcessing':
|
|
361
|
+
return cls(
|
|
362
|
+
digest=data.get('digest'),
|
|
363
|
+
status=DocumentProcessingStatus.from_value(data.get('status')),
|
|
364
|
+
file_path=data.get('file_path'),
|
|
365
|
+
file_auxiliar_path=data.get('file_auxiliar_path'),
|
|
366
|
+
category=(
|
|
367
|
+
DocumentProcessingCategory.from_value(data.get('category'))
|
|
368
|
+
if data.get('category') else None
|
|
369
|
+
),
|
|
370
|
+
sub_category=(
|
|
371
|
+
DocumentProcessingSubCategory.from_value(data.get('sub_category'))
|
|
372
|
+
if data.get('sub_category') else None
|
|
373
|
+
),
|
|
374
|
+
uploaded_from=(
|
|
375
|
+
DocumentProcessingSource.from_value(data.get('uploaded_from'))
|
|
376
|
+
if data.get('uploaded_from') else None
|
|
377
|
+
),
|
|
378
|
+
locked_by_admin=data.get('locked_by_admin'),
|
|
379
|
+
processed_csv_path=data.get('processed_csv_path'),
|
|
380
|
+
processed_xlsx_path=data.get('processed_xlsx_path'),
|
|
381
|
+
processed_json_path=data.get('processed_json_path'),
|
|
382
|
+
processed_metadata_path=data.get('processed_metadata_path'),
|
|
383
|
+
processing_time=(
|
|
384
|
+
Decimal(data.get('processing_time'))
|
|
385
|
+
if data.get('processing_time') else None
|
|
386
|
+
),
|
|
387
|
+
processing_accuracy=(
|
|
388
|
+
Decimal(data.get('processing_accuracy'))
|
|
389
|
+
if data.get('processing_accuracy') else None
|
|
390
|
+
),
|
|
391
|
+
issued_at=get_datetime_from_data(input_datetime=data.get('issued_at')),
|
|
392
|
+
uploaded_at=get_datetime_from_data(input_datetime=data.get('uploaded_at')),
|
|
393
|
+
enqueued_at=get_datetime_from_data(input_datetime=data.get('enqueued_at')),
|
|
394
|
+
started_at=get_datetime_from_data(input_datetime=data.get('started_at')),
|
|
395
|
+
failed_at=get_datetime_from_data(input_datetime=data.get('failed_at')),
|
|
396
|
+
failed_reason=data.get('failed_reason'),
|
|
397
|
+
feedback=data.get('feedback'),
|
|
398
|
+
metadata=data.get('metadata', {}),
|
|
399
|
+
completed_at=get_datetime_from_data(input_datetime=data.get('completed_at')),
|
|
400
|
+
metadata_items=[
|
|
401
|
+
DocumentProcessingMetadata.from_dict(metadata)
|
|
402
|
+
for metadata in data.get('metadata_items', [])
|
|
403
|
+
],
|
|
404
|
+
document_size=(
|
|
405
|
+
Decimal(data.get('document_size'))
|
|
406
|
+
if data.get('document_size') else None
|
|
407
|
+
),
|
|
408
|
+
document_pages=data.get('document_pages')
|
|
409
|
+
)
|
|
410
|
+
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class DocumentProcessingMetadata(object):
|
|
8
|
+
publication_date: Optional[datetime] = None
|
|
9
|
+
num_circular: Optional[str] = None
|
|
10
|
+
asfi_identifier: Optional[str] = None
|
|
11
|
+
contains_tables: Optional[bool] = None
|
|
12
|
+
text_content: Optional[str] = None
|
|
13
|
+
case_name: Optional[str] = None
|
|
14
|
+
starting_office: Optional[str] = None
|
|
15
|
+
output_json: Optional[dict] = None
|
|
16
|
+
processing_time: Optional[float] = None
|
|
17
|
+
llm_model: Optional[str] = None
|
|
18
|
+
num_pages: Optional[float] = None
|
|
19
|
+
num_tokens: Optional[float] = None
|
|
20
|
+
citcular_type: Optional[str] = None
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def to_dict(self):
|
|
24
|
+
return {
|
|
25
|
+
'publication_date': (
|
|
26
|
+
self.publication_date.isoformat()
|
|
27
|
+
if self.publication_date
|
|
28
|
+
else None
|
|
29
|
+
),
|
|
30
|
+
'num_circular': self.num_circular,
|
|
31
|
+
'asfi_identifier': self.asfi_identifier,
|
|
32
|
+
'contains_tables': self.contains_tables,
|
|
33
|
+
'text_content': self.text_content,
|
|
34
|
+
'case_name': self.case_name,
|
|
35
|
+
'starting_office': self.starting_office,
|
|
36
|
+
'output_json': self.output_json,
|
|
37
|
+
'processing_time': self.processing_time,
|
|
38
|
+
'llm_model': self.llm_model,
|
|
39
|
+
'num_pages': self.num_pages,
|
|
40
|
+
'num_tokens': self.num_tokens,
|
|
41
|
+
'citcular_type': self.citcular_type
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def from_dict(cls, data: dict):
|
|
46
|
+
return cls(
|
|
47
|
+
publication_date=(
|
|
48
|
+
datetime.fromisoformat(data.get('publication_date'))
|
|
49
|
+
if data.get('publication_date')
|
|
50
|
+
else None
|
|
51
|
+
),
|
|
52
|
+
num_circular=data.get('num_circular'),
|
|
53
|
+
asfi_identifier=data.get('asfi_identifier'),
|
|
54
|
+
contains_tables=data.get('contains_tables'),
|
|
55
|
+
text_content=data.get('text_content'),
|
|
56
|
+
case_name=data.get('case_name'),
|
|
57
|
+
starting_office=data.get('starting_office'),
|
|
58
|
+
output_json=data.get('output_json'),
|
|
59
|
+
processing_time=data.get('processing_time'),
|
|
60
|
+
llm_model=data.get('llm_model'),
|
|
61
|
+
num_pages=data.get('num_pages'),
|
|
62
|
+
num_tokens=data.get('num_tokens'),
|
|
63
|
+
citcular_type=data.get('citcular_type')
|
|
64
|
+
)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from documente_shared.application.files import get_filename_from_path, remove_extension, remove_slash_from_path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class InMemoryDocument(object):
|
|
10
|
+
file_path: Optional[str] = None
|
|
11
|
+
file_bytes: Optional[bytes] = None
|
|
12
|
+
file_base64: Optional[str] = None
|
|
13
|
+
|
|
14
|
+
def __post_init__(self):
|
|
15
|
+
if not self.file_path:
|
|
16
|
+
return
|
|
17
|
+
if self.file_base64 and not self.file_bytes:
|
|
18
|
+
self.file_bytes = base64.b64decode(self.file_base64)
|
|
19
|
+
elif self.file_bytes and not self.file_base64:
|
|
20
|
+
self.file_base64 = base64.b64encode(self.file_bytes).decode()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def is_valid(self) -> bool:
|
|
25
|
+
return bool(self.file_path and self.file_bytes)
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def has_content(self) -> bool:
|
|
29
|
+
return bool(self.file_bytes or self.file_base64)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def file_name(self) -> Optional[str]:
|
|
33
|
+
return get_filename_from_path(self.file_path) if self.file_path else None
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def raw_file_name(self) -> str:
|
|
37
|
+
return remove_extension(self.file_name)
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def file_key(self) -> Optional[str]:
|
|
41
|
+
return remove_slash_from_path(self.file_path)
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def is_procesable(self) -> bool:
|
|
45
|
+
return self.is_valid and self.has_content
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def to_dict(self) -> dict:
|
|
49
|
+
return {
|
|
50
|
+
"file_path": self.file_path,
|
|
51
|
+
"file_base64": self.file_base64,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def to_queue_dict(self) -> dict:
|
|
56
|
+
return {
|
|
57
|
+
"file_path": self.file_path,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def from_dict(cls, data: dict):
|
|
62
|
+
file_bytes = data.get("file_bytes")
|
|
63
|
+
file_base64 = data.get("file_base64")
|
|
64
|
+
|
|
65
|
+
if file_bytes and not file_base64:
|
|
66
|
+
file_base64 = base64.b64encode(file_bytes).decode()
|
|
67
|
+
|
|
68
|
+
if file_base64 and not file_bytes:
|
|
69
|
+
file_bytes = base64.b64decode(file_base64)
|
|
70
|
+
|
|
71
|
+
return cls(
|
|
72
|
+
file_path=data.get("file_path"),
|
|
73
|
+
file_bytes=file_bytes,
|
|
74
|
+
file_base64=file_base64,
|
|
75
|
+
)
|