kodexa-document 7.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodexa-document might be problematic. Click here for more details.
- kodexa_document-7.5.0/PKG-INFO +27 -0
- kodexa_document-7.5.0/README.md +0 -0
- kodexa_document-7.5.0/kodexa_document/connectors.py +456 -0
- kodexa_document-7.5.0/kodexa_document/model.py +3642 -0
- kodexa_document-7.5.0/kodexa_document/persistence.py +2057 -0
- kodexa_document-7.5.0/kodexa_document/persistence_models.py +421 -0
- kodexa_document-7.5.0/kodexa_document/selectors/__init__.py +5 -0
- kodexa_document-7.5.0/kodexa_document/selectors/ast.py +677 -0
- kodexa_document-7.5.0/kodexa_document/selectors/error.py +29 -0
- kodexa_document-7.5.0/kodexa_document/selectors/kodexa-ast-visitor.py +268 -0
- kodexa_document-7.5.0/kodexa_document/selectors/parser.py +91 -0
- kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelector.interp +99 -0
- kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelector.tokens +56 -0
- kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelectorLexer.interp +119 -0
- kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelectorLexer.py +204 -0
- kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelectorLexer.tokens +56 -0
- kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelectorListener.py +570 -0
- kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelectorParser.py +3246 -0
- kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelectorVisitor.py +323 -0
- kodexa_document-7.5.0/kodexa_document/selectors/visitor.py +265 -0
- kodexa_document-7.5.0/kodexa_document/steps.py +109 -0
- kodexa_document-7.5.0/pyproject.toml +41 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: kodexa-document
|
|
3
|
+
Version: 7.5.0
|
|
4
|
+
Summary: Python SDK for the Kodexa Document Database (KDDB)
|
|
5
|
+
Author: Austin Redenbaugh
|
|
6
|
+
Author-email: austin@kodexa.com
|
|
7
|
+
Requires-Python: >=3.11,<4.0
|
|
8
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
16
|
+
Requires-Dist: addict (>=2.4.0,<3.0.0)
|
|
17
|
+
Requires-Dist: antlr4-python3-runtime (>=4.13.2,<5.0.0)
|
|
18
|
+
Requires-Dist: deepdiff (>=8.4.2,<9.0.0)
|
|
19
|
+
Requires-Dist: msgpack (>=1.1.0,<2.0.0)
|
|
20
|
+
Requires-Dist: peewee (>=3.18.1,<4.0.0)
|
|
21
|
+
Requires-Dist: pydantic (>=2.11.4,<3.0.0)
|
|
22
|
+
Requires-Dist: pytest (>=8.3.5,<9.0.0)
|
|
23
|
+
Requires-Dist: pyyaml (>=6.0,<7.0)
|
|
24
|
+
Requires-Dist: requests (>=2.32.0,<3.0.0)
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
|
|
File without changes
|
|
@@ -0,0 +1,456 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utilities and base implementation of Connectors that allow a document to access a source native file or
|
|
3
|
+
stream upon which the document is based or derived.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import fnmatch
|
|
7
|
+
import inspect
|
|
8
|
+
import io
|
|
9
|
+
import logging
|
|
10
|
+
import mimetypes
|
|
11
|
+
import os
|
|
12
|
+
import tempfile
|
|
13
|
+
import urllib
|
|
14
|
+
from os.path import join
|
|
15
|
+
from typing import Dict, Type
|
|
16
|
+
|
|
17
|
+
from kodexa_document.model import Document, DocumentMetadata, SourceMetadata
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_caller_dir():
|
|
23
|
+
"""Returns the absolute path of the directory containing the file that called this function.
|
|
24
|
+
|
|
25
|
+
This function uses the `inspect` module to retrieve the stack frame of the caller and extract its file path. The file path is then converted to an absolute path using the `os.path` module.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
str: The absolute path of the caller's directory.
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
>>> get_caller_dir()
|
|
32
|
+
'/path/to/caller/directory'
|
|
33
|
+
"""
|
|
34
|
+
# get the caller's stack frame and extract its file path
|
|
35
|
+
frame_info = inspect.stack()[3]
|
|
36
|
+
filepath = frame_info.filename
|
|
37
|
+
del frame_info
|
|
38
|
+
# drop the reference to the stack frame to avoid reference cycles
|
|
39
|
+
|
|
40
|
+
# make the path absolute (optional)
|
|
41
|
+
filepath = os.path.dirname(os.path.abspath(filepath))
|
|
42
|
+
return filepath
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class FolderConnector:
|
|
46
|
+
""" """
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def get_name():
|
|
50
|
+
""" """
|
|
51
|
+
return "folder"
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
path,
|
|
56
|
+
file_filter="*",
|
|
57
|
+
recursive=False,
|
|
58
|
+
relative=False,
|
|
59
|
+
caller_path=get_caller_dir(),
|
|
60
|
+
unpack=False,
|
|
61
|
+
):
|
|
62
|
+
self.path = path
|
|
63
|
+
self.file_filter = file_filter
|
|
64
|
+
self.recursive = recursive
|
|
65
|
+
self.relative = relative
|
|
66
|
+
self.caller_path = caller_path
|
|
67
|
+
self.unpack = unpack
|
|
68
|
+
|
|
69
|
+
if not self.path:
|
|
70
|
+
raise ValueError("You must provide a path")
|
|
71
|
+
|
|
72
|
+
self.files = self.__get_files__()
|
|
73
|
+
self.index = 0
|
|
74
|
+
|
|
75
|
+
@staticmethod
|
|
76
|
+
def get_source(document):
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
document:
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
|
|
84
|
+
"""
|
|
85
|
+
return open(
|
|
86
|
+
join(document.source.original_path, document.source.original_filename), "rb"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def __iter__(self):
|
|
90
|
+
return self
|
|
91
|
+
|
|
92
|
+
def __next__(self):
|
|
93
|
+
if self.index > len(self.files) - 1:
|
|
94
|
+
raise StopIteration
|
|
95
|
+
|
|
96
|
+
self.index += 1
|
|
97
|
+
if self.unpack:
|
|
98
|
+
return Document.from_kdxa(self.files[self.index - 1])
|
|
99
|
+
|
|
100
|
+
document = Document(
|
|
101
|
+
DocumentMetadata(
|
|
102
|
+
{
|
|
103
|
+
"source_path": self.files[self.index - 1],
|
|
104
|
+
"connector": self.get_name(),
|
|
105
|
+
"mime_type": mimetypes.guess_type(self.files[self.index - 1]),
|
|
106
|
+
"connector_options": {
|
|
107
|
+
"path": self.path,
|
|
108
|
+
"file_filter": self.file_filter,
|
|
109
|
+
},
|
|
110
|
+
}
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
document.source.original_filename = os.path.basename(self.files[self.index - 1])
|
|
114
|
+
document.source.original_path = self.path
|
|
115
|
+
document.source.connector = self.get_name()
|
|
116
|
+
|
|
117
|
+
# TODO we need to get the checksum and last_updated and created times
|
|
118
|
+
return document
|
|
119
|
+
|
|
120
|
+
def __get_files__(self):
|
|
121
|
+
all_files = []
|
|
122
|
+
base_path = self.path
|
|
123
|
+
|
|
124
|
+
if self.relative:
|
|
125
|
+
base_path = os.path.join(self.caller_path, base_path)
|
|
126
|
+
for dp, dn, fn in os.walk(os.path.expanduser(base_path)):
|
|
127
|
+
for f in fn:
|
|
128
|
+
file_name = os.path.join(dp, f)
|
|
129
|
+
if fnmatch.fnmatch(f, self.file_filter):
|
|
130
|
+
all_files.append(file_name)
|
|
131
|
+
|
|
132
|
+
if not self.recursive:
|
|
133
|
+
break
|
|
134
|
+
|
|
135
|
+
return all_files
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class FileHandleConnector:
|
|
139
|
+
"""
|
|
140
|
+
A File Handle Connector can be used to create a document from a file path (e.g. a file like object)
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
@staticmethod
|
|
144
|
+
def get_name():
|
|
145
|
+
""" """
|
|
146
|
+
return "file-handle"
|
|
147
|
+
|
|
148
|
+
def __init__(self, original_path: str):
|
|
149
|
+
self.file = original_path
|
|
150
|
+
self.index = 0
|
|
151
|
+
self.completed = False
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def get_source(document: Document):
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
document:
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
|
|
162
|
+
"""
|
|
163
|
+
return open(document.source.original_path, "rb")
|
|
164
|
+
|
|
165
|
+
def __iter__(self):
|
|
166
|
+
return self
|
|
167
|
+
|
|
168
|
+
def __next__(self):
|
|
169
|
+
if self.completed:
|
|
170
|
+
raise StopIteration
|
|
171
|
+
|
|
172
|
+
document = Document(
|
|
173
|
+
DocumentMetadata(
|
|
174
|
+
{
|
|
175
|
+
"source_path": self.file,
|
|
176
|
+
"connector": self.get_name(),
|
|
177
|
+
"mime_type": mimetypes.guess_type(self.file),
|
|
178
|
+
"connector_options": {"file": self.file},
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
document.source.original_filename = self.file
|
|
183
|
+
document.source.original_path = os.path.basename(self.file)
|
|
184
|
+
document.source.connector = self.get_name()
|
|
185
|
+
return document
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class UrlConnector:
|
|
189
|
+
""" """
|
|
190
|
+
|
|
191
|
+
@staticmethod
|
|
192
|
+
def get_name():
|
|
193
|
+
""" """
|
|
194
|
+
return "url"
|
|
195
|
+
|
|
196
|
+
def __init__(self, original_path, headers=None):
|
|
197
|
+
if headers is None:
|
|
198
|
+
headers = {}
|
|
199
|
+
self.url = original_path
|
|
200
|
+
self.headers = headers
|
|
201
|
+
self.index = 0
|
|
202
|
+
self.completed = False
|
|
203
|
+
|
|
204
|
+
@staticmethod
|
|
205
|
+
def get_source(document):
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
document:
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
# If we have an http URL then we should use requests, it is much
|
|
216
|
+
# cleaner
|
|
217
|
+
import requests
|
|
218
|
+
|
|
219
|
+
if document.source.original_path.startswith("http"):
|
|
220
|
+
response = requests.get(
|
|
221
|
+
document.source.original_path, headers=document.source.headers
|
|
222
|
+
)
|
|
223
|
+
return io.BytesIO(response.content)
|
|
224
|
+
|
|
225
|
+
if document.source.headers:
|
|
226
|
+
opener = urllib.request.build_opener()
|
|
227
|
+
for header in document.source.headers:
|
|
228
|
+
opener.addheaders = [(header, document.source.headers[header])]
|
|
229
|
+
urllib.request.install_opener(opener)
|
|
230
|
+
from kodexa import KodexaPlatform
|
|
231
|
+
|
|
232
|
+
with tempfile.NamedTemporaryFile(
|
|
233
|
+
delete=True, dir=KodexaPlatform.get_tempdir()
|
|
234
|
+
) as tmp_file:
|
|
235
|
+
urllib.request.urlretrieve(document.source.original_path, tmp_file.name)
|
|
236
|
+
|
|
237
|
+
return open(tmp_file.name, "rb")
|
|
238
|
+
|
|
239
|
+
def __iter__(self):
|
|
240
|
+
return self
|
|
241
|
+
|
|
242
|
+
def __next__(self):
|
|
243
|
+
if self.completed:
|
|
244
|
+
raise StopIteration
|
|
245
|
+
|
|
246
|
+
self.completed = True
|
|
247
|
+
document = Document(
|
|
248
|
+
DocumentMetadata(
|
|
249
|
+
{
|
|
250
|
+
"connector": self.get_name(),
|
|
251
|
+
"connector_options": {"url": self.url, "headers": self.headers},
|
|
252
|
+
}
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
document.source.connector = self.get_name()
|
|
256
|
+
document.source.original_path = self.url
|
|
257
|
+
document.source.headers = self.headers
|
|
258
|
+
return document
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
# The registered connectors
|
|
262
|
+
registered_connectors: Dict[str, Type] = {}
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def get_connectors():
|
|
266
|
+
"""
|
|
267
|
+
Returns the keys of the registered connectors.
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
A list of keys representing the registered connectors.
|
|
271
|
+
"""
|
|
272
|
+
return registered_connectors.keys()
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def get_connector(connector: str, source: SourceMetadata):
|
|
276
|
+
"""Get a connector based on the provided connector name and source metadata.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
connector (str): The name of the connector.
|
|
280
|
+
source (SourceMetadata): The metadata of the source.
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
The registered connector with the given name.
|
|
284
|
+
|
|
285
|
+
Raises:
|
|
286
|
+
Exception: If the connector is not found.
|
|
287
|
+
|
|
288
|
+
Example:
|
|
289
|
+
>>> get_connector('mysql', source_metadata)
|
|
290
|
+
<mysql_connector>
|
|
291
|
+
"""
|
|
292
|
+
if connector in registered_connectors:
|
|
293
|
+
logger.info(f"Getting registered connector {connector}")
|
|
294
|
+
return registered_connectors[connector]
|
|
295
|
+
|
|
296
|
+
logging.error(f"Unable to find connector {connector}")
|
|
297
|
+
raise Exception(f"Unable to find connector {connector}")
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def add_connector(connector):
|
|
301
|
+
"""Adds a connector to the list of registered connectors.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
connector: The connector object to be added.
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
None
|
|
308
|
+
|
|
309
|
+
Raises:
|
|
310
|
+
None
|
|
311
|
+
"""
|
|
312
|
+
registered_connectors[connector.get_name()] = connector
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def get_source(document):
|
|
316
|
+
"""
|
|
317
|
+
Returns the source of a document using the specified connector.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
document (Document): The document object for which to retrieve the source.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
str: The source of the document.
|
|
324
|
+
|
|
325
|
+
Raises:
|
|
326
|
+
ValueError: If the document source connector is invalid.
|
|
327
|
+
|
|
328
|
+
Example:
|
|
329
|
+
>>> document = Document(...)
|
|
330
|
+
>>> source = get_source(document)
|
|
331
|
+
>>> print(source)
|
|
332
|
+
This is the source of the document.
|
|
333
|
+
"""
|
|
334
|
+
connector = get_connector(document.source.connector, document.source)
|
|
335
|
+
return connector.get_source(document)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class DocumentStoreConnector:
|
|
339
|
+
"""
|
|
340
|
+
A class for connecting to a document store.
|
|
341
|
+
|
|
342
|
+
Methods:
|
|
343
|
+
- get_name: Get the name of the document store.
|
|
344
|
+
- get_source: Get the source of a document from the document store.
|
|
345
|
+
"""
|
|
346
|
+
|
|
347
|
+
@staticmethod
|
|
348
|
+
def get_name():
|
|
349
|
+
""" """
|
|
350
|
+
return "document-store"
|
|
351
|
+
|
|
352
|
+
@staticmethod
|
|
353
|
+
def get_source(document):
|
|
354
|
+
"""
|
|
355
|
+
Get the source of a document from the document store.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
document (object): The document object.
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
io.BytesIO: The source of the document as a BytesIO object.
|
|
362
|
+
|
|
363
|
+
Raises:
|
|
364
|
+
Exception: If the source of the document cannot be retrieved.
|
|
365
|
+
"""
|
|
366
|
+
from kodexa import KodexaClient
|
|
367
|
+
|
|
368
|
+
client = KodexaClient()
|
|
369
|
+
from kodexa.platform.client import DocumentStoreEndpoint
|
|
370
|
+
|
|
371
|
+
document_store: DocumentStoreEndpoint = client.get_object_by_ref(
|
|
372
|
+
"store", document.source.headers["ref"]
|
|
373
|
+
)
|
|
374
|
+
from kodexa.platform.client import DocumentFamilyEndpoint
|
|
375
|
+
|
|
376
|
+
family: DocumentFamilyEndpoint = document_store.get_family(
|
|
377
|
+
document.source.headers["family"]
|
|
378
|
+
)
|
|
379
|
+
document_bytes = family.get_native()
|
|
380
|
+
if document_bytes is None:
|
|
381
|
+
raise Exception(
|
|
382
|
+
f"Unable to get source, document with id {document.source.headers['id']} is missing?"
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
import io
|
|
386
|
+
|
|
387
|
+
return io.BytesIO(document_bytes)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
add_connector(DocumentStoreConnector)
|
|
391
|
+
|
|
392
|
+
add_connector(FolderConnector)
|
|
393
|
+
add_connector(FileHandleConnector)
|
|
394
|
+
add_connector(UrlConnector)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
class SessionConnector:
|
|
399
|
+
"""
|
|
400
|
+
A class used to represent a SessionConnector.
|
|
401
|
+
|
|
402
|
+
...
|
|
403
|
+
|
|
404
|
+
Attributes
|
|
405
|
+
----------
|
|
406
|
+
event_helper : object
|
|
407
|
+
a helper object used in the connector, default is None
|
|
408
|
+
|
|
409
|
+
Methods
|
|
410
|
+
-------
|
|
411
|
+
get_name():
|
|
412
|
+
Returns the name of the cloud content.
|
|
413
|
+
|
|
414
|
+
get_source(document):
|
|
415
|
+
Returns the content object of the document source.
|
|
416
|
+
"""
|
|
417
|
+
|
|
418
|
+
event_helper = None
|
|
419
|
+
|
|
420
|
+
@classmethod
|
|
421
|
+
def get_name(cls):
|
|
422
|
+
"""
|
|
423
|
+
Gets the name of the cloud content.
|
|
424
|
+
|
|
425
|
+
Returns
|
|
426
|
+
-------
|
|
427
|
+
str
|
|
428
|
+
The name of the cloud content.
|
|
429
|
+
"""
|
|
430
|
+
return "cloud-content"
|
|
431
|
+
|
|
432
|
+
@classmethod
|
|
433
|
+
def get_source(cls, document):
|
|
434
|
+
"""
|
|
435
|
+
Gets the content object of the document source.
|
|
436
|
+
|
|
437
|
+
Parameters
|
|
438
|
+
----------
|
|
439
|
+
document : object
|
|
440
|
+
The document object to get the source from.
|
|
441
|
+
|
|
442
|
+
Raises
|
|
443
|
+
------
|
|
444
|
+
Exception
|
|
445
|
+
If the event_helper is not set.
|
|
446
|
+
|
|
447
|
+
Returns
|
|
448
|
+
-------
|
|
449
|
+
object
|
|
450
|
+
The content object of the document source.
|
|
451
|
+
"""
|
|
452
|
+
if cls.event_helper is None:
|
|
453
|
+
raise Exception("The event_helper needs to be set to use this connector")
|
|
454
|
+
|
|
455
|
+
logger.info(f"Getting content object {document.source.original_path}")
|
|
456
|
+
return cls.event_helper.get_content_object(document.source.original_path)
|