kodexa-document 7.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodexa-document might be problematic. Click here for more details.

Files changed (22) hide show
  1. kodexa_document-7.5.0/PKG-INFO +27 -0
  2. kodexa_document-7.5.0/README.md +0 -0
  3. kodexa_document-7.5.0/kodexa_document/connectors.py +456 -0
  4. kodexa_document-7.5.0/kodexa_document/model.py +3642 -0
  5. kodexa_document-7.5.0/kodexa_document/persistence.py +2057 -0
  6. kodexa_document-7.5.0/kodexa_document/persistence_models.py +421 -0
  7. kodexa_document-7.5.0/kodexa_document/selectors/__init__.py +5 -0
  8. kodexa_document-7.5.0/kodexa_document/selectors/ast.py +677 -0
  9. kodexa_document-7.5.0/kodexa_document/selectors/error.py +29 -0
  10. kodexa_document-7.5.0/kodexa_document/selectors/kodexa-ast-visitor.py +268 -0
  11. kodexa_document-7.5.0/kodexa_document/selectors/parser.py +91 -0
  12. kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelector.interp +99 -0
  13. kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelector.tokens +56 -0
  14. kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelectorLexer.interp +119 -0
  15. kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelectorLexer.py +204 -0
  16. kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelectorLexer.tokens +56 -0
  17. kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelectorListener.py +570 -0
  18. kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelectorParser.py +3246 -0
  19. kodexa_document-7.5.0/kodexa_document/selectors/resources/KodexaSelectorVisitor.py +323 -0
  20. kodexa_document-7.5.0/kodexa_document/selectors/visitor.py +265 -0
  21. kodexa_document-7.5.0/kodexa_document/steps.py +109 -0
  22. kodexa_document-7.5.0/pyproject.toml +41 -0
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.3
2
+ Name: kodexa-document
3
+ Version: 7.5.0
4
+ Summary: Python SDK for the Kodexa Document Database (KDDB)
5
+ Author: Austin Redenbaugh
6
+ Author-email: austin@kodexa.com
7
+ Requires-Python: >=3.11,<4.0
8
+ Classifier: Development Status :: 5 - Production/Stable
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Software Development :: Libraries
16
+ Requires-Dist: addict (>=2.4.0,<3.0.0)
17
+ Requires-Dist: antlr4-python3-runtime (>=4.13.2,<5.0.0)
18
+ Requires-Dist: deepdiff (>=8.4.2,<9.0.0)
19
+ Requires-Dist: msgpack (>=1.1.0,<2.0.0)
20
+ Requires-Dist: peewee (>=3.18.1,<4.0.0)
21
+ Requires-Dist: pydantic (>=2.11.4,<3.0.0)
22
+ Requires-Dist: pytest (>=8.3.5,<9.0.0)
23
+ Requires-Dist: pyyaml (>=6.0,<7.0)
24
+ Requires-Dist: requests (>=2.32.0,<3.0.0)
25
+ Description-Content-Type: text/markdown
26
+
27
+
File without changes
@@ -0,0 +1,456 @@
1
+ """
2
+ Utilities and base implementation of Connectors that allow a document to access a source native file or
3
+ stream upon which the document is based or derived.
4
+ """
5
+
6
+ import fnmatch
7
+ import inspect
8
+ import io
9
+ import logging
10
+ import mimetypes
11
+ import os
12
+ import tempfile
13
+ import urllib
14
+ from os.path import join
15
+ from typing import Dict, Type
16
+
17
+ from kodexa_document.model import Document, DocumentMetadata, SourceMetadata
18
+
19
+ logger = logging.getLogger()
20
+
21
+
22
+ def get_caller_dir():
23
+ """Returns the absolute path of the directory containing the file that called this function.
24
+
25
+ This function uses the `inspect` module to retrieve the stack frame of the caller and extract its file path. The file path is then converted to an absolute path using the `os.path` module.
26
+
27
+ Returns:
28
+ str: The absolute path of the caller's directory.
29
+
30
+ Example:
31
+ >>> get_caller_dir()
32
+ '/path/to/caller/directory'
33
+ """
34
+ # get the caller's stack frame and extract its file path
35
+ frame_info = inspect.stack()[3]
36
+ filepath = frame_info.filename
37
+ del frame_info
38
+ # drop the reference to the stack frame to avoid reference cycles
39
+
40
+ # make the path absolute (optional)
41
+ filepath = os.path.dirname(os.path.abspath(filepath))
42
+ return filepath
43
+
44
+
45
+ class FolderConnector:
46
+ """ """
47
+
48
+ @staticmethod
49
+ def get_name():
50
+ """ """
51
+ return "folder"
52
+
53
+ def __init__(
54
+ self,
55
+ path,
56
+ file_filter="*",
57
+ recursive=False,
58
+ relative=False,
59
+ caller_path=get_caller_dir(),
60
+ unpack=False,
61
+ ):
62
+ self.path = path
63
+ self.file_filter = file_filter
64
+ self.recursive = recursive
65
+ self.relative = relative
66
+ self.caller_path = caller_path
67
+ self.unpack = unpack
68
+
69
+ if not self.path:
70
+ raise ValueError("You must provide a path")
71
+
72
+ self.files = self.__get_files__()
73
+ self.index = 0
74
+
75
+ @staticmethod
76
+ def get_source(document):
77
+ """
78
+
79
+ Args:
80
+ document:
81
+
82
+ Returns:
83
+
84
+ """
85
+ return open(
86
+ join(document.source.original_path, document.source.original_filename), "rb"
87
+ )
88
+
89
+ def __iter__(self):
90
+ return self
91
+
92
+ def __next__(self):
93
+ if self.index > len(self.files) - 1:
94
+ raise StopIteration
95
+
96
+ self.index += 1
97
+ if self.unpack:
98
+ return Document.from_kdxa(self.files[self.index - 1])
99
+
100
+ document = Document(
101
+ DocumentMetadata(
102
+ {
103
+ "source_path": self.files[self.index - 1],
104
+ "connector": self.get_name(),
105
+ "mime_type": mimetypes.guess_type(self.files[self.index - 1]),
106
+ "connector_options": {
107
+ "path": self.path,
108
+ "file_filter": self.file_filter,
109
+ },
110
+ }
111
+ )
112
+ )
113
+ document.source.original_filename = os.path.basename(self.files[self.index - 1])
114
+ document.source.original_path = self.path
115
+ document.source.connector = self.get_name()
116
+
117
+ # TODO we need to get the checksum and last_updated and created times
118
+ return document
119
+
120
+ def __get_files__(self):
121
+ all_files = []
122
+ base_path = self.path
123
+
124
+ if self.relative:
125
+ base_path = os.path.join(self.caller_path, base_path)
126
+ for dp, dn, fn in os.walk(os.path.expanduser(base_path)):
127
+ for f in fn:
128
+ file_name = os.path.join(dp, f)
129
+ if fnmatch.fnmatch(f, self.file_filter):
130
+ all_files.append(file_name)
131
+
132
+ if not self.recursive:
133
+ break
134
+
135
+ return all_files
136
+
137
+
138
+ class FileHandleConnector:
139
+ """
140
+ A File Handle Connector can be used to create a document from a file path (e.g. a file like object)
141
+ """
142
+
143
+ @staticmethod
144
+ def get_name():
145
+ """ """
146
+ return "file-handle"
147
+
148
+ def __init__(self, original_path: str):
149
+ self.file = original_path
150
+ self.index = 0
151
+ self.completed = False
152
+
153
+ @staticmethod
154
+ def get_source(document: Document):
155
+ """
156
+
157
+ Args:
158
+ document:
159
+
160
+ Returns:
161
+
162
+ """
163
+ return open(document.source.original_path, "rb")
164
+
165
+ def __iter__(self):
166
+ return self
167
+
168
+ def __next__(self):
169
+ if self.completed:
170
+ raise StopIteration
171
+
172
+ document = Document(
173
+ DocumentMetadata(
174
+ {
175
+ "source_path": self.file,
176
+ "connector": self.get_name(),
177
+ "mime_type": mimetypes.guess_type(self.file),
178
+ "connector_options": {"file": self.file},
179
+ }
180
+ )
181
+ )
182
+ document.source.original_filename = self.file
183
+ document.source.original_path = os.path.basename(self.file)
184
+ document.source.connector = self.get_name()
185
+ return document
186
+
187
+
188
+ class UrlConnector:
189
+ """ """
190
+
191
+ @staticmethod
192
+ def get_name():
193
+ """ """
194
+ return "url"
195
+
196
+ def __init__(self, original_path, headers=None):
197
+ if headers is None:
198
+ headers = {}
199
+ self.url = original_path
200
+ self.headers = headers
201
+ self.index = 0
202
+ self.completed = False
203
+
204
+ @staticmethod
205
+ def get_source(document):
206
+ """
207
+
208
+ Args:
209
+ document:
210
+
211
+ Returns:
212
+
213
+ """
214
+
215
+ # If we have an http URL then we should use requests, it is much
216
+ # cleaner
217
+ import requests
218
+
219
+ if document.source.original_path.startswith("http"):
220
+ response = requests.get(
221
+ document.source.original_path, headers=document.source.headers
222
+ )
223
+ return io.BytesIO(response.content)
224
+
225
+ if document.source.headers:
226
+ opener = urllib.request.build_opener()
227
+ for header in document.source.headers:
228
+ opener.addheaders = [(header, document.source.headers[header])]
229
+ urllib.request.install_opener(opener)
230
+ from kodexa import KodexaPlatform
231
+
232
+ with tempfile.NamedTemporaryFile(
233
+ delete=True, dir=KodexaPlatform.get_tempdir()
234
+ ) as tmp_file:
235
+ urllib.request.urlretrieve(document.source.original_path, tmp_file.name)
236
+
237
+ return open(tmp_file.name, "rb")
238
+
239
+ def __iter__(self):
240
+ return self
241
+
242
+ def __next__(self):
243
+ if self.completed:
244
+ raise StopIteration
245
+
246
+ self.completed = True
247
+ document = Document(
248
+ DocumentMetadata(
249
+ {
250
+ "connector": self.get_name(),
251
+ "connector_options": {"url": self.url, "headers": self.headers},
252
+ }
253
+ )
254
+ )
255
+ document.source.connector = self.get_name()
256
+ document.source.original_path = self.url
257
+ document.source.headers = self.headers
258
+ return document
259
+
260
+
261
+ # The registered connectors
262
+ registered_connectors: Dict[str, Type] = {}
263
+
264
+
265
+ def get_connectors():
266
+ """
267
+ Returns the keys of the registered connectors.
268
+
269
+ Returns:
270
+ A list of keys representing the registered connectors.
271
+ """
272
+ return registered_connectors.keys()
273
+
274
+
275
+ def get_connector(connector: str, source: SourceMetadata):
276
+ """Get a connector based on the provided connector name and source metadata.
277
+
278
+ Args:
279
+ connector (str): The name of the connector.
280
+ source (SourceMetadata): The metadata of the source.
281
+
282
+ Returns:
283
+ The registered connector with the given name.
284
+
285
+ Raises:
286
+ Exception: If the connector is not found.
287
+
288
+ Example:
289
+ >>> get_connector('mysql', source_metadata)
290
+ <mysql_connector>
291
+ """
292
+ if connector in registered_connectors:
293
+ logger.info(f"Getting registered connector {connector}")
294
+ return registered_connectors[connector]
295
+
296
+ logging.error(f"Unable to find connector {connector}")
297
+ raise Exception(f"Unable to find connector {connector}")
298
+
299
+
300
+ def add_connector(connector):
301
+ """Adds a connector to the list of registered connectors.
302
+
303
+ Args:
304
+ connector: The connector object to be added.
305
+
306
+ Returns:
307
+ None
308
+
309
+ Raises:
310
+ None
311
+ """
312
+ registered_connectors[connector.get_name()] = connector
313
+
314
+
315
+ def get_source(document):
316
+ """
317
+ Returns the source of a document using the specified connector.
318
+
319
+ Args:
320
+ document (Document): The document object for which to retrieve the source.
321
+
322
+ Returns:
323
+ str: The source of the document.
324
+
325
+ Raises:
326
+ ValueError: If the document source connector is invalid.
327
+
328
+ Example:
329
+ >>> document = Document(...)
330
+ >>> source = get_source(document)
331
+ >>> print(source)
332
+ This is the source of the document.
333
+ """
334
+ connector = get_connector(document.source.connector, document.source)
335
+ return connector.get_source(document)
336
+
337
+
338
+ class DocumentStoreConnector:
339
+ """
340
+ A class for connecting to a document store.
341
+
342
+ Methods:
343
+ - get_name: Get the name of the document store.
344
+ - get_source: Get the source of a document from the document store.
345
+ """
346
+
347
+ @staticmethod
348
+ def get_name():
349
+ """ """
350
+ return "document-store"
351
+
352
+ @staticmethod
353
+ def get_source(document):
354
+ """
355
+ Get the source of a document from the document store.
356
+
357
+ Args:
358
+ document (object): The document object.
359
+
360
+ Returns:
361
+ io.BytesIO: The source of the document as a BytesIO object.
362
+
363
+ Raises:
364
+ Exception: If the source of the document cannot be retrieved.
365
+ """
366
+ from kodexa import KodexaClient
367
+
368
+ client = KodexaClient()
369
+ from kodexa.platform.client import DocumentStoreEndpoint
370
+
371
+ document_store: DocumentStoreEndpoint = client.get_object_by_ref(
372
+ "store", document.source.headers["ref"]
373
+ )
374
+ from kodexa.platform.client import DocumentFamilyEndpoint
375
+
376
+ family: DocumentFamilyEndpoint = document_store.get_family(
377
+ document.source.headers["family"]
378
+ )
379
+ document_bytes = family.get_native()
380
+ if document_bytes is None:
381
+ raise Exception(
382
+ f"Unable to get source, document with id {document.source.headers['id']} is missing?"
383
+ )
384
+
385
+ import io
386
+
387
+ return io.BytesIO(document_bytes)
388
+
389
+
390
+ add_connector(DocumentStoreConnector)
391
+
392
+ add_connector(FolderConnector)
393
+ add_connector(FileHandleConnector)
394
+ add_connector(UrlConnector)
395
+
396
+
397
+
398
+ class SessionConnector:
399
+ """
400
+ A class used to represent a SessionConnector.
401
+
402
+ ...
403
+
404
+ Attributes
405
+ ----------
406
+ event_helper : object
407
+ a helper object used in the connector, default is None
408
+
409
+ Methods
410
+ -------
411
+ get_name():
412
+ Returns the name of the cloud content.
413
+
414
+ get_source(document):
415
+ Returns the content object of the document source.
416
+ """
417
+
418
+ event_helper = None
419
+
420
+ @classmethod
421
+ def get_name(cls):
422
+ """
423
+ Gets the name of the cloud content.
424
+
425
+ Returns
426
+ -------
427
+ str
428
+ The name of the cloud content.
429
+ """
430
+ return "cloud-content"
431
+
432
+ @classmethod
433
+ def get_source(cls, document):
434
+ """
435
+ Gets the content object of the document source.
436
+
437
+ Parameters
438
+ ----------
439
+ document : object
440
+ The document object to get the source from.
441
+
442
+ Raises
443
+ ------
444
+ Exception
445
+ If the event_helper is not set.
446
+
447
+ Returns
448
+ -------
449
+ object
450
+ The content object of the document source.
451
+ """
452
+ if cls.event_helper is None:
453
+ raise Exception("The event_helper needs to be set to use this connector")
454
+
455
+ logger.info(f"Getting content object {document.source.original_path}")
456
+ return cls.event_helper.get_content_object(document.source.original_path)