kodexa-document 8.0.1.post1746984682__tar.gz → 8.0.1.post1746986432__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodexa-document might be problematic. Click here for more details.

Files changed (24) hide show
  1. kodexa_document-8.0.1.post1746986432/PKG-INFO +353 -0
  2. kodexa_document-8.0.1.post1746986432/README.md +326 -0
  3. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/pyproject.toml +1 -1
  4. kodexa_document-8.0.1.post1746984682/PKG-INFO +0 -27
  5. kodexa_document-8.0.1.post1746984682/README.md +0 -0
  6. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/connectors.py +0 -0
  7. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/model.py +0 -0
  8. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/persistence.py +0 -0
  9. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/persistence_models.py +0 -0
  10. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/__init__.py +0 -0
  11. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/ast.py +0 -0
  12. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/error.py +0 -0
  13. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/kodexa-ast-visitor.py +0 -0
  14. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/parser.py +0 -0
  15. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/resources/KodexaSelector.interp +0 -0
  16. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/resources/KodexaSelector.tokens +0 -0
  17. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/resources/KodexaSelectorLexer.interp +0 -0
  18. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/resources/KodexaSelectorLexer.py +0 -0
  19. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/resources/KodexaSelectorLexer.tokens +0 -0
  20. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/resources/KodexaSelectorListener.py +0 -0
  21. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/resources/KodexaSelectorParser.py +0 -0
  22. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/resources/KodexaSelectorVisitor.py +0 -0
  23. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/selectors/visitor.py +0 -0
  24. {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746986432}/kodexa_document/steps.py +0 -0
@@ -0,0 +1,353 @@
1
+ Metadata-Version: 2.3
2
+ Name: kodexa-document
3
+ Version: 8.0.1.post1746986432
4
+ Summary: Python SDK for the Kodexa Document Database (KDDB)
5
+ Author: Austin Redenbaugh
6
+ Author-email: austin@kodexa.com
7
+ Requires-Python: >=3.11,<4.0
8
+ Classifier: Development Status :: 5 - Production/Stable
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Software Development :: Libraries
16
+ Requires-Dist: addict (>=2.4.0,<3.0.0)
17
+ Requires-Dist: antlr4-python3-runtime (>=4.13.2,<5.0.0)
18
+ Requires-Dist: deepdiff (>=8.4.2,<9.0.0)
19
+ Requires-Dist: msgpack (>=1.1.0,<2.0.0)
20
+ Requires-Dist: peewee (>=3.18.1,<4.0.0)
21
+ Requires-Dist: pydantic (>=2.11.4,<3.0.0)
22
+ Requires-Dist: pytest (>=8.3.5,<9.0.0)
23
+ Requires-Dist: pyyaml (>=6.0,<7.0)
24
+ Requires-Dist: requests (>=2.32.0,<3.0.0)
25
+ Description-Content-Type: text/markdown
26
+
27
+ # Kodexa Document Python SDK
28
+
29
+ The Kodexa Document Python SDK provides a powerful framework for working with structured documents in the Kodexa Document Database (KDDB) format. This library enables developers to create, load, manipulate, and query documents with a hierarchical node structure.
30
+
31
+ ## Overview
32
+
33
+ The Kodexa Document Python SDK is designed to provide a robust document object model with persistence capabilities. At its core is the `Document` class, which represents a document as a hierarchical tree of content nodes. The SDK uses SQLite as its underlying storage mechanism through the KDDB (Kodexa Document Database) format.
34
+
35
+ Key features include:
36
+
37
+ - Document creation and manipulation through a hierarchical node structure
38
+ - Persistence to and from KDDB files
39
+ - Powerful selector language (similar to XPath) for querying document content
40
+ - Feature and tag management for document nodes
41
+ - Support for document metadata and source tracking
42
+
43
+ ## Installation
44
+
45
+ ```bash
46
+ pip install kodexa-document
47
+ ```
48
+
49
+ Or using Poetry:
50
+
51
+ ```bash
52
+ poetry add kodexa-document
53
+ ```
54
+
55
+ ## Working with KDDB Files
56
+
57
+ KDDB (Kodexa Document Database) is the default storage format for Kodexa documents. It provides high-performance storage and the ability to handle very large document objects. KDDB files are SQLite databases with a specific schema designed for efficient document storage and retrieval.
58
+
59
+ ### Reading KDDB Files
60
+
61
+ To read a document from a KDDB file:
62
+
63
+ ```python
64
+ from kodexa_document import Document
65
+
66
+ # Load a document from a KDDB file
67
+ document = Document.from_kddb("path/to/document.kddb")
68
+
69
+ # Access the document content
70
+ root_node = document.get_root()
71
+ print(root_node.get_all_content())
72
+
73
+ # Query the document using selectors
74
+ paragraphs = document.select("//paragraph")
75
+ for paragraph in paragraphs:
76
+ print(paragraph.content)
77
+ ```
78
+
79
+ You can also load a KDDB file from a bytes object:
80
+
81
+ ```python
82
+ # Load from bytes
83
+ with open("path/to/document.kddb", "rb") as f:
84
+ kddb_bytes = f.read()
85
+
86
+ document = Document.from_kddb(kddb_bytes)
87
+ ```
88
+
89
+ The `from_kddb` method accepts the following parameters:
90
+
91
+ - `source` (str or bytes): Path to a KDDB file or bytes containing KDDB data
92
+ - `detached` (bool, optional): Whether to load the document in detached mode (default: True)
93
+ - `inmemory` (bool, optional): Whether to load the document in memory for faster processing (default: False)
94
+
95
+ ### Writing KDDB Files
96
+
97
+ To save a document to a KDDB file:
98
+
99
+ ```python
100
+ # Save to a file
101
+ document.to_kddb("path/to/output.kddb")
102
+
103
+ # Or get the KDDB as bytes
104
+ kddb_bytes = document.to_kddb()
105
+ ```
106
+
107
+ The `to_kddb` method accepts an optional `path` parameter. If provided, the document will be written to the specified file. If not provided, the method will return a bytes object containing the KDDB data.
108
+
109
+ ### In-Memory Processing
110
+
111
+ For faster processing of documents, you can use the `inmemory` parameter when loading a KDDB file:
112
+
113
+ ```python
114
+ # Load document in memory for faster processing
115
+ document = Document.from_kddb("path/to/document.kddb", inmemory=True)
116
+
117
+ # Process the document...
118
+
119
+ # Save the document back to disk
120
+ document.to_kddb("path/to/output.kddb")
121
+ ```
122
+
123
+ ## Creating Documents
124
+
125
+ You can create new documents from scratch or from text:
126
+
127
+ ```python
128
+ # Create a new empty document
129
+ from kodexa_document import Document
130
+ document = Document()
131
+
132
+ # Create a root node
133
+ root_node = document.create_node(node_type="document")
134
+ document.content_node = root_node
135
+
136
+ # Add child nodes
137
+ paragraph = document.create_node(node_type="paragraph", content="This is a paragraph.")
138
+ root_node.add_child(paragraph)
139
+
140
+ # Create a document from text
141
+ text_document = Document.from_text("This is a sample document.")
142
+ print(text_document.get_root().content)
143
+
144
+ # Create a document with separated content
145
+ separated_document = Document.from_text("This is a sample document.", separator=" ")
146
+ # This creates a document with each word as a separate child node
147
+ print(len(separated_document.get_root().get_children())) # Outputs: 5
148
+ ```
149
+
150
+ ## Document Structure
151
+
152
+ A Kodexa document consists of:
153
+
154
+ - A root content node (`document.content_node` or `document.get_root()`)
155
+ - A hierarchical structure of content nodes
156
+ - Metadata about the document (`document.metadata`)
157
+ - Source information (`document.source`)
158
+
159
+ Each content node can have:
160
+
161
+ - Content (text)
162
+ - Features (metadata attached to nodes)
163
+ - Child nodes
164
+ - Tags (special features for marking up content)
165
+
166
+ ## Working with Content Nodes
167
+
168
+ Content nodes form the hierarchical structure of a document:
169
+
170
+ ```python
171
+ # Access the root node
172
+ root = document.get_root()
173
+
174
+ # Get all children
175
+ children = root.get_children()
176
+
177
+ # Access content
178
+ content = root.content
179
+
180
+ # Get all content (including from children)
181
+ all_content = root.get_all_content()
182
+
183
+ # Add a child node
184
+ new_node = document.create_node(node_type="paragraph", content="New paragraph")
185
+ root.add_child(new_node)
186
+ ```
187
+
188
+ ## Using Selectors
189
+
190
+ The SDK provides a powerful selector language (similar to XPath) for querying document content:
191
+
192
+ ```python
193
+ # Select all paragraph nodes
194
+ paragraphs = document.select("//paragraph")
195
+
196
+ # Select nodes with specific content
197
+ important_nodes = document.select("//paragraph[contains(., 'important')]")
198
+
199
+ # Select the first matching node
200
+ first_table = document.select_first("//table")
201
+
202
+ # Select nodes with specific features
203
+ tagged_nodes = document.select("//*[hasFeature('tag', 'highlight')]")
204
+ ```
205
+
206
+ ## Working with Features and Tags
207
+
208
+ Features are metadata attached to content nodes:
209
+
210
+ ```python
211
+ # Add a feature to a node
212
+ node.add_feature("category", "section", "introduction")
213
+
214
+ # Check if a node has a feature
215
+ if node.has_feature("category", "section"):
216
+ # Do something
217
+
218
+ # Get feature value
219
+ category = node.get_feature_value("category", "section")
220
+ ```
221
+
222
+ Tags are special features used for marking up content:
223
+
224
+ ```python
225
+ # Tag a node
226
+ node.tag("highlight", tag_uuid="unique-id-123")
227
+
228
+ # Tag content matching a pattern
229
+ node.tag("person", content_re="John|Jane")
230
+
231
+ # Check if a node has tags
232
+ if node.has_tags():
233
+ # Do something
234
+
235
+ # Get all tags on a node
236
+ tags = node.get_tags()
237
+ ```
238
+
239
+ ## Persistence Layer
240
+
241
+ The SDK uses SQLite as its persistence layer through the `SqliteDocumentPersistence` class. This class handles all database operations for storing and retrieving documents, nodes, and features.
242
+
243
+ The persistence layer is automatically created when you create a document or load a document from a KDDB file. You can access it through the `get_persistence()` method:
244
+
245
+ ```python
246
+ # Access the persistence layer
247
+ persistence = document.get_persistence()
248
+
249
+ # Close the document and clean up resources
250
+ document.close()
251
+ ```
252
+
253
+ ## Converting Between Formats
254
+
255
+ The SDK supports converting between different formats:
256
+
257
+ ```python
258
+ # Convert to/from JSON
259
+ json_str = document.to_json()
260
+ json_document = Document.from_json(json_str)
261
+
262
+ # Convert to/from dictionary
263
+ doc_dict = document.to_dict()
264
+ dict_document = Document.from_dict(doc_dict)
265
+
266
+ # Convert to/from MessagePack (KDXA format)
267
+ msgpack_bytes = document.to_msgpack()
268
+ msgpack_document = Document.from_msgpack(msgpack_bytes)
269
+
270
+ # Save to KDXA file
271
+ document.to_kdxa("document.kdxa")
272
+
273
+ # Load from KDXA file
274
+ kdxa_document = Document.from_kdxa("document.kdxa")
275
+ ```
276
+
277
+ ## Example: Processing a KDDB File
278
+
279
+ Here's a complete example of loading a KDDB file, processing its content, and saving it back:
280
+
281
+ ```python
282
+ from kodexa_document import Document
283
+
284
+ # Load a document from a KDDB file
285
+ document = Document.from_kddb("input.kddb")
286
+
287
+ # Get the root node
288
+ root = document.get_root()
289
+
290
+ # Select all paragraph nodes
291
+ paragraphs = root.select("//paragraph")
292
+
293
+ # Tag paragraphs containing specific text
294
+ for paragraph in paragraphs:
295
+ if "important" in paragraph.get_all_content():
296
+ paragraph.tag("important")
297
+
298
+ # Add a document-level label
299
+ document.add_label("processed")
300
+
301
+ # Save the modified document
302
+ document.to_kddb("output.kddb")
303
+
304
+ # Clean up resources
305
+ document.close()
306
+ ```
307
+
308
+ ## Advanced Features
309
+
310
+ ### External Data
311
+
312
+ You can store arbitrary data with a document:
313
+
314
+ ```python
315
+ # Store external data
316
+ document.set_external_data({"key": "value"})
317
+
318
+ # Store external data with a specific key
319
+ document.set_external_data({"status": "processed"}, "metadata")
320
+
321
+ # Retrieve external data
322
+ data = document.get_external_data()
323
+ metadata = document.get_external_data("metadata")
324
+
325
+ # Get all external data keys
326
+ keys = document.get_external_data_keys()
327
+ ```
328
+
329
+ ### Processing Steps
330
+
331
+ You can track processing steps applied to a document:
332
+
333
+ ```python
334
+ from kodexa_document.model import ProcessingStep
335
+
336
+ # Create processing steps
337
+ step1 = ProcessingStep(name="Extract Text")
338
+ step2 = ProcessingStep(name="Tag Entities")
339
+
340
+ # Add child steps
341
+ step1.add_child(step2)
342
+
343
+ # Set steps on the document
344
+ document.set_steps([step1, step2])
345
+
346
+ # Retrieve steps
347
+ steps = document.get_steps()
348
+ ```
349
+
350
+ ## License
351
+
352
+ This project is licensed under the Apache License 2.0 - see the LICENSE file for details.
353
+
@@ -0,0 +1,326 @@
1
+ # Kodexa Document Python SDK
2
+
3
+ The Kodexa Document Python SDK provides a powerful framework for working with structured documents in the Kodexa Document Database (KDDB) format. This library enables developers to create, load, manipulate, and query documents with a hierarchical node structure.
4
+
5
+ ## Overview
6
+
7
+ The Kodexa Document Python SDK is designed to provide a robust document object model with persistence capabilities. At its core is the `Document` class, which represents a document as a hierarchical tree of content nodes. The SDK uses SQLite as its underlying storage mechanism through the KDDB (Kodexa Document Database) format.
8
+
9
+ Key features include:
10
+
11
+ - Document creation and manipulation through a hierarchical node structure
12
+ - Persistence to and from KDDB files
13
+ - Powerful selector language (similar to XPath) for querying document content
14
+ - Feature and tag management for document nodes
15
+ - Support for document metadata and source tracking
16
+
17
+ ## Installation
18
+
19
+ ```bash
20
+ pip install kodexa-document
21
+ ```
22
+
23
+ Or using Poetry:
24
+
25
+ ```bash
26
+ poetry add kodexa-document
27
+ ```
28
+
29
+ ## Working with KDDB Files
30
+
31
+ KDDB (Kodexa Document Database) is the default storage format for Kodexa documents. It provides high-performance storage and the ability to handle very large document objects. KDDB files are SQLite databases with a specific schema designed for efficient document storage and retrieval.
32
+
33
+ ### Reading KDDB Files
34
+
35
+ To read a document from a KDDB file:
36
+
37
+ ```python
38
+ from kodexa_document import Document
39
+
40
+ # Load a document from a KDDB file
41
+ document = Document.from_kddb("path/to/document.kddb")
42
+
43
+ # Access the document content
44
+ root_node = document.get_root()
45
+ print(root_node.get_all_content())
46
+
47
+ # Query the document using selectors
48
+ paragraphs = document.select("//paragraph")
49
+ for paragraph in paragraphs:
50
+ print(paragraph.content)
51
+ ```
52
+
53
+ You can also load a KDDB file from a bytes object:
54
+
55
+ ```python
56
+ # Load from bytes
57
+ with open("path/to/document.kddb", "rb") as f:
58
+ kddb_bytes = f.read()
59
+
60
+ document = Document.from_kddb(kddb_bytes)
61
+ ```
62
+
63
+ The `from_kddb` method accepts the following parameters:
64
+
65
+ - `source` (str or bytes): Path to a KDDB file or bytes containing KDDB data
66
+ - `detached` (bool, optional): Whether to load the document in detached mode (default: True)
67
+ - `inmemory` (bool, optional): Whether to load the document in memory for faster processing (default: False)
68
+
69
+ ### Writing KDDB Files
70
+
71
+ To save a document to a KDDB file:
72
+
73
+ ```python
74
+ # Save to a file
75
+ document.to_kddb("path/to/output.kddb")
76
+
77
+ # Or get the KDDB as bytes
78
+ kddb_bytes = document.to_kddb()
79
+ ```
80
+
81
+ The `to_kddb` method accepts an optional `path` parameter. If provided, the document will be written to the specified file. If not provided, the method will return a bytes object containing the KDDB data.
82
+
83
+ ### In-Memory Processing
84
+
85
+ For faster processing of documents, you can use the `inmemory` parameter when loading a KDDB file:
86
+
87
+ ```python
88
+ # Load document in memory for faster processing
89
+ document = Document.from_kddb("path/to/document.kddb", inmemory=True)
90
+
91
+ # Process the document...
92
+
93
+ # Save the document back to disk
94
+ document.to_kddb("path/to/output.kddb")
95
+ ```
96
+
97
+ ## Creating Documents
98
+
99
+ You can create new documents from scratch or from text:
100
+
101
+ ```python
102
+ # Create a new empty document
103
+ from kodexa_document import Document
104
+ document = Document()
105
+
106
+ # Create a root node
107
+ root_node = document.create_node(node_type="document")
108
+ document.content_node = root_node
109
+
110
+ # Add child nodes
111
+ paragraph = document.create_node(node_type="paragraph", content="This is a paragraph.")
112
+ root_node.add_child(paragraph)
113
+
114
+ # Create a document from text
115
+ text_document = Document.from_text("This is a sample document.")
116
+ print(text_document.get_root().content)
117
+
118
+ # Create a document with separated content
119
+ separated_document = Document.from_text("This is a sample document.", separator=" ")
120
+ # This creates a document with each word as a separate child node
121
+ print(len(separated_document.get_root().get_children())) # Outputs: 5
122
+ ```
123
+
124
+ ## Document Structure
125
+
126
+ A Kodexa document consists of:
127
+
128
+ - A root content node (`document.content_node` or `document.get_root()`)
129
+ - A hierarchical structure of content nodes
130
+ - Metadata about the document (`document.metadata`)
131
+ - Source information (`document.source`)
132
+
133
+ Each content node can have:
134
+
135
+ - Content (text)
136
+ - Features (metadata attached to nodes)
137
+ - Child nodes
138
+ - Tags (special features for marking up content)
139
+
140
+ ## Working with Content Nodes
141
+
142
+ Content nodes form the hierarchical structure of a document:
143
+
144
+ ```python
145
+ # Access the root node
146
+ root = document.get_root()
147
+
148
+ # Get all children
149
+ children = root.get_children()
150
+
151
+ # Access content
152
+ content = root.content
153
+
154
+ # Get all content (including from children)
155
+ all_content = root.get_all_content()
156
+
157
+ # Add a child node
158
+ new_node = document.create_node(node_type="paragraph", content="New paragraph")
159
+ root.add_child(new_node)
160
+ ```
161
+
162
+ ## Using Selectors
163
+
164
+ The SDK provides a powerful selector language (similar to XPath) for querying document content:
165
+
166
+ ```python
167
+ # Select all paragraph nodes
168
+ paragraphs = document.select("//paragraph")
169
+
170
+ # Select nodes with specific content
171
+ important_nodes = document.select("//paragraph[contains(., 'important')]")
172
+
173
+ # Select the first matching node
174
+ first_table = document.select_first("//table")
175
+
176
+ # Select nodes with specific features
177
+ tagged_nodes = document.select("//*[hasFeature('tag', 'highlight')]")
178
+ ```
179
+
180
+ ## Working with Features and Tags
181
+
182
+ Features are metadata attached to content nodes:
183
+
184
+ ```python
185
+ # Add a feature to a node
186
+ node.add_feature("category", "section", "introduction")
187
+
188
+ # Check if a node has a feature
189
+ if node.has_feature("category", "section"):
190
+ # Do something
191
+
192
+ # Get feature value
193
+ category = node.get_feature_value("category", "section")
194
+ ```
195
+
196
+ Tags are special features used for marking up content:
197
+
198
+ ```python
199
+ # Tag a node
200
+ node.tag("highlight", tag_uuid="unique-id-123")
201
+
202
+ # Tag content matching a pattern
203
+ node.tag("person", content_re="John|Jane")
204
+
205
+ # Check if a node has tags
206
+ if node.has_tags():
207
+ # Do something
208
+
209
+ # Get all tags on a node
210
+ tags = node.get_tags()
211
+ ```
212
+
213
+ ## Persistence Layer
214
+
215
+ The SDK uses SQLite as its persistence layer through the `SqliteDocumentPersistence` class. This class handles all database operations for storing and retrieving documents, nodes, and features.
216
+
217
+ The persistence layer is automatically created when you create a document or load a document from a KDDB file. You can access it through the `get_persistence()` method:
218
+
219
+ ```python
220
+ # Access the persistence layer
221
+ persistence = document.get_persistence()
222
+
223
+ # Close the document and clean up resources
224
+ document.close()
225
+ ```
226
+
227
+ ## Converting Between Formats
228
+
229
+ The SDK supports converting between different formats:
230
+
231
+ ```python
232
+ # Convert to/from JSON
233
+ json_str = document.to_json()
234
+ json_document = Document.from_json(json_str)
235
+
236
+ # Convert to/from dictionary
237
+ doc_dict = document.to_dict()
238
+ dict_document = Document.from_dict(doc_dict)
239
+
240
+ # Convert to/from MessagePack (KDXA format)
241
+ msgpack_bytes = document.to_msgpack()
242
+ msgpack_document = Document.from_msgpack(msgpack_bytes)
243
+
244
+ # Save to KDXA file
245
+ document.to_kdxa("document.kdxa")
246
+
247
+ # Load from KDXA file
248
+ kdxa_document = Document.from_kdxa("document.kdxa")
249
+ ```
250
+
251
+ ## Example: Processing a KDDB File
252
+
253
+ Here's a complete example of loading a KDDB file, processing its content, and saving it back:
254
+
255
+ ```python
256
+ from kodexa_document import Document
257
+
258
+ # Load a document from a KDDB file
259
+ document = Document.from_kddb("input.kddb")
260
+
261
+ # Get the root node
262
+ root = document.get_root()
263
+
264
+ # Select all paragraph nodes
265
+ paragraphs = root.select("//paragraph")
266
+
267
+ # Tag paragraphs containing specific text
268
+ for paragraph in paragraphs:
269
+ if "important" in paragraph.get_all_content():
270
+ paragraph.tag("important")
271
+
272
+ # Add a document-level label
273
+ document.add_label("processed")
274
+
275
+ # Save the modified document
276
+ document.to_kddb("output.kddb")
277
+
278
+ # Clean up resources
279
+ document.close()
280
+ ```
281
+
282
+ ## Advanced Features
283
+
284
+ ### External Data
285
+
286
+ You can store arbitrary data with a document:
287
+
288
+ ```python
289
+ # Store external data
290
+ document.set_external_data({"key": "value"})
291
+
292
+ # Store external data with a specific key
293
+ document.set_external_data({"status": "processed"}, "metadata")
294
+
295
+ # Retrieve external data
296
+ data = document.get_external_data()
297
+ metadata = document.get_external_data("metadata")
298
+
299
+ # Get all external data keys
300
+ keys = document.get_external_data_keys()
301
+ ```
302
+
303
+ ### Processing Steps
304
+
305
+ You can track processing steps applied to a document:
306
+
307
+ ```python
308
+ from kodexa_document.model import ProcessingStep
309
+
310
+ # Create processing steps
311
+ step1 = ProcessingStep(name="Extract Text")
312
+ step2 = ProcessingStep(name="Tag Entities")
313
+
314
+ # Add child steps
315
+ step1.add_child(step2)
316
+
317
+ # Set steps on the document
318
+ document.set_steps([step1, step2])
319
+
320
+ # Retrieve steps
321
+ steps = document.get_steps()
322
+ ```
323
+
324
+ ## License
325
+
326
+ This project is licensed under the Apache License 2.0 - see the LICENSE file for details.
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "kodexa-document"
3
- version = "8.0.1-1746984682"
3
+ version = "8.0.1-1746986432"
4
4
  description = "Python SDK for the Kodexa Document Database (KDDB)"
5
5
  authors = ["Austin Redenbaugh <austin@kodexa.com>", "Philip Dodds <philip@kodexa.com>", "Romar Cablao <rcablao@kodexa.com>", "Amadea Paula Dodds <amadeapaula@kodexa.com>"]
6
6
  readme = "README.md"
@@ -1,27 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: kodexa-document
3
- Version: 8.0.1.post1746984682
4
- Summary: Python SDK for the Kodexa Document Database (KDDB)
5
- Author: Austin Redenbaugh
6
- Author-email: austin@kodexa.com
7
- Requires-Python: >=3.11,<4.0
8
- Classifier: Development Status :: 5 - Production/Stable
9
- Classifier: Intended Audience :: Developers
10
- Classifier: License :: OSI Approved :: Apache Software License
11
- Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.11
13
- Classifier: Programming Language :: Python :: 3.12
14
- Classifier: Programming Language :: Python :: 3.13
15
- Classifier: Topic :: Software Development :: Libraries
16
- Requires-Dist: addict (>=2.4.0,<3.0.0)
17
- Requires-Dist: antlr4-python3-runtime (>=4.13.2,<5.0.0)
18
- Requires-Dist: deepdiff (>=8.4.2,<9.0.0)
19
- Requires-Dist: msgpack (>=1.1.0,<2.0.0)
20
- Requires-Dist: peewee (>=3.18.1,<4.0.0)
21
- Requires-Dist: pydantic (>=2.11.4,<3.0.0)
22
- Requires-Dist: pytest (>=8.3.5,<9.0.0)
23
- Requires-Dist: pyyaml (>=6.0,<7.0)
24
- Requires-Dist: requests (>=2.32.0,<3.0.0)
25
- Description-Content-Type: text/markdown
26
-
27
-
File without changes