kodexa-document 8.0.1.post1746984682__tar.gz → 8.0.1.post1746985996__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodexa-document might be problematic. Click here for more details.
- kodexa_document-8.0.1.post1746985996/PKG-INFO +353 -0
- kodexa_document-8.0.1.post1746985996/README.md +326 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/pyproject.toml +1 -1
- kodexa_document-8.0.1.post1746984682/PKG-INFO +0 -27
- kodexa_document-8.0.1.post1746984682/README.md +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/connectors.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/model.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/persistence.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/persistence_models.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/__init__.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/ast.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/error.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/kodexa-ast-visitor.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/parser.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/resources/KodexaSelector.interp +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/resources/KodexaSelector.tokens +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/resources/KodexaSelectorLexer.interp +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/resources/KodexaSelectorLexer.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/resources/KodexaSelectorLexer.tokens +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/resources/KodexaSelectorListener.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/resources/KodexaSelectorParser.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/resources/KodexaSelectorVisitor.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/selectors/visitor.py +0 -0
- {kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/kodexa_document/steps.py +0 -0
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: kodexa-document
|
|
3
|
+
Version: 8.0.1.post1746985996
|
|
4
|
+
Summary: Python SDK for the Kodexa Document Database (KDDB)
|
|
5
|
+
Author: Austin Redenbaugh
|
|
6
|
+
Author-email: austin@kodexa.com
|
|
7
|
+
Requires-Python: >=3.11,<4.0
|
|
8
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
16
|
+
Requires-Dist: addict (>=2.4.0,<3.0.0)
|
|
17
|
+
Requires-Dist: antlr4-python3-runtime (>=4.13.2,<5.0.0)
|
|
18
|
+
Requires-Dist: deepdiff (>=8.4.2,<9.0.0)
|
|
19
|
+
Requires-Dist: msgpack (>=1.1.0,<2.0.0)
|
|
20
|
+
Requires-Dist: peewee (>=3.18.1,<4.0.0)
|
|
21
|
+
Requires-Dist: pydantic (>=2.11.4,<3.0.0)
|
|
22
|
+
Requires-Dist: pytest (>=8.3.5,<9.0.0)
|
|
23
|
+
Requires-Dist: pyyaml (>=6.0,<7.0)
|
|
24
|
+
Requires-Dist: requests (>=2.32.0,<3.0.0)
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# Kodexa Document Python SDK
|
|
28
|
+
|
|
29
|
+
The Kodexa Document Python SDK provides a powerful framework for working with structured documents in the Kodexa Document Database (KDDB) format. This library enables developers to create, load, manipulate, and query documents with a hierarchical node structure.
|
|
30
|
+
|
|
31
|
+
## Overview
|
|
32
|
+
|
|
33
|
+
The Kodexa Document Python SDK is designed to provide a robust document object model with persistence capabilities. At its core is the `Document` class, which represents a document as a hierarchical tree of content nodes. The SDK uses SQLite as its underlying storage mechanism through the KDDB (Kodexa Document Database) format.
|
|
34
|
+
|
|
35
|
+
Key features include:
|
|
36
|
+
|
|
37
|
+
- Document creation and manipulation through a hierarchical node structure
|
|
38
|
+
- Persistence to and from KDDB files
|
|
39
|
+
- Powerful selector language (similar to XPath) for querying document content
|
|
40
|
+
- Feature and tag management for document nodes
|
|
41
|
+
- Support for document metadata and source tracking
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install kodexa-document
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Or using Poetry:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
poetry add kodexa-document
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Working with KDDB Files
|
|
56
|
+
|
|
57
|
+
KDDB (Kodexa Document Database) is the default storage format for Kodexa documents. It provides high-performance storage and the ability to handle very large document objects. KDDB files are SQLite databases with a specific schema designed for efficient document storage and retrieval.
|
|
58
|
+
|
|
59
|
+
### Reading KDDB Files
|
|
60
|
+
|
|
61
|
+
To read a document from a KDDB file:
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from kodexa_document import Document
|
|
65
|
+
|
|
66
|
+
# Load a document from a KDDB file
|
|
67
|
+
document = Document.from_kddb("path/to/document.kddb")
|
|
68
|
+
|
|
69
|
+
# Access the document content
|
|
70
|
+
root_node = document.get_root()
|
|
71
|
+
print(root_node.get_all_content())
|
|
72
|
+
|
|
73
|
+
# Query the document using selectors
|
|
74
|
+
paragraphs = document.select("//paragraph")
|
|
75
|
+
for paragraph in paragraphs:
|
|
76
|
+
print(paragraph.content)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
You can also load a KDDB file from a bytes object:
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
# Load from bytes
|
|
83
|
+
with open("path/to/document.kddb", "rb") as f:
|
|
84
|
+
kddb_bytes = f.read()
|
|
85
|
+
|
|
86
|
+
document = Document.from_kddb(kddb_bytes)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
The `from_kddb` method accepts the following parameters:
|
|
90
|
+
|
|
91
|
+
- `source` (str or bytes): Path to a KDDB file or bytes containing KDDB data
|
|
92
|
+
- `detached` (bool, optional): Whether to load the document in detached mode (default: True)
|
|
93
|
+
- `inmemory` (bool, optional): Whether to load the document in memory for faster processing (default: False)
|
|
94
|
+
|
|
95
|
+
### Writing KDDB Files
|
|
96
|
+
|
|
97
|
+
To save a document to a KDDB file:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
# Save to a file
|
|
101
|
+
document.to_kddb("path/to/output.kddb")
|
|
102
|
+
|
|
103
|
+
# Or get the KDDB as bytes
|
|
104
|
+
kddb_bytes = document.to_kddb()
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
The `to_kddb` method accepts an optional `path` parameter. If provided, the document will be written to the specified file. If not provided, the method will return a bytes object containing the KDDB data.
|
|
108
|
+
|
|
109
|
+
### In-Memory Processing
|
|
110
|
+
|
|
111
|
+
For faster processing of documents, you can use the `inmemory` parameter when loading a KDDB file:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
# Load document in memory for faster processing
|
|
115
|
+
document = Document.from_kddb("path/to/document.kddb", inmemory=True)
|
|
116
|
+
|
|
117
|
+
# Process the document...
|
|
118
|
+
|
|
119
|
+
# Save the document back to disk
|
|
120
|
+
document.to_kddb("path/to/output.kddb")
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Creating Documents
|
|
124
|
+
|
|
125
|
+
You can create new documents from scratch or from text:
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
# Create a new empty document
|
|
129
|
+
from kodexa_document import Document
|
|
130
|
+
document = Document()
|
|
131
|
+
|
|
132
|
+
# Create a root node
|
|
133
|
+
root_node = document.create_node(node_type="document")
|
|
134
|
+
document.content_node = root_node
|
|
135
|
+
|
|
136
|
+
# Add child nodes
|
|
137
|
+
paragraph = document.create_node(node_type="paragraph", content="This is a paragraph.")
|
|
138
|
+
root_node.add_child(paragraph)
|
|
139
|
+
|
|
140
|
+
# Create a document from text
|
|
141
|
+
text_document = Document.from_text("This is a sample document.")
|
|
142
|
+
print(text_document.get_root().content)
|
|
143
|
+
|
|
144
|
+
# Create a document with separated content
|
|
145
|
+
separated_document = Document.from_text("This is a sample document.", separator=" ")
|
|
146
|
+
# This creates a document with each word as a separate child node
|
|
147
|
+
print(len(separated_document.get_root().get_children())) # Outputs: 5
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Document Structure
|
|
151
|
+
|
|
152
|
+
A Kodexa document consists of:
|
|
153
|
+
|
|
154
|
+
- A root content node (`document.content_node` or `document.get_root()`)
|
|
155
|
+
- A hierarchical structure of content nodes
|
|
156
|
+
- Metadata about the document (`document.metadata`)
|
|
157
|
+
- Source information (`document.source`)
|
|
158
|
+
|
|
159
|
+
Each content node can have:
|
|
160
|
+
|
|
161
|
+
- Content (text)
|
|
162
|
+
- Features (metadata attached to nodes)
|
|
163
|
+
- Child nodes
|
|
164
|
+
- Tags (special features for marking up content)
|
|
165
|
+
|
|
166
|
+
## Working with Content Nodes
|
|
167
|
+
|
|
168
|
+
Content nodes form the hierarchical structure of a document:
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
# Access the root node
|
|
172
|
+
root = document.get_root()
|
|
173
|
+
|
|
174
|
+
# Get all children
|
|
175
|
+
children = root.get_children()
|
|
176
|
+
|
|
177
|
+
# Access content
|
|
178
|
+
content = root.content
|
|
179
|
+
|
|
180
|
+
# Get all content (including from children)
|
|
181
|
+
all_content = root.get_all_content()
|
|
182
|
+
|
|
183
|
+
# Add a child node
|
|
184
|
+
new_node = document.create_node(node_type="paragraph", content="New paragraph")
|
|
185
|
+
root.add_child(new_node)
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## Using Selectors
|
|
189
|
+
|
|
190
|
+
The SDK provides a powerful selector language (similar to XPath) for querying document content:
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
# Select all paragraph nodes
|
|
194
|
+
paragraphs = document.select("//paragraph")
|
|
195
|
+
|
|
196
|
+
# Select nodes with specific content
|
|
197
|
+
important_nodes = document.select("//paragraph[contains(., 'important')]")
|
|
198
|
+
|
|
199
|
+
# Select the first matching node
|
|
200
|
+
first_table = document.select_first("//table")
|
|
201
|
+
|
|
202
|
+
# Select nodes with specific features
|
|
203
|
+
tagged_nodes = document.select("//*[hasFeature('tag', 'highlight')]")
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## Working with Features and Tags
|
|
207
|
+
|
|
208
|
+
Features are metadata attached to content nodes:
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
# Add a feature to a node
|
|
212
|
+
node.add_feature("category", "section", "introduction")
|
|
213
|
+
|
|
214
|
+
# Check if a node has a feature
|
|
215
|
+
if node.has_feature("category", "section"):
|
|
216
|
+
# Do something
|
|
217
|
+
|
|
218
|
+
# Get feature value
|
|
219
|
+
category = node.get_feature_value("category", "section")
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
Tags are special features used for marking up content:
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
# Tag a node
|
|
226
|
+
node.tag("highlight", tag_uuid="unique-id-123")
|
|
227
|
+
|
|
228
|
+
# Tag content matching a pattern
|
|
229
|
+
node.tag("person", content_re="John|Jane")
|
|
230
|
+
|
|
231
|
+
# Check if a node has tags
|
|
232
|
+
if node.has_tags():
|
|
233
|
+
# Do something
|
|
234
|
+
|
|
235
|
+
# Get all tags on a node
|
|
236
|
+
tags = node.get_tags()
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
## Persistence Layer
|
|
240
|
+
|
|
241
|
+
The SDK uses SQLite as its persistence layer through the `SqliteDocumentPersistence` class. This class handles all database operations for storing and retrieving documents, nodes, and features.
|
|
242
|
+
|
|
243
|
+
The persistence layer is automatically created when you create a document or load a document from a KDDB file. You can access it through the `get_persistence()` method:
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
# Access the persistence layer
|
|
247
|
+
persistence = document.get_persistence()
|
|
248
|
+
|
|
249
|
+
# Close the document and clean up resources
|
|
250
|
+
document.close()
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
## Converting Between Formats
|
|
254
|
+
|
|
255
|
+
The SDK supports converting between different formats:
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
# Convert to/from JSON
|
|
259
|
+
json_str = document.to_json()
|
|
260
|
+
json_document = Document.from_json(json_str)
|
|
261
|
+
|
|
262
|
+
# Convert to/from dictionary
|
|
263
|
+
doc_dict = document.to_dict()
|
|
264
|
+
dict_document = Document.from_dict(doc_dict)
|
|
265
|
+
|
|
266
|
+
# Convert to/from MessagePack (KDXA format)
|
|
267
|
+
msgpack_bytes = document.to_msgpack()
|
|
268
|
+
msgpack_document = Document.from_msgpack(msgpack_bytes)
|
|
269
|
+
|
|
270
|
+
# Save to KDXA file
|
|
271
|
+
document.to_kdxa("document.kdxa")
|
|
272
|
+
|
|
273
|
+
# Load from KDXA file
|
|
274
|
+
kdxa_document = Document.from_kdxa("document.kdxa")
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
## Example: Processing a KDDB File
|
|
278
|
+
|
|
279
|
+
Here's a complete example of loading a KDDB file, processing its content, and saving it back:
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
from kodexa_document import Document
|
|
283
|
+
|
|
284
|
+
# Load a document from a KDDB file
|
|
285
|
+
document = Document.from_kddb("input.kddb")
|
|
286
|
+
|
|
287
|
+
# Get the root node
|
|
288
|
+
root = document.get_root()
|
|
289
|
+
|
|
290
|
+
# Select all paragraph nodes
|
|
291
|
+
paragraphs = root.select("//paragraph")
|
|
292
|
+
|
|
293
|
+
# Tag paragraphs containing specific text
|
|
294
|
+
for paragraph in paragraphs:
|
|
295
|
+
if "important" in paragraph.get_all_content():
|
|
296
|
+
paragraph.tag("important")
|
|
297
|
+
|
|
298
|
+
# Add a document-level label
|
|
299
|
+
document.add_label("processed")
|
|
300
|
+
|
|
301
|
+
# Save the modified document
|
|
302
|
+
document.to_kddb("output.kddb")
|
|
303
|
+
|
|
304
|
+
# Clean up resources
|
|
305
|
+
document.close()
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
## Advanced Features
|
|
309
|
+
|
|
310
|
+
### External Data
|
|
311
|
+
|
|
312
|
+
You can store arbitrary data with a document:
|
|
313
|
+
|
|
314
|
+
```python
|
|
315
|
+
# Store external data
|
|
316
|
+
document.set_external_data({"key": "value"})
|
|
317
|
+
|
|
318
|
+
# Store external data with a specific key
|
|
319
|
+
document.set_external_data({"status": "processed"}, "metadata")
|
|
320
|
+
|
|
321
|
+
# Retrieve external data
|
|
322
|
+
data = document.get_external_data()
|
|
323
|
+
metadata = document.get_external_data("metadata")
|
|
324
|
+
|
|
325
|
+
# Get all external data keys
|
|
326
|
+
keys = document.get_external_data_keys()
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
### Processing Steps
|
|
330
|
+
|
|
331
|
+
You can track processing steps applied to a document:
|
|
332
|
+
|
|
333
|
+
```python
|
|
334
|
+
from kodexa_document.model import ProcessingStep
|
|
335
|
+
|
|
336
|
+
# Create processing steps
|
|
337
|
+
step1 = ProcessingStep(name="Extract Text")
|
|
338
|
+
step2 = ProcessingStep(name="Tag Entities")
|
|
339
|
+
|
|
340
|
+
# Add child steps
|
|
341
|
+
step1.add_child(step2)
|
|
342
|
+
|
|
343
|
+
# Set steps on the document
|
|
344
|
+
document.set_steps([step1, step2])
|
|
345
|
+
|
|
346
|
+
# Retrieve steps
|
|
347
|
+
steps = document.get_steps()
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
## License
|
|
351
|
+
|
|
352
|
+
This project is licensed under the Apache License 2.0 - see the LICENSE file for details.
|
|
353
|
+
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
# Kodexa Document Python SDK
|
|
2
|
+
|
|
3
|
+
The Kodexa Document Python SDK provides a powerful framework for working with structured documents in the Kodexa Document Database (KDDB) format. This library enables developers to create, load, manipulate, and query documents with a hierarchical node structure.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The Kodexa Document Python SDK is designed to provide a robust document object model with persistence capabilities. At its core is the `Document` class, which represents a document as a hierarchical tree of content nodes. The SDK uses SQLite as its underlying storage mechanism through the KDDB (Kodexa Document Database) format.
|
|
8
|
+
|
|
9
|
+
Key features include:
|
|
10
|
+
|
|
11
|
+
- Document creation and manipulation through a hierarchical node structure
|
|
12
|
+
- Persistence to and from KDDB files
|
|
13
|
+
- Powerful selector language (similar to XPath) for querying document content
|
|
14
|
+
- Feature and tag management for document nodes
|
|
15
|
+
- Support for document metadata and source tracking
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install kodexa-document
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Or using Poetry:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
poetry add kodexa-document
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Working with KDDB Files
|
|
30
|
+
|
|
31
|
+
KDDB (Kodexa Document Database) is the default storage format for Kodexa documents. It provides high-performance storage and the ability to handle very large document objects. KDDB files are SQLite databases with a specific schema designed for efficient document storage and retrieval.
|
|
32
|
+
|
|
33
|
+
### Reading KDDB Files
|
|
34
|
+
|
|
35
|
+
To read a document from a KDDB file:
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from kodexa_document import Document
|
|
39
|
+
|
|
40
|
+
# Load a document from a KDDB file
|
|
41
|
+
document = Document.from_kddb("path/to/document.kddb")
|
|
42
|
+
|
|
43
|
+
# Access the document content
|
|
44
|
+
root_node = document.get_root()
|
|
45
|
+
print(root_node.get_all_content())
|
|
46
|
+
|
|
47
|
+
# Query the document using selectors
|
|
48
|
+
paragraphs = document.select("//paragraph")
|
|
49
|
+
for paragraph in paragraphs:
|
|
50
|
+
print(paragraph.content)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
You can also load a KDDB file from a bytes object:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
# Load from bytes
|
|
57
|
+
with open("path/to/document.kddb", "rb") as f:
|
|
58
|
+
kddb_bytes = f.read()
|
|
59
|
+
|
|
60
|
+
document = Document.from_kddb(kddb_bytes)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
The `from_kddb` method accepts the following parameters:
|
|
64
|
+
|
|
65
|
+
- `source` (str or bytes): Path to a KDDB file or bytes containing KDDB data
|
|
66
|
+
- `detached` (bool, optional): Whether to load the document in detached mode (default: True)
|
|
67
|
+
- `inmemory` (bool, optional): Whether to load the document in memory for faster processing (default: False)
|
|
68
|
+
|
|
69
|
+
### Writing KDDB Files
|
|
70
|
+
|
|
71
|
+
To save a document to a KDDB file:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
# Save to a file
|
|
75
|
+
document.to_kddb("path/to/output.kddb")
|
|
76
|
+
|
|
77
|
+
# Or get the KDDB as bytes
|
|
78
|
+
kddb_bytes = document.to_kddb()
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
The `to_kddb` method accepts an optional `path` parameter. If provided, the document will be written to the specified file. If not provided, the method will return a bytes object containing the KDDB data.
|
|
82
|
+
|
|
83
|
+
### In-Memory Processing
|
|
84
|
+
|
|
85
|
+
For faster processing of documents, you can use the `inmemory` parameter when loading a KDDB file:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
# Load document in memory for faster processing
|
|
89
|
+
document = Document.from_kddb("path/to/document.kddb", inmemory=True)
|
|
90
|
+
|
|
91
|
+
# Process the document...
|
|
92
|
+
|
|
93
|
+
# Save the document back to disk
|
|
94
|
+
document.to_kddb("path/to/output.kddb")
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Creating Documents
|
|
98
|
+
|
|
99
|
+
You can create new documents from scratch or from text:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
# Create a new empty document
|
|
103
|
+
from kodexa_document import Document
|
|
104
|
+
document = Document()
|
|
105
|
+
|
|
106
|
+
# Create a root node
|
|
107
|
+
root_node = document.create_node(node_type="document")
|
|
108
|
+
document.content_node = root_node
|
|
109
|
+
|
|
110
|
+
# Add child nodes
|
|
111
|
+
paragraph = document.create_node(node_type="paragraph", content="This is a paragraph.")
|
|
112
|
+
root_node.add_child(paragraph)
|
|
113
|
+
|
|
114
|
+
# Create a document from text
|
|
115
|
+
text_document = Document.from_text("This is a sample document.")
|
|
116
|
+
print(text_document.get_root().content)
|
|
117
|
+
|
|
118
|
+
# Create a document with separated content
|
|
119
|
+
separated_document = Document.from_text("This is a sample document.", separator=" ")
|
|
120
|
+
# This creates a document with each word as a separate child node
|
|
121
|
+
print(len(separated_document.get_root().get_children())) # Outputs: 5
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Document Structure
|
|
125
|
+
|
|
126
|
+
A Kodexa document consists of:
|
|
127
|
+
|
|
128
|
+
- A root content node (`document.content_node` or `document.get_root()`)
|
|
129
|
+
- A hierarchical structure of content nodes
|
|
130
|
+
- Metadata about the document (`document.metadata`)
|
|
131
|
+
- Source information (`document.source`)
|
|
132
|
+
|
|
133
|
+
Each content node can have:
|
|
134
|
+
|
|
135
|
+
- Content (text)
|
|
136
|
+
- Features (metadata attached to nodes)
|
|
137
|
+
- Child nodes
|
|
138
|
+
- Tags (special features for marking up content)
|
|
139
|
+
|
|
140
|
+
## Working with Content Nodes
|
|
141
|
+
|
|
142
|
+
Content nodes form the hierarchical structure of a document:
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
# Access the root node
|
|
146
|
+
root = document.get_root()
|
|
147
|
+
|
|
148
|
+
# Get all children
|
|
149
|
+
children = root.get_children()
|
|
150
|
+
|
|
151
|
+
# Access content
|
|
152
|
+
content = root.content
|
|
153
|
+
|
|
154
|
+
# Get all content (including from children)
|
|
155
|
+
all_content = root.get_all_content()
|
|
156
|
+
|
|
157
|
+
# Add a child node
|
|
158
|
+
new_node = document.create_node(node_type="paragraph", content="New paragraph")
|
|
159
|
+
root.add_child(new_node)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Using Selectors
|
|
163
|
+
|
|
164
|
+
The SDK provides a powerful selector language (similar to XPath) for querying document content:
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
# Select all paragraph nodes
|
|
168
|
+
paragraphs = document.select("//paragraph")
|
|
169
|
+
|
|
170
|
+
# Select nodes with specific content
|
|
171
|
+
important_nodes = document.select("//paragraph[contains(., 'important')]")
|
|
172
|
+
|
|
173
|
+
# Select the first matching node
|
|
174
|
+
first_table = document.select_first("//table")
|
|
175
|
+
|
|
176
|
+
# Select nodes with specific features
|
|
177
|
+
tagged_nodes = document.select("//*[hasFeature('tag', 'highlight')]")
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Working with Features and Tags
|
|
181
|
+
|
|
182
|
+
Features are metadata attached to content nodes:
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
# Add a feature to a node
|
|
186
|
+
node.add_feature("category", "section", "introduction")
|
|
187
|
+
|
|
188
|
+
# Check if a node has a feature
|
|
189
|
+
if node.has_feature("category", "section"):
|
|
190
|
+
# Do something
|
|
191
|
+
|
|
192
|
+
# Get feature value
|
|
193
|
+
category = node.get_feature_value("category", "section")
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Tags are special features used for marking up content:
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
# Tag a node
|
|
200
|
+
node.tag("highlight", tag_uuid="unique-id-123")
|
|
201
|
+
|
|
202
|
+
# Tag content matching a pattern
|
|
203
|
+
node.tag("person", content_re="John|Jane")
|
|
204
|
+
|
|
205
|
+
# Check if a node has tags
|
|
206
|
+
if node.has_tags():
|
|
207
|
+
# Do something
|
|
208
|
+
|
|
209
|
+
# Get all tags on a node
|
|
210
|
+
tags = node.get_tags()
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Persistence Layer
|
|
214
|
+
|
|
215
|
+
The SDK uses SQLite as its persistence layer through the `SqliteDocumentPersistence` class. This class handles all database operations for storing and retrieving documents, nodes, and features.
|
|
216
|
+
|
|
217
|
+
The persistence layer is automatically created when you create a document or load a document from a KDDB file. You can access it through the `get_persistence()` method:
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
# Access the persistence layer
|
|
221
|
+
persistence = document.get_persistence()
|
|
222
|
+
|
|
223
|
+
# Close the document and clean up resources
|
|
224
|
+
document.close()
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
## Converting Between Formats
|
|
228
|
+
|
|
229
|
+
The SDK supports converting between different formats:
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
# Convert to/from JSON
|
|
233
|
+
json_str = document.to_json()
|
|
234
|
+
json_document = Document.from_json(json_str)
|
|
235
|
+
|
|
236
|
+
# Convert to/from dictionary
|
|
237
|
+
doc_dict = document.to_dict()
|
|
238
|
+
dict_document = Document.from_dict(doc_dict)
|
|
239
|
+
|
|
240
|
+
# Convert to/from MessagePack (KDXA format)
|
|
241
|
+
msgpack_bytes = document.to_msgpack()
|
|
242
|
+
msgpack_document = Document.from_msgpack(msgpack_bytes)
|
|
243
|
+
|
|
244
|
+
# Save to KDXA file
|
|
245
|
+
document.to_kdxa("document.kdxa")
|
|
246
|
+
|
|
247
|
+
# Load from KDXA file
|
|
248
|
+
kdxa_document = Document.from_kdxa("document.kdxa")
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
## Example: Processing a KDDB File
|
|
252
|
+
|
|
253
|
+
Here's a complete example of loading a KDDB file, processing its content, and saving it back:
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
from kodexa_document import Document
|
|
257
|
+
|
|
258
|
+
# Load a document from a KDDB file
|
|
259
|
+
document = Document.from_kddb("input.kddb")
|
|
260
|
+
|
|
261
|
+
# Get the root node
|
|
262
|
+
root = document.get_root()
|
|
263
|
+
|
|
264
|
+
# Select all paragraph nodes
|
|
265
|
+
paragraphs = root.select("//paragraph")
|
|
266
|
+
|
|
267
|
+
# Tag paragraphs containing specific text
|
|
268
|
+
for paragraph in paragraphs:
|
|
269
|
+
if "important" in paragraph.get_all_content():
|
|
270
|
+
paragraph.tag("important")
|
|
271
|
+
|
|
272
|
+
# Add a document-level label
|
|
273
|
+
document.add_label("processed")
|
|
274
|
+
|
|
275
|
+
# Save the modified document
|
|
276
|
+
document.to_kddb("output.kddb")
|
|
277
|
+
|
|
278
|
+
# Clean up resources
|
|
279
|
+
document.close()
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
## Advanced Features
|
|
283
|
+
|
|
284
|
+
### External Data
|
|
285
|
+
|
|
286
|
+
You can store arbitrary data with a document:
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
# Store external data
|
|
290
|
+
document.set_external_data({"key": "value"})
|
|
291
|
+
|
|
292
|
+
# Store external data with a specific key
|
|
293
|
+
document.set_external_data({"status": "processed"}, "metadata")
|
|
294
|
+
|
|
295
|
+
# Retrieve external data
|
|
296
|
+
data = document.get_external_data()
|
|
297
|
+
metadata = document.get_external_data("metadata")
|
|
298
|
+
|
|
299
|
+
# Get all external data keys
|
|
300
|
+
keys = document.get_external_data_keys()
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
### Processing Steps
|
|
304
|
+
|
|
305
|
+
You can track processing steps applied to a document:
|
|
306
|
+
|
|
307
|
+
```python
|
|
308
|
+
from kodexa_document.model import ProcessingStep
|
|
309
|
+
|
|
310
|
+
# Create processing steps
|
|
311
|
+
step1 = ProcessingStep(name="Extract Text")
|
|
312
|
+
step2 = ProcessingStep(name="Tag Entities")
|
|
313
|
+
|
|
314
|
+
# Add child steps
|
|
315
|
+
step1.add_child(step2)
|
|
316
|
+
|
|
317
|
+
# Set steps on the document
|
|
318
|
+
document.set_steps([step1, step2])
|
|
319
|
+
|
|
320
|
+
# Retrieve steps
|
|
321
|
+
steps = document.get_steps()
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
## License
|
|
325
|
+
|
|
326
|
+
This project is licensed under the Apache License 2.0 - see the LICENSE file for details.
|
{kodexa_document-8.0.1.post1746984682 → kodexa_document-8.0.1.post1746985996}/pyproject.toml
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "kodexa-document"
|
|
3
|
-
version = "8.0.1-
|
|
3
|
+
version = "8.0.1-1746985996"
|
|
4
4
|
description = "Python SDK for the Kodexa Document Database (KDDB)"
|
|
5
5
|
authors = ["Austin Redenbaugh <austin@kodexa.com>", "Philip Dodds <philip@kodexa.com>", "Romar Cablao <rcablao@kodexa.com>", "Amadea Paula Dodds <amadeapaula@kodexa.com>"]
|
|
6
6
|
readme = "README.md"
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: kodexa-document
|
|
3
|
-
Version: 8.0.1.post1746984682
|
|
4
|
-
Summary: Python SDK for the Kodexa Document Database (KDDB)
|
|
5
|
-
Author: Austin Redenbaugh
|
|
6
|
-
Author-email: austin@kodexa.com
|
|
7
|
-
Requires-Python: >=3.11,<4.0
|
|
8
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
9
|
-
Classifier: Intended Audience :: Developers
|
|
10
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
-
Classifier: Topic :: Software Development :: Libraries
|
|
16
|
-
Requires-Dist: addict (>=2.4.0,<3.0.0)
|
|
17
|
-
Requires-Dist: antlr4-python3-runtime (>=4.13.2,<5.0.0)
|
|
18
|
-
Requires-Dist: deepdiff (>=8.4.2,<9.0.0)
|
|
19
|
-
Requires-Dist: msgpack (>=1.1.0,<2.0.0)
|
|
20
|
-
Requires-Dist: peewee (>=3.18.1,<4.0.0)
|
|
21
|
-
Requires-Dist: pydantic (>=2.11.4,<3.0.0)
|
|
22
|
-
Requires-Dist: pytest (>=8.3.5,<9.0.0)
|
|
23
|
-
Requires-Dist: pyyaml (>=6.0,<7.0)
|
|
24
|
-
Requires-Dist: requests (>=2.32.0,<3.0.0)
|
|
25
|
-
Description-Content-Type: text/markdown
|
|
26
|
-
|
|
27
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|