kodexa-document 8.0.0.dev20607935278__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kodexa_document-8.0.0.dev20607935278/CHANGELOG.md +5 -0
- kodexa_document-8.0.0.dev20607935278/MANIFEST.in +37 -0
- kodexa_document-8.0.0.dev20607935278/PKG-INFO +407 -0
- kodexa_document-8.0.0.dev20607935278/README.md +366 -0
- kodexa_document-8.0.0.dev20607935278/USAGE.md +926 -0
- kodexa_document-8.0.0.dev20607935278/docs/API_REFERENCE.md +2089 -0
- kodexa_document-8.0.0.dev20607935278/kodexa_document/__init__.py +73 -0
- kodexa_document-8.0.0.dev20607935278/kodexa_document/_native/darwin-arm64/libkodexa_go.dylib +0 -0
- kodexa_document-8.0.0.dev20607935278/kodexa_document/_native/darwin-arm64/libkodexa_go.h +229 -0
- kodexa_document-8.0.0.dev20607935278/kodexa_document/_native/darwin-arm64/libkodexa_go_jni.dylib +0 -0
- kodexa_document-8.0.0.dev20607935278/kodexa_document/_native/linux-amd64/libkodexa_go.h +229 -0
- kodexa_document-8.0.0.dev20607935278/kodexa_document/_native/linux-amd64/libkodexa_go.so +0 -0
- kodexa_document-8.0.0.dev20607935278/kodexa_document/_native/linux-amd64/libkodexa_go_jni.so +0 -0
- kodexa_document-8.0.0.dev20607935278/kodexa_document/_native.py +395 -0
- kodexa_document-8.0.0.dev20607935278/kodexa_document/errors.py +77 -0
- kodexa_document-8.0.0.dev20607935278/kodexa_document.egg-info/SOURCES.txt +66 -0
- kodexa_document-8.0.0.dev20607935278/pyproject.toml +103 -0
- kodexa_document-8.0.0.dev20607935278/setup.cfg +4 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_content_exception.py +275 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_content_feature_coverage.py +157 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_content_parts.py +231 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_basic.py +341 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_features_tags.py +791 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_legacy_compat.py +106 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_lifecycle.py +274 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_navigation.py +256 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_selectors.py +399 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_tree.py +376 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_delete_on_close.py +271 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_document.py +901 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_document_from_json.py +274 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_document_from_kddb.py +191 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_document_inmemory.py +93 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_document_metadata.py +181 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_document_statistics.py +75 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_document_to_json.py +107 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_enums.py +129 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_errors.py +51 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_exceptions.py +576 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_external_data.py +122 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_extraction.py +487 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_extraction_engine_get_validations.py +214 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_extraction_integration.py +325 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_extraction_process_and_save.py +148 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_extraction_validations.py +207 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_feature_append_behavior.py +235 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_feature_debug.py +39 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_feature_detailed.py +132 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_feature_fix.py +85 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_json_fixes.py +267 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_json_roundtrip.py +230 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_labels.py +281 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_memory.py +72 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_metadata_persistence.py +143 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_minimal.py +91 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_mixins.py +230 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_new_methods.py +120 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_pretty_print.py +423 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_processing_step.py +339 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_processing_step_integration.py +249 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_processing_steps.py +311 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_remaining_issues.py +150 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_rotation_methods.py +175 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_source_metadata.py +216 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_spatial_methods.py +293 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_tag_class.py +251 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_tag_instance.py +478 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_tag_integration.py +135 -0
- kodexa_document-8.0.0.dev20607935278/tests/test_to_kddb.py +211 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Include native libraries (required for functionality)
|
|
2
|
+
recursive-include kodexa_document/_native *.so *.dll *.dylib *.h
|
|
3
|
+
|
|
4
|
+
# Include essential user documentation (shipped with package)
|
|
5
|
+
include README.md
|
|
6
|
+
include USAGE.md
|
|
7
|
+
include CHANGELOG.md
|
|
8
|
+
include docs/API_REFERENCE.md
|
|
9
|
+
|
|
10
|
+
# Include license and metadata files
|
|
11
|
+
include pyproject.toml
|
|
12
|
+
|
|
13
|
+
# Exclude build tools and build documentation (not needed by end users)
|
|
14
|
+
prune build/
|
|
15
|
+
global-exclude build_*.py
|
|
16
|
+
global-exclude build_*.sh
|
|
17
|
+
global-exclude build_*.bat
|
|
18
|
+
global-exclude fix_windows.py
|
|
19
|
+
|
|
20
|
+
# Exclude development/debug files
|
|
21
|
+
exclude .work/
|
|
22
|
+
prune .work/
|
|
23
|
+
exclude .pytest_cache/
|
|
24
|
+
prune .pytest_cache/
|
|
25
|
+
global-exclude test_contentnode_lifecycle_old.py
|
|
26
|
+
|
|
27
|
+
# Exclude compiled Python files and caches
|
|
28
|
+
global-exclude __pycache__/
|
|
29
|
+
global-exclude *.pyc
|
|
30
|
+
global-exclude *.pyo
|
|
31
|
+
global-exclude *.pyd
|
|
32
|
+
global-exclude .Python
|
|
33
|
+
global-exclude pip-log.txt
|
|
34
|
+
|
|
35
|
+
# Exclude distribution build artifacts
|
|
36
|
+
prune dist/
|
|
37
|
+
prune *.egg-info/
|
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kodexa-document
|
|
3
|
+
Version: 8.0.0.dev20607935278
|
|
4
|
+
Summary: High-performance Python bindings for the Go-based Kodexa Document SDK with in-memory processing
|
|
5
|
+
Author-email: Kodexa <support@kodexa.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/kodexa/kodexa-document
|
|
8
|
+
Project-URL: Repository, https://github.com/kodexa/kodexa-document
|
|
9
|
+
Project-URL: Documentation, https://docs.kodexa.com
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/kodexa/kodexa-document/issues
|
|
11
|
+
Keywords: document,processing,extraction,nlp,ai,sqlite,kddb
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Classifier: Topic :: Text Processing
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Database
|
|
22
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
23
|
+
Requires-Python: >=3.12
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
Requires-Dist: cffi>=1.14.0
|
|
26
|
+
Requires-Dist: addict>=2.4.0
|
|
27
|
+
Requires-Dist: pydantic>=2.0.0
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
31
|
+
Requires-Dist: black; extra == "dev"
|
|
32
|
+
Requires-Dist: isort; extra == "dev"
|
|
33
|
+
Requires-Dist: mypy; extra == "dev"
|
|
34
|
+
Requires-Dist: flake8; extra == "dev"
|
|
35
|
+
Requires-Dist: build; extra == "dev"
|
|
36
|
+
Requires-Dist: twine; extra == "dev"
|
|
37
|
+
Provides-Extra: test
|
|
38
|
+
Requires-Dist: pytest>=6.0; extra == "test"
|
|
39
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
40
|
+
Provides-Extra: lambda
|
|
41
|
+
|
|
42
|
+
# Kodexa Document Python
|
|
43
|
+
|
|
44
|
+
High-performance Python bindings for the Go-based Kodexa Document SDK using CFFI. Provides comprehensive document processing capabilities with ~100x performance improvement through in-memory operations.
|
|
45
|
+
|
|
46
|
+
## Overview
|
|
47
|
+
|
|
48
|
+
This package provides mature Python bindings for the Go-based Kodexa Document SDK. It uses CFFI (C Foreign Function Interface) to communicate with the Go library, offering full access to hierarchical document processing, advanced querying, and rich metadata management.
|
|
49
|
+
|
|
50
|
+
**Key Highlights:**
|
|
51
|
+
- **Production Ready**: 413+ comprehensive tests covering all functionality
|
|
52
|
+
- **High Performance**: ~100x faster with in-memory mode (1.19ms vs 121ms)
|
|
53
|
+
- **Full Feature Set**: Complete document manipulation, querying, and persistence
|
|
54
|
+
- **Cross Platform**: Linux, macOS (Intel/ARM), Windows, AWS Lambda
|
|
55
|
+
|
|
56
|
+
## Features
|
|
57
|
+
|
|
58
|
+
### Core Document Operations
|
|
59
|
+
- **Document Creation**: From text, JSON, KDDB files, or scratch
|
|
60
|
+
- **In-Memory Processing**: ~100x performance boost for temporary operations
|
|
61
|
+
- **Context Managers**: Automatic resource cleanup with `with` statements
|
|
62
|
+
- **Multiple Formats**: JSON export/import, KDDB persistence, dict conversion
|
|
63
|
+
|
|
64
|
+
### Content Structure & Navigation
|
|
65
|
+
- **Hierarchical Nodes**: Document tree structure like DOM for web pages
|
|
66
|
+
- **Content Operations**: Rich text handling with content parts
|
|
67
|
+
- **Tree Navigation**: Parent/child relationships, sibling traversal, path queries
|
|
68
|
+
- **Node Management**: Create, modify, remove nodes with full hierarchy support
|
|
69
|
+
|
|
70
|
+
### Advanced Querying
|
|
71
|
+
- **Selector Language**: XPath-like queries (`//paragraph[contains(@content, 'text')]`)
|
|
72
|
+
- **Variable Support**: Parameterized queries with variable substitution
|
|
73
|
+
- **Performance Options**: First-only results, relative queries from nodes
|
|
74
|
+
- **Rich Filtering**: Content-based, tag-based, and feature-based selection
|
|
75
|
+
|
|
76
|
+
### Metadata & Annotations
|
|
77
|
+
- **Features System**: Key-value metadata with type organization
|
|
78
|
+
- **Tagging**: Content annotation with confidence scores and values
|
|
79
|
+
- **Document Labels**: Classification and categorization
|
|
80
|
+
- **Mixins**: Capability flags and behavior markers
|
|
81
|
+
- **External Data**: Arbitrary data storage with custom keys
|
|
82
|
+
- **Processing Steps**: Workflow tracking and validation rules
|
|
83
|
+
|
|
84
|
+
### Spatial & Geometric Operations
|
|
85
|
+
- **Bounding Boxes**: Position and dimension tracking
|
|
86
|
+
- **Spatial Queries**: Location-based content selection
|
|
87
|
+
- **Coordinate Systems**: Flexible positioning support
|
|
88
|
+
|
|
89
|
+
### Enterprise Features
|
|
90
|
+
- **Extraction Engine**: Advanced content extraction with taxonomies
|
|
91
|
+
- **Validation Framework**: Rule-based document validation
|
|
92
|
+
- **Statistics**: Comprehensive document metrics and analysis
|
|
93
|
+
- **Error Handling**: Comprehensive exception system with specific error types
|
|
94
|
+
- **Memory Management**: Automatic cleanup with finalizers
|
|
95
|
+
|
|
96
|
+
## Installation
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
pip install kodexa-document
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Quick Start
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from kodexa_document import Document
|
|
106
|
+
|
|
107
|
+
# Create high-performance in-memory document
|
|
108
|
+
with Document(inmemory=True) as doc:
|
|
109
|
+
# Create document structure
|
|
110
|
+
root = doc.create_node("document", "My Document")
|
|
111
|
+
doc.content_node = root
|
|
112
|
+
|
|
113
|
+
section = doc.create_node("section", "Introduction", parent=root)
|
|
114
|
+
para = doc.create_node("paragraph", "Important content", parent=section)
|
|
115
|
+
|
|
116
|
+
# Add rich metadata
|
|
117
|
+
para.tag("important", confidence=0.95, value="key-point")
|
|
118
|
+
para.add_feature("style", "emphasis", "bold")
|
|
119
|
+
doc.add_label("technical-document")
|
|
120
|
+
|
|
121
|
+
# Query with selectors
|
|
122
|
+
important_nodes = doc.select("//paragraph[@tag='important']")
|
|
123
|
+
all_content = doc.select("//*[contains(@content, 'content')]")
|
|
124
|
+
|
|
125
|
+
# Export to different formats
|
|
126
|
+
json_str = doc.to_json(indent=2)
|
|
127
|
+
doc.save("output.kddb")
|
|
128
|
+
|
|
129
|
+
print(f"Found {len(important_nodes)} important paragraphs")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Advanced Usage Examples
|
|
133
|
+
|
|
134
|
+
### Document Processing Pipeline
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from kodexa_document import Document
|
|
138
|
+
from kodexa_document.errors import DocumentError
|
|
139
|
+
|
|
140
|
+
def process_document(input_path, output_path):
|
|
141
|
+
"""Complete document processing pipeline."""
|
|
142
|
+
with Document.from_kddb(input_path, inmemory=True) as doc:
|
|
143
|
+
# Analyze structure
|
|
144
|
+
all_nodes = doc.select("//*")
|
|
145
|
+
paragraphs = doc.select("//paragraph")
|
|
146
|
+
|
|
147
|
+
# Process content
|
|
148
|
+
for i, para in enumerate(paragraphs):
|
|
149
|
+
if len(para.content) > 100: # Long paragraphs
|
|
150
|
+
para.tag("detailed", confidence=0.8)
|
|
151
|
+
para.add_feature("analysis", "length", len(para.content))
|
|
152
|
+
|
|
153
|
+
if i == 0: # First paragraph
|
|
154
|
+
para.tag("introduction")
|
|
155
|
+
|
|
156
|
+
# Add document metadata
|
|
157
|
+
doc.set_metadata("processed", True)
|
|
158
|
+
doc.set_metadata("node_count", len(all_nodes))
|
|
159
|
+
doc.add_label("processed-document")
|
|
160
|
+
|
|
161
|
+
# Save results
|
|
162
|
+
doc.save(output_path)
|
|
163
|
+
|
|
164
|
+
return {
|
|
165
|
+
"uuid": doc.uuid,
|
|
166
|
+
"nodes": len(all_nodes),
|
|
167
|
+
"tagged": len(doc.get_all_tagged_nodes())
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
# Process with error handling
|
|
171
|
+
try:
|
|
172
|
+
result = process_document("input.kddb", "processed.kddb")
|
|
173
|
+
print(f"Processed document {result['uuid']}: {result['nodes']} nodes")
|
|
174
|
+
except DocumentError as e:
|
|
175
|
+
print(f"Processing failed: {e}")
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Content Analysis and Extraction
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
# Load and analyze document structure
|
|
182
|
+
with Document.from_text("Chapter 1\nIntroduction\nContent here",
|
|
183
|
+
separator="\n", inmemory=True) as doc:
|
|
184
|
+
|
|
185
|
+
# Navigate document hierarchy
|
|
186
|
+
root = doc.content_node
|
|
187
|
+
children = root.get_children()
|
|
188
|
+
|
|
189
|
+
# Rich querying
|
|
190
|
+
headers = doc.select("//paragraph[1]") # First paragraphs (likely headers)
|
|
191
|
+
long_content = doc.select("//paragraph[string-length(@content) > 50]")
|
|
192
|
+
|
|
193
|
+
# Feature analysis
|
|
194
|
+
for node in children:
|
|
195
|
+
node.add_feature("position", "index", node.index)
|
|
196
|
+
if "Chapter" in node.content:
|
|
197
|
+
node.tag("chapter-header")
|
|
198
|
+
node.add_feature("structure", "type", "header")
|
|
199
|
+
|
|
200
|
+
# Get comprehensive statistics
|
|
201
|
+
stats = doc.get_statistics()
|
|
202
|
+
tagged_nodes = doc.get_all_tagged_nodes()
|
|
203
|
+
|
|
204
|
+
print(f"Document structure: {len(children)} top-level nodes")
|
|
205
|
+
print(f"Tagged content: {len(tagged_nodes)} nodes")
|
|
206
|
+
print(f"Statistics: {stats}")
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
## Performance Comparison
|
|
210
|
+
|
|
211
|
+
```python
|
|
212
|
+
import time
|
|
213
|
+
|
|
214
|
+
# In-memory processing (recommended for temporary operations)
|
|
215
|
+
start = time.time()
|
|
216
|
+
with Document(inmemory=True) as doc:
|
|
217
|
+
root = doc.create_node("document", "Fast processing")
|
|
218
|
+
doc.content_node = root
|
|
219
|
+
for i in range(1000):
|
|
220
|
+
doc.create_node("item", f"Item {i}", parent=root)
|
|
221
|
+
nodes = doc.select("//*")
|
|
222
|
+
inmemory_time = time.time() - start
|
|
223
|
+
|
|
224
|
+
# File-based processing (for persistence)
|
|
225
|
+
start = time.time()
|
|
226
|
+
with Document(inmemory=False) as doc:
|
|
227
|
+
root = doc.create_node("document", "Persistent processing")
|
|
228
|
+
doc.content_node = root
|
|
229
|
+
for i in range(1000):
|
|
230
|
+
doc.create_node("item", f"Item {i}", parent=root)
|
|
231
|
+
nodes = doc.select("//*")
|
|
232
|
+
file_time = time.time() - start
|
|
233
|
+
|
|
234
|
+
print(f"In-memory: {inmemory_time:.3f}s")
|
|
235
|
+
print(f"File-based: {file_time:.3f}s")
|
|
236
|
+
print(f"Performance improvement: {file_time/inmemory_time:.1f}x faster")
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
## Loading Documents
|
|
240
|
+
|
|
241
|
+
The `from_kddb` method supports flexible loading modes:
|
|
242
|
+
|
|
243
|
+
```python
|
|
244
|
+
# Standard loading modes
|
|
245
|
+
doc = Document.from_kddb("input.kddb") # Detached copy (safe, default)
|
|
246
|
+
doc = Document.from_kddb("input.kddb", detached=False) # In-place editing
|
|
247
|
+
doc = Document.from_kddb("input.kddb", inmemory=True) # 100x performance boost
|
|
248
|
+
|
|
249
|
+
# Load from bytes (API responses, downloads, etc.)
|
|
250
|
+
with open("document.kddb", "rb") as f:
|
|
251
|
+
kddb_bytes = f.read()
|
|
252
|
+
doc = Document.from_kddb(kddb_bytes, inmemory=True)
|
|
253
|
+
|
|
254
|
+
# Temporary files with auto-cleanup
|
|
255
|
+
doc = Document.from_kddb("temp.kddb", delete_on_close=True)
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
| Parameter | Default | Description |
|
|
259
|
+
|-----------|---------|-------------|
|
|
260
|
+
| `detached` | `True` | Creates working copy vs editing original |
|
|
261
|
+
| `inmemory` | `False` | Loads into memory for ~100x performance |
|
|
262
|
+
| `delete_on_close` | `False` | Auto-deletes file when document closes |
|
|
263
|
+
|
|
264
|
+
## Error Handling
|
|
265
|
+
|
|
266
|
+
```python
|
|
267
|
+
from kodexa_document.errors import DocumentError, DocumentNotFoundError
|
|
268
|
+
|
|
269
|
+
# Robust error handling
|
|
270
|
+
try:
|
|
271
|
+
with Document.from_kddb("document.kddb", inmemory=True) as doc:
|
|
272
|
+
# Process document
|
|
273
|
+
nodes = doc.select("//paragraph")
|
|
274
|
+
for node in nodes:
|
|
275
|
+
node.tag("processed")
|
|
276
|
+
|
|
277
|
+
# Validate results
|
|
278
|
+
if not doc.uuid:
|
|
279
|
+
raise DocumentError("Invalid document state")
|
|
280
|
+
|
|
281
|
+
except DocumentNotFoundError:
|
|
282
|
+
print("Document file not found")
|
|
283
|
+
except DocumentError as e:
|
|
284
|
+
print(f"Document processing error: {e}")
|
|
285
|
+
except Exception as e:
|
|
286
|
+
print(f"Unexpected error: {e}")
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
## Architecture
|
|
290
|
+
|
|
291
|
+
```
|
|
292
|
+
Python Application
|
|
293
|
+
↓
|
|
294
|
+
CFFI Python Wrapper (413+ Tests)
|
|
295
|
+
↓
|
|
296
|
+
Go Shared Library (CGO)
|
|
297
|
+
↓
|
|
298
|
+
GORM Domain Layer
|
|
299
|
+
↓
|
|
300
|
+
SQLite Database (File/Memory)
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
**Performance Modes:**
|
|
304
|
+
- **In-Memory SQLite**: `:memory:` database for maximum speed
|
|
305
|
+
- **File-Based SQLite**: Persistent `.kddb` files for storage
|
|
306
|
+
- **Hybrid Mode**: Load from file, process in-memory, save back
|
|
307
|
+
|
|
308
|
+
## Requirements
|
|
309
|
+
|
|
310
|
+
- Python 3.12+
|
|
311
|
+
- cffi >= 1.14.0
|
|
312
|
+
- Go shared library (automatically bundled in wheel)
|
|
313
|
+
|
|
314
|
+
## Platform Support
|
|
315
|
+
|
|
316
|
+
- **Linux x86_64** - Primary development platform
|
|
317
|
+
- **macOS x86_64 & ARM64** - Intel and Apple Silicon support
|
|
318
|
+
- **Windows x86_64** - Full Windows compatibility
|
|
319
|
+
- **AWS Lambda** - Amazon Linux 2 optimization
|
|
320
|
+
|
|
321
|
+
## Testing & Quality
|
|
322
|
+
|
|
323
|
+
- **413+ Comprehensive Tests** covering all functionality
|
|
324
|
+
- **100% Feature Coverage** - All advertised features are tested and working
|
|
325
|
+
- **Error Path Testing** - Comprehensive error handling validation
|
|
326
|
+
- **Performance Testing** - Memory usage and speed benchmarks
|
|
327
|
+
- **Cross-Platform Testing** - Validated on all supported platforms
|
|
328
|
+
|
|
329
|
+
```bash
|
|
330
|
+
# Run comprehensive test suite
|
|
331
|
+
cd lib/python
|
|
332
|
+
source ../../venv/bin/activate
|
|
333
|
+
python -m pytest tests/ -v
|
|
334
|
+
|
|
335
|
+
# Test categories
|
|
336
|
+
python -m pytest tests/test_document.py -v # Core document operations
|
|
337
|
+
python -m pytest tests/test_contentnode_features_tags.py -v # Features and tags
|
|
338
|
+
python -m pytest tests/test_contentnode_selectors.py -v # Query system
|
|
339
|
+
python -m pytest tests/test_extraction.py -v # Advanced extraction
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
## Development Setup
|
|
343
|
+
|
|
344
|
+
```bash
|
|
345
|
+
# Quick setup from repository root
|
|
346
|
+
python3 -m venv venv
|
|
347
|
+
source venv/bin/activate
|
|
348
|
+
pip install cffi pytest
|
|
349
|
+
|
|
350
|
+
# Build Go library and Python bindings
|
|
351
|
+
cd lib/go && make linux # or: make darwin, make windows
|
|
352
|
+
cd ../python
|
|
353
|
+
|
|
354
|
+
# Test installation
|
|
355
|
+
python -c "from kodexa_document import Document; print('Success!')"
|
|
356
|
+
|
|
357
|
+
# Run tests
|
|
358
|
+
python -m pytest tests/ -v
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
## Documentation
|
|
362
|
+
|
|
363
|
+
### User Documentation
|
|
364
|
+
- **[USAGE.md](USAGE.md)** - Comprehensive usage examples and best practices
|
|
365
|
+
- **[docs/API_REFERENCE.md](docs/API_REFERENCE.md)** - Complete API reference
|
|
366
|
+
|
|
367
|
+
### Build Documentation
|
|
368
|
+
- **[docs/BUILD_SCRIPTS_GUIDE.md](docs/BUILD_SCRIPTS_GUIDE.md)** - Build automation guide
|
|
369
|
+
- **[build/docs/BUILD.md](build/docs/BUILD.md)** - Detailed build instructions
|
|
370
|
+
- **[build/docs/WINDOWS_SETUP.md](build/docs/WINDOWS_SETUP.md)** - Windows development setup
|
|
371
|
+
|
|
372
|
+
## Best Practices
|
|
373
|
+
|
|
374
|
+
1. **Use `inmemory=True`** for temporary processing (~100x faster)
|
|
375
|
+
2. **Use context managers** (`with` statements) for automatic cleanup
|
|
376
|
+
3. **Handle specific exceptions** (DocumentError, DocumentNotFoundError)
|
|
377
|
+
4. **Structure documents hierarchically** with proper parent-child relationships
|
|
378
|
+
5. **Leverage selectors** for efficient document querying
|
|
379
|
+
6. **Use features and tags** for rich content annotation
|
|
380
|
+
7. **Set meaningful metadata** for document tracking and organization
|
|
381
|
+
|
|
382
|
+
## Use Cases
|
|
383
|
+
|
|
384
|
+
- **Document Processing Pipelines** - ETL workflows for structured documents
|
|
385
|
+
- **Content Analysis** - Text mining, information extraction, document understanding
|
|
386
|
+
- **Document Transformation** - Format conversion, structure normalization
|
|
387
|
+
- **Search and Indexing** - Content indexing with rich metadata
|
|
388
|
+
- **Validation and Quality** - Document structure validation and quality assessment
|
|
389
|
+
- **Machine Learning** - Feature extraction for ML pipelines
|
|
390
|
+
- **Enterprise Integration** - High-performance document processing systems
|
|
391
|
+
|
|
392
|
+
## Performance Characteristics
|
|
393
|
+
|
|
394
|
+
| Operation | In-Memory | File-Based | Improvement |
|
|
395
|
+
|-----------|-----------|------------|-------------|
|
|
396
|
+
| Document Creation | ~1.2ms | ~121ms | 100x |
|
|
397
|
+
| Node Creation (1000 nodes) | ~15ms | ~1.5s | 100x |
|
|
398
|
+
| Selector Queries | ~2ms | ~45ms | 22x |
|
|
399
|
+
| Feature/Tag Operations | ~0.5ms | ~25ms | 50x |
|
|
400
|
+
|
|
401
|
+
## License
|
|
402
|
+
|
|
403
|
+
Same as the main Kodexa Document SDK.
|
|
404
|
+
|
|
405
|
+
---
|
|
406
|
+
|
|
407
|
+
**Ready to get started?** Check out [USAGE.md](USAGE.md) for comprehensive examples and [run the test suite](#testing--quality) to see all features in action!
|