kodexa-document 8.0.0.dev20607935278__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. kodexa_document-8.0.0.dev20607935278/CHANGELOG.md +5 -0
  2. kodexa_document-8.0.0.dev20607935278/MANIFEST.in +37 -0
  3. kodexa_document-8.0.0.dev20607935278/PKG-INFO +407 -0
  4. kodexa_document-8.0.0.dev20607935278/README.md +366 -0
  5. kodexa_document-8.0.0.dev20607935278/USAGE.md +926 -0
  6. kodexa_document-8.0.0.dev20607935278/docs/API_REFERENCE.md +2089 -0
  7. kodexa_document-8.0.0.dev20607935278/kodexa_document/__init__.py +73 -0
  8. kodexa_document-8.0.0.dev20607935278/kodexa_document/_native/darwin-arm64/libkodexa_go.dylib +0 -0
  9. kodexa_document-8.0.0.dev20607935278/kodexa_document/_native/darwin-arm64/libkodexa_go.h +229 -0
  10. kodexa_document-8.0.0.dev20607935278/kodexa_document/_native/darwin-arm64/libkodexa_go_jni.dylib +0 -0
  11. kodexa_document-8.0.0.dev20607935278/kodexa_document/_native/linux-amd64/libkodexa_go.h +229 -0
  12. kodexa_document-8.0.0.dev20607935278/kodexa_document/_native/linux-amd64/libkodexa_go.so +0 -0
  13. kodexa_document-8.0.0.dev20607935278/kodexa_document/_native/linux-amd64/libkodexa_go_jni.so +0 -0
  14. kodexa_document-8.0.0.dev20607935278/kodexa_document/_native.py +395 -0
  15. kodexa_document-8.0.0.dev20607935278/kodexa_document/errors.py +77 -0
  16. kodexa_document-8.0.0.dev20607935278/kodexa_document.egg-info/SOURCES.txt +66 -0
  17. kodexa_document-8.0.0.dev20607935278/pyproject.toml +103 -0
  18. kodexa_document-8.0.0.dev20607935278/setup.cfg +4 -0
  19. kodexa_document-8.0.0.dev20607935278/tests/test_content_exception.py +275 -0
  20. kodexa_document-8.0.0.dev20607935278/tests/test_content_feature_coverage.py +157 -0
  21. kodexa_document-8.0.0.dev20607935278/tests/test_content_parts.py +231 -0
  22. kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_basic.py +341 -0
  23. kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_features_tags.py +791 -0
  24. kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_legacy_compat.py +106 -0
  25. kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_lifecycle.py +274 -0
  26. kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_navigation.py +256 -0
  27. kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_selectors.py +399 -0
  28. kodexa_document-8.0.0.dev20607935278/tests/test_contentnode_tree.py +376 -0
  29. kodexa_document-8.0.0.dev20607935278/tests/test_delete_on_close.py +271 -0
  30. kodexa_document-8.0.0.dev20607935278/tests/test_document.py +901 -0
  31. kodexa_document-8.0.0.dev20607935278/tests/test_document_from_json.py +274 -0
  32. kodexa_document-8.0.0.dev20607935278/tests/test_document_from_kddb.py +191 -0
  33. kodexa_document-8.0.0.dev20607935278/tests/test_document_inmemory.py +93 -0
  34. kodexa_document-8.0.0.dev20607935278/tests/test_document_metadata.py +181 -0
  35. kodexa_document-8.0.0.dev20607935278/tests/test_document_statistics.py +75 -0
  36. kodexa_document-8.0.0.dev20607935278/tests/test_document_to_json.py +107 -0
  37. kodexa_document-8.0.0.dev20607935278/tests/test_enums.py +129 -0
  38. kodexa_document-8.0.0.dev20607935278/tests/test_errors.py +51 -0
  39. kodexa_document-8.0.0.dev20607935278/tests/test_exceptions.py +576 -0
  40. kodexa_document-8.0.0.dev20607935278/tests/test_external_data.py +122 -0
  41. kodexa_document-8.0.0.dev20607935278/tests/test_extraction.py +487 -0
  42. kodexa_document-8.0.0.dev20607935278/tests/test_extraction_engine_get_validations.py +214 -0
  43. kodexa_document-8.0.0.dev20607935278/tests/test_extraction_integration.py +325 -0
  44. kodexa_document-8.0.0.dev20607935278/tests/test_extraction_process_and_save.py +148 -0
  45. kodexa_document-8.0.0.dev20607935278/tests/test_extraction_validations.py +207 -0
  46. kodexa_document-8.0.0.dev20607935278/tests/test_feature_append_behavior.py +235 -0
  47. kodexa_document-8.0.0.dev20607935278/tests/test_feature_debug.py +39 -0
  48. kodexa_document-8.0.0.dev20607935278/tests/test_feature_detailed.py +132 -0
  49. kodexa_document-8.0.0.dev20607935278/tests/test_feature_fix.py +85 -0
  50. kodexa_document-8.0.0.dev20607935278/tests/test_json_fixes.py +267 -0
  51. kodexa_document-8.0.0.dev20607935278/tests/test_json_roundtrip.py +230 -0
  52. kodexa_document-8.0.0.dev20607935278/tests/test_labels.py +281 -0
  53. kodexa_document-8.0.0.dev20607935278/tests/test_memory.py +72 -0
  54. kodexa_document-8.0.0.dev20607935278/tests/test_metadata_persistence.py +143 -0
  55. kodexa_document-8.0.0.dev20607935278/tests/test_minimal.py +91 -0
  56. kodexa_document-8.0.0.dev20607935278/tests/test_mixins.py +230 -0
  57. kodexa_document-8.0.0.dev20607935278/tests/test_new_methods.py +120 -0
  58. kodexa_document-8.0.0.dev20607935278/tests/test_pretty_print.py +423 -0
  59. kodexa_document-8.0.0.dev20607935278/tests/test_processing_step.py +339 -0
  60. kodexa_document-8.0.0.dev20607935278/tests/test_processing_step_integration.py +249 -0
  61. kodexa_document-8.0.0.dev20607935278/tests/test_processing_steps.py +311 -0
  62. kodexa_document-8.0.0.dev20607935278/tests/test_remaining_issues.py +150 -0
  63. kodexa_document-8.0.0.dev20607935278/tests/test_rotation_methods.py +175 -0
  64. kodexa_document-8.0.0.dev20607935278/tests/test_source_metadata.py +216 -0
  65. kodexa_document-8.0.0.dev20607935278/tests/test_spatial_methods.py +293 -0
  66. kodexa_document-8.0.0.dev20607935278/tests/test_tag_class.py +251 -0
  67. kodexa_document-8.0.0.dev20607935278/tests/test_tag_instance.py +478 -0
  68. kodexa_document-8.0.0.dev20607935278/tests/test_tag_integration.py +135 -0
  69. kodexa_document-8.0.0.dev20607935278/tests/test_to_kddb.py +211 -0
@@ -0,0 +1,5 @@
1
+ # Changelog
2
+
3
+ All notable changes to the Python Go bindings will be documented in this file.
4
+
5
+ ## [Unreleased]
@@ -0,0 +1,37 @@
1
+ # Include native libraries (required for functionality)
2
+ recursive-include kodexa_document/_native *.so *.dll *.dylib *.h
3
+
4
+ # Include essential user documentation (shipped with package)
5
+ include README.md
6
+ include USAGE.md
7
+ include CHANGELOG.md
8
+ include docs/API_REFERENCE.md
9
+
10
+ # Include license and metadata files
11
+ include pyproject.toml
12
+
13
+ # Exclude build tools and build documentation (not needed by end users)
14
+ prune build/
15
+ global-exclude build_*.py
16
+ global-exclude build_*.sh
17
+ global-exclude build_*.bat
18
+ global-exclude fix_windows.py
19
+
20
+ # Exclude development/debug files
21
+ exclude .work/
22
+ prune .work/
23
+ exclude .pytest_cache/
24
+ prune .pytest_cache/
25
+ global-exclude test_contentnode_lifecycle_old.py
26
+
27
+ # Exclude compiled Python files and caches
28
+ global-exclude __pycache__/
29
+ global-exclude *.pyc
30
+ global-exclude *.pyo
31
+ global-exclude *.pyd
32
+ global-exclude .Python
33
+ global-exclude pip-log.txt
34
+
35
+ # Exclude distribution build artifacts
36
+ prune dist/
37
+ prune *.egg-info/
@@ -0,0 +1,407 @@
1
+ Metadata-Version: 2.4
2
+ Name: kodexa-document
3
+ Version: 8.0.0.dev20607935278
4
+ Summary: High-performance Python bindings for the Go-based Kodexa Document SDK with in-memory processing
5
+ Author-email: Kodexa <support@kodexa.com>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/kodexa/kodexa-document
8
+ Project-URL: Repository, https://github.com/kodexa/kodexa-document
9
+ Project-URL: Documentation, https://docs.kodexa.com
10
+ Project-URL: Bug Tracker, https://github.com/kodexa/kodexa-document/issues
11
+ Keywords: document,processing,extraction,nlp,ai,sqlite,kddb
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Classifier: Topic :: Text Processing
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Database
22
+ Classifier: Topic :: Text Processing :: Markup
23
+ Requires-Python: >=3.12
24
+ Description-Content-Type: text/markdown
25
+ Requires-Dist: cffi>=1.14.0
26
+ Requires-Dist: addict>=2.4.0
27
+ Requires-Dist: pydantic>=2.0.0
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=6.0; extra == "dev"
30
+ Requires-Dist: pytest-cov; extra == "dev"
31
+ Requires-Dist: black; extra == "dev"
32
+ Requires-Dist: isort; extra == "dev"
33
+ Requires-Dist: mypy; extra == "dev"
34
+ Requires-Dist: flake8; extra == "dev"
35
+ Requires-Dist: build; extra == "dev"
36
+ Requires-Dist: twine; extra == "dev"
37
+ Provides-Extra: test
38
+ Requires-Dist: pytest>=6.0; extra == "test"
39
+ Requires-Dist: pytest-cov; extra == "test"
40
+ Provides-Extra: lambda
41
+
42
+ # Kodexa Document Python
43
+
44
+ High-performance Python bindings for the Go-based Kodexa Document SDK using CFFI. Provides comprehensive document processing capabilities with ~100x performance improvement through in-memory operations.
45
+
46
+ ## Overview
47
+
48
+ This package provides mature Python bindings for the Go-based Kodexa Document SDK. It uses CFFI (C Foreign Function Interface) to communicate with the Go library, offering full access to hierarchical document processing, advanced querying, and rich metadata management.
49
+
50
+ **Key Highlights:**
51
+ - **Production Ready**: 413+ comprehensive tests covering all functionality
52
+ - **High Performance**: ~100x faster with in-memory mode (1.19ms vs 121ms)
53
+ - **Full Feature Set**: Complete document manipulation, querying, and persistence
54
+ - **Cross Platform**: Linux, macOS (Intel/ARM), Windows, AWS Lambda
55
+
56
+ ## Features
57
+
58
+ ### Core Document Operations
59
+ - **Document Creation**: From text, JSON, KDDB files, or scratch
60
+ - **In-Memory Processing**: ~100x performance boost for temporary operations
61
+ - **Context Managers**: Automatic resource cleanup with `with` statements
62
+ - **Multiple Formats**: JSON export/import, KDDB persistence, dict conversion
63
+
64
+ ### Content Structure & Navigation
65
+ - **Hierarchical Nodes**: Document tree structure like DOM for web pages
66
+ - **Content Operations**: Rich text handling with content parts
67
+ - **Tree Navigation**: Parent/child relationships, sibling traversal, path queries
68
+ - **Node Management**: Create, modify, remove nodes with full hierarchy support
69
+
70
+ ### Advanced Querying
71
+ - **Selector Language**: XPath-like queries (`//paragraph[contains(@content, 'text')]`)
72
+ - **Variable Support**: Parameterized queries with variable substitution
73
+ - **Performance Options**: First-only results, relative queries from nodes
74
+ - **Rich Filtering**: Content-based, tag-based, and feature-based selection
75
+
76
+ ### Metadata & Annotations
77
+ - **Features System**: Key-value metadata with type organization
78
+ - **Tagging**: Content annotation with confidence scores and values
79
+ - **Document Labels**: Classification and categorization
80
+ - **Mixins**: Capability flags and behavior markers
81
+ - **External Data**: Arbitrary data storage with custom keys
82
+ - **Processing Steps**: Workflow tracking and validation rules
83
+
84
+ ### Spatial & Geometric Operations
85
+ - **Bounding Boxes**: Position and dimension tracking
86
+ - **Spatial Queries**: Location-based content selection
87
+ - **Coordinate Systems**: Flexible positioning support
88
+
89
+ ### Enterprise Features
90
+ - **Extraction Engine**: Advanced content extraction with taxonomies
91
+ - **Validation Framework**: Rule-based document validation
92
+ - **Statistics**: Comprehensive document metrics and analysis
93
+ - **Error Handling**: Comprehensive exception system with specific error types
94
+ - **Memory Management**: Automatic cleanup with finalizers
95
+
96
+ ## Installation
97
+
98
+ ```bash
99
+ pip install kodexa-document
100
+ ```
101
+
102
+ ## Quick Start
103
+
104
+ ```python
105
+ from kodexa_document import Document
106
+
107
+ # Create high-performance in-memory document
108
+ with Document(inmemory=True) as doc:
109
+ # Create document structure
110
+ root = doc.create_node("document", "My Document")
111
+ doc.content_node = root
112
+
113
+ section = doc.create_node("section", "Introduction", parent=root)
114
+ para = doc.create_node("paragraph", "Important content", parent=section)
115
+
116
+ # Add rich metadata
117
+ para.tag("important", confidence=0.95, value="key-point")
118
+ para.add_feature("style", "emphasis", "bold")
119
+ doc.add_label("technical-document")
120
+
121
+ # Query with selectors
122
+ important_nodes = doc.select("//paragraph[@tag='important']")
123
+ all_content = doc.select("//*[contains(@content, 'content')]")
124
+
125
+ # Export to different formats
126
+ json_str = doc.to_json(indent=2)
127
+ doc.save("output.kddb")
128
+
129
+ print(f"Found {len(important_nodes)} important paragraphs")
130
+ ```
131
+
132
+ ## Advanced Usage Examples
133
+
134
+ ### Document Processing Pipeline
135
+
136
+ ```python
137
+ from kodexa_document import Document
138
+ from kodexa_document.errors import DocumentError
139
+
140
+ def process_document(input_path, output_path):
141
+ """Complete document processing pipeline."""
142
+ with Document.from_kddb(input_path, inmemory=True) as doc:
143
+ # Analyze structure
144
+ all_nodes = doc.select("//*")
145
+ paragraphs = doc.select("//paragraph")
146
+
147
+ # Process content
148
+ for i, para in enumerate(paragraphs):
149
+ if len(para.content) > 100: # Long paragraphs
150
+ para.tag("detailed", confidence=0.8)
151
+ para.add_feature("analysis", "length", len(para.content))
152
+
153
+ if i == 0: # First paragraph
154
+ para.tag("introduction")
155
+
156
+ # Add document metadata
157
+ doc.set_metadata("processed", True)
158
+ doc.set_metadata("node_count", len(all_nodes))
159
+ doc.add_label("processed-document")
160
+
161
+ # Save results
162
+ doc.save(output_path)
163
+
164
+ return {
165
+ "uuid": doc.uuid,
166
+ "nodes": len(all_nodes),
167
+ "tagged": len(doc.get_all_tagged_nodes())
168
+ }
169
+
170
+ # Process with error handling
171
+ try:
172
+ result = process_document("input.kddb", "processed.kddb")
173
+ print(f"Processed document {result['uuid']}: {result['nodes']} nodes")
174
+ except DocumentError as e:
175
+ print(f"Processing failed: {e}")
176
+ ```
177
+
178
+ ### Content Analysis and Extraction
179
+
180
+ ```python
181
+ # Load and analyze document structure
182
+ with Document.from_text("Chapter 1\nIntroduction\nContent here",
183
+ separator="\n", inmemory=True) as doc:
184
+
185
+ # Navigate document hierarchy
186
+ root = doc.content_node
187
+ children = root.get_children()
188
+
189
+ # Rich querying
190
+ headers = doc.select("//paragraph[1]") # First paragraphs (likely headers)
191
+ long_content = doc.select("//paragraph[string-length(@content) > 50]")
192
+
193
+ # Feature analysis
194
+ for node in children:
195
+ node.add_feature("position", "index", node.index)
196
+ if "Chapter" in node.content:
197
+ node.tag("chapter-header")
198
+ node.add_feature("structure", "type", "header")
199
+
200
+ # Get comprehensive statistics
201
+ stats = doc.get_statistics()
202
+ tagged_nodes = doc.get_all_tagged_nodes()
203
+
204
+ print(f"Document structure: {len(children)} top-level nodes")
205
+ print(f"Tagged content: {len(tagged_nodes)} nodes")
206
+ print(f"Statistics: {stats}")
207
+ ```
208
+
209
+ ## Performance Comparison
210
+
211
+ ```python
212
+ import time
213
+
214
+ # In-memory processing (recommended for temporary operations)
215
+ start = time.time()
216
+ with Document(inmemory=True) as doc:
217
+ root = doc.create_node("document", "Fast processing")
218
+ doc.content_node = root
219
+ for i in range(1000):
220
+ doc.create_node("item", f"Item {i}", parent=root)
221
+ nodes = doc.select("//*")
222
+ inmemory_time = time.time() - start
223
+
224
+ # File-based processing (for persistence)
225
+ start = time.time()
226
+ with Document(inmemory=False) as doc:
227
+ root = doc.create_node("document", "Persistent processing")
228
+ doc.content_node = root
229
+ for i in range(1000):
230
+ doc.create_node("item", f"Item {i}", parent=root)
231
+ nodes = doc.select("//*")
232
+ file_time = time.time() - start
233
+
234
+ print(f"In-memory: {inmemory_time:.3f}s")
235
+ print(f"File-based: {file_time:.3f}s")
236
+ print(f"Performance improvement: {file_time/inmemory_time:.1f}x faster")
237
+ ```
238
+
239
+ ## Loading Documents
240
+
241
+ The `from_kddb` method supports flexible loading modes:
242
+
243
+ ```python
244
+ # Standard loading modes
245
+ doc = Document.from_kddb("input.kddb") # Detached copy (safe, default)
246
+ doc = Document.from_kddb("input.kddb", detached=False) # In-place editing
247
+ doc = Document.from_kddb("input.kddb", inmemory=True) # 100x performance boost
248
+
249
+ # Load from bytes (API responses, downloads, etc.)
250
+ with open("document.kddb", "rb") as f:
251
+ kddb_bytes = f.read()
252
+ doc = Document.from_kddb(kddb_bytes, inmemory=True)
253
+
254
+ # Temporary files with auto-cleanup
255
+ doc = Document.from_kddb("temp.kddb", delete_on_close=True)
256
+ ```
257
+
258
+ | Parameter | Default | Description |
259
+ |-----------|---------|-------------|
260
+ | `detached` | `True` | Creates working copy vs editing original |
261
+ | `inmemory` | `False` | Loads into memory for ~100x performance |
262
+ | `delete_on_close` | `False` | Auto-deletes file when document closes |
263
+
264
+ ## Error Handling
265
+
266
+ ```python
267
+ from kodexa_document.errors import DocumentError, DocumentNotFoundError
268
+
269
+ # Robust error handling
270
+ try:
271
+ with Document.from_kddb("document.kddb", inmemory=True) as doc:
272
+ # Process document
273
+ nodes = doc.select("//paragraph")
274
+ for node in nodes:
275
+ node.tag("processed")
276
+
277
+ # Validate results
278
+ if not doc.uuid:
279
+ raise DocumentError("Invalid document state")
280
+
281
+ except DocumentNotFoundError:
282
+ print("Document file not found")
283
+ except DocumentError as e:
284
+ print(f"Document processing error: {e}")
285
+ except Exception as e:
286
+ print(f"Unexpected error: {e}")
287
+ ```
288
+
289
+ ## Architecture
290
+
291
+ ```
292
+ Python Application
293
+
294
+ CFFI Python Wrapper (413+ Tests)
295
+
296
+ Go Shared Library (CGO)
297
+
298
+ GORM Domain Layer
299
+
300
+ SQLite Database (File/Memory)
301
+ ```
302
+
303
+ **Performance Modes:**
304
+ - **In-Memory SQLite**: `:memory:` database for maximum speed
305
+ - **File-Based SQLite**: Persistent `.kddb` files for storage
306
+ - **Hybrid Mode**: Load from file, process in-memory, save back
307
+
308
+ ## Requirements
309
+
310
+ - Python 3.12+
311
+ - cffi >= 1.14.0
312
+ - Go shared library (automatically bundled in wheel)
313
+
314
+ ## Platform Support
315
+
316
+ - **Linux x86_64** - Primary development platform
317
+ - **macOS x86_64 & ARM64** - Intel and Apple Silicon support
318
+ - **Windows x86_64** - Full Windows compatibility
319
+ - **AWS Lambda** - Amazon Linux 2 optimization
320
+
321
+ ## Testing & Quality
322
+
323
+ - **413+ Comprehensive Tests** covering all functionality
324
+ - **100% Feature Coverage** - All advertised features are tested and working
325
+ - **Error Path Testing** - Comprehensive error handling validation
326
+ - **Performance Testing** - Memory usage and speed benchmarks
327
+ - **Cross-Platform Testing** - Validated on all supported platforms
328
+
329
+ ```bash
330
+ # Run comprehensive test suite
331
+ cd lib/python
332
+ source ../../venv/bin/activate
333
+ python -m pytest tests/ -v
334
+
335
+ # Test categories
336
+ python -m pytest tests/test_document.py -v # Core document operations
337
+ python -m pytest tests/test_contentnode_features_tags.py -v # Features and tags
338
+ python -m pytest tests/test_contentnode_selectors.py -v # Query system
339
+ python -m pytest tests/test_extraction.py -v # Advanced extraction
340
+ ```
341
+
342
+ ## Development Setup
343
+
344
+ ```bash
345
+ # Quick setup from repository root
346
+ python3 -m venv venv
347
+ source venv/bin/activate
348
+ pip install cffi pytest
349
+
350
+ # Build Go library and Python bindings
351
+ cd lib/go && make linux # or: make darwin, make windows
352
+ cd ../python
353
+
354
+ # Test installation
355
+ python -c "from kodexa_document import Document; print('Success!')"
356
+
357
+ # Run tests
358
+ python -m pytest tests/ -v
359
+ ```
360
+
361
+ ## Documentation
362
+
363
+ ### User Documentation
364
+ - **[USAGE.md](USAGE.md)** - Comprehensive usage examples and best practices
365
+ - **[docs/API_REFERENCE.md](docs/API_REFERENCE.md)** - Complete API reference
366
+
367
+ ### Build Documentation
368
+ - **[docs/BUILD_SCRIPTS_GUIDE.md](docs/BUILD_SCRIPTS_GUIDE.md)** - Build automation guide
369
+ - **[build/docs/BUILD.md](build/docs/BUILD.md)** - Detailed build instructions
370
+ - **[build/docs/WINDOWS_SETUP.md](build/docs/WINDOWS_SETUP.md)** - Windows development setup
371
+
372
+ ## Best Practices
373
+
374
+ 1. **Use `inmemory=True`** for temporary processing (~100x faster)
375
+ 2. **Use context managers** (`with` statements) for automatic cleanup
376
+ 3. **Handle specific exceptions** (DocumentError, DocumentNotFoundError)
377
+ 4. **Structure documents hierarchically** with proper parent-child relationships
378
+ 5. **Leverage selectors** for efficient document querying
379
+ 6. **Use features and tags** for rich content annotation
380
+ 7. **Set meaningful metadata** for document tracking and organization
381
+
382
+ ## Use Cases
383
+
384
+ - **Document Processing Pipelines** - ETL workflows for structured documents
385
+ - **Content Analysis** - Text mining, information extraction, document understanding
386
+ - **Document Transformation** - Format conversion, structure normalization
387
+ - **Search and Indexing** - Content indexing with rich metadata
388
+ - **Validation and Quality** - Document structure validation and quality assessment
389
+ - **Machine Learning** - Feature extraction for ML pipelines
390
+ - **Enterprise Integration** - High-performance document processing systems
391
+
392
+ ## Performance Characteristics
393
+
394
+ | Operation | In-Memory | File-Based | Improvement |
395
+ |-----------|-----------|------------|-------------|
396
+ | Document Creation | ~1.2ms | ~121ms | 100x |
397
+ | Node Creation (1000 nodes) | ~15ms | ~1.5s | 100x |
398
+ | Selector Queries | ~2ms | ~45ms | 22x |
399
+ | Feature/Tag Operations | ~0.5ms | ~25ms | 50x |
400
+
401
+ ## License
402
+
403
+ Same as the main Kodexa Document SDK.
404
+
405
+ ---
406
+
407
+ **Ready to get started?** Check out [USAGE.md](USAGE.md) for comprehensive examples and [run the test suite](#testing--quality) to see all features in action!