lexiredact 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexiredact-0.1.0/LICENSE +21 -0
- lexiredact-0.1.0/MANIFEST.in +19 -0
- lexiredact-0.1.0/PKG-INFO +100 -0
- lexiredact-0.1.0/README.md +59 -0
- lexiredact-0.1.0/lexiredact/__init__.py +139 -0
- lexiredact-0.1.0/lexiredact/chunking/__init__.py +18 -0
- lexiredact-0.1.0/lexiredact/chunking/chunker.py +321 -0
- lexiredact-0.1.0/lexiredact/chunking/json_exporter.py +103 -0
- lexiredact-0.1.0/lexiredact/chunking/pdf_loader.py +86 -0
- lexiredact-0.1.0/lexiredact/cli.py +177 -0
- lexiredact-0.1.0/lexiredact/config/__init__.py +16 -0
- lexiredact-0.1.0/lexiredact/config/defaults.py +121 -0
- lexiredact-0.1.0/lexiredact/config/loader.py +119 -0
- lexiredact-0.1.0/lexiredact/implementations/__init__.py +41 -0
- lexiredact-0.1.0/lexiredact/implementations/cache/__init__.py +10 -0
- lexiredact-0.1.0/lexiredact/implementations/cache/generic.py +112 -0
- lexiredact-0.1.0/lexiredact/implementations/cache/memory.py +120 -0
- lexiredact-0.1.0/lexiredact/implementations/cache/redis.py +182 -0
- lexiredact-0.1.0/lexiredact/implementations/embedding/__init__.py +7 -0
- lexiredact-0.1.0/lexiredact/implementations/embedding/fastembed.py +97 -0
- lexiredact-0.1.0/lexiredact/implementations/embedding/generic.py +101 -0
- lexiredact-0.1.0/lexiredact/implementations/tracker/__init__.py +10 -0
- lexiredact-0.1.0/lexiredact/implementations/tracker/mlflow.py +147 -0
- lexiredact-0.1.0/lexiredact/implementations/vectorstore/__init__.py +11 -0
- lexiredact-0.1.0/lexiredact/implementations/vectorstore/chroma.py +271 -0
- lexiredact-0.1.0/lexiredact/implementations/vectorstore/generic.py +120 -0
- lexiredact-0.1.0/lexiredact/interfaces/__init__.py +18 -0
- lexiredact-0.1.0/lexiredact/interfaces/cache.py +58 -0
- lexiredact-0.1.0/lexiredact/interfaces/embedder.py +48 -0
- lexiredact-0.1.0/lexiredact/interfaces/tracker.py +67 -0
- lexiredact-0.1.0/lexiredact/interfaces/vectorstore.py +89 -0
- lexiredact-0.1.0/lexiredact/metrics/__init__.py +21 -0
- lexiredact-0.1.0/lexiredact/metrics/stats.py +386 -0
- lexiredact-0.1.0/lexiredact/pipeline/__init__.py +11 -0
- lexiredact-0.1.0/lexiredact/pipeline/ingest.py +587 -0
- lexiredact-0.1.0/lexiredact/privacy/__init__.py +15 -0
- lexiredact-0.1.0/lexiredact/privacy/pii_detector.py +176 -0
- lexiredact-0.1.0/lexiredact/privacy/policy.py +135 -0
- lexiredact-0.1.0/lexiredact/privacy/redactor.py +110 -0
- lexiredact-0.1.0/lexiredact/py.typed +1 -0
- lexiredact-0.1.0/lexiredact/registry/__init__.py +9 -0
- lexiredact-0.1.0/lexiredact/registry/loader.py +521 -0
- lexiredact-0.1.0/lexiredact/utils/__init__.py +17 -0
- lexiredact-0.1.0/lexiredact/utils/hashing.py +60 -0
- lexiredact-0.1.0/lexiredact/utils/timing.py +122 -0
- lexiredact-0.1.0/lexiredact.egg-info/PKG-INFO +100 -0
- lexiredact-0.1.0/lexiredact.egg-info/SOURCES.txt +52 -0
- lexiredact-0.1.0/lexiredact.egg-info/dependency_links.txt +1 -0
- lexiredact-0.1.0/lexiredact.egg-info/entry_points.txt +2 -0
- lexiredact-0.1.0/lexiredact.egg-info/requires.txt +21 -0
- lexiredact-0.1.0/lexiredact.egg-info/top_level.txt +1 -0
- lexiredact-0.1.0/pyproject.toml +65 -0
- lexiredact-0.1.0/setup.cfg +4 -0
- lexiredact-0.1.0/setup.py +3 -0
lexiredact-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Baihela Abid Hussain
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
include README.md
|
|
2
|
+
include LICENSE*
|
|
3
|
+
|
|
4
|
+
recursive-include lexiredact *.py
|
|
5
|
+
recursive-include lexiredact *.typed
|
|
6
|
+
|
|
7
|
+
prune backend
|
|
8
|
+
prune benchmarks
|
|
9
|
+
prune data
|
|
10
|
+
prune dist
|
|
11
|
+
prune .tmp-build
|
|
12
|
+
prune lexiredact_data
|
|
13
|
+
prune venv
|
|
14
|
+
|
|
15
|
+
global-exclude *.py[cod]
|
|
16
|
+
global-exclude __pycache__
|
|
17
|
+
global-exclude *.so
|
|
18
|
+
exclude mlflow.db
|
|
19
|
+
exclude requirements-backend.txt
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lexiredact
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Privacy-First Vector Database for Sensitive Data
|
|
5
|
+
Author-email: Shwetan Londhe <shwetan.college@gmail.com>, Varad Limbkar <varadlimbkar@gmail.com>, Baihela Husain <baihelahusain@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/lexiredact/lexiredact
|
|
8
|
+
Project-URL: Repository, https://github.com/lexiredact/lexiredact
|
|
9
|
+
Project-URL: Documentation, https://github.com/lexiredact/lexiredact#documentation
|
|
10
|
+
Project-URL: Issues, https://github.com/lexiredact/lexiredact/issues
|
|
11
|
+
Keywords: pii,privacy,vector-database,embedding,rag
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: presidio-analyzer>=2.2.0
|
|
24
|
+
Requires-Dist: presidio-anonymizer>=2.2.0
|
|
25
|
+
Requires-Dist: fastembed>=0.2.0
|
|
26
|
+
Requires-Dist: chromadb>=0.4.0
|
|
27
|
+
Requires-Dist: pydantic>=2.0.0
|
|
28
|
+
Requires-Dist: pyyaml>=6.0
|
|
29
|
+
Requires-Dist: numpy>=1.24.0
|
|
30
|
+
Provides-Extra: pdf
|
|
31
|
+
Requires-Dist: pypdf>=4.0.0; extra == "pdf"
|
|
32
|
+
Provides-Extra: redis
|
|
33
|
+
Requires-Dist: redis[async]>=5.0.0; extra == "redis"
|
|
34
|
+
Provides-Extra: mlflow
|
|
35
|
+
Requires-Dist: mlflow>=2.10.0; extra == "mlflow"
|
|
36
|
+
Provides-Extra: all
|
|
37
|
+
Requires-Dist: pypdf>=4.0.0; extra == "all"
|
|
38
|
+
Requires-Dist: redis[async]>=5.0.0; extra == "all"
|
|
39
|
+
Requires-Dist: mlflow>=2.10.0; extra == "all"
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
|
|
42
|
+
# LexiRedact
|
|
43
|
+
|
|
44
|
+
LexiRedact is a Python package for privacy-first document ingestion in RAG and vector database workflows. It detects PII, redacts sensitive text before storage, and preserves retrieval quality by generating embeddings from the original text while storing only sanitized content.
|
|
45
|
+
|
|
46
|
+
## Install
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install lexiredact
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Optional extras:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install "lexiredact[pdf]"
|
|
56
|
+
pip install "lexiredact[redis]"
|
|
57
|
+
pip install "lexiredact[mlflow]"
|
|
58
|
+
pip install "lexiredact[all]"
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## What It Focuses On
|
|
62
|
+
|
|
63
|
+
- PII detection with Presidio
|
|
64
|
+
- safe redaction before vector-store persistence
|
|
65
|
+
- configurable ingestion pipeline components
|
|
66
|
+
- operational metrics for privacy and latency
|
|
67
|
+
- optional retrieval evaluation helpers for model comparison
|
|
68
|
+
|
|
69
|
+
## Quick Start
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import asyncio
|
|
73
|
+
import lexiredact as lr
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
async def main() -> None:
|
|
77
|
+
pipeline = lr.IngestionPipeline()
|
|
78
|
+
await pipeline.initialize()
|
|
79
|
+
|
|
80
|
+
result = await pipeline.process_document(
|
|
81
|
+
lr.Document(
|
|
82
|
+
id="doc-1",
|
|
83
|
+
text="Contact Jane Doe at jane@example.com or 555-0101",
|
|
84
|
+
metadata={"source": "demo"},
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
print(result.clean_text)
|
|
89
|
+
print(result.pii_entities)
|
|
90
|
+
|
|
91
|
+
await pipeline.shutdown()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
asyncio.run(main())
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Docs And Examples
|
|
98
|
+
|
|
99
|
+
- docs: [`docs/`](./docs)
|
|
100
|
+
- examples: [`examples/`](./examples)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# LexiRedact
|
|
2
|
+
|
|
3
|
+
LexiRedact is a Python package for privacy-first document ingestion in RAG and vector database workflows. It detects PII, redacts sensitive text before storage, and preserves retrieval quality by generating embeddings from the original text while storing only sanitized content.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install lexiredact
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Optional extras:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install "lexiredact[pdf]"
|
|
15
|
+
pip install "lexiredact[redis]"
|
|
16
|
+
pip install "lexiredact[mlflow]"
|
|
17
|
+
pip install "lexiredact[all]"
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## What It Focuses On
|
|
21
|
+
|
|
22
|
+
- PII detection with Presidio
|
|
23
|
+
- safe redaction before vector-store persistence
|
|
24
|
+
- configurable ingestion pipeline components
|
|
25
|
+
- operational metrics for privacy and latency
|
|
26
|
+
- optional retrieval evaluation helpers for model comparison
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import asyncio
|
|
32
|
+
import lexiredact as lr
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def main() -> None:
|
|
36
|
+
pipeline = lr.IngestionPipeline()
|
|
37
|
+
await pipeline.initialize()
|
|
38
|
+
|
|
39
|
+
result = await pipeline.process_document(
|
|
40
|
+
lr.Document(
|
|
41
|
+
id="doc-1",
|
|
42
|
+
text="Contact Jane Doe at jane@example.com or 555-0101",
|
|
43
|
+
metadata={"source": "demo"},
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
print(result.clean_text)
|
|
48
|
+
print(result.pii_entities)
|
|
49
|
+
|
|
50
|
+
await pipeline.shutdown()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
asyncio.run(main())
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Docs And Examples
|
|
57
|
+
|
|
58
|
+
- docs: [`docs/`](./docs)
|
|
59
|
+
- examples: [`examples/`](./examples)
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LexiRedact - Privacy-Preserving RAG Middleware
|
|
3
|
+
|
|
4
|
+
A Python SDK for protecting PII in vector databases while maintaining
|
|
5
|
+
semantic search quality through intelligent embedding and redaction.
|
|
6
|
+
|
|
7
|
+
Key Features:
|
|
8
|
+
- Automatic PII detection and redaction using Microsoft Presidio
|
|
9
|
+
- Embedding generation from original text (Shadow Mode architecture)
|
|
10
|
+
- Only sanitized text stored in vector databases
|
|
11
|
+
- Redis caching for performance optimization
|
|
12
|
+
- Pluggable architecture via dependency injection
|
|
13
|
+
- Comprehensive metrics and tracking
|
|
14
|
+
|
|
15
|
+
Basic Usage:
|
|
16
|
+
>>> import lexiredact as vs
|
|
17
|
+
>>>
|
|
18
|
+
>>> # Create pipeline with defaults
|
|
19
|
+
>>> pipeline = vs.IngestionPipeline()
|
|
20
|
+
>>> await pipeline.initialize()
|
|
21
|
+
>>>
|
|
22
|
+
>>> # Process documents
|
|
23
|
+
>>> doc = vs.Document(id="1", text="Contact John at john@example.com")
|
|
24
|
+
>>> result = await pipeline.process_document(doc)
|
|
25
|
+
>>>
|
|
26
|
+
>>> print(result.clean_text) # "Contact <PERSON> at <EMAIL_ADDRESS>"
|
|
27
|
+
>>> print(result.pii_entities) # ["PERSON", "EMAIL_ADDRESS"]
|
|
28
|
+
>>>
|
|
29
|
+
>>> await pipeline.shutdown()
|
|
30
|
+
|
|
31
|
+
Custom Configuration:
|
|
32
|
+
>>> from lexiredact import IngestionPipeline, load_config
|
|
33
|
+
>>>
|
|
34
|
+
>>> config = load_config(config_dict={
|
|
35
|
+
... "embedding_model": "BAAI/bge-base-en-v1.5",
|
|
36
|
+
... "cache_backend": "redis",
|
|
37
|
+
... "redis_host": "localhost"
|
|
38
|
+
... })
|
|
39
|
+
>>>
|
|
40
|
+
>>> pipeline = IngestionPipeline(config=config)
|
|
41
|
+
|
|
42
|
+
Custom Components:
|
|
43
|
+
>>> from lexiredact import IngestionPipeline
|
|
44
|
+
>>> from lexiredact.interfaces import Embedder
|
|
45
|
+
>>>
|
|
46
|
+
>>> class MyEmbedder(Embedder):
|
|
47
|
+
... # Custom implementation
|
|
48
|
+
... pass
|
|
49
|
+
>>>
|
|
50
|
+
>>> pipeline = IngestionPipeline(embedder=MyEmbedder())
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
__version__ = "0.1.0"
|
|
54
|
+
|
|
55
|
+
# Core pipeline
|
|
56
|
+
from .pipeline import IngestionPipeline, Document, ProcessedDocument
|
|
57
|
+
|
|
58
|
+
# Configuration
|
|
59
|
+
from .config import load_config, get_default_config, save_config_to_yaml
|
|
60
|
+
|
|
61
|
+
# Privacy components
|
|
62
|
+
from .privacy import PIIDetector, PIIRedactor, PIIPolicy
|
|
63
|
+
|
|
64
|
+
# Interfaces (for custom implementations)
|
|
65
|
+
from .interfaces import CacheBackend, Embedder, VectorStore, Tracker
|
|
66
|
+
|
|
67
|
+
# Default implementations
|
|
68
|
+
from .implementations import (
|
|
69
|
+
MemoryCache,
|
|
70
|
+
RedisCache,
|
|
71
|
+
GenericCache,
|
|
72
|
+
FastEmbedEmbedder,
|
|
73
|
+
GenericEmbedder,
|
|
74
|
+
ChromaVectorStore,
|
|
75
|
+
GenericVectorStore,
|
|
76
|
+
MLflowTracker,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Metrics
|
|
80
|
+
from .metrics import (
|
|
81
|
+
MetricsCollector,
|
|
82
|
+
AggregateStats,
|
|
83
|
+
RetrievalAggregateStats,
|
|
84
|
+
RetrievalMetricsEvaluator,
|
|
85
|
+
RetrievalQueryMetrics,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Utilities
|
|
89
|
+
from .utils import hash_text, generate_cache_key, Timer
|
|
90
|
+
|
|
91
|
+
__all__ = [
|
|
92
|
+
# Version
|
|
93
|
+
"__version__",
|
|
94
|
+
|
|
95
|
+
# Core
|
|
96
|
+
"IngestionPipeline",
|
|
97
|
+
"Document",
|
|
98
|
+
"ProcessedDocument",
|
|
99
|
+
|
|
100
|
+
# Configuration
|
|
101
|
+
"load_config",
|
|
102
|
+
"get_default_config",
|
|
103
|
+
"save_config_to_yaml",
|
|
104
|
+
|
|
105
|
+
# Privacy
|
|
106
|
+
"PIIDetector",
|
|
107
|
+
"PIIRedactor",
|
|
108
|
+
"PIIPolicy",
|
|
109
|
+
|
|
110
|
+
# Interfaces
|
|
111
|
+
"CacheBackend",
|
|
112
|
+
"Embedder",
|
|
113
|
+
"VectorStore",
|
|
114
|
+
"Tracker",
|
|
115
|
+
|
|
116
|
+
#custom models
|
|
117
|
+
"GenericCache",
|
|
118
|
+
"GenericEmbedder",
|
|
119
|
+
"GenericVectorStore",
|
|
120
|
+
|
|
121
|
+
# Implementations
|
|
122
|
+
"MemoryCache",
|
|
123
|
+
"RedisCache",
|
|
124
|
+
"FastEmbedEmbedder",
|
|
125
|
+
"ChromaVectorStore",
|
|
126
|
+
"MLflowTracker",
|
|
127
|
+
|
|
128
|
+
# Metrics
|
|
129
|
+
"MetricsCollector",
|
|
130
|
+
"AggregateStats",
|
|
131
|
+
"RetrievalAggregateStats",
|
|
132
|
+
"RetrievalMetricsEvaluator",
|
|
133
|
+
"RetrievalQueryMetrics",
|
|
134
|
+
|
|
135
|
+
# Utils
|
|
136
|
+
"hash_text",
|
|
137
|
+
"generate_cache_key",
|
|
138
|
+
"Timer",
|
|
139
|
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document chunking module for LexiRedact.
|
|
3
|
+
|
|
4
|
+
Converts PDFs and large text documents into manageable chunks
|
|
5
|
+
suitable for embedding and PII detection.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .chunker import Chunk, DocumentChunker, ChunkingStrategy
|
|
9
|
+
from .json_exporter import JSONExporter
|
|
10
|
+
from .pdf_loader import PDFLoader
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"Chunk",
|
|
14
|
+
"DocumentChunker",
|
|
15
|
+
"ChunkingStrategy",
|
|
16
|
+
"PDFLoader",
|
|
17
|
+
"JSONExporter",
|
|
18
|
+
]
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core document chunking logic.
|
|
3
|
+
Splits large documents into smaller chunks with overlap.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Dict, Any, Optional, Literal
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
import uuid
|
|
9
|
+
from enum import Enum
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ChunkingStrategy(str, Enum):
|
|
14
|
+
"""Chunking strategies."""
|
|
15
|
+
FIXED_SIZE = "fixed_size" # Fixed token/char chunks
|
|
16
|
+
SENTENCE = "sentence" # Split by sentences
|
|
17
|
+
PARAGRAPH = "paragraph" # Split by paragraphs
|
|
18
|
+
HYBRID = "hybrid" # Sentences grouped into chunks
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class Chunk:
|
|
23
|
+
"""Single chunk of text."""
|
|
24
|
+
id: str
|
|
25
|
+
text: str
|
|
26
|
+
chunk_index: int
|
|
27
|
+
start_char: int
|
|
28
|
+
end_char: int
|
|
29
|
+
metadata: Dict[str, Any]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DocumentChunker:
|
|
33
|
+
"""
|
|
34
|
+
Convert large documents into LexiRedact-compatible chunks.
|
|
35
|
+
|
|
36
|
+
Support for:
|
|
37
|
+
- Fixed-size chunking (tokens or characters)
|
|
38
|
+
- Sentence-based chunking
|
|
39
|
+
- Paragraph-based chunking
|
|
40
|
+
- Overlap between chunks
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
chunk_size: int = 512, # Max characters per chunk
|
|
46
|
+
overlap: int = 100, # Overlap between chunks (chars)
|
|
47
|
+
strategy: ChunkingStrategy = ChunkingStrategy.FIXED_SIZE,
|
|
48
|
+
preserve_sentences: bool = True, # Don't split mid-sentence
|
|
49
|
+
):
|
|
50
|
+
"""
|
|
51
|
+
Initialize chunker.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
chunk_size: Target chunk size in characters
|
|
55
|
+
overlap: Overlap between chunks (to preserve context)
|
|
56
|
+
strategy: Chunking strategy to use
|
|
57
|
+
preserve_sentences: Don't split in middle of sentence
|
|
58
|
+
"""
|
|
59
|
+
if chunk_size <= 0:
|
|
60
|
+
raise ValueError("chunk_size must be greater than 0")
|
|
61
|
+
if overlap < 0:
|
|
62
|
+
raise ValueError("overlap must be greater than or equal to 0")
|
|
63
|
+
if overlap >= chunk_size:
|
|
64
|
+
raise ValueError("overlap must be smaller than chunk_size")
|
|
65
|
+
|
|
66
|
+
self.chunk_size = chunk_size
|
|
67
|
+
self.overlap = overlap
|
|
68
|
+
self.strategy = strategy
|
|
69
|
+
self.preserve_sentences = preserve_sentences
|
|
70
|
+
|
|
71
|
+
def chunk_text(
|
|
72
|
+
self,
|
|
73
|
+
text: str,
|
|
74
|
+
doc_id: str,
|
|
75
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
76
|
+
) -> List[Chunk]:
|
|
77
|
+
"""
|
|
78
|
+
Chunk a document.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
text: Document text to chunk
|
|
82
|
+
doc_id: Original document ID (source)
|
|
83
|
+
metadata: Optional metadata to attach to chunks
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List of Chunk objects
|
|
87
|
+
"""
|
|
88
|
+
metadata = metadata or {}
|
|
89
|
+
|
|
90
|
+
if self.strategy == ChunkingStrategy.FIXED_SIZE:
|
|
91
|
+
return self._chunk_fixed_size(text, doc_id, metadata)
|
|
92
|
+
elif self.strategy == ChunkingStrategy.SENTENCE:
|
|
93
|
+
return self._chunk_by_sentence(text, doc_id, metadata)
|
|
94
|
+
elif self.strategy == ChunkingStrategy.PARAGRAPH:
|
|
95
|
+
return self._chunk_by_paragraph(text, doc_id, metadata)
|
|
96
|
+
elif self.strategy == ChunkingStrategy.HYBRID:
|
|
97
|
+
return self._chunk_hybrid(text, doc_id, metadata)
|
|
98
|
+
else:
|
|
99
|
+
raise ValueError(f"Unknown strategy: {self.strategy}")
|
|
100
|
+
|
|
101
|
+
def _chunk_fixed_size(
|
|
102
|
+
self,
|
|
103
|
+
text: str,
|
|
104
|
+
doc_id: str,
|
|
105
|
+
metadata: Dict[str, Any]
|
|
106
|
+
) -> List[Chunk]:
|
|
107
|
+
"""Split into fixed-size chunks with overlap."""
|
|
108
|
+
chunks = []
|
|
109
|
+
chunk_index = 0
|
|
110
|
+
start = 0
|
|
111
|
+
|
|
112
|
+
while start < len(text):
|
|
113
|
+
end = min(start + self.chunk_size, len(text))
|
|
114
|
+
|
|
115
|
+
# If preserve_sentences, adjust end to not split mid-sentence
|
|
116
|
+
if self.preserve_sentences and end < len(text):
|
|
117
|
+
window = text[start:end]
|
|
118
|
+
boundaries = list(re.finditer(r"[.!?](?=\s|$)|\n", window))
|
|
119
|
+
if boundaries:
|
|
120
|
+
end = start + boundaries[-1].end()
|
|
121
|
+
|
|
122
|
+
chunk_text = text[start:end].strip()
|
|
123
|
+
|
|
124
|
+
if chunk_text: # Skip empty chunks
|
|
125
|
+
chunk = Chunk(
|
|
126
|
+
id=f"{doc_id}_chunk_{chunk_index}",
|
|
127
|
+
text=chunk_text,
|
|
128
|
+
chunk_index=chunk_index,
|
|
129
|
+
start_char=start,
|
|
130
|
+
end_char=end,
|
|
131
|
+
metadata={
|
|
132
|
+
**metadata,
|
|
133
|
+
"source_doc_id": doc_id,
|
|
134
|
+
"chunk_number": chunk_index,
|
|
135
|
+
"strategy": self.strategy.value,
|
|
136
|
+
}
|
|
137
|
+
)
|
|
138
|
+
chunks.append(chunk)
|
|
139
|
+
chunk_index += 1
|
|
140
|
+
|
|
141
|
+
if end >= len(text):
|
|
142
|
+
break
|
|
143
|
+
|
|
144
|
+
# Move start position (with overlap)
|
|
145
|
+
start = end - self.overlap
|
|
146
|
+
|
|
147
|
+
return chunks
|
|
148
|
+
|
|
149
|
+
def _chunk_by_sentence(
|
|
150
|
+
self,
|
|
151
|
+
text: str,
|
|
152
|
+
doc_id: str,
|
|
153
|
+
metadata: Dict[str, Any]
|
|
154
|
+
) -> List[Chunk]:
|
|
155
|
+
"""Split by sentences while preserving punctuation-heavy tokens."""
|
|
156
|
+
sentences = self._split_sentences(text)
|
|
157
|
+
chunks = []
|
|
158
|
+
chunk_index = 0
|
|
159
|
+
|
|
160
|
+
current_chunk = []
|
|
161
|
+
current_size = 0
|
|
162
|
+
start_char = 0
|
|
163
|
+
|
|
164
|
+
for sentence in sentences:
|
|
165
|
+
sentence_size = len(sentence)
|
|
166
|
+
|
|
167
|
+
# If adding this sentence exceeds chunk_size, save current chunk
|
|
168
|
+
if current_size + sentence_size > self.chunk_size and current_chunk:
|
|
169
|
+
chunk_text = ' '.join(current_chunk).strip()
|
|
170
|
+
if chunk_text:
|
|
171
|
+
chunk = Chunk(
|
|
172
|
+
id=f"{doc_id}_chunk_{chunk_index}",
|
|
173
|
+
text=chunk_text,
|
|
174
|
+
chunk_index=chunk_index,
|
|
175
|
+
start_char=start_char,
|
|
176
|
+
end_char=start_char + len(chunk_text),
|
|
177
|
+
metadata={
|
|
178
|
+
**metadata,
|
|
179
|
+
"source_doc_id": doc_id,
|
|
180
|
+
"chunk_number": chunk_index,
|
|
181
|
+
"strategy": self.strategy.value,
|
|
182
|
+
}
|
|
183
|
+
)
|
|
184
|
+
chunks.append(chunk)
|
|
185
|
+
chunk_index += 1
|
|
186
|
+
|
|
187
|
+
start_char += len(chunk_text) + 1
|
|
188
|
+
current_chunk = [sentence]
|
|
189
|
+
current_size = sentence_size
|
|
190
|
+
else:
|
|
191
|
+
current_chunk.append(sentence)
|
|
192
|
+
current_size += sentence_size
|
|
193
|
+
|
|
194
|
+
# Add remaining chunk
|
|
195
|
+
if current_chunk:
|
|
196
|
+
chunk_text = ' '.join(current_chunk).strip()
|
|
197
|
+
chunk = Chunk(
|
|
198
|
+
id=f"{doc_id}_chunk_{chunk_index}",
|
|
199
|
+
text=chunk_text,
|
|
200
|
+
chunk_index=chunk_index,
|
|
201
|
+
start_char=start_char,
|
|
202
|
+
end_char=start_char + len(chunk_text),
|
|
203
|
+
metadata={
|
|
204
|
+
**metadata,
|
|
205
|
+
"source_doc_id": doc_id,
|
|
206
|
+
"chunk_number": chunk_index,
|
|
207
|
+
"strategy": self.strategy.value,
|
|
208
|
+
}
|
|
209
|
+
)
|
|
210
|
+
chunks.append(chunk)
|
|
211
|
+
|
|
212
|
+
return chunks
|
|
213
|
+
|
|
214
|
+
def _chunk_by_paragraph(
|
|
215
|
+
self,
|
|
216
|
+
text: str,
|
|
217
|
+
doc_id: str,
|
|
218
|
+
metadata: Dict[str, Any]
|
|
219
|
+
) -> List[Chunk]:
|
|
220
|
+
"""Split by paragraphs (double newline)."""
|
|
221
|
+
paragraphs = text.split('\n\n')
|
|
222
|
+
chunks = []
|
|
223
|
+
chunk_index = 0
|
|
224
|
+
start_char = 0
|
|
225
|
+
|
|
226
|
+
for para in paragraphs:
|
|
227
|
+
para = para.strip()
|
|
228
|
+
if para:
|
|
229
|
+
chunk = Chunk(
|
|
230
|
+
id=f"{doc_id}_chunk_{chunk_index}",
|
|
231
|
+
text=para,
|
|
232
|
+
chunk_index=chunk_index,
|
|
233
|
+
start_char=start_char,
|
|
234
|
+
end_char=start_char + len(para),
|
|
235
|
+
metadata={
|
|
236
|
+
**metadata,
|
|
237
|
+
"source_doc_id": doc_id,
|
|
238
|
+
"chunk_number": chunk_index,
|
|
239
|
+
"strategy": self.strategy.value,
|
|
240
|
+
}
|
|
241
|
+
)
|
|
242
|
+
chunks.append(chunk)
|
|
243
|
+
chunk_index += 1
|
|
244
|
+
start_char += len(para) + 2 # +2 for '\n\n'
|
|
245
|
+
|
|
246
|
+
return chunks
|
|
247
|
+
|
|
248
|
+
def _chunk_hybrid(
|
|
249
|
+
self,
|
|
250
|
+
text: str,
|
|
251
|
+
doc_id: str,
|
|
252
|
+
metadata: Dict[str, Any]
|
|
253
|
+
) -> List[Chunk]:
|
|
254
|
+
"""
|
|
255
|
+
Hybrid: Group sentences into chunks of target size.
|
|
256
|
+
Better than fixed_size because sentences stay together.
|
|
257
|
+
"""
|
|
258
|
+
sentences = self._split_sentences(text)
|
|
259
|
+
chunks = []
|
|
260
|
+
chunk_index = 0
|
|
261
|
+
start_char = 0
|
|
262
|
+
|
|
263
|
+
current_chunk = []
|
|
264
|
+
current_size = 0
|
|
265
|
+
|
|
266
|
+
for sentence in sentences:
|
|
267
|
+
# If adding sentence exceeds size and we have content, save chunk
|
|
268
|
+
if current_size + len(sentence) > self.chunk_size and current_chunk:
|
|
269
|
+
chunk_text = ' '.join(current_chunk).strip()
|
|
270
|
+
chunk = Chunk(
|
|
271
|
+
id=f"{doc_id}_chunk_{chunk_index}",
|
|
272
|
+
text=chunk_text,
|
|
273
|
+
chunk_index=chunk_index,
|
|
274
|
+
start_char=start_char,
|
|
275
|
+
end_char=start_char + len(chunk_text),
|
|
276
|
+
metadata={
|
|
277
|
+
**metadata,
|
|
278
|
+
"source_doc_id": doc_id,
|
|
279
|
+
"chunk_number": chunk_index,
|
|
280
|
+
"strategy": self.strategy.value,
|
|
281
|
+
}
|
|
282
|
+
)
|
|
283
|
+
chunks.append(chunk)
|
|
284
|
+
chunk_index += 1
|
|
285
|
+
start_char += len(chunk_text) + 1
|
|
286
|
+
current_chunk = [sentence]
|
|
287
|
+
current_size = len(sentence)
|
|
288
|
+
else:
|
|
289
|
+
current_chunk.append(sentence)
|
|
290
|
+
current_size += len(sentence)
|
|
291
|
+
|
|
292
|
+
# Add final chunk
|
|
293
|
+
if current_chunk:
|
|
294
|
+
chunk_text = ' '.join(current_chunk).strip()
|
|
295
|
+
chunk = Chunk(
|
|
296
|
+
id=f"{doc_id}_chunk_{chunk_index}",
|
|
297
|
+
text=chunk_text,
|
|
298
|
+
chunk_index=chunk_index,
|
|
299
|
+
start_char=start_char,
|
|
300
|
+
end_char=start_char + len(chunk_text),
|
|
301
|
+
metadata={
|
|
302
|
+
**metadata,
|
|
303
|
+
"source_doc_id": doc_id,
|
|
304
|
+
"chunk_number": chunk_index,
|
|
305
|
+
"strategy": self.strategy.value,
|
|
306
|
+
}
|
|
307
|
+
)
|
|
308
|
+
chunks.append(chunk)
|
|
309
|
+
|
|
310
|
+
return chunks
|
|
311
|
+
|
|
312
|
+
def _split_sentences(self, text: str) -> List[str]:
|
|
313
|
+
"""
|
|
314
|
+
Split text on sentence boundaries without breaking emails or domains.
|
|
315
|
+
"""
|
|
316
|
+
normalized = text.strip()
|
|
317
|
+
if not normalized:
|
|
318
|
+
return []
|
|
319
|
+
|
|
320
|
+
parts = re.split(r"(?<=[.!?])\s+(?=[A-Z0-9])", normalized)
|
|
321
|
+
return [part.strip() for part in parts if part.strip()]
|