lexiredact 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexiredact/__init__.py +139 -0
- lexiredact/chunking/__init__.py +18 -0
- lexiredact/chunking/chunker.py +321 -0
- lexiredact/chunking/json_exporter.py +103 -0
- lexiredact/chunking/pdf_loader.py +86 -0
- lexiredact/cli.py +177 -0
- lexiredact/config/__init__.py +16 -0
- lexiredact/config/defaults.py +121 -0
- lexiredact/config/loader.py +119 -0
- lexiredact/implementations/__init__.py +41 -0
- lexiredact/implementations/cache/__init__.py +10 -0
- lexiredact/implementations/cache/generic.py +112 -0
- lexiredact/implementations/cache/memory.py +120 -0
- lexiredact/implementations/cache/redis.py +182 -0
- lexiredact/implementations/embedding/__init__.py +7 -0
- lexiredact/implementations/embedding/fastembed.py +97 -0
- lexiredact/implementations/embedding/generic.py +101 -0
- lexiredact/implementations/tracker/__init__.py +10 -0
- lexiredact/implementations/tracker/mlflow.py +147 -0
- lexiredact/implementations/vectorstore/__init__.py +11 -0
- lexiredact/implementations/vectorstore/chroma.py +271 -0
- lexiredact/implementations/vectorstore/generic.py +120 -0
- lexiredact/interfaces/__init__.py +18 -0
- lexiredact/interfaces/cache.py +58 -0
- lexiredact/interfaces/embedder.py +48 -0
- lexiredact/interfaces/tracker.py +67 -0
- lexiredact/interfaces/vectorstore.py +89 -0
- lexiredact/metrics/__init__.py +21 -0
- lexiredact/metrics/stats.py +386 -0
- lexiredact/pipeline/__init__.py +11 -0
- lexiredact/pipeline/ingest.py +587 -0
- lexiredact/privacy/__init__.py +15 -0
- lexiredact/privacy/pii_detector.py +176 -0
- lexiredact/privacy/policy.py +135 -0
- lexiredact/privacy/redactor.py +110 -0
- lexiredact/py.typed +1 -0
- lexiredact/registry/__init__.py +9 -0
- lexiredact/registry/loader.py +521 -0
- lexiredact/utils/__init__.py +17 -0
- lexiredact/utils/hashing.py +60 -0
- lexiredact/utils/timing.py +122 -0
- lexiredact-0.1.0.dist-info/METADATA +100 -0
- lexiredact-0.1.0.dist-info/RECORD +47 -0
- lexiredact-0.1.0.dist-info/WHEEL +5 -0
- lexiredact-0.1.0.dist-info/entry_points.txt +2 -0
- lexiredact-0.1.0.dist-info/licenses/LICENSE +21 -0
- lexiredact-0.1.0.dist-info/top_level.txt +1 -0
lexiredact/__init__.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LexiRedact - Privacy-Preserving RAG Middleware
|
|
3
|
+
|
|
4
|
+
A Python SDK for protecting PII in vector databases while maintaining
|
|
5
|
+
semantic search quality through intelligent embedding and redaction.
|
|
6
|
+
|
|
7
|
+
Key Features:
|
|
8
|
+
- Automatic PII detection and redaction using Microsoft Presidio
|
|
9
|
+
- Embedding generation from original text (Shadow Mode architecture)
|
|
10
|
+
- Only sanitized text stored in vector databases
|
|
11
|
+
- Redis caching for performance optimization
|
|
12
|
+
- Pluggable architecture via dependency injection
|
|
13
|
+
- Comprehensive metrics and tracking
|
|
14
|
+
|
|
15
|
+
Basic Usage:
|
|
16
|
+
>>> import lexiredact as vs
|
|
17
|
+
>>>
|
|
18
|
+
>>> # Create pipeline with defaults
|
|
19
|
+
>>> pipeline = vs.IngestionPipeline()
|
|
20
|
+
>>> await pipeline.initialize()
|
|
21
|
+
>>>
|
|
22
|
+
>>> # Process documents
|
|
23
|
+
>>> doc = vs.Document(id="1", text="Contact John at john@example.com")
|
|
24
|
+
>>> result = await pipeline.process_document(doc)
|
|
25
|
+
>>>
|
|
26
|
+
>>> print(result.clean_text) # "Contact <PERSON> at <EMAIL_ADDRESS>"
|
|
27
|
+
>>> print(result.pii_entities) # ["PERSON", "EMAIL_ADDRESS"]
|
|
28
|
+
>>>
|
|
29
|
+
>>> await pipeline.shutdown()
|
|
30
|
+
|
|
31
|
+
Custom Configuration:
|
|
32
|
+
>>> from lexiredact import IngestionPipeline, load_config
|
|
33
|
+
>>>
|
|
34
|
+
>>> config = load_config(config_dict={
|
|
35
|
+
... "embedding_model": "BAAI/bge-base-en-v1.5",
|
|
36
|
+
... "cache_backend": "redis",
|
|
37
|
+
... "redis_host": "localhost"
|
|
38
|
+
... })
|
|
39
|
+
>>>
|
|
40
|
+
>>> pipeline = IngestionPipeline(config=config)
|
|
41
|
+
|
|
42
|
+
Custom Components:
|
|
43
|
+
>>> from lexiredact import IngestionPipeline
|
|
44
|
+
>>> from lexiredact.interfaces import Embedder
|
|
45
|
+
>>>
|
|
46
|
+
>>> class MyEmbedder(Embedder):
|
|
47
|
+
... # Custom implementation
|
|
48
|
+
... pass
|
|
49
|
+
>>>
|
|
50
|
+
>>> pipeline = IngestionPipeline(embedder=MyEmbedder())
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
__version__ = "0.1.0"
|
|
54
|
+
|
|
55
|
+
# Core pipeline
|
|
56
|
+
from .pipeline import IngestionPipeline, Document, ProcessedDocument
|
|
57
|
+
|
|
58
|
+
# Configuration
|
|
59
|
+
from .config import load_config, get_default_config, save_config_to_yaml
|
|
60
|
+
|
|
61
|
+
# Privacy components
|
|
62
|
+
from .privacy import PIIDetector, PIIRedactor, PIIPolicy
|
|
63
|
+
|
|
64
|
+
# Interfaces (for custom implementations)
|
|
65
|
+
from .interfaces import CacheBackend, Embedder, VectorStore, Tracker
|
|
66
|
+
|
|
67
|
+
# Default implementations
|
|
68
|
+
from .implementations import (
|
|
69
|
+
MemoryCache,
|
|
70
|
+
RedisCache,
|
|
71
|
+
GenericCache,
|
|
72
|
+
FastEmbedEmbedder,
|
|
73
|
+
GenericEmbedder,
|
|
74
|
+
ChromaVectorStore,
|
|
75
|
+
GenericVectorStore,
|
|
76
|
+
MLflowTracker,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Metrics
|
|
80
|
+
from .metrics import (
|
|
81
|
+
MetricsCollector,
|
|
82
|
+
AggregateStats,
|
|
83
|
+
RetrievalAggregateStats,
|
|
84
|
+
RetrievalMetricsEvaluator,
|
|
85
|
+
RetrievalQueryMetrics,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Utilities
|
|
89
|
+
from .utils import hash_text, generate_cache_key, Timer
|
|
90
|
+
|
|
91
|
+
__all__ = [
|
|
92
|
+
# Version
|
|
93
|
+
"__version__",
|
|
94
|
+
|
|
95
|
+
# Core
|
|
96
|
+
"IngestionPipeline",
|
|
97
|
+
"Document",
|
|
98
|
+
"ProcessedDocument",
|
|
99
|
+
|
|
100
|
+
# Configuration
|
|
101
|
+
"load_config",
|
|
102
|
+
"get_default_config",
|
|
103
|
+
"save_config_to_yaml",
|
|
104
|
+
|
|
105
|
+
# Privacy
|
|
106
|
+
"PIIDetector",
|
|
107
|
+
"PIIRedactor",
|
|
108
|
+
"PIIPolicy",
|
|
109
|
+
|
|
110
|
+
# Interfaces
|
|
111
|
+
"CacheBackend",
|
|
112
|
+
"Embedder",
|
|
113
|
+
"VectorStore",
|
|
114
|
+
"Tracker",
|
|
115
|
+
|
|
116
|
+
#custom models
|
|
117
|
+
"GenericCache",
|
|
118
|
+
"GenericEmbedder",
|
|
119
|
+
"GenericVectorStore",
|
|
120
|
+
|
|
121
|
+
# Implementations
|
|
122
|
+
"MemoryCache",
|
|
123
|
+
"RedisCache",
|
|
124
|
+
"FastEmbedEmbedder",
|
|
125
|
+
"ChromaVectorStore",
|
|
126
|
+
"MLflowTracker",
|
|
127
|
+
|
|
128
|
+
# Metrics
|
|
129
|
+
"MetricsCollector",
|
|
130
|
+
"AggregateStats",
|
|
131
|
+
"RetrievalAggregateStats",
|
|
132
|
+
"RetrievalMetricsEvaluator",
|
|
133
|
+
"RetrievalQueryMetrics",
|
|
134
|
+
|
|
135
|
+
# Utils
|
|
136
|
+
"hash_text",
|
|
137
|
+
"generate_cache_key",
|
|
138
|
+
"Timer",
|
|
139
|
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document chunking module for LexiRedact.
|
|
3
|
+
|
|
4
|
+
Converts PDFs and large text documents into manageable chunks
|
|
5
|
+
suitable for embedding and PII detection.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .chunker import Chunk, DocumentChunker, ChunkingStrategy
|
|
9
|
+
from .json_exporter import JSONExporter
|
|
10
|
+
from .pdf_loader import PDFLoader
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"Chunk",
|
|
14
|
+
"DocumentChunker",
|
|
15
|
+
"ChunkingStrategy",
|
|
16
|
+
"PDFLoader",
|
|
17
|
+
"JSONExporter",
|
|
18
|
+
]
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core document chunking logic.
|
|
3
|
+
Splits large documents into smaller chunks with overlap.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Dict, Any, Optional, Literal
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
import uuid
|
|
9
|
+
from enum import Enum
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ChunkingStrategy(str, Enum):
|
|
14
|
+
"""Chunking strategies."""
|
|
15
|
+
FIXED_SIZE = "fixed_size" # Fixed token/char chunks
|
|
16
|
+
SENTENCE = "sentence" # Split by sentences
|
|
17
|
+
PARAGRAPH = "paragraph" # Split by paragraphs
|
|
18
|
+
HYBRID = "hybrid" # Sentences grouped into chunks
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class Chunk:
|
|
23
|
+
"""Single chunk of text."""
|
|
24
|
+
id: str
|
|
25
|
+
text: str
|
|
26
|
+
chunk_index: int
|
|
27
|
+
start_char: int
|
|
28
|
+
end_char: int
|
|
29
|
+
metadata: Dict[str, Any]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DocumentChunker:
|
|
33
|
+
"""
|
|
34
|
+
Convert large documents into LexiRedact-compatible chunks.
|
|
35
|
+
|
|
36
|
+
Support for:
|
|
37
|
+
- Fixed-size chunking (tokens or characters)
|
|
38
|
+
- Sentence-based chunking
|
|
39
|
+
- Paragraph-based chunking
|
|
40
|
+
- Overlap between chunks
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
chunk_size: int = 512, # Max characters per chunk
|
|
46
|
+
overlap: int = 100, # Overlap between chunks (chars)
|
|
47
|
+
strategy: ChunkingStrategy = ChunkingStrategy.FIXED_SIZE,
|
|
48
|
+
preserve_sentences: bool = True, # Don't split mid-sentence
|
|
49
|
+
):
|
|
50
|
+
"""
|
|
51
|
+
Initialize chunker.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
chunk_size: Target chunk size in characters
|
|
55
|
+
overlap: Overlap between chunks (to preserve context)
|
|
56
|
+
strategy: Chunking strategy to use
|
|
57
|
+
preserve_sentences: Don't split in middle of sentence
|
|
58
|
+
"""
|
|
59
|
+
if chunk_size <= 0:
|
|
60
|
+
raise ValueError("chunk_size must be greater than 0")
|
|
61
|
+
if overlap < 0:
|
|
62
|
+
raise ValueError("overlap must be greater than or equal to 0")
|
|
63
|
+
if overlap >= chunk_size:
|
|
64
|
+
raise ValueError("overlap must be smaller than chunk_size")
|
|
65
|
+
|
|
66
|
+
self.chunk_size = chunk_size
|
|
67
|
+
self.overlap = overlap
|
|
68
|
+
self.strategy = strategy
|
|
69
|
+
self.preserve_sentences = preserve_sentences
|
|
70
|
+
|
|
71
|
+
def chunk_text(
|
|
72
|
+
self,
|
|
73
|
+
text: str,
|
|
74
|
+
doc_id: str,
|
|
75
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
76
|
+
) -> List[Chunk]:
|
|
77
|
+
"""
|
|
78
|
+
Chunk a document.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
text: Document text to chunk
|
|
82
|
+
doc_id: Original document ID (source)
|
|
83
|
+
metadata: Optional metadata to attach to chunks
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List of Chunk objects
|
|
87
|
+
"""
|
|
88
|
+
metadata = metadata or {}
|
|
89
|
+
|
|
90
|
+
if self.strategy == ChunkingStrategy.FIXED_SIZE:
|
|
91
|
+
return self._chunk_fixed_size(text, doc_id, metadata)
|
|
92
|
+
elif self.strategy == ChunkingStrategy.SENTENCE:
|
|
93
|
+
return self._chunk_by_sentence(text, doc_id, metadata)
|
|
94
|
+
elif self.strategy == ChunkingStrategy.PARAGRAPH:
|
|
95
|
+
return self._chunk_by_paragraph(text, doc_id, metadata)
|
|
96
|
+
elif self.strategy == ChunkingStrategy.HYBRID:
|
|
97
|
+
return self._chunk_hybrid(text, doc_id, metadata)
|
|
98
|
+
else:
|
|
99
|
+
raise ValueError(f"Unknown strategy: {self.strategy}")
|
|
100
|
+
|
|
101
|
+
def _chunk_fixed_size(
|
|
102
|
+
self,
|
|
103
|
+
text: str,
|
|
104
|
+
doc_id: str,
|
|
105
|
+
metadata: Dict[str, Any]
|
|
106
|
+
) -> List[Chunk]:
|
|
107
|
+
"""Split into fixed-size chunks with overlap."""
|
|
108
|
+
chunks = []
|
|
109
|
+
chunk_index = 0
|
|
110
|
+
start = 0
|
|
111
|
+
|
|
112
|
+
while start < len(text):
|
|
113
|
+
end = min(start + self.chunk_size, len(text))
|
|
114
|
+
|
|
115
|
+
# If preserve_sentences, adjust end to not split mid-sentence
|
|
116
|
+
if self.preserve_sentences and end < len(text):
|
|
117
|
+
window = text[start:end]
|
|
118
|
+
boundaries = list(re.finditer(r"[.!?](?=\s|$)|\n", window))
|
|
119
|
+
if boundaries:
|
|
120
|
+
end = start + boundaries[-1].end()
|
|
121
|
+
|
|
122
|
+
chunk_text = text[start:end].strip()
|
|
123
|
+
|
|
124
|
+
if chunk_text: # Skip empty chunks
|
|
125
|
+
chunk = Chunk(
|
|
126
|
+
id=f"{doc_id}_chunk_{chunk_index}",
|
|
127
|
+
text=chunk_text,
|
|
128
|
+
chunk_index=chunk_index,
|
|
129
|
+
start_char=start,
|
|
130
|
+
end_char=end,
|
|
131
|
+
metadata={
|
|
132
|
+
**metadata,
|
|
133
|
+
"source_doc_id": doc_id,
|
|
134
|
+
"chunk_number": chunk_index,
|
|
135
|
+
"strategy": self.strategy.value,
|
|
136
|
+
}
|
|
137
|
+
)
|
|
138
|
+
chunks.append(chunk)
|
|
139
|
+
chunk_index += 1
|
|
140
|
+
|
|
141
|
+
if end >= len(text):
|
|
142
|
+
break
|
|
143
|
+
|
|
144
|
+
# Move start position (with overlap)
|
|
145
|
+
start = end - self.overlap
|
|
146
|
+
|
|
147
|
+
return chunks
|
|
148
|
+
|
|
149
|
+
def _chunk_by_sentence(
|
|
150
|
+
self,
|
|
151
|
+
text: str,
|
|
152
|
+
doc_id: str,
|
|
153
|
+
metadata: Dict[str, Any]
|
|
154
|
+
) -> List[Chunk]:
|
|
155
|
+
"""Split by sentences while preserving punctuation-heavy tokens."""
|
|
156
|
+
sentences = self._split_sentences(text)
|
|
157
|
+
chunks = []
|
|
158
|
+
chunk_index = 0
|
|
159
|
+
|
|
160
|
+
current_chunk = []
|
|
161
|
+
current_size = 0
|
|
162
|
+
start_char = 0
|
|
163
|
+
|
|
164
|
+
for sentence in sentences:
|
|
165
|
+
sentence_size = len(sentence)
|
|
166
|
+
|
|
167
|
+
# If adding this sentence exceeds chunk_size, save current chunk
|
|
168
|
+
if current_size + sentence_size > self.chunk_size and current_chunk:
|
|
169
|
+
chunk_text = ' '.join(current_chunk).strip()
|
|
170
|
+
if chunk_text:
|
|
171
|
+
chunk = Chunk(
|
|
172
|
+
id=f"{doc_id}_chunk_{chunk_index}",
|
|
173
|
+
text=chunk_text,
|
|
174
|
+
chunk_index=chunk_index,
|
|
175
|
+
start_char=start_char,
|
|
176
|
+
end_char=start_char + len(chunk_text),
|
|
177
|
+
metadata={
|
|
178
|
+
**metadata,
|
|
179
|
+
"source_doc_id": doc_id,
|
|
180
|
+
"chunk_number": chunk_index,
|
|
181
|
+
"strategy": self.strategy.value,
|
|
182
|
+
}
|
|
183
|
+
)
|
|
184
|
+
chunks.append(chunk)
|
|
185
|
+
chunk_index += 1
|
|
186
|
+
|
|
187
|
+
start_char += len(chunk_text) + 1
|
|
188
|
+
current_chunk = [sentence]
|
|
189
|
+
current_size = sentence_size
|
|
190
|
+
else:
|
|
191
|
+
current_chunk.append(sentence)
|
|
192
|
+
current_size += sentence_size
|
|
193
|
+
|
|
194
|
+
# Add remaining chunk
|
|
195
|
+
if current_chunk:
|
|
196
|
+
chunk_text = ' '.join(current_chunk).strip()
|
|
197
|
+
chunk = Chunk(
|
|
198
|
+
id=f"{doc_id}_chunk_{chunk_index}",
|
|
199
|
+
text=chunk_text,
|
|
200
|
+
chunk_index=chunk_index,
|
|
201
|
+
start_char=start_char,
|
|
202
|
+
end_char=start_char + len(chunk_text),
|
|
203
|
+
metadata={
|
|
204
|
+
**metadata,
|
|
205
|
+
"source_doc_id": doc_id,
|
|
206
|
+
"chunk_number": chunk_index,
|
|
207
|
+
"strategy": self.strategy.value,
|
|
208
|
+
}
|
|
209
|
+
)
|
|
210
|
+
chunks.append(chunk)
|
|
211
|
+
|
|
212
|
+
return chunks
|
|
213
|
+
|
|
214
|
+
def _chunk_by_paragraph(
|
|
215
|
+
self,
|
|
216
|
+
text: str,
|
|
217
|
+
doc_id: str,
|
|
218
|
+
metadata: Dict[str, Any]
|
|
219
|
+
) -> List[Chunk]:
|
|
220
|
+
"""Split by paragraphs (double newline)."""
|
|
221
|
+
paragraphs = text.split('\n\n')
|
|
222
|
+
chunks = []
|
|
223
|
+
chunk_index = 0
|
|
224
|
+
start_char = 0
|
|
225
|
+
|
|
226
|
+
for para in paragraphs:
|
|
227
|
+
para = para.strip()
|
|
228
|
+
if para:
|
|
229
|
+
chunk = Chunk(
|
|
230
|
+
id=f"{doc_id}_chunk_{chunk_index}",
|
|
231
|
+
text=para,
|
|
232
|
+
chunk_index=chunk_index,
|
|
233
|
+
start_char=start_char,
|
|
234
|
+
end_char=start_char + len(para),
|
|
235
|
+
metadata={
|
|
236
|
+
**metadata,
|
|
237
|
+
"source_doc_id": doc_id,
|
|
238
|
+
"chunk_number": chunk_index,
|
|
239
|
+
"strategy": self.strategy.value,
|
|
240
|
+
}
|
|
241
|
+
)
|
|
242
|
+
chunks.append(chunk)
|
|
243
|
+
chunk_index += 1
|
|
244
|
+
start_char += len(para) + 2 # +2 for '\n\n'
|
|
245
|
+
|
|
246
|
+
return chunks
|
|
247
|
+
|
|
248
|
+
def _chunk_hybrid(
|
|
249
|
+
self,
|
|
250
|
+
text: str,
|
|
251
|
+
doc_id: str,
|
|
252
|
+
metadata: Dict[str, Any]
|
|
253
|
+
) -> List[Chunk]:
|
|
254
|
+
"""
|
|
255
|
+
Hybrid: Group sentences into chunks of target size.
|
|
256
|
+
Better than fixed_size because sentences stay together.
|
|
257
|
+
"""
|
|
258
|
+
sentences = self._split_sentences(text)
|
|
259
|
+
chunks = []
|
|
260
|
+
chunk_index = 0
|
|
261
|
+
start_char = 0
|
|
262
|
+
|
|
263
|
+
current_chunk = []
|
|
264
|
+
current_size = 0
|
|
265
|
+
|
|
266
|
+
for sentence in sentences:
|
|
267
|
+
# If adding sentence exceeds size and we have content, save chunk
|
|
268
|
+
if current_size + len(sentence) > self.chunk_size and current_chunk:
|
|
269
|
+
chunk_text = ' '.join(current_chunk).strip()
|
|
270
|
+
chunk = Chunk(
|
|
271
|
+
id=f"{doc_id}_chunk_{chunk_index}",
|
|
272
|
+
text=chunk_text,
|
|
273
|
+
chunk_index=chunk_index,
|
|
274
|
+
start_char=start_char,
|
|
275
|
+
end_char=start_char + len(chunk_text),
|
|
276
|
+
metadata={
|
|
277
|
+
**metadata,
|
|
278
|
+
"source_doc_id": doc_id,
|
|
279
|
+
"chunk_number": chunk_index,
|
|
280
|
+
"strategy": self.strategy.value,
|
|
281
|
+
}
|
|
282
|
+
)
|
|
283
|
+
chunks.append(chunk)
|
|
284
|
+
chunk_index += 1
|
|
285
|
+
start_char += len(chunk_text) + 1
|
|
286
|
+
current_chunk = [sentence]
|
|
287
|
+
current_size = len(sentence)
|
|
288
|
+
else:
|
|
289
|
+
current_chunk.append(sentence)
|
|
290
|
+
current_size += len(sentence)
|
|
291
|
+
|
|
292
|
+
# Add final chunk
|
|
293
|
+
if current_chunk:
|
|
294
|
+
chunk_text = ' '.join(current_chunk).strip()
|
|
295
|
+
chunk = Chunk(
|
|
296
|
+
id=f"{doc_id}_chunk_{chunk_index}",
|
|
297
|
+
text=chunk_text,
|
|
298
|
+
chunk_index=chunk_index,
|
|
299
|
+
start_char=start_char,
|
|
300
|
+
end_char=start_char + len(chunk_text),
|
|
301
|
+
metadata={
|
|
302
|
+
**metadata,
|
|
303
|
+
"source_doc_id": doc_id,
|
|
304
|
+
"chunk_number": chunk_index,
|
|
305
|
+
"strategy": self.strategy.value,
|
|
306
|
+
}
|
|
307
|
+
)
|
|
308
|
+
chunks.append(chunk)
|
|
309
|
+
|
|
310
|
+
return chunks
|
|
311
|
+
|
|
312
|
+
def _split_sentences(self, text: str) -> List[str]:
|
|
313
|
+
"""
|
|
314
|
+
Split text on sentence boundaries without breaking emails or domains.
|
|
315
|
+
"""
|
|
316
|
+
normalized = text.strip()
|
|
317
|
+
if not normalized:
|
|
318
|
+
return []
|
|
319
|
+
|
|
320
|
+
parts = re.split(r"(?<=[.!?])\s+(?=[A-Z0-9])", normalized)
|
|
321
|
+
return [part.strip() for part in parts if part.strip()]
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Export chunks as JSON in LexiRedact format.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import List, Dict, Any
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from .chunker import Chunk
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class JSONExporter:
|
|
12
|
+
"""Export chunks to JSON format compatible with LexiRedact."""
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def to_lexiredact_format(
|
|
16
|
+
chunks: List[Chunk],
|
|
17
|
+
output_path: str | Path | None = None,
|
|
18
|
+
wrap_documents: bool = False,
|
|
19
|
+
) -> str | List[Dict[str, Any]] | Dict[str, List[Dict[str, Any]]]:
|
|
20
|
+
"""
|
|
21
|
+
Convert chunks to LexiRedact JSON format.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
chunks: List of Chunk objects
|
|
25
|
+
output_path: Optional path to save JSON file
|
|
26
|
+
wrap_documents: Export CLI-ready payload as
|
|
27
|
+
{"documents": [...]} when True
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
JSON string (if output_path) or payload object (if None)
|
|
31
|
+
|
|
32
|
+
Example output format:
|
|
33
|
+
[
|
|
34
|
+
{
|
|
35
|
+
"id": "doc1_chunk_0",
|
|
36
|
+
"text": "First chunk of text...",
|
|
37
|
+
"metadata": {
|
|
38
|
+
"source_doc_id": "doc1",
|
|
39
|
+
"chunk_number": 0,
|
|
40
|
+
"chunk_size": 512
|
|
41
|
+
}
|
|
42
|
+
},
|
|
43
|
+
...
|
|
44
|
+
]
|
|
45
|
+
"""
|
|
46
|
+
chunk_dicts = [
|
|
47
|
+
{
|
|
48
|
+
"id": chunk.id,
|
|
49
|
+
"text": chunk.text,
|
|
50
|
+
"metadata": chunk.metadata
|
|
51
|
+
}
|
|
52
|
+
for chunk in chunks
|
|
53
|
+
]
|
|
54
|
+
payload: List[Dict[str, Any]] | Dict[str, List[Dict[str, Any]]]
|
|
55
|
+
payload = {"documents": chunk_dicts} if wrap_documents else chunk_dicts
|
|
56
|
+
|
|
57
|
+
if output_path:
|
|
58
|
+
output_path = Path(output_path)
|
|
59
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
|
|
61
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
62
|
+
json.dump(payload, f, indent=2, ensure_ascii=False)
|
|
63
|
+
|
|
64
|
+
return json.dumps(payload, indent=2, ensure_ascii=False)
|
|
65
|
+
|
|
66
|
+
return payload
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def to_cli_input(
|
|
70
|
+
chunks: List[Chunk],
|
|
71
|
+
output_path: str | Path | None = None,
|
|
72
|
+
) -> str | Dict[str, List[Dict[str, Any]]]:
|
|
73
|
+
"""Export chunks in the JSON shape expected by `lexiredact process`."""
|
|
74
|
+
return JSONExporter.to_lexiredact_format(
|
|
75
|
+
chunks,
|
|
76
|
+
output_path=output_path,
|
|
77
|
+
wrap_documents=True,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
@staticmethod
|
|
81
|
+
def to_jsonl(
|
|
82
|
+
chunks: List[Chunk],
|
|
83
|
+
output_path: str | Path,
|
|
84
|
+
) -> None:
|
|
85
|
+
"""
|
|
86
|
+
Export chunks as JSONL (one JSON per line).
|
|
87
|
+
Useful for streaming/large datasets.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
chunks: List of Chunk objects
|
|
91
|
+
output_path: Path to save JSONL file
|
|
92
|
+
"""
|
|
93
|
+
output_path = Path(output_path)
|
|
94
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
|
|
96
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
97
|
+
for chunk in chunks:
|
|
98
|
+
line = {
|
|
99
|
+
"id": chunk.id,
|
|
100
|
+
"text": chunk.text,
|
|
101
|
+
"metadata": chunk.metadata
|
|
102
|
+
}
|
|
103
|
+
f.write(json.dumps(line, ensure_ascii=False) + '\n')
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF extraction for document chunking.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Dict, Any
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PDFLoader:
|
|
10
|
+
"""Load and extract text from PDF files."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, use_ocr: bool = False):
|
|
13
|
+
"""
|
|
14
|
+
Initialize PDF loader.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
use_ocr: Use OCR for scanned PDFs (requires pytesseract)
|
|
18
|
+
"""
|
|
19
|
+
self.use_ocr = use_ocr
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def extract_text(pdf_path: str | Path) -> str:
|
|
23
|
+
"""
|
|
24
|
+
Extract text from PDF.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
pdf_path: Path to PDF file
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Extracted text
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
ImportError: If pypdf not installed
|
|
34
|
+
FileNotFoundError: If PDF not found
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
from pypdf import PdfReader
|
|
38
|
+
except ImportError:
|
|
39
|
+
raise ImportError(
|
|
40
|
+
"PDF support requires pypdf. "
|
|
41
|
+
"Install with: pip install pypdf"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
pdf_path = Path(pdf_path)
|
|
45
|
+
if not pdf_path.exists():
|
|
46
|
+
raise FileNotFoundError(f"PDF not found: {pdf_path}")
|
|
47
|
+
|
|
48
|
+
text = []
|
|
49
|
+
with open(pdf_path, 'rb') as file:
|
|
50
|
+
reader = PdfReader(file)
|
|
51
|
+
for page in reader.pages:
|
|
52
|
+
text.append(page.extract_text())
|
|
53
|
+
|
|
54
|
+
return '\n\n'.join(text)
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def extract_metadata(pdf_path: str | Path) -> Dict[str, Any]:
|
|
58
|
+
"""
|
|
59
|
+
Extract metadata from PDF.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
pdf_path: Path to PDF file
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Metadata dictionary
|
|
66
|
+
"""
|
|
67
|
+
try:
|
|
68
|
+
from pypdf import PdfReader
|
|
69
|
+
except ImportError:
|
|
70
|
+
raise ImportError("PDF support requires pypdf")
|
|
71
|
+
|
|
72
|
+
pdf_path = Path(pdf_path)
|
|
73
|
+
metadata = {}
|
|
74
|
+
|
|
75
|
+
with open(pdf_path, 'rb') as file:
|
|
76
|
+
reader = PdfReader(file)
|
|
77
|
+
if reader.metadata:
|
|
78
|
+
metadata = {
|
|
79
|
+
"title": reader.metadata.get("/Title", ""),
|
|
80
|
+
"author": reader.metadata.get("/Author", ""),
|
|
81
|
+
"subject": reader.metadata.get("/Subject", ""),
|
|
82
|
+
"pages": len(reader.pages),
|
|
83
|
+
"pdf_file": str(pdf_path.name),
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return metadata
|