lexiredact 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. lexiredact-0.1.0/LICENSE +21 -0
  2. lexiredact-0.1.0/MANIFEST.in +19 -0
  3. lexiredact-0.1.0/PKG-INFO +100 -0
  4. lexiredact-0.1.0/README.md +59 -0
  5. lexiredact-0.1.0/lexiredact/__init__.py +139 -0
  6. lexiredact-0.1.0/lexiredact/chunking/__init__.py +18 -0
  7. lexiredact-0.1.0/lexiredact/chunking/chunker.py +321 -0
  8. lexiredact-0.1.0/lexiredact/chunking/json_exporter.py +103 -0
  9. lexiredact-0.1.0/lexiredact/chunking/pdf_loader.py +86 -0
  10. lexiredact-0.1.0/lexiredact/cli.py +177 -0
  11. lexiredact-0.1.0/lexiredact/config/__init__.py +16 -0
  12. lexiredact-0.1.0/lexiredact/config/defaults.py +121 -0
  13. lexiredact-0.1.0/lexiredact/config/loader.py +119 -0
  14. lexiredact-0.1.0/lexiredact/implementations/__init__.py +41 -0
  15. lexiredact-0.1.0/lexiredact/implementations/cache/__init__.py +10 -0
  16. lexiredact-0.1.0/lexiredact/implementations/cache/generic.py +112 -0
  17. lexiredact-0.1.0/lexiredact/implementations/cache/memory.py +120 -0
  18. lexiredact-0.1.0/lexiredact/implementations/cache/redis.py +182 -0
  19. lexiredact-0.1.0/lexiredact/implementations/embedding/__init__.py +7 -0
  20. lexiredact-0.1.0/lexiredact/implementations/embedding/fastembed.py +97 -0
  21. lexiredact-0.1.0/lexiredact/implementations/embedding/generic.py +101 -0
  22. lexiredact-0.1.0/lexiredact/implementations/tracker/__init__.py +10 -0
  23. lexiredact-0.1.0/lexiredact/implementations/tracker/mlflow.py +147 -0
  24. lexiredact-0.1.0/lexiredact/implementations/vectorstore/__init__.py +11 -0
  25. lexiredact-0.1.0/lexiredact/implementations/vectorstore/chroma.py +271 -0
  26. lexiredact-0.1.0/lexiredact/implementations/vectorstore/generic.py +120 -0
  27. lexiredact-0.1.0/lexiredact/interfaces/__init__.py +18 -0
  28. lexiredact-0.1.0/lexiredact/interfaces/cache.py +58 -0
  29. lexiredact-0.1.0/lexiredact/interfaces/embedder.py +48 -0
  30. lexiredact-0.1.0/lexiredact/interfaces/tracker.py +67 -0
  31. lexiredact-0.1.0/lexiredact/interfaces/vectorstore.py +89 -0
  32. lexiredact-0.1.0/lexiredact/metrics/__init__.py +21 -0
  33. lexiredact-0.1.0/lexiredact/metrics/stats.py +386 -0
  34. lexiredact-0.1.0/lexiredact/pipeline/__init__.py +11 -0
  35. lexiredact-0.1.0/lexiredact/pipeline/ingest.py +587 -0
  36. lexiredact-0.1.0/lexiredact/privacy/__init__.py +15 -0
  37. lexiredact-0.1.0/lexiredact/privacy/pii_detector.py +176 -0
  38. lexiredact-0.1.0/lexiredact/privacy/policy.py +135 -0
  39. lexiredact-0.1.0/lexiredact/privacy/redactor.py +110 -0
  40. lexiredact-0.1.0/lexiredact/py.typed +1 -0
  41. lexiredact-0.1.0/lexiredact/registry/__init__.py +9 -0
  42. lexiredact-0.1.0/lexiredact/registry/loader.py +521 -0
  43. lexiredact-0.1.0/lexiredact/utils/__init__.py +17 -0
  44. lexiredact-0.1.0/lexiredact/utils/hashing.py +60 -0
  45. lexiredact-0.1.0/lexiredact/utils/timing.py +122 -0
  46. lexiredact-0.1.0/lexiredact.egg-info/PKG-INFO +100 -0
  47. lexiredact-0.1.0/lexiredact.egg-info/SOURCES.txt +52 -0
  48. lexiredact-0.1.0/lexiredact.egg-info/dependency_links.txt +1 -0
  49. lexiredact-0.1.0/lexiredact.egg-info/entry_points.txt +2 -0
  50. lexiredact-0.1.0/lexiredact.egg-info/requires.txt +21 -0
  51. lexiredact-0.1.0/lexiredact.egg-info/top_level.txt +1 -0
  52. lexiredact-0.1.0/pyproject.toml +65 -0
  53. lexiredact-0.1.0/setup.cfg +4 -0
  54. lexiredact-0.1.0/setup.py +3 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Baihela Abid Hussain
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,19 @@
1
+ include README.md
2
+ include LICENSE*
3
+
4
+ recursive-include lexiredact *.py
5
+ recursive-include lexiredact *.typed
6
+
7
+ prune backend
8
+ prune benchmarks
9
+ prune data
10
+ prune dist
11
+ prune .tmp-build
12
+ prune lexiredact_data
13
+ prune venv
14
+
15
+ global-exclude *.py[cod]
16
+ global-exclude __pycache__
17
+ global-exclude *.so
18
+ exclude mlflow.db
19
+ exclude requirements-backend.txt
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.4
2
+ Name: lexiredact
3
+ Version: 0.1.0
4
+ Summary: Privacy-First Vector Database for Sensitive Data
5
+ Author-email: Shwetan Londhe <shwetan.college@gmail.com>, Varad Limbkar <varadlimbkar@gmail.com>, Baihela Husain <baihelahusain@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/lexiredact/lexiredact
8
+ Project-URL: Repository, https://github.com/lexiredact/lexiredact
9
+ Project-URL: Documentation, https://github.com/lexiredact/lexiredact#documentation
10
+ Project-URL: Issues, https://github.com/lexiredact/lexiredact/issues
11
+ Keywords: pii,privacy,vector-database,embedding,rag
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Requires-Python: >=3.8
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: presidio-analyzer>=2.2.0
24
+ Requires-Dist: presidio-anonymizer>=2.2.0
25
+ Requires-Dist: fastembed>=0.2.0
26
+ Requires-Dist: chromadb>=0.4.0
27
+ Requires-Dist: pydantic>=2.0.0
28
+ Requires-Dist: pyyaml>=6.0
29
+ Requires-Dist: numpy>=1.24.0
30
+ Provides-Extra: pdf
31
+ Requires-Dist: pypdf>=4.0.0; extra == "pdf"
32
+ Provides-Extra: redis
33
+ Requires-Dist: redis[async]>=5.0.0; extra == "redis"
34
+ Provides-Extra: mlflow
35
+ Requires-Dist: mlflow>=2.10.0; extra == "mlflow"
36
+ Provides-Extra: all
37
+ Requires-Dist: pypdf>=4.0.0; extra == "all"
38
+ Requires-Dist: redis[async]>=5.0.0; extra == "all"
39
+ Requires-Dist: mlflow>=2.10.0; extra == "all"
40
+ Dynamic: license-file
41
+
42
+ # LexiRedact
43
+
44
+ LexiRedact is a Python package for privacy-first document ingestion in RAG and vector database workflows. It detects PII, redacts sensitive text before storage, and preserves retrieval quality by generating embeddings from the original text while storing only sanitized content.
45
+
46
+ ## Install
47
+
48
+ ```bash
49
+ pip install lexiredact
50
+ ```
51
+
52
+ Optional extras:
53
+
54
+ ```bash
55
+ pip install "lexiredact[pdf]"
56
+ pip install "lexiredact[redis]"
57
+ pip install "lexiredact[mlflow]"
58
+ pip install "lexiredact[all]"
59
+ ```
60
+
61
+ ## What It Focuses On
62
+
63
+ - PII detection with Presidio
64
+ - safe redaction before vector-store persistence
65
+ - configurable ingestion pipeline components
66
+ - operational metrics for privacy and latency
67
+ - optional retrieval evaluation helpers for model comparison
68
+
69
+ ## Quick Start
70
+
71
+ ```python
72
+ import asyncio
73
+ import lexiredact as lr
74
+
75
+
76
+ async def main() -> None:
77
+ pipeline = lr.IngestionPipeline()
78
+ await pipeline.initialize()
79
+
80
+ result = await pipeline.process_document(
81
+ lr.Document(
82
+ id="doc-1",
83
+ text="Contact Jane Doe at jane@example.com or 555-0101",
84
+ metadata={"source": "demo"},
85
+ )
86
+ )
87
+
88
+ print(result.clean_text)
89
+ print(result.pii_entities)
90
+
91
+ await pipeline.shutdown()
92
+
93
+
94
+ asyncio.run(main())
95
+ ```
96
+
97
+ ## Docs And Examples
98
+
99
+ - docs: [`docs/`](./docs)
100
+ - examples: [`examples/`](./examples)
@@ -0,0 +1,59 @@
1
+ # LexiRedact
2
+
3
+ LexiRedact is a Python package for privacy-first document ingestion in RAG and vector database workflows. It detects PII, redacts sensitive text before storage, and preserves retrieval quality by generating embeddings from the original text while storing only sanitized content.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install lexiredact
9
+ ```
10
+
11
+ Optional extras:
12
+
13
+ ```bash
14
+ pip install "lexiredact[pdf]"
15
+ pip install "lexiredact[redis]"
16
+ pip install "lexiredact[mlflow]"
17
+ pip install "lexiredact[all]"
18
+ ```
19
+
20
+ ## What It Focuses On
21
+
22
+ - PII detection with Presidio
23
+ - safe redaction before vector-store persistence
24
+ - configurable ingestion pipeline components
25
+ - operational metrics for privacy and latency
26
+ - optional retrieval evaluation helpers for model comparison
27
+
28
+ ## Quick Start
29
+
30
+ ```python
31
+ import asyncio
32
+ import lexiredact as lr
33
+
34
+
35
+ async def main() -> None:
36
+ pipeline = lr.IngestionPipeline()
37
+ await pipeline.initialize()
38
+
39
+ result = await pipeline.process_document(
40
+ lr.Document(
41
+ id="doc-1",
42
+ text="Contact Jane Doe at jane@example.com or 555-0101",
43
+ metadata={"source": "demo"},
44
+ )
45
+ )
46
+
47
+ print(result.clean_text)
48
+ print(result.pii_entities)
49
+
50
+ await pipeline.shutdown()
51
+
52
+
53
+ asyncio.run(main())
54
+ ```
55
+
56
+ ## Docs And Examples
57
+
58
+ - docs: [`docs/`](./docs)
59
+ - examples: [`examples/`](./examples)
@@ -0,0 +1,139 @@
1
+ """
2
+ LexiRedact - Privacy-Preserving RAG Middleware
3
+
4
+ A Python SDK for protecting PII in vector databases while maintaining
5
+ semantic search quality through intelligent embedding and redaction.
6
+
7
+ Key Features:
8
+ - Automatic PII detection and redaction using Microsoft Presidio
9
+ - Embedding generation from original text (Shadow Mode architecture)
10
+ - Only sanitized text stored in vector databases
11
+ - Redis caching for performance optimization
12
+ - Pluggable architecture via dependency injection
13
+ - Comprehensive metrics and tracking
14
+
15
+ Basic Usage:
16
+ >>> import lexiredact as vs
17
+ >>>
18
+ >>> # Create pipeline with defaults
19
+ >>> pipeline = vs.IngestionPipeline()
20
+ >>> await pipeline.initialize()
21
+ >>>
22
+ >>> # Process documents
23
+ >>> doc = vs.Document(id="1", text="Contact John at john@example.com")
24
+ >>> result = await pipeline.process_document(doc)
25
+ >>>
26
+ >>> print(result.clean_text) # "Contact <PERSON> at <EMAIL_ADDRESS>"
27
+ >>> print(result.pii_entities) # ["PERSON", "EMAIL_ADDRESS"]
28
+ >>>
29
+ >>> await pipeline.shutdown()
30
+
31
+ Custom Configuration:
32
+ >>> from lexiredact import IngestionPipeline, load_config
33
+ >>>
34
+ >>> config = load_config(config_dict={
35
+ ... "embedding_model": "BAAI/bge-base-en-v1.5",
36
+ ... "cache_backend": "redis",
37
+ ... "redis_host": "localhost"
38
+ ... })
39
+ >>>
40
+ >>> pipeline = IngestionPipeline(config=config)
41
+
42
+ Custom Components:
43
+ >>> from lexiredact import IngestionPipeline
44
+ >>> from lexiredact.interfaces import Embedder
45
+ >>>
46
+ >>> class MyEmbedder(Embedder):
47
+ ... # Custom implementation
48
+ ... pass
49
+ >>>
50
+ >>> pipeline = IngestionPipeline(embedder=MyEmbedder())
51
+ """
52
+
53
+ __version__ = "0.1.0"
54
+
55
+ # Core pipeline
56
+ from .pipeline import IngestionPipeline, Document, ProcessedDocument
57
+
58
+ # Configuration
59
+ from .config import load_config, get_default_config, save_config_to_yaml
60
+
61
+ # Privacy components
62
+ from .privacy import PIIDetector, PIIRedactor, PIIPolicy
63
+
64
+ # Interfaces (for custom implementations)
65
+ from .interfaces import CacheBackend, Embedder, VectorStore, Tracker
66
+
67
+ # Default implementations
68
+ from .implementations import (
69
+ MemoryCache,
70
+ RedisCache,
71
+ GenericCache,
72
+ FastEmbedEmbedder,
73
+ GenericEmbedder,
74
+ ChromaVectorStore,
75
+ GenericVectorStore,
76
+ MLflowTracker,
77
+ )
78
+
79
+ # Metrics
80
+ from .metrics import (
81
+ MetricsCollector,
82
+ AggregateStats,
83
+ RetrievalAggregateStats,
84
+ RetrievalMetricsEvaluator,
85
+ RetrievalQueryMetrics,
86
+ )
87
+
88
+ # Utilities
89
+ from .utils import hash_text, generate_cache_key, Timer
90
+
91
+ __all__ = [
92
+ # Version
93
+ "__version__",
94
+
95
+ # Core
96
+ "IngestionPipeline",
97
+ "Document",
98
+ "ProcessedDocument",
99
+
100
+ # Configuration
101
+ "load_config",
102
+ "get_default_config",
103
+ "save_config_to_yaml",
104
+
105
+ # Privacy
106
+ "PIIDetector",
107
+ "PIIRedactor",
108
+ "PIIPolicy",
109
+
110
+ # Interfaces
111
+ "CacheBackend",
112
+ "Embedder",
113
+ "VectorStore",
114
+ "Tracker",
115
+
116
+ #custom models
117
+ "GenericCache",
118
+ "GenericEmbedder",
119
+ "GenericVectorStore",
120
+
121
+ # Implementations
122
+ "MemoryCache",
123
+ "RedisCache",
124
+ "FastEmbedEmbedder",
125
+ "ChromaVectorStore",
126
+ "MLflowTracker",
127
+
128
+ # Metrics
129
+ "MetricsCollector",
130
+ "AggregateStats",
131
+ "RetrievalAggregateStats",
132
+ "RetrievalMetricsEvaluator",
133
+ "RetrievalQueryMetrics",
134
+
135
+ # Utils
136
+ "hash_text",
137
+ "generate_cache_key",
138
+ "Timer",
139
+ ]
@@ -0,0 +1,18 @@
1
+ """
2
+ Document chunking module for LexiRedact.
3
+
4
+ Converts PDFs and large text documents into manageable chunks
5
+ suitable for embedding and PII detection.
6
+ """
7
+
8
+ from .chunker import Chunk, DocumentChunker, ChunkingStrategy
9
+ from .json_exporter import JSONExporter
10
+ from .pdf_loader import PDFLoader
11
+
12
+ __all__ = [
13
+ "Chunk",
14
+ "DocumentChunker",
15
+ "ChunkingStrategy",
16
+ "PDFLoader",
17
+ "JSONExporter",
18
+ ]
@@ -0,0 +1,321 @@
1
+ """
2
+ Core document chunking logic.
3
+ Splits large documents into smaller chunks with overlap.
4
+ """
5
+
6
+ from typing import List, Dict, Any, Optional, Literal
7
+ from dataclasses import dataclass
8
+ import uuid
9
+ from enum import Enum
10
+ import re
11
+
12
+
13
+ class ChunkingStrategy(str, Enum):
14
+ """Chunking strategies."""
15
+ FIXED_SIZE = "fixed_size" # Fixed token/char chunks
16
+ SENTENCE = "sentence" # Split by sentences
17
+ PARAGRAPH = "paragraph" # Split by paragraphs
18
+ HYBRID = "hybrid" # Sentences grouped into chunks
19
+
20
+
21
+ @dataclass
22
+ class Chunk:
23
+ """Single chunk of text."""
24
+ id: str
25
+ text: str
26
+ chunk_index: int
27
+ start_char: int
28
+ end_char: int
29
+ metadata: Dict[str, Any]
30
+
31
+
32
+ class DocumentChunker:
33
+ """
34
+ Convert large documents into LexiRedact-compatible chunks.
35
+
36
+ Support for:
37
+ - Fixed-size chunking (tokens or characters)
38
+ - Sentence-based chunking
39
+ - Paragraph-based chunking
40
+ - Overlap between chunks
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ chunk_size: int = 512, # Max characters per chunk
46
+ overlap: int = 100, # Overlap between chunks (chars)
47
+ strategy: ChunkingStrategy = ChunkingStrategy.FIXED_SIZE,
48
+ preserve_sentences: bool = True, # Don't split mid-sentence
49
+ ):
50
+ """
51
+ Initialize chunker.
52
+
53
+ Args:
54
+ chunk_size: Target chunk size in characters
55
+ overlap: Overlap between chunks (to preserve context)
56
+ strategy: Chunking strategy to use
57
+ preserve_sentences: Don't split in middle of sentence
58
+ """
59
+ if chunk_size <= 0:
60
+ raise ValueError("chunk_size must be greater than 0")
61
+ if overlap < 0:
62
+ raise ValueError("overlap must be greater than or equal to 0")
63
+ if overlap >= chunk_size:
64
+ raise ValueError("overlap must be smaller than chunk_size")
65
+
66
+ self.chunk_size = chunk_size
67
+ self.overlap = overlap
68
+ self.strategy = strategy
69
+ self.preserve_sentences = preserve_sentences
70
+
71
+ def chunk_text(
72
+ self,
73
+ text: str,
74
+ doc_id: str,
75
+ metadata: Optional[Dict[str, Any]] = None,
76
+ ) -> List[Chunk]:
77
+ """
78
+ Chunk a document.
79
+
80
+ Args:
81
+ text: Document text to chunk
82
+ doc_id: Original document ID (source)
83
+ metadata: Optional metadata to attach to chunks
84
+
85
+ Returns:
86
+ List of Chunk objects
87
+ """
88
+ metadata = metadata or {}
89
+
90
+ if self.strategy == ChunkingStrategy.FIXED_SIZE:
91
+ return self._chunk_fixed_size(text, doc_id, metadata)
92
+ elif self.strategy == ChunkingStrategy.SENTENCE:
93
+ return self._chunk_by_sentence(text, doc_id, metadata)
94
+ elif self.strategy == ChunkingStrategy.PARAGRAPH:
95
+ return self._chunk_by_paragraph(text, doc_id, metadata)
96
+ elif self.strategy == ChunkingStrategy.HYBRID:
97
+ return self._chunk_hybrid(text, doc_id, metadata)
98
+ else:
99
+ raise ValueError(f"Unknown strategy: {self.strategy}")
100
+
101
+ def _chunk_fixed_size(
102
+ self,
103
+ text: str,
104
+ doc_id: str,
105
+ metadata: Dict[str, Any]
106
+ ) -> List[Chunk]:
107
+ """Split into fixed-size chunks with overlap."""
108
+ chunks = []
109
+ chunk_index = 0
110
+ start = 0
111
+
112
+ while start < len(text):
113
+ end = min(start + self.chunk_size, len(text))
114
+
115
+ # If preserve_sentences, adjust end to not split mid-sentence
116
+ if self.preserve_sentences and end < len(text):
117
+ window = text[start:end]
118
+ boundaries = list(re.finditer(r"[.!?](?=\s|$)|\n", window))
119
+ if boundaries:
120
+ end = start + boundaries[-1].end()
121
+
122
+ chunk_text = text[start:end].strip()
123
+
124
+ if chunk_text: # Skip empty chunks
125
+ chunk = Chunk(
126
+ id=f"{doc_id}_chunk_{chunk_index}",
127
+ text=chunk_text,
128
+ chunk_index=chunk_index,
129
+ start_char=start,
130
+ end_char=end,
131
+ metadata={
132
+ **metadata,
133
+ "source_doc_id": doc_id,
134
+ "chunk_number": chunk_index,
135
+ "strategy": self.strategy.value,
136
+ }
137
+ )
138
+ chunks.append(chunk)
139
+ chunk_index += 1
140
+
141
+ if end >= len(text):
142
+ break
143
+
144
+ # Move start position (with overlap)
145
+ start = end - self.overlap
146
+
147
+ return chunks
148
+
149
+ def _chunk_by_sentence(
150
+ self,
151
+ text: str,
152
+ doc_id: str,
153
+ metadata: Dict[str, Any]
154
+ ) -> List[Chunk]:
155
+ """Split by sentences while preserving punctuation-heavy tokens."""
156
+ sentences = self._split_sentences(text)
157
+ chunks = []
158
+ chunk_index = 0
159
+
160
+ current_chunk = []
161
+ current_size = 0
162
+ start_char = 0
163
+
164
+ for sentence in sentences:
165
+ sentence_size = len(sentence)
166
+
167
+ # If adding this sentence exceeds chunk_size, save current chunk
168
+ if current_size + sentence_size > self.chunk_size and current_chunk:
169
+ chunk_text = ' '.join(current_chunk).strip()
170
+ if chunk_text:
171
+ chunk = Chunk(
172
+ id=f"{doc_id}_chunk_{chunk_index}",
173
+ text=chunk_text,
174
+ chunk_index=chunk_index,
175
+ start_char=start_char,
176
+ end_char=start_char + len(chunk_text),
177
+ metadata={
178
+ **metadata,
179
+ "source_doc_id": doc_id,
180
+ "chunk_number": chunk_index,
181
+ "strategy": self.strategy.value,
182
+ }
183
+ )
184
+ chunks.append(chunk)
185
+ chunk_index += 1
186
+
187
+ start_char += len(chunk_text) + 1
188
+ current_chunk = [sentence]
189
+ current_size = sentence_size
190
+ else:
191
+ current_chunk.append(sentence)
192
+ current_size += sentence_size
193
+
194
+ # Add remaining chunk
195
+ if current_chunk:
196
+ chunk_text = ' '.join(current_chunk).strip()
197
+ chunk = Chunk(
198
+ id=f"{doc_id}_chunk_{chunk_index}",
199
+ text=chunk_text,
200
+ chunk_index=chunk_index,
201
+ start_char=start_char,
202
+ end_char=start_char + len(chunk_text),
203
+ metadata={
204
+ **metadata,
205
+ "source_doc_id": doc_id,
206
+ "chunk_number": chunk_index,
207
+ "strategy": self.strategy.value,
208
+ }
209
+ )
210
+ chunks.append(chunk)
211
+
212
+ return chunks
213
+
214
+ def _chunk_by_paragraph(
215
+ self,
216
+ text: str,
217
+ doc_id: str,
218
+ metadata: Dict[str, Any]
219
+ ) -> List[Chunk]:
220
+ """Split by paragraphs (double newline)."""
221
+ paragraphs = text.split('\n\n')
222
+ chunks = []
223
+ chunk_index = 0
224
+ start_char = 0
225
+
226
+ for para in paragraphs:
227
+ para = para.strip()
228
+ if para:
229
+ chunk = Chunk(
230
+ id=f"{doc_id}_chunk_{chunk_index}",
231
+ text=para,
232
+ chunk_index=chunk_index,
233
+ start_char=start_char,
234
+ end_char=start_char + len(para),
235
+ metadata={
236
+ **metadata,
237
+ "source_doc_id": doc_id,
238
+ "chunk_number": chunk_index,
239
+ "strategy": self.strategy.value,
240
+ }
241
+ )
242
+ chunks.append(chunk)
243
+ chunk_index += 1
244
+ start_char += len(para) + 2 # +2 for '\n\n'
245
+
246
+ return chunks
247
+
248
+ def _chunk_hybrid(
249
+ self,
250
+ text: str,
251
+ doc_id: str,
252
+ metadata: Dict[str, Any]
253
+ ) -> List[Chunk]:
254
+ """
255
+ Hybrid: Group sentences into chunks of target size.
256
+ Better than fixed_size because sentences stay together.
257
+ """
258
+ sentences = self._split_sentences(text)
259
+ chunks = []
260
+ chunk_index = 0
261
+ start_char = 0
262
+
263
+ current_chunk = []
264
+ current_size = 0
265
+
266
+ for sentence in sentences:
267
+ # If adding sentence exceeds size and we have content, save chunk
268
+ if current_size + len(sentence) > self.chunk_size and current_chunk:
269
+ chunk_text = ' '.join(current_chunk).strip()
270
+ chunk = Chunk(
271
+ id=f"{doc_id}_chunk_{chunk_index}",
272
+ text=chunk_text,
273
+ chunk_index=chunk_index,
274
+ start_char=start_char,
275
+ end_char=start_char + len(chunk_text),
276
+ metadata={
277
+ **metadata,
278
+ "source_doc_id": doc_id,
279
+ "chunk_number": chunk_index,
280
+ "strategy": self.strategy.value,
281
+ }
282
+ )
283
+ chunks.append(chunk)
284
+ chunk_index += 1
285
+ start_char += len(chunk_text) + 1
286
+ current_chunk = [sentence]
287
+ current_size = len(sentence)
288
+ else:
289
+ current_chunk.append(sentence)
290
+ current_size += len(sentence)
291
+
292
+ # Add final chunk
293
+ if current_chunk:
294
+ chunk_text = ' '.join(current_chunk).strip()
295
+ chunk = Chunk(
296
+ id=f"{doc_id}_chunk_{chunk_index}",
297
+ text=chunk_text,
298
+ chunk_index=chunk_index,
299
+ start_char=start_char,
300
+ end_char=start_char + len(chunk_text),
301
+ metadata={
302
+ **metadata,
303
+ "source_doc_id": doc_id,
304
+ "chunk_number": chunk_index,
305
+ "strategy": self.strategy.value,
306
+ }
307
+ )
308
+ chunks.append(chunk)
309
+
310
+ return chunks
311
+
312
+ def _split_sentences(self, text: str) -> List[str]:
313
+ """
314
+ Split text on sentence boundaries without breaking emails or domains.
315
+ """
316
+ normalized = text.strip()
317
+ if not normalized:
318
+ return []
319
+
320
+ parts = re.split(r"(?<=[.!?])\s+(?=[A-Z0-9])", normalized)
321
+ return [part.strip() for part in parts if part.strip()]