lexiredact 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. lexiredact/__init__.py +139 -0
  2. lexiredact/chunking/__init__.py +18 -0
  3. lexiredact/chunking/chunker.py +321 -0
  4. lexiredact/chunking/json_exporter.py +103 -0
  5. lexiredact/chunking/pdf_loader.py +86 -0
  6. lexiredact/cli.py +177 -0
  7. lexiredact/config/__init__.py +16 -0
  8. lexiredact/config/defaults.py +121 -0
  9. lexiredact/config/loader.py +119 -0
  10. lexiredact/implementations/__init__.py +41 -0
  11. lexiredact/implementations/cache/__init__.py +10 -0
  12. lexiredact/implementations/cache/generic.py +112 -0
  13. lexiredact/implementations/cache/memory.py +120 -0
  14. lexiredact/implementations/cache/redis.py +182 -0
  15. lexiredact/implementations/embedding/__init__.py +7 -0
  16. lexiredact/implementations/embedding/fastembed.py +97 -0
  17. lexiredact/implementations/embedding/generic.py +101 -0
  18. lexiredact/implementations/tracker/__init__.py +10 -0
  19. lexiredact/implementations/tracker/mlflow.py +147 -0
  20. lexiredact/implementations/vectorstore/__init__.py +11 -0
  21. lexiredact/implementations/vectorstore/chroma.py +271 -0
  22. lexiredact/implementations/vectorstore/generic.py +120 -0
  23. lexiredact/interfaces/__init__.py +18 -0
  24. lexiredact/interfaces/cache.py +58 -0
  25. lexiredact/interfaces/embedder.py +48 -0
  26. lexiredact/interfaces/tracker.py +67 -0
  27. lexiredact/interfaces/vectorstore.py +89 -0
  28. lexiredact/metrics/__init__.py +21 -0
  29. lexiredact/metrics/stats.py +386 -0
  30. lexiredact/pipeline/__init__.py +11 -0
  31. lexiredact/pipeline/ingest.py +587 -0
  32. lexiredact/privacy/__init__.py +15 -0
  33. lexiredact/privacy/pii_detector.py +176 -0
  34. lexiredact/privacy/policy.py +135 -0
  35. lexiredact/privacy/redactor.py +110 -0
  36. lexiredact/py.typed +1 -0
  37. lexiredact/registry/__init__.py +9 -0
  38. lexiredact/registry/loader.py +521 -0
  39. lexiredact/utils/__init__.py +17 -0
  40. lexiredact/utils/hashing.py +60 -0
  41. lexiredact/utils/timing.py +122 -0
  42. lexiredact-0.1.0.dist-info/METADATA +100 -0
  43. lexiredact-0.1.0.dist-info/RECORD +47 -0
  44. lexiredact-0.1.0.dist-info/WHEEL +5 -0
  45. lexiredact-0.1.0.dist-info/entry_points.txt +2 -0
  46. lexiredact-0.1.0.dist-info/licenses/LICENSE +21 -0
  47. lexiredact-0.1.0.dist-info/top_level.txt +1 -0
lexiredact/__init__.py ADDED
@@ -0,0 +1,139 @@
1
+ """
2
+ LexiRedact - Privacy-Preserving RAG Middleware
3
+
4
+ A Python SDK for protecting PII in vector databases while maintaining
5
+ semantic search quality through intelligent embedding and redaction.
6
+
7
+ Key Features:
8
+ - Automatic PII detection and redaction using Microsoft Presidio
9
+ - Embedding generation from original text (Shadow Mode architecture)
10
+ - Only sanitized text stored in vector databases
11
+ - Redis caching for performance optimization
12
+ - Pluggable architecture via dependency injection
13
+ - Comprehensive metrics and tracking
14
+
15
+ Basic Usage:
16
+ >>> import lexiredact as vs
17
+ >>>
18
+ >>> # Create pipeline with defaults
19
+ >>> pipeline = vs.IngestionPipeline()
20
+ >>> await pipeline.initialize()
21
+ >>>
22
+ >>> # Process documents
23
+ >>> doc = vs.Document(id="1", text="Contact John at john@example.com")
24
+ >>> result = await pipeline.process_document(doc)
25
+ >>>
26
+ >>> print(result.clean_text) # "Contact <PERSON> at <EMAIL_ADDRESS>"
27
+ >>> print(result.pii_entities) # ["PERSON", "EMAIL_ADDRESS"]
28
+ >>>
29
+ >>> await pipeline.shutdown()
30
+
31
+ Custom Configuration:
32
+ >>> from lexiredact import IngestionPipeline, load_config
33
+ >>>
34
+ >>> config = load_config(config_dict={
35
+ ... "embedding_model": "BAAI/bge-base-en-v1.5",
36
+ ... "cache_backend": "redis",
37
+ ... "redis_host": "localhost"
38
+ ... })
39
+ >>>
40
+ >>> pipeline = IngestionPipeline(config=config)
41
+
42
+ Custom Components:
43
+ >>> from lexiredact import IngestionPipeline
44
+ >>> from lexiredact.interfaces import Embedder
45
+ >>>
46
+ >>> class MyEmbedder(Embedder):
47
+ ... # Custom implementation
48
+ ... pass
49
+ >>>
50
+ >>> pipeline = IngestionPipeline(embedder=MyEmbedder())
51
+ """
52
+
53
+ __version__ = "0.1.0"
54
+
55
+ # Core pipeline
56
+ from .pipeline import IngestionPipeline, Document, ProcessedDocument
57
+
58
+ # Configuration
59
+ from .config import load_config, get_default_config, save_config_to_yaml
60
+
61
+ # Privacy components
62
+ from .privacy import PIIDetector, PIIRedactor, PIIPolicy
63
+
64
+ # Interfaces (for custom implementations)
65
+ from .interfaces import CacheBackend, Embedder, VectorStore, Tracker
66
+
67
+ # Default implementations
68
+ from .implementations import (
69
+ MemoryCache,
70
+ RedisCache,
71
+ GenericCache,
72
+ FastEmbedEmbedder,
73
+ GenericEmbedder,
74
+ ChromaVectorStore,
75
+ GenericVectorStore,
76
+ MLflowTracker,
77
+ )
78
+
79
+ # Metrics
80
+ from .metrics import (
81
+ MetricsCollector,
82
+ AggregateStats,
83
+ RetrievalAggregateStats,
84
+ RetrievalMetricsEvaluator,
85
+ RetrievalQueryMetrics,
86
+ )
87
+
88
+ # Utilities
89
+ from .utils import hash_text, generate_cache_key, Timer
90
+
91
+ __all__ = [
92
+ # Version
93
+ "__version__",
94
+
95
+ # Core
96
+ "IngestionPipeline",
97
+ "Document",
98
+ "ProcessedDocument",
99
+
100
+ # Configuration
101
+ "load_config",
102
+ "get_default_config",
103
+ "save_config_to_yaml",
104
+
105
+ # Privacy
106
+ "PIIDetector",
107
+ "PIIRedactor",
108
+ "PIIPolicy",
109
+
110
+ # Interfaces
111
+ "CacheBackend",
112
+ "Embedder",
113
+ "VectorStore",
114
+ "Tracker",
115
+
116
+ #custom models
117
+ "GenericCache",
118
+ "GenericEmbedder",
119
+ "GenericVectorStore",
120
+
121
+ # Implementations
122
+ "MemoryCache",
123
+ "RedisCache",
124
+ "FastEmbedEmbedder",
125
+ "ChromaVectorStore",
126
+ "MLflowTracker",
127
+
128
+ # Metrics
129
+ "MetricsCollector",
130
+ "AggregateStats",
131
+ "RetrievalAggregateStats",
132
+ "RetrievalMetricsEvaluator",
133
+ "RetrievalQueryMetrics",
134
+
135
+ # Utils
136
+ "hash_text",
137
+ "generate_cache_key",
138
+ "Timer",
139
+ ]
@@ -0,0 +1,18 @@
1
+ """
2
+ Document chunking module for LexiRedact.
3
+
4
+ Converts PDFs and large text documents into manageable chunks
5
+ suitable for embedding and PII detection.
6
+ """
7
+
8
+ from .chunker import Chunk, DocumentChunker, ChunkingStrategy
9
+ from .json_exporter import JSONExporter
10
+ from .pdf_loader import PDFLoader
11
+
12
+ __all__ = [
13
+ "Chunk",
14
+ "DocumentChunker",
15
+ "ChunkingStrategy",
16
+ "PDFLoader",
17
+ "JSONExporter",
18
+ ]
@@ -0,0 +1,321 @@
1
+ """
2
+ Core document chunking logic.
3
+ Splits large documents into smaller chunks with overlap.
4
+ """
5
+
6
+ from typing import List, Dict, Any, Optional, Literal
7
+ from dataclasses import dataclass
8
+ import uuid
9
+ from enum import Enum
10
+ import re
11
+
12
+
13
+ class ChunkingStrategy(str, Enum):
14
+ """Chunking strategies."""
15
+ FIXED_SIZE = "fixed_size" # Fixed token/char chunks
16
+ SENTENCE = "sentence" # Split by sentences
17
+ PARAGRAPH = "paragraph" # Split by paragraphs
18
+ HYBRID = "hybrid" # Sentences grouped into chunks
19
+
20
+
21
+ @dataclass
22
+ class Chunk:
23
+ """Single chunk of text."""
24
+ id: str
25
+ text: str
26
+ chunk_index: int
27
+ start_char: int
28
+ end_char: int
29
+ metadata: Dict[str, Any]
30
+
31
+
32
+ class DocumentChunker:
33
+ """
34
+ Convert large documents into LexiRedact-compatible chunks.
35
+
36
+ Support for:
37
+ - Fixed-size chunking (tokens or characters)
38
+ - Sentence-based chunking
39
+ - Paragraph-based chunking
40
+ - Overlap between chunks
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ chunk_size: int = 512, # Max characters per chunk
46
+ overlap: int = 100, # Overlap between chunks (chars)
47
+ strategy: ChunkingStrategy = ChunkingStrategy.FIXED_SIZE,
48
+ preserve_sentences: bool = True, # Don't split mid-sentence
49
+ ):
50
+ """
51
+ Initialize chunker.
52
+
53
+ Args:
54
+ chunk_size: Target chunk size in characters
55
+ overlap: Overlap between chunks (to preserve context)
56
+ strategy: Chunking strategy to use
57
+ preserve_sentences: Don't split in middle of sentence
58
+ """
59
+ if chunk_size <= 0:
60
+ raise ValueError("chunk_size must be greater than 0")
61
+ if overlap < 0:
62
+ raise ValueError("overlap must be greater than or equal to 0")
63
+ if overlap >= chunk_size:
64
+ raise ValueError("overlap must be smaller than chunk_size")
65
+
66
+ self.chunk_size = chunk_size
67
+ self.overlap = overlap
68
+ self.strategy = strategy
69
+ self.preserve_sentences = preserve_sentences
70
+
71
+ def chunk_text(
72
+ self,
73
+ text: str,
74
+ doc_id: str,
75
+ metadata: Optional[Dict[str, Any]] = None,
76
+ ) -> List[Chunk]:
77
+ """
78
+ Chunk a document.
79
+
80
+ Args:
81
+ text: Document text to chunk
82
+ doc_id: Original document ID (source)
83
+ metadata: Optional metadata to attach to chunks
84
+
85
+ Returns:
86
+ List of Chunk objects
87
+ """
88
+ metadata = metadata or {}
89
+
90
+ if self.strategy == ChunkingStrategy.FIXED_SIZE:
91
+ return self._chunk_fixed_size(text, doc_id, metadata)
92
+ elif self.strategy == ChunkingStrategy.SENTENCE:
93
+ return self._chunk_by_sentence(text, doc_id, metadata)
94
+ elif self.strategy == ChunkingStrategy.PARAGRAPH:
95
+ return self._chunk_by_paragraph(text, doc_id, metadata)
96
+ elif self.strategy == ChunkingStrategy.HYBRID:
97
+ return self._chunk_hybrid(text, doc_id, metadata)
98
+ else:
99
+ raise ValueError(f"Unknown strategy: {self.strategy}")
100
+
101
+ def _chunk_fixed_size(
102
+ self,
103
+ text: str,
104
+ doc_id: str,
105
+ metadata: Dict[str, Any]
106
+ ) -> List[Chunk]:
107
+ """Split into fixed-size chunks with overlap."""
108
+ chunks = []
109
+ chunk_index = 0
110
+ start = 0
111
+
112
+ while start < len(text):
113
+ end = min(start + self.chunk_size, len(text))
114
+
115
+ # If preserve_sentences, adjust end to not split mid-sentence
116
+ if self.preserve_sentences and end < len(text):
117
+ window = text[start:end]
118
+ boundaries = list(re.finditer(r"[.!?](?=\s|$)|\n", window))
119
+ if boundaries:
120
+ end = start + boundaries[-1].end()
121
+
122
+ chunk_text = text[start:end].strip()
123
+
124
+ if chunk_text: # Skip empty chunks
125
+ chunk = Chunk(
126
+ id=f"{doc_id}_chunk_{chunk_index}",
127
+ text=chunk_text,
128
+ chunk_index=chunk_index,
129
+ start_char=start,
130
+ end_char=end,
131
+ metadata={
132
+ **metadata,
133
+ "source_doc_id": doc_id,
134
+ "chunk_number": chunk_index,
135
+ "strategy": self.strategy.value,
136
+ }
137
+ )
138
+ chunks.append(chunk)
139
+ chunk_index += 1
140
+
141
+ if end >= len(text):
142
+ break
143
+
144
+ # Move start position (with overlap)
145
+ start = end - self.overlap
146
+
147
+ return chunks
148
+
149
+ def _chunk_by_sentence(
150
+ self,
151
+ text: str,
152
+ doc_id: str,
153
+ metadata: Dict[str, Any]
154
+ ) -> List[Chunk]:
155
+ """Split by sentences while preserving punctuation-heavy tokens."""
156
+ sentences = self._split_sentences(text)
157
+ chunks = []
158
+ chunk_index = 0
159
+
160
+ current_chunk = []
161
+ current_size = 0
162
+ start_char = 0
163
+
164
+ for sentence in sentences:
165
+ sentence_size = len(sentence)
166
+
167
+ # If adding this sentence exceeds chunk_size, save current chunk
168
+ if current_size + sentence_size > self.chunk_size and current_chunk:
169
+ chunk_text = ' '.join(current_chunk).strip()
170
+ if chunk_text:
171
+ chunk = Chunk(
172
+ id=f"{doc_id}_chunk_{chunk_index}",
173
+ text=chunk_text,
174
+ chunk_index=chunk_index,
175
+ start_char=start_char,
176
+ end_char=start_char + len(chunk_text),
177
+ metadata={
178
+ **metadata,
179
+ "source_doc_id": doc_id,
180
+ "chunk_number": chunk_index,
181
+ "strategy": self.strategy.value,
182
+ }
183
+ )
184
+ chunks.append(chunk)
185
+ chunk_index += 1
186
+
187
+ start_char += len(chunk_text) + 1
188
+ current_chunk = [sentence]
189
+ current_size = sentence_size
190
+ else:
191
+ current_chunk.append(sentence)
192
+ current_size += sentence_size
193
+
194
+ # Add remaining chunk
195
+ if current_chunk:
196
+ chunk_text = ' '.join(current_chunk).strip()
197
+ chunk = Chunk(
198
+ id=f"{doc_id}_chunk_{chunk_index}",
199
+ text=chunk_text,
200
+ chunk_index=chunk_index,
201
+ start_char=start_char,
202
+ end_char=start_char + len(chunk_text),
203
+ metadata={
204
+ **metadata,
205
+ "source_doc_id": doc_id,
206
+ "chunk_number": chunk_index,
207
+ "strategy": self.strategy.value,
208
+ }
209
+ )
210
+ chunks.append(chunk)
211
+
212
+ return chunks
213
+
214
+ def _chunk_by_paragraph(
215
+ self,
216
+ text: str,
217
+ doc_id: str,
218
+ metadata: Dict[str, Any]
219
+ ) -> List[Chunk]:
220
+ """Split by paragraphs (double newline)."""
221
+ paragraphs = text.split('\n\n')
222
+ chunks = []
223
+ chunk_index = 0
224
+ start_char = 0
225
+
226
+ for para in paragraphs:
227
+ para = para.strip()
228
+ if para:
229
+ chunk = Chunk(
230
+ id=f"{doc_id}_chunk_{chunk_index}",
231
+ text=para,
232
+ chunk_index=chunk_index,
233
+ start_char=start_char,
234
+ end_char=start_char + len(para),
235
+ metadata={
236
+ **metadata,
237
+ "source_doc_id": doc_id,
238
+ "chunk_number": chunk_index,
239
+ "strategy": self.strategy.value,
240
+ }
241
+ )
242
+ chunks.append(chunk)
243
+ chunk_index += 1
244
+ start_char += len(para) + 2 # +2 for '\n\n'
245
+
246
+ return chunks
247
+
248
+ def _chunk_hybrid(
249
+ self,
250
+ text: str,
251
+ doc_id: str,
252
+ metadata: Dict[str, Any]
253
+ ) -> List[Chunk]:
254
+ """
255
+ Hybrid: Group sentences into chunks of target size.
256
+ Better than fixed_size because sentences stay together.
257
+ """
258
+ sentences = self._split_sentences(text)
259
+ chunks = []
260
+ chunk_index = 0
261
+ start_char = 0
262
+
263
+ current_chunk = []
264
+ current_size = 0
265
+
266
+ for sentence in sentences:
267
+ # If adding sentence exceeds size and we have content, save chunk
268
+ if current_size + len(sentence) > self.chunk_size and current_chunk:
269
+ chunk_text = ' '.join(current_chunk).strip()
270
+ chunk = Chunk(
271
+ id=f"{doc_id}_chunk_{chunk_index}",
272
+ text=chunk_text,
273
+ chunk_index=chunk_index,
274
+ start_char=start_char,
275
+ end_char=start_char + len(chunk_text),
276
+ metadata={
277
+ **metadata,
278
+ "source_doc_id": doc_id,
279
+ "chunk_number": chunk_index,
280
+ "strategy": self.strategy.value,
281
+ }
282
+ )
283
+ chunks.append(chunk)
284
+ chunk_index += 1
285
+ start_char += len(chunk_text) + 1
286
+ current_chunk = [sentence]
287
+ current_size = len(sentence)
288
+ else:
289
+ current_chunk.append(sentence)
290
+ current_size += len(sentence)
291
+
292
+ # Add final chunk
293
+ if current_chunk:
294
+ chunk_text = ' '.join(current_chunk).strip()
295
+ chunk = Chunk(
296
+ id=f"{doc_id}_chunk_{chunk_index}",
297
+ text=chunk_text,
298
+ chunk_index=chunk_index,
299
+ start_char=start_char,
300
+ end_char=start_char + len(chunk_text),
301
+ metadata={
302
+ **metadata,
303
+ "source_doc_id": doc_id,
304
+ "chunk_number": chunk_index,
305
+ "strategy": self.strategy.value,
306
+ }
307
+ )
308
+ chunks.append(chunk)
309
+
310
+ return chunks
311
+
312
+ def _split_sentences(self, text: str) -> List[str]:
313
+ """
314
+ Split text on sentence boundaries without breaking emails or domains.
315
+ """
316
+ normalized = text.strip()
317
+ if not normalized:
318
+ return []
319
+
320
+ parts = re.split(r"(?<=[.!?])\s+(?=[A-Z0-9])", normalized)
321
+ return [part.strip() for part in parts if part.strip()]
@@ -0,0 +1,103 @@
1
+ """
2
+ Export chunks as JSON in LexiRedact format.
3
+ """
4
+
5
+ import json
6
+ from typing import List, Dict, Any
7
+ from pathlib import Path
8
+ from .chunker import Chunk
9
+
10
+
11
+ class JSONExporter:
12
+ """Export chunks to JSON format compatible with LexiRedact."""
13
+
14
+ @staticmethod
15
+ def to_lexiredact_format(
16
+ chunks: List[Chunk],
17
+ output_path: str | Path | None = None,
18
+ wrap_documents: bool = False,
19
+ ) -> str | List[Dict[str, Any]] | Dict[str, List[Dict[str, Any]]]:
20
+ """
21
+ Convert chunks to LexiRedact JSON format.
22
+
23
+ Args:
24
+ chunks: List of Chunk objects
25
+ output_path: Optional path to save JSON file
26
+ wrap_documents: Export CLI-ready payload as
27
+ {"documents": [...]} when True
28
+
29
+ Returns:
30
+ JSON string (if output_path) or payload object (if None)
31
+
32
+ Example output format:
33
+ [
34
+ {
35
+ "id": "doc1_chunk_0",
36
+ "text": "First chunk of text...",
37
+ "metadata": {
38
+ "source_doc_id": "doc1",
39
+ "chunk_number": 0,
40
+ "chunk_size": 512
41
+ }
42
+ },
43
+ ...
44
+ ]
45
+ """
46
+ chunk_dicts = [
47
+ {
48
+ "id": chunk.id,
49
+ "text": chunk.text,
50
+ "metadata": chunk.metadata
51
+ }
52
+ for chunk in chunks
53
+ ]
54
+ payload: List[Dict[str, Any]] | Dict[str, List[Dict[str, Any]]]
55
+ payload = {"documents": chunk_dicts} if wrap_documents else chunk_dicts
56
+
57
+ if output_path:
58
+ output_path = Path(output_path)
59
+ output_path.parent.mkdir(parents=True, exist_ok=True)
60
+
61
+ with open(output_path, 'w', encoding='utf-8') as f:
62
+ json.dump(payload, f, indent=2, ensure_ascii=False)
63
+
64
+ return json.dumps(payload, indent=2, ensure_ascii=False)
65
+
66
+ return payload
67
+
68
+ @staticmethod
69
+ def to_cli_input(
70
+ chunks: List[Chunk],
71
+ output_path: str | Path | None = None,
72
+ ) -> str | Dict[str, List[Dict[str, Any]]]:
73
+ """Export chunks in the JSON shape expected by `lexiredact process`."""
74
+ return JSONExporter.to_lexiredact_format(
75
+ chunks,
76
+ output_path=output_path,
77
+ wrap_documents=True,
78
+ )
79
+
80
+ @staticmethod
81
+ def to_jsonl(
82
+ chunks: List[Chunk],
83
+ output_path: str | Path,
84
+ ) -> None:
85
+ """
86
+ Export chunks as JSONL (one JSON per line).
87
+ Useful for streaming/large datasets.
88
+
89
+ Args:
90
+ chunks: List of Chunk objects
91
+ output_path: Path to save JSONL file
92
+ """
93
+ output_path = Path(output_path)
94
+ output_path.parent.mkdir(parents=True, exist_ok=True)
95
+
96
+ with open(output_path, 'w', encoding='utf-8') as f:
97
+ for chunk in chunks:
98
+ line = {
99
+ "id": chunk.id,
100
+ "text": chunk.text,
101
+ "metadata": chunk.metadata
102
+ }
103
+ f.write(json.dumps(line, ensure_ascii=False) + '\n')
@@ -0,0 +1,86 @@
1
+ """
2
+ PDF extraction for document chunking.
3
+ """
4
+
5
+ from typing import Dict, Any
6
+ from pathlib import Path
7
+
8
+
9
+ class PDFLoader:
10
+ """Load and extract text from PDF files."""
11
+
12
+ def __init__(self, use_ocr: bool = False):
13
+ """
14
+ Initialize PDF loader.
15
+
16
+ Args:
17
+ use_ocr: Use OCR for scanned PDFs (requires pytesseract)
18
+ """
19
+ self.use_ocr = use_ocr
20
+
21
+ @staticmethod
22
+ def extract_text(pdf_path: str | Path) -> str:
23
+ """
24
+ Extract text from PDF.
25
+
26
+ Args:
27
+ pdf_path: Path to PDF file
28
+
29
+ Returns:
30
+ Extracted text
31
+
32
+ Raises:
33
+ ImportError: If pypdf not installed
34
+ FileNotFoundError: If PDF not found
35
+ """
36
+ try:
37
+ from pypdf import PdfReader
38
+ except ImportError:
39
+ raise ImportError(
40
+ "PDF support requires pypdf. "
41
+ "Install with: pip install pypdf"
42
+ )
43
+
44
+ pdf_path = Path(pdf_path)
45
+ if not pdf_path.exists():
46
+ raise FileNotFoundError(f"PDF not found: {pdf_path}")
47
+
48
+ text = []
49
+ with open(pdf_path, 'rb') as file:
50
+ reader = PdfReader(file)
51
+ for page in reader.pages:
52
+ text.append(page.extract_text())
53
+
54
+ return '\n\n'.join(text)
55
+
56
+ @staticmethod
57
+ def extract_metadata(pdf_path: str | Path) -> Dict[str, Any]:
58
+ """
59
+ Extract metadata from PDF.
60
+
61
+ Args:
62
+ pdf_path: Path to PDF file
63
+
64
+ Returns:
65
+ Metadata dictionary
66
+ """
67
+ try:
68
+ from pypdf import PdfReader
69
+ except ImportError:
70
+ raise ImportError("PDF support requires pypdf")
71
+
72
+ pdf_path = Path(pdf_path)
73
+ metadata = {}
74
+
75
+ with open(pdf_path, 'rb') as file:
76
+ reader = PdfReader(file)
77
+ if reader.metadata:
78
+ metadata = {
79
+ "title": reader.metadata.get("/Title", ""),
80
+ "author": reader.metadata.get("/Author", ""),
81
+ "subject": reader.metadata.get("/Subject", ""),
82
+ "pages": len(reader.pages),
83
+ "pdf_file": str(pdf_path.name),
84
+ }
85
+
86
+ return metadata