keep-skill 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keep/__init__.py +53 -0
- keep/__main__.py +8 -0
- keep/api.py +686 -0
- keep/chunking.py +364 -0
- keep/cli.py +503 -0
- keep/config.py +323 -0
- keep/context.py +127 -0
- keep/indexing.py +208 -0
- keep/logging_config.py +73 -0
- keep/paths.py +67 -0
- keep/pending_summaries.py +166 -0
- keep/providers/__init__.py +40 -0
- keep/providers/base.py +416 -0
- keep/providers/documents.py +250 -0
- keep/providers/embedding_cache.py +260 -0
- keep/providers/embeddings.py +245 -0
- keep/providers/llm.py +371 -0
- keep/providers/mlx.py +256 -0
- keep/providers/summarization.py +107 -0
- keep/store.py +403 -0
- keep/types.py +65 -0
- keep_skill-0.1.0.dist-info/METADATA +290 -0
- keep_skill-0.1.0.dist-info/RECORD +26 -0
- keep_skill-0.1.0.dist-info/WHEEL +4 -0
- keep_skill-0.1.0.dist-info/entry_points.txt +2 -0
- keep_skill-0.1.0.dist-info/licenses/LICENSE +21 -0
keep/providers/base.py
ADDED
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base provider protocols.
|
|
3
|
+
|
|
4
|
+
These define the interfaces that concrete providers must implement.
|
|
5
|
+
Using Protocol for structural subtyping - no explicit inheritance required.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any, Protocol, runtime_checkable
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# -----------------------------------------------------------------------------
|
|
13
|
+
# Document Fetching
|
|
14
|
+
# -----------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class Document:
|
|
18
|
+
"""
|
|
19
|
+
A fetched document ready for processing.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
uri: Original URI that was fetched
|
|
23
|
+
content: Text content of the document
|
|
24
|
+
content_type: MIME type if known (e.g., "text/markdown", "text/plain")
|
|
25
|
+
metadata: Additional metadata from the source (headers, file stats, etc.)
|
|
26
|
+
"""
|
|
27
|
+
uri: str
|
|
28
|
+
content: str
|
|
29
|
+
content_type: str | None = None
|
|
30
|
+
metadata: dict[str, Any] | None = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@runtime_checkable
|
|
34
|
+
class DocumentProvider(Protocol):
|
|
35
|
+
"""
|
|
36
|
+
Fetches document content from a URI.
|
|
37
|
+
|
|
38
|
+
Implementations handle specific URI schemes (file://, https://, s3://, etc.)
|
|
39
|
+
and convert the content to text.
|
|
40
|
+
|
|
41
|
+
Example implementation:
|
|
42
|
+
class FileDocumentProvider:
|
|
43
|
+
def supports(self, uri: str) -> bool:
|
|
44
|
+
return uri.startswith("file://")
|
|
45
|
+
|
|
46
|
+
def fetch(self, uri: str) -> Document:
|
|
47
|
+
path = uri.removeprefix("file://")
|
|
48
|
+
content = Path(path).read_text()
|
|
49
|
+
return Document(uri=uri, content=content, content_type="text/plain")
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def supports(self, uri: str) -> bool:
|
|
53
|
+
"""
|
|
54
|
+
Check if this provider can handle the given URI.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
uri: The URI to check
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
True if this provider can fetch the URI
|
|
61
|
+
"""
|
|
62
|
+
...
|
|
63
|
+
|
|
64
|
+
def fetch(self, uri: str) -> Document:
|
|
65
|
+
"""
|
|
66
|
+
Fetch and return the document content.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
uri: The URI to fetch
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Document with text content
|
|
73
|
+
|
|
74
|
+
Raises:
|
|
75
|
+
ValueError: If URI is malformed
|
|
76
|
+
IOError: If document cannot be fetched
|
|
77
|
+
"""
|
|
78
|
+
...
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# -----------------------------------------------------------------------------
|
|
82
|
+
# Embedding Generation
|
|
83
|
+
# -----------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
@runtime_checkable
|
|
86
|
+
class EmbeddingProvider(Protocol):
|
|
87
|
+
"""
|
|
88
|
+
Generates vector embeddings from text.
|
|
89
|
+
|
|
90
|
+
Embeddings enable semantic similarity search. The same provider instance
|
|
91
|
+
must be used for both indexing and querying to ensure consistent vectors.
|
|
92
|
+
|
|
93
|
+
Example implementation:
|
|
94
|
+
class SentenceTransformerEmbedding:
|
|
95
|
+
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
|
|
96
|
+
from sentence_transformers import SentenceTransformer
|
|
97
|
+
self.model = SentenceTransformer(model_name)
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def dimension(self) -> int:
|
|
101
|
+
return self.model.get_sentence_embedding_dimension()
|
|
102
|
+
|
|
103
|
+
def embed(self, text: str) -> list[float]:
|
|
104
|
+
return self.model.encode(text).tolist()
|
|
105
|
+
|
|
106
|
+
def embed_batch(self, texts: list[str]) -> list[list[float]]:
|
|
107
|
+
return self.model.encode(texts).tolist()
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def dimension(self) -> int:
|
|
112
|
+
"""
|
|
113
|
+
The dimensionality of the embedding vectors.
|
|
114
|
+
|
|
115
|
+
This must be consistent across all calls. ChromaDb and other vector
|
|
116
|
+
stores need this to configure the index.
|
|
117
|
+
"""
|
|
118
|
+
...
|
|
119
|
+
|
|
120
|
+
def embed(self, text: str) -> list[float]:
|
|
121
|
+
"""
|
|
122
|
+
Generate an embedding vector for the given text.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
text: The text to embed
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
A list of floats representing the embedding vector
|
|
129
|
+
"""
|
|
130
|
+
...
|
|
131
|
+
|
|
132
|
+
def embed_batch(self, texts: list[str]) -> list[list[float]]:
|
|
133
|
+
"""
|
|
134
|
+
Generate embeddings for multiple texts.
|
|
135
|
+
|
|
136
|
+
Batch processing is often more efficient than individual calls.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
texts: List of texts to embed
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
List of embedding vectors, one per input text
|
|
143
|
+
"""
|
|
144
|
+
...
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# -----------------------------------------------------------------------------
|
|
148
|
+
# Summarization
|
|
149
|
+
# -----------------------------------------------------------------------------
|
|
150
|
+
|
|
151
|
+
@runtime_checkable
|
|
152
|
+
class SummarizationProvider(Protocol):
|
|
153
|
+
"""
|
|
154
|
+
Generates concise summaries of document content.
|
|
155
|
+
|
|
156
|
+
Summaries are stored alongside items and enable quick recall without
|
|
157
|
+
fetching the original document. They're also used for full-text search.
|
|
158
|
+
|
|
159
|
+
Example implementation:
|
|
160
|
+
class OpenAISummarization:
|
|
161
|
+
def __init__(self, model: str = "gpt-4o-mini"):
|
|
162
|
+
self.client = OpenAI()
|
|
163
|
+
self.model = model
|
|
164
|
+
|
|
165
|
+
def summarize(self, content: str, max_length: int = 500) -> str:
|
|
166
|
+
response = self.client.chat.completions.create(
|
|
167
|
+
model=self.model,
|
|
168
|
+
messages=[
|
|
169
|
+
{"role": "system", "content": "Summarize concisely."},
|
|
170
|
+
{"role": "user", "content": content}
|
|
171
|
+
],
|
|
172
|
+
max_tokens=max_length
|
|
173
|
+
)
|
|
174
|
+
return response.choices[0].message.content
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
def summarize(self, content: str, *, max_length: int = 500) -> str:
|
|
178
|
+
"""
|
|
179
|
+
Generate a summary of the content.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
content: The full document content
|
|
183
|
+
max_length: Approximate maximum length in characters
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
A concise summary of the content
|
|
187
|
+
"""
|
|
188
|
+
...
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# -----------------------------------------------------------------------------
|
|
192
|
+
# Tagging
|
|
193
|
+
# -----------------------------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
@runtime_checkable
|
|
196
|
+
class TaggingProvider(Protocol):
|
|
197
|
+
"""
|
|
198
|
+
Generates structured tags from document content.
|
|
199
|
+
|
|
200
|
+
Tags enable traditional navigation and filtering. The provider analyzes
|
|
201
|
+
content and returns relevant key-value pairs.
|
|
202
|
+
|
|
203
|
+
Example implementation:
|
|
204
|
+
class OpenAITagging:
|
|
205
|
+
def __init__(self, model: str = "gpt-4o-mini"):
|
|
206
|
+
self.client = OpenAI()
|
|
207
|
+
self.model = model
|
|
208
|
+
|
|
209
|
+
def tag(self, content: str) -> dict[str, str]:
|
|
210
|
+
response = self.client.chat.completions.create(
|
|
211
|
+
model=self.model,
|
|
212
|
+
messages=[...],
|
|
213
|
+
response_format={"type": "json_object"}
|
|
214
|
+
)
|
|
215
|
+
return json.loads(response.choices[0].message.content)
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
def tag(self, content: str) -> dict[str, str]:
|
|
219
|
+
"""
|
|
220
|
+
Generate tags for the content.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
content: The full document content
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Dictionary of tag key-value pairs
|
|
227
|
+
|
|
228
|
+
Note:
|
|
229
|
+
Keys should be lowercase with underscores (e.g., "content_type").
|
|
230
|
+
Values should be simple strings.
|
|
231
|
+
System tags (keys starting with "_") should not be generated here.
|
|
232
|
+
"""
|
|
233
|
+
...
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
# -----------------------------------------------------------------------------
|
|
237
|
+
# Provider Registry
|
|
238
|
+
# -----------------------------------------------------------------------------
|
|
239
|
+
|
|
240
|
+
class ProviderRegistry:
|
|
241
|
+
"""
|
|
242
|
+
Registry for discovering and instantiating providers.
|
|
243
|
+
|
|
244
|
+
Providers are registered by name and can be instantiated from configuration.
|
|
245
|
+
This allows the store configuration (TOML) to specify providers by name
|
|
246
|
+
rather than requiring code changes.
|
|
247
|
+
|
|
248
|
+
Example:
|
|
249
|
+
registry = ProviderRegistry()
|
|
250
|
+
registry.register_embedding("sentence-transformers", SentenceTransformerEmbedding)
|
|
251
|
+
registry.register_embedding("openai", OpenAIEmbedding)
|
|
252
|
+
|
|
253
|
+
# Later, from config:
|
|
254
|
+
provider = registry.create_embedding("sentence-transformers", {"model": "all-MiniLM-L6-v2"})
|
|
255
|
+
"""
|
|
256
|
+
|
|
257
|
+
def __init__(self):
|
|
258
|
+
self._embedding_providers: dict[str, type] = {}
|
|
259
|
+
self._summarization_providers: dict[str, type] = {}
|
|
260
|
+
self._tagging_providers: dict[str, type] = {}
|
|
261
|
+
self._document_providers: dict[str, type] = {}
|
|
262
|
+
self._lazy_loaded = False
|
|
263
|
+
|
|
264
|
+
def _ensure_providers_loaded(self) -> None:
|
|
265
|
+
"""Lazily load all provider modules."""
|
|
266
|
+
if self._lazy_loaded:
|
|
267
|
+
return
|
|
268
|
+
|
|
269
|
+
self._lazy_loaded = True
|
|
270
|
+
|
|
271
|
+
# Import provider modules to trigger registration
|
|
272
|
+
# These imports are safe - they only register classes, don't instantiate
|
|
273
|
+
try:
|
|
274
|
+
from . import documents
|
|
275
|
+
except ImportError:
|
|
276
|
+
pass # Document provider might not be available
|
|
277
|
+
|
|
278
|
+
try:
|
|
279
|
+
from . import embeddings
|
|
280
|
+
except ImportError:
|
|
281
|
+
pass # Embedding providers might not be available
|
|
282
|
+
|
|
283
|
+
try:
|
|
284
|
+
from . import summarization
|
|
285
|
+
except ImportError:
|
|
286
|
+
pass # Summarization providers might not be available
|
|
287
|
+
|
|
288
|
+
try:
|
|
289
|
+
from . import llm
|
|
290
|
+
except ImportError:
|
|
291
|
+
pass # LLM providers might not be available
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
from . import mlx
|
|
295
|
+
except ImportError:
|
|
296
|
+
pass # MLX providers might not be available
|
|
297
|
+
|
|
298
|
+
# Registration methods
|
|
299
|
+
|
|
300
|
+
def register_embedding(self, name: str, provider_class: type) -> None:
|
|
301
|
+
"""Register an embedding provider class."""
|
|
302
|
+
self._embedding_providers[name] = provider_class
|
|
303
|
+
|
|
304
|
+
def register_summarization(self, name: str, provider_class: type) -> None:
|
|
305
|
+
"""Register a summarization provider class."""
|
|
306
|
+
self._summarization_providers[name] = provider_class
|
|
307
|
+
|
|
308
|
+
def register_tagging(self, name: str, provider_class: type) -> None:
|
|
309
|
+
"""Register a tagging provider class."""
|
|
310
|
+
self._tagging_providers[name] = provider_class
|
|
311
|
+
|
|
312
|
+
def register_document(self, name: str, provider_class: type) -> None:
|
|
313
|
+
"""Register a document provider class."""
|
|
314
|
+
self._document_providers[name] = provider_class
|
|
315
|
+
|
|
316
|
+
# Factory methods
|
|
317
|
+
|
|
318
|
+
def create_embedding(self, name: str, params: dict | None = None) -> EmbeddingProvider:
|
|
319
|
+
"""Create an embedding provider instance."""
|
|
320
|
+
self._ensure_providers_loaded()
|
|
321
|
+
if name not in self._embedding_providers:
|
|
322
|
+
available = ", ".join(self._embedding_providers.keys()) or "none"
|
|
323
|
+
raise ValueError(
|
|
324
|
+
f"Unknown embedding provider: '{name}'. "
|
|
325
|
+
f"Available providers: {available}. "
|
|
326
|
+
f"Install missing dependencies or check provider name."
|
|
327
|
+
)
|
|
328
|
+
try:
|
|
329
|
+
return self._embedding_providers[name](**(params or {}))
|
|
330
|
+
except Exception as e:
|
|
331
|
+
raise RuntimeError(
|
|
332
|
+
f"Failed to create embedding provider '{name}': {e}\n"
|
|
333
|
+
f"Make sure required dependencies are installed."
|
|
334
|
+
) from e
|
|
335
|
+
|
|
336
|
+
def create_summarization(self, name: str, params: dict | None = None) -> SummarizationProvider:
|
|
337
|
+
"""Create a summarization provider instance."""
|
|
338
|
+
self._ensure_providers_loaded()
|
|
339
|
+
if name not in self._summarization_providers:
|
|
340
|
+
available = ", ".join(self._summarization_providers.keys()) or "none"
|
|
341
|
+
raise ValueError(
|
|
342
|
+
f"Unknown summarization provider: '{name}'. "
|
|
343
|
+
f"Available providers: {available}. "
|
|
344
|
+
f"Install missing dependencies or check provider name."
|
|
345
|
+
)
|
|
346
|
+
try:
|
|
347
|
+
return self._summarization_providers[name](**(params or {}))
|
|
348
|
+
except Exception as e:
|
|
349
|
+
raise RuntimeError(
|
|
350
|
+
f"Failed to create summarization provider '{name}': {e}\n"
|
|
351
|
+
f"Make sure required dependencies are installed."
|
|
352
|
+
) from e
|
|
353
|
+
|
|
354
|
+
def create_tagging(self, name: str, params: dict | None = None) -> TaggingProvider:
|
|
355
|
+
"""Create a tagging provider instance."""
|
|
356
|
+
self._ensure_providers_loaded()
|
|
357
|
+
if name not in self._tagging_providers:
|
|
358
|
+
available = ", ".join(self._tagging_providers.keys()) or "none"
|
|
359
|
+
raise ValueError(
|
|
360
|
+
f"Unknown tagging provider: '{name}'. "
|
|
361
|
+
f"Available providers: {available}. "
|
|
362
|
+
f"Install missing dependencies or check provider name."
|
|
363
|
+
)
|
|
364
|
+
try:
|
|
365
|
+
return self._tagging_providers[name](**(params or {}))
|
|
366
|
+
except Exception as e:
|
|
367
|
+
raise RuntimeError(
|
|
368
|
+
f"Failed to create tagging provider '{name}': {e}\n"
|
|
369
|
+
f"Make sure required dependencies are installed."
|
|
370
|
+
) from e
|
|
371
|
+
|
|
372
|
+
def create_document(self, name: str, params: dict | None = None) -> DocumentProvider:
|
|
373
|
+
"""Create a document provider instance."""
|
|
374
|
+
self._ensure_providers_loaded()
|
|
375
|
+
if name not in self._document_providers:
|
|
376
|
+
available = ", ".join(self._document_providers.keys()) or "none"
|
|
377
|
+
raise ValueError(
|
|
378
|
+
f"Unknown document provider: '{name}'. "
|
|
379
|
+
f"Available providers: {available}. "
|
|
380
|
+
f"Install missing dependencies or check provider name."
|
|
381
|
+
)
|
|
382
|
+
try:
|
|
383
|
+
return self._document_providers[name](**(params or {}))
|
|
384
|
+
except Exception as e:
|
|
385
|
+
raise RuntimeError(
|
|
386
|
+
f"Failed to create document provider '{name}': {e}\n"
|
|
387
|
+
f"Make sure required dependencies are installed."
|
|
388
|
+
) from e
|
|
389
|
+
|
|
390
|
+
# Introspection
|
|
391
|
+
|
|
392
|
+
def list_embedding_providers(self) -> list[str]:
|
|
393
|
+
"""List registered embedding provider names."""
|
|
394
|
+
return list(self._embedding_providers.keys())
|
|
395
|
+
|
|
396
|
+
def list_summarization_providers(self) -> list[str]:
|
|
397
|
+
"""List registered summarization provider names."""
|
|
398
|
+
return list(self._summarization_providers.keys())
|
|
399
|
+
|
|
400
|
+
def list_tagging_providers(self) -> list[str]:
|
|
401
|
+
"""List registered tagging provider names."""
|
|
402
|
+
return list(self._tagging_providers.keys())
|
|
403
|
+
|
|
404
|
+
def list_document_providers(self) -> list[str]:
|
|
405
|
+
"""List registered document provider names."""
|
|
406
|
+
return list(self._document_providers.keys())
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
# Global registry instance
|
|
410
|
+
# Concrete providers register themselves on import
|
|
411
|
+
_registry = ProviderRegistry()
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def get_registry() -> ProviderRegistry:
|
|
415
|
+
"""Get the global provider registry."""
|
|
416
|
+
return _registry
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document providers for fetching content from various URI schemes.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
from .base import Document, DocumentProvider, get_registry
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FileDocumentProvider:
|
|
12
|
+
"""
|
|
13
|
+
Fetches documents from the local filesystem.
|
|
14
|
+
|
|
15
|
+
Supports file:// URIs and attempts to detect content type from extension.
|
|
16
|
+
Performs text extraction for PDF and HTML files.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
EXTENSION_TYPES = {
|
|
20
|
+
".md": "text/markdown",
|
|
21
|
+
".markdown": "text/markdown",
|
|
22
|
+
".txt": "text/plain",
|
|
23
|
+
".py": "text/x-python",
|
|
24
|
+
".js": "text/javascript",
|
|
25
|
+
".ts": "text/typescript",
|
|
26
|
+
".json": "application/json",
|
|
27
|
+
".yaml": "text/yaml",
|
|
28
|
+
".yml": "text/yaml",
|
|
29
|
+
".html": "text/html",
|
|
30
|
+
".htm": "text/html",
|
|
31
|
+
".css": "text/css",
|
|
32
|
+
".xml": "application/xml",
|
|
33
|
+
".rst": "text/x-rst",
|
|
34
|
+
".pdf": "application/pdf",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
def supports(self, uri: str) -> bool:
|
|
38
|
+
"""Check if this is a file:// URI or bare path."""
|
|
39
|
+
return uri.startswith("file://") or uri.startswith("/")
|
|
40
|
+
|
|
41
|
+
def fetch(self, uri: str) -> Document:
|
|
42
|
+
"""Read file content from the filesystem with text extraction for PDF/HTML."""
|
|
43
|
+
# Normalize to path
|
|
44
|
+
if uri.startswith("file://"):
|
|
45
|
+
path_str = uri.removeprefix("file://")
|
|
46
|
+
else:
|
|
47
|
+
path_str = uri
|
|
48
|
+
|
|
49
|
+
path = Path(path_str)
|
|
50
|
+
|
|
51
|
+
if not path.exists():
|
|
52
|
+
raise IOError(f"File not found: {path}")
|
|
53
|
+
|
|
54
|
+
if not path.is_file():
|
|
55
|
+
raise IOError(f"Not a file: {path}")
|
|
56
|
+
|
|
57
|
+
# Detect content type
|
|
58
|
+
suffix = path.suffix.lower()
|
|
59
|
+
content_type = self.EXTENSION_TYPES.get(suffix, "text/plain")
|
|
60
|
+
|
|
61
|
+
# Extract text based on file type
|
|
62
|
+
if suffix == ".pdf":
|
|
63
|
+
content = self._extract_pdf_text(path)
|
|
64
|
+
elif suffix in (".html", ".htm"):
|
|
65
|
+
content = self._extract_html_text(path)
|
|
66
|
+
else:
|
|
67
|
+
# Read as plain text
|
|
68
|
+
try:
|
|
69
|
+
content = path.read_text(encoding="utf-8")
|
|
70
|
+
except UnicodeDecodeError:
|
|
71
|
+
raise IOError(f"Cannot read file as text: {path}")
|
|
72
|
+
|
|
73
|
+
# Gather metadata
|
|
74
|
+
stat = path.stat()
|
|
75
|
+
metadata = {
|
|
76
|
+
"size": stat.st_size,
|
|
77
|
+
"modified": stat.st_mtime,
|
|
78
|
+
"name": path.name,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return Document(
|
|
82
|
+
uri=f"file://{path.resolve()}", # Normalize to absolute
|
|
83
|
+
content=content,
|
|
84
|
+
content_type=content_type,
|
|
85
|
+
metadata=metadata,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def _extract_pdf_text(self, path: Path) -> str:
|
|
89
|
+
"""Extract text from PDF file."""
|
|
90
|
+
try:
|
|
91
|
+
from pypdf import PdfReader
|
|
92
|
+
except ImportError:
|
|
93
|
+
raise IOError(
|
|
94
|
+
f"PDF support requires 'pypdf' library. "
|
|
95
|
+
f"Install with: pip install pypdf\n"
|
|
96
|
+
f"Cannot read PDF: {path}"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
reader = PdfReader(path)
|
|
101
|
+
text_parts = []
|
|
102
|
+
for page in reader.pages:
|
|
103
|
+
text = page.extract_text()
|
|
104
|
+
if text:
|
|
105
|
+
text_parts.append(text)
|
|
106
|
+
|
|
107
|
+
if not text_parts:
|
|
108
|
+
raise IOError(f"No text extracted from PDF: {path}")
|
|
109
|
+
|
|
110
|
+
return "\n\n".join(text_parts)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
raise IOError(f"Failed to extract text from PDF {path}: {e}")
|
|
113
|
+
|
|
114
|
+
def _extract_html_text(self, path: Path) -> str:
|
|
115
|
+
"""Extract text from HTML file."""
|
|
116
|
+
try:
|
|
117
|
+
from bs4 import BeautifulSoup
|
|
118
|
+
except ImportError:
|
|
119
|
+
raise IOError(
|
|
120
|
+
f"HTML text extraction requires 'beautifulsoup4' library. "
|
|
121
|
+
f"Install with: pip install beautifulsoup4\n"
|
|
122
|
+
f"Cannot extract text from HTML: {path}"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
html_content = path.read_text(encoding="utf-8")
|
|
127
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
128
|
+
|
|
129
|
+
# Remove script and style elements
|
|
130
|
+
for script in soup(["script", "style"]):
|
|
131
|
+
script.decompose()
|
|
132
|
+
|
|
133
|
+
# Get text
|
|
134
|
+
text = soup.get_text()
|
|
135
|
+
|
|
136
|
+
# Clean up whitespace
|
|
137
|
+
lines = (line.strip() for line in text.splitlines())
|
|
138
|
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
139
|
+
text = '\n'.join(chunk for chunk in chunks if chunk)
|
|
140
|
+
|
|
141
|
+
return text
|
|
142
|
+
except Exception as e:
|
|
143
|
+
raise IOError(f"Failed to extract text from HTML {path}: {e}")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class HttpDocumentProvider:
|
|
147
|
+
"""
|
|
148
|
+
Fetches documents from HTTP/HTTPS URLs.
|
|
149
|
+
|
|
150
|
+
Requires the `requests` library (optional dependency).
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
def __init__(self, timeout: int = 30, max_size: int = 10_000_000):
|
|
154
|
+
"""
|
|
155
|
+
Args:
|
|
156
|
+
timeout: Request timeout in seconds
|
|
157
|
+
max_size: Maximum content size in bytes
|
|
158
|
+
"""
|
|
159
|
+
self.timeout = timeout
|
|
160
|
+
self.max_size = max_size
|
|
161
|
+
|
|
162
|
+
def supports(self, uri: str) -> bool:
|
|
163
|
+
"""Check if this is an HTTP(S) URL."""
|
|
164
|
+
return uri.startswith("http://") or uri.startswith("https://")
|
|
165
|
+
|
|
166
|
+
def fetch(self, uri: str) -> Document:
|
|
167
|
+
"""Fetch content from HTTP URL."""
|
|
168
|
+
try:
|
|
169
|
+
import requests
|
|
170
|
+
except ImportError:
|
|
171
|
+
raise RuntimeError("HTTP document fetching requires 'requests' library")
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
# Use context manager to ensure connection is closed
|
|
175
|
+
with requests.get(
|
|
176
|
+
uri,
|
|
177
|
+
timeout=self.timeout,
|
|
178
|
+
headers={"User-Agent": "keep/0.1"},
|
|
179
|
+
stream=True,
|
|
180
|
+
) as response:
|
|
181
|
+
response.raise_for_status()
|
|
182
|
+
|
|
183
|
+
# Check size
|
|
184
|
+
content_length = response.headers.get("content-length")
|
|
185
|
+
if content_length and int(content_length) > self.max_size:
|
|
186
|
+
raise IOError(f"Content too large: {content_length} bytes")
|
|
187
|
+
|
|
188
|
+
# Read content with size limit
|
|
189
|
+
content = response.text[:self.max_size]
|
|
190
|
+
|
|
191
|
+
# Get content type
|
|
192
|
+
content_type = response.headers.get("content-type", "text/plain")
|
|
193
|
+
if ";" in content_type:
|
|
194
|
+
content_type = content_type.split(";")[0].strip()
|
|
195
|
+
|
|
196
|
+
return Document(
|
|
197
|
+
uri=uri,
|
|
198
|
+
content=content,
|
|
199
|
+
content_type=content_type,
|
|
200
|
+
metadata={
|
|
201
|
+
"status_code": response.status_code,
|
|
202
|
+
"headers": dict(response.headers),
|
|
203
|
+
},
|
|
204
|
+
)
|
|
205
|
+
except requests.RequestException as e:
|
|
206
|
+
raise IOError(f"Failed to fetch {uri}: {e}")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class CompositeDocumentProvider:
|
|
210
|
+
"""
|
|
211
|
+
Combines multiple document providers, delegating to the appropriate one.
|
|
212
|
+
|
|
213
|
+
This is the default provider used by Keeper.
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
def __init__(self, providers: list[DocumentProvider] | None = None):
|
|
217
|
+
"""
|
|
218
|
+
Args:
|
|
219
|
+
providers: List of providers to try. If None, uses defaults.
|
|
220
|
+
"""
|
|
221
|
+
if providers is None:
|
|
222
|
+
self._providers = [
|
|
223
|
+
FileDocumentProvider(),
|
|
224
|
+
HttpDocumentProvider(),
|
|
225
|
+
]
|
|
226
|
+
else:
|
|
227
|
+
self._providers = list(providers)
|
|
228
|
+
|
|
229
|
+
def supports(self, uri: str) -> bool:
|
|
230
|
+
"""Check if any provider supports this URI."""
|
|
231
|
+
return any(p.supports(uri) for p in self._providers)
|
|
232
|
+
|
|
233
|
+
def fetch(self, uri: str) -> Document:
|
|
234
|
+
"""Fetch using the first provider that supports this URI."""
|
|
235
|
+
for provider in self._providers:
|
|
236
|
+
if provider.supports(uri):
|
|
237
|
+
return provider.fetch(uri)
|
|
238
|
+
|
|
239
|
+
raise ValueError(f"No provider supports URI: {uri}")
|
|
240
|
+
|
|
241
|
+
def add_provider(self, provider: DocumentProvider) -> None:
|
|
242
|
+
"""Add a provider to the list (checked first)."""
|
|
243
|
+
self._providers.insert(0, provider)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
# Register providers
|
|
247
|
+
_registry = get_registry()
|
|
248
|
+
_registry.register_document("file", FileDocumentProvider)
|
|
249
|
+
_registry.register_document("http", HttpDocumentProvider)
|
|
250
|
+
_registry.register_document("composite", CompositeDocumentProvider)
|