keep-skill 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
keep/providers/base.py ADDED
@@ -0,0 +1,416 @@
1
+ """
2
+ Base provider protocols.
3
+
4
+ These define the interfaces that concrete providers must implement.
5
+ Using Protocol for structural subtyping - no explicit inheritance required.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Any, Protocol, runtime_checkable
10
+
11
+
12
+ # -----------------------------------------------------------------------------
13
+ # Document Fetching
14
+ # -----------------------------------------------------------------------------
15
+
16
+ @dataclass
17
+ class Document:
18
+ """
19
+ A fetched document ready for processing.
20
+
21
+ Attributes:
22
+ uri: Original URI that was fetched
23
+ content: Text content of the document
24
+ content_type: MIME type if known (e.g., "text/markdown", "text/plain")
25
+ metadata: Additional metadata from the source (headers, file stats, etc.)
26
+ """
27
+ uri: str
28
+ content: str
29
+ content_type: str | None = None
30
+ metadata: dict[str, Any] | None = None
31
+
32
+
33
+ @runtime_checkable
34
+ class DocumentProvider(Protocol):
35
+ """
36
+ Fetches document content from a URI.
37
+
38
+ Implementations handle specific URI schemes (file://, https://, s3://, etc.)
39
+ and convert the content to text.
40
+
41
+ Example implementation:
42
+ class FileDocumentProvider:
43
+ def supports(self, uri: str) -> bool:
44
+ return uri.startswith("file://")
45
+
46
+ def fetch(self, uri: str) -> Document:
47
+ path = uri.removeprefix("file://")
48
+ content = Path(path).read_text()
49
+ return Document(uri=uri, content=content, content_type="text/plain")
50
+ """
51
+
52
+ def supports(self, uri: str) -> bool:
53
+ """
54
+ Check if this provider can handle the given URI.
55
+
56
+ Args:
57
+ uri: The URI to check
58
+
59
+ Returns:
60
+ True if this provider can fetch the URI
61
+ """
62
+ ...
63
+
64
+ def fetch(self, uri: str) -> Document:
65
+ """
66
+ Fetch and return the document content.
67
+
68
+ Args:
69
+ uri: The URI to fetch
70
+
71
+ Returns:
72
+ Document with text content
73
+
74
+ Raises:
75
+ ValueError: If URI is malformed
76
+ IOError: If document cannot be fetched
77
+ """
78
+ ...
79
+
80
+
81
+ # -----------------------------------------------------------------------------
82
+ # Embedding Generation
83
+ # -----------------------------------------------------------------------------
84
+
85
+ @runtime_checkable
86
+ class EmbeddingProvider(Protocol):
87
+ """
88
+ Generates vector embeddings from text.
89
+
90
+ Embeddings enable semantic similarity search. The same provider instance
91
+ must be used for both indexing and querying to ensure consistent vectors.
92
+
93
+ Example implementation:
94
+ class SentenceTransformerEmbedding:
95
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
96
+ from sentence_transformers import SentenceTransformer
97
+ self.model = SentenceTransformer(model_name)
98
+
99
+ @property
100
+ def dimension(self) -> int:
101
+ return self.model.get_sentence_embedding_dimension()
102
+
103
+ def embed(self, text: str) -> list[float]:
104
+ return self.model.encode(text).tolist()
105
+
106
+ def embed_batch(self, texts: list[str]) -> list[list[float]]:
107
+ return self.model.encode(texts).tolist()
108
+ """
109
+
110
+ @property
111
+ def dimension(self) -> int:
112
+ """
113
+ The dimensionality of the embedding vectors.
114
+
115
+ This must be consistent across all calls. ChromaDb and other vector
116
+ stores need this to configure the index.
117
+ """
118
+ ...
119
+
120
+ def embed(self, text: str) -> list[float]:
121
+ """
122
+ Generate an embedding vector for the given text.
123
+
124
+ Args:
125
+ text: The text to embed
126
+
127
+ Returns:
128
+ A list of floats representing the embedding vector
129
+ """
130
+ ...
131
+
132
+ def embed_batch(self, texts: list[str]) -> list[list[float]]:
133
+ """
134
+ Generate embeddings for multiple texts.
135
+
136
+ Batch processing is often more efficient than individual calls.
137
+
138
+ Args:
139
+ texts: List of texts to embed
140
+
141
+ Returns:
142
+ List of embedding vectors, one per input text
143
+ """
144
+ ...
145
+
146
+
147
+ # -----------------------------------------------------------------------------
148
+ # Summarization
149
+ # -----------------------------------------------------------------------------
150
+
151
+ @runtime_checkable
152
+ class SummarizationProvider(Protocol):
153
+ """
154
+ Generates concise summaries of document content.
155
+
156
+ Summaries are stored alongside items and enable quick recall without
157
+ fetching the original document. They're also used for full-text search.
158
+
159
+ Example implementation:
160
+ class OpenAISummarization:
161
+ def __init__(self, model: str = "gpt-4o-mini"):
162
+ self.client = OpenAI()
163
+ self.model = model
164
+
165
+ def summarize(self, content: str, max_length: int = 500) -> str:
166
+ response = self.client.chat.completions.create(
167
+ model=self.model,
168
+ messages=[
169
+ {"role": "system", "content": "Summarize concisely."},
170
+ {"role": "user", "content": content}
171
+ ],
172
+ max_tokens=max_length
173
+ )
174
+ return response.choices[0].message.content
175
+ """
176
+
177
+ def summarize(self, content: str, *, max_length: int = 500) -> str:
178
+ """
179
+ Generate a summary of the content.
180
+
181
+ Args:
182
+ content: The full document content
183
+ max_length: Approximate maximum length in characters
184
+
185
+ Returns:
186
+ A concise summary of the content
187
+ """
188
+ ...
189
+
190
+
191
+ # -----------------------------------------------------------------------------
192
+ # Tagging
193
+ # -----------------------------------------------------------------------------
194
+
195
+ @runtime_checkable
196
+ class TaggingProvider(Protocol):
197
+ """
198
+ Generates structured tags from document content.
199
+
200
+ Tags enable traditional navigation and filtering. The provider analyzes
201
+ content and returns relevant key-value pairs.
202
+
203
+ Example implementation:
204
+ class OpenAITagging:
205
+ def __init__(self, model: str = "gpt-4o-mini"):
206
+ self.client = OpenAI()
207
+ self.model = model
208
+
209
+ def tag(self, content: str) -> dict[str, str]:
210
+ response = self.client.chat.completions.create(
211
+ model=self.model,
212
+ messages=[...],
213
+ response_format={"type": "json_object"}
214
+ )
215
+ return json.loads(response.choices[0].message.content)
216
+ """
217
+
218
+ def tag(self, content: str) -> dict[str, str]:
219
+ """
220
+ Generate tags for the content.
221
+
222
+ Args:
223
+ content: The full document content
224
+
225
+ Returns:
226
+ Dictionary of tag key-value pairs
227
+
228
+ Note:
229
+ Keys should be lowercase with underscores (e.g., "content_type").
230
+ Values should be simple strings.
231
+ System tags (keys starting with "_") should not be generated here.
232
+ """
233
+ ...
234
+
235
+
236
+ # -----------------------------------------------------------------------------
237
+ # Provider Registry
238
+ # -----------------------------------------------------------------------------
239
+
240
+ class ProviderRegistry:
241
+ """
242
+ Registry for discovering and instantiating providers.
243
+
244
+ Providers are registered by name and can be instantiated from configuration.
245
+ This allows the store configuration (TOML) to specify providers by name
246
+ rather than requiring code changes.
247
+
248
+ Example:
249
+ registry = ProviderRegistry()
250
+ registry.register_embedding("sentence-transformers", SentenceTransformerEmbedding)
251
+ registry.register_embedding("openai", OpenAIEmbedding)
252
+
253
+ # Later, from config:
254
+ provider = registry.create_embedding("sentence-transformers", {"model": "all-MiniLM-L6-v2"})
255
+ """
256
+
257
+ def __init__(self):
258
+ self._embedding_providers: dict[str, type] = {}
259
+ self._summarization_providers: dict[str, type] = {}
260
+ self._tagging_providers: dict[str, type] = {}
261
+ self._document_providers: dict[str, type] = {}
262
+ self._lazy_loaded = False
263
+
264
+ def _ensure_providers_loaded(self) -> None:
265
+ """Lazily load all provider modules."""
266
+ if self._lazy_loaded:
267
+ return
268
+
269
+ self._lazy_loaded = True
270
+
271
+ # Import provider modules to trigger registration
272
+ # These imports are safe - they only register classes, don't instantiate
273
+ try:
274
+ from . import documents
275
+ except ImportError:
276
+ pass # Document provider might not be available
277
+
278
+ try:
279
+ from . import embeddings
280
+ except ImportError:
281
+ pass # Embedding providers might not be available
282
+
283
+ try:
284
+ from . import summarization
285
+ except ImportError:
286
+ pass # Summarization providers might not be available
287
+
288
+ try:
289
+ from . import llm
290
+ except ImportError:
291
+ pass # LLM providers might not be available
292
+
293
+ try:
294
+ from . import mlx
295
+ except ImportError:
296
+ pass # MLX providers might not be available
297
+
298
+ # Registration methods
299
+
300
+ def register_embedding(self, name: str, provider_class: type) -> None:
301
+ """Register an embedding provider class."""
302
+ self._embedding_providers[name] = provider_class
303
+
304
+ def register_summarization(self, name: str, provider_class: type) -> None:
305
+ """Register a summarization provider class."""
306
+ self._summarization_providers[name] = provider_class
307
+
308
+ def register_tagging(self, name: str, provider_class: type) -> None:
309
+ """Register a tagging provider class."""
310
+ self._tagging_providers[name] = provider_class
311
+
312
+ def register_document(self, name: str, provider_class: type) -> None:
313
+ """Register a document provider class."""
314
+ self._document_providers[name] = provider_class
315
+
316
+ # Factory methods
317
+
318
+ def create_embedding(self, name: str, params: dict | None = None) -> EmbeddingProvider:
319
+ """Create an embedding provider instance."""
320
+ self._ensure_providers_loaded()
321
+ if name not in self._embedding_providers:
322
+ available = ", ".join(self._embedding_providers.keys()) or "none"
323
+ raise ValueError(
324
+ f"Unknown embedding provider: '{name}'. "
325
+ f"Available providers: {available}. "
326
+ f"Install missing dependencies or check provider name."
327
+ )
328
+ try:
329
+ return self._embedding_providers[name](**(params or {}))
330
+ except Exception as e:
331
+ raise RuntimeError(
332
+ f"Failed to create embedding provider '{name}': {e}\n"
333
+ f"Make sure required dependencies are installed."
334
+ ) from e
335
+
336
+ def create_summarization(self, name: str, params: dict | None = None) -> SummarizationProvider:
337
+ """Create a summarization provider instance."""
338
+ self._ensure_providers_loaded()
339
+ if name not in self._summarization_providers:
340
+ available = ", ".join(self._summarization_providers.keys()) or "none"
341
+ raise ValueError(
342
+ f"Unknown summarization provider: '{name}'. "
343
+ f"Available providers: {available}. "
344
+ f"Install missing dependencies or check provider name."
345
+ )
346
+ try:
347
+ return self._summarization_providers[name](**(params or {}))
348
+ except Exception as e:
349
+ raise RuntimeError(
350
+ f"Failed to create summarization provider '{name}': {e}\n"
351
+ f"Make sure required dependencies are installed."
352
+ ) from e
353
+
354
+ def create_tagging(self, name: str, params: dict | None = None) -> TaggingProvider:
355
+ """Create a tagging provider instance."""
356
+ self._ensure_providers_loaded()
357
+ if name not in self._tagging_providers:
358
+ available = ", ".join(self._tagging_providers.keys()) or "none"
359
+ raise ValueError(
360
+ f"Unknown tagging provider: '{name}'. "
361
+ f"Available providers: {available}. "
362
+ f"Install missing dependencies or check provider name."
363
+ )
364
+ try:
365
+ return self._tagging_providers[name](**(params or {}))
366
+ except Exception as e:
367
+ raise RuntimeError(
368
+ f"Failed to create tagging provider '{name}': {e}\n"
369
+ f"Make sure required dependencies are installed."
370
+ ) from e
371
+
372
+ def create_document(self, name: str, params: dict | None = None) -> DocumentProvider:
373
+ """Create a document provider instance."""
374
+ self._ensure_providers_loaded()
375
+ if name not in self._document_providers:
376
+ available = ", ".join(self._document_providers.keys()) or "none"
377
+ raise ValueError(
378
+ f"Unknown document provider: '{name}'. "
379
+ f"Available providers: {available}. "
380
+ f"Install missing dependencies or check provider name."
381
+ )
382
+ try:
383
+ return self._document_providers[name](**(params or {}))
384
+ except Exception as e:
385
+ raise RuntimeError(
386
+ f"Failed to create document provider '{name}': {e}\n"
387
+ f"Make sure required dependencies are installed."
388
+ ) from e
389
+
390
+ # Introspection
391
+
392
+ def list_embedding_providers(self) -> list[str]:
393
+ """List registered embedding provider names."""
394
+ return list(self._embedding_providers.keys())
395
+
396
+ def list_summarization_providers(self) -> list[str]:
397
+ """List registered summarization provider names."""
398
+ return list(self._summarization_providers.keys())
399
+
400
+ def list_tagging_providers(self) -> list[str]:
401
+ """List registered tagging provider names."""
402
+ return list(self._tagging_providers.keys())
403
+
404
+ def list_document_providers(self) -> list[str]:
405
+ """List registered document provider names."""
406
+ return list(self._document_providers.keys())
407
+
408
+
409
+ # Global registry instance
410
+ # Concrete providers register themselves on import
411
+ _registry = ProviderRegistry()
412
+
413
+
414
+ def get_registry() -> ProviderRegistry:
415
+ """Get the global provider registry."""
416
+ return _registry
@@ -0,0 +1,250 @@
1
+ """
2
+ Document providers for fetching content from various URI schemes.
3
+ """
4
+
5
+ from pathlib import Path
6
+ from urllib.parse import urlparse
7
+
8
+ from .base import Document, DocumentProvider, get_registry
9
+
10
+
11
+ class FileDocumentProvider:
12
+ """
13
+ Fetches documents from the local filesystem.
14
+
15
+ Supports file:// URIs and attempts to detect content type from extension.
16
+ Performs text extraction for PDF and HTML files.
17
+ """
18
+
19
+ EXTENSION_TYPES = {
20
+ ".md": "text/markdown",
21
+ ".markdown": "text/markdown",
22
+ ".txt": "text/plain",
23
+ ".py": "text/x-python",
24
+ ".js": "text/javascript",
25
+ ".ts": "text/typescript",
26
+ ".json": "application/json",
27
+ ".yaml": "text/yaml",
28
+ ".yml": "text/yaml",
29
+ ".html": "text/html",
30
+ ".htm": "text/html",
31
+ ".css": "text/css",
32
+ ".xml": "application/xml",
33
+ ".rst": "text/x-rst",
34
+ ".pdf": "application/pdf",
35
+ }
36
+
37
+ def supports(self, uri: str) -> bool:
38
+ """Check if this is a file:// URI or bare path."""
39
+ return uri.startswith("file://") or uri.startswith("/")
40
+
41
+ def fetch(self, uri: str) -> Document:
42
+ """Read file content from the filesystem with text extraction for PDF/HTML."""
43
+ # Normalize to path
44
+ if uri.startswith("file://"):
45
+ path_str = uri.removeprefix("file://")
46
+ else:
47
+ path_str = uri
48
+
49
+ path = Path(path_str)
50
+
51
+ if not path.exists():
52
+ raise IOError(f"File not found: {path}")
53
+
54
+ if not path.is_file():
55
+ raise IOError(f"Not a file: {path}")
56
+
57
+ # Detect content type
58
+ suffix = path.suffix.lower()
59
+ content_type = self.EXTENSION_TYPES.get(suffix, "text/plain")
60
+
61
+ # Extract text based on file type
62
+ if suffix == ".pdf":
63
+ content = self._extract_pdf_text(path)
64
+ elif suffix in (".html", ".htm"):
65
+ content = self._extract_html_text(path)
66
+ else:
67
+ # Read as plain text
68
+ try:
69
+ content = path.read_text(encoding="utf-8")
70
+ except UnicodeDecodeError:
71
+ raise IOError(f"Cannot read file as text: {path}")
72
+
73
+ # Gather metadata
74
+ stat = path.stat()
75
+ metadata = {
76
+ "size": stat.st_size,
77
+ "modified": stat.st_mtime,
78
+ "name": path.name,
79
+ }
80
+
81
+ return Document(
82
+ uri=f"file://{path.resolve()}", # Normalize to absolute
83
+ content=content,
84
+ content_type=content_type,
85
+ metadata=metadata,
86
+ )
87
+
88
+ def _extract_pdf_text(self, path: Path) -> str:
89
+ """Extract text from PDF file."""
90
+ try:
91
+ from pypdf import PdfReader
92
+ except ImportError:
93
+ raise IOError(
94
+ f"PDF support requires 'pypdf' library. "
95
+ f"Install with: pip install pypdf\n"
96
+ f"Cannot read PDF: {path}"
97
+ )
98
+
99
+ try:
100
+ reader = PdfReader(path)
101
+ text_parts = []
102
+ for page in reader.pages:
103
+ text = page.extract_text()
104
+ if text:
105
+ text_parts.append(text)
106
+
107
+ if not text_parts:
108
+ raise IOError(f"No text extracted from PDF: {path}")
109
+
110
+ return "\n\n".join(text_parts)
111
+ except Exception as e:
112
+ raise IOError(f"Failed to extract text from PDF {path}: {e}")
113
+
114
+ def _extract_html_text(self, path: Path) -> str:
115
+ """Extract text from HTML file."""
116
+ try:
117
+ from bs4 import BeautifulSoup
118
+ except ImportError:
119
+ raise IOError(
120
+ f"HTML text extraction requires 'beautifulsoup4' library. "
121
+ f"Install with: pip install beautifulsoup4\n"
122
+ f"Cannot extract text from HTML: {path}"
123
+ )
124
+
125
+ try:
126
+ html_content = path.read_text(encoding="utf-8")
127
+ soup = BeautifulSoup(html_content, "html.parser")
128
+
129
+ # Remove script and style elements
130
+ for script in soup(["script", "style"]):
131
+ script.decompose()
132
+
133
+ # Get text
134
+ text = soup.get_text()
135
+
136
+ # Clean up whitespace
137
+ lines = (line.strip() for line in text.splitlines())
138
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
139
+ text = '\n'.join(chunk for chunk in chunks if chunk)
140
+
141
+ return text
142
+ except Exception as e:
143
+ raise IOError(f"Failed to extract text from HTML {path}: {e}")
144
+
145
+
146
+ class HttpDocumentProvider:
147
+ """
148
+ Fetches documents from HTTP/HTTPS URLs.
149
+
150
+ Requires the `requests` library (optional dependency).
151
+ """
152
+
153
+ def __init__(self, timeout: int = 30, max_size: int = 10_000_000):
154
+ """
155
+ Args:
156
+ timeout: Request timeout in seconds
157
+ max_size: Maximum content size in bytes
158
+ """
159
+ self.timeout = timeout
160
+ self.max_size = max_size
161
+
162
+ def supports(self, uri: str) -> bool:
163
+ """Check if this is an HTTP(S) URL."""
164
+ return uri.startswith("http://") or uri.startswith("https://")
165
+
166
+ def fetch(self, uri: str) -> Document:
167
+ """Fetch content from HTTP URL."""
168
+ try:
169
+ import requests
170
+ except ImportError:
171
+ raise RuntimeError("HTTP document fetching requires 'requests' library")
172
+
173
+ try:
174
+ # Use context manager to ensure connection is closed
175
+ with requests.get(
176
+ uri,
177
+ timeout=self.timeout,
178
+ headers={"User-Agent": "keep/0.1"},
179
+ stream=True,
180
+ ) as response:
181
+ response.raise_for_status()
182
+
183
+ # Check size
184
+ content_length = response.headers.get("content-length")
185
+ if content_length and int(content_length) > self.max_size:
186
+ raise IOError(f"Content too large: {content_length} bytes")
187
+
188
+ # Read content with size limit
189
+ content = response.text[:self.max_size]
190
+
191
+ # Get content type
192
+ content_type = response.headers.get("content-type", "text/plain")
193
+ if ";" in content_type:
194
+ content_type = content_type.split(";")[0].strip()
195
+
196
+ return Document(
197
+ uri=uri,
198
+ content=content,
199
+ content_type=content_type,
200
+ metadata={
201
+ "status_code": response.status_code,
202
+ "headers": dict(response.headers),
203
+ },
204
+ )
205
+ except requests.RequestException as e:
206
+ raise IOError(f"Failed to fetch {uri}: {e}")
207
+
208
+
209
+ class CompositeDocumentProvider:
210
+ """
211
+ Combines multiple document providers, delegating to the appropriate one.
212
+
213
+ This is the default provider used by Keeper.
214
+ """
215
+
216
+ def __init__(self, providers: list[DocumentProvider] | None = None):
217
+ """
218
+ Args:
219
+ providers: List of providers to try. If None, uses defaults.
220
+ """
221
+ if providers is None:
222
+ self._providers = [
223
+ FileDocumentProvider(),
224
+ HttpDocumentProvider(),
225
+ ]
226
+ else:
227
+ self._providers = list(providers)
228
+
229
+ def supports(self, uri: str) -> bool:
230
+ """Check if any provider supports this URI."""
231
+ return any(p.supports(uri) for p in self._providers)
232
+
233
+ def fetch(self, uri: str) -> Document:
234
+ """Fetch using the first provider that supports this URI."""
235
+ for provider in self._providers:
236
+ if provider.supports(uri):
237
+ return provider.fetch(uri)
238
+
239
+ raise ValueError(f"No provider supports URI: {uri}")
240
+
241
+ def add_provider(self, provider: DocumentProvider) -> None:
242
+ """Add a provider to the list (checked first)."""
243
+ self._providers.insert(0, provider)
244
+
245
+
246
+ # Register providers
247
+ _registry = get_registry()
248
+ _registry.register_document("file", FileDocumentProvider)
249
+ _registry.register_document("http", HttpDocumentProvider)
250
+ _registry.register_document("composite", CompositeDocumentProvider)