admina-framework 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. admina/__init__.py +34 -0
  2. admina/cli/__init__.py +14 -0
  3. admina/cli/commands/__init__.py +14 -0
  4. admina/cli/main.py +1522 -0
  5. admina/cli/templates/admina.yaml.j2 +77 -0
  6. admina/cli/templates/docker-compose.yml.j2 +254 -0
  7. admina/cli/templates/env.j2 +10 -0
  8. admina/cli/templates/main.py.j2 +95 -0
  9. admina/cli/templates/plugin.py.j2 +145 -0
  10. admina/cli/templates/plugin_pyproject.toml.j2 +15 -0
  11. admina/cli/templates/plugin_readme.md.j2 +27 -0
  12. admina/cli/templates/plugin_test.py.j2 +48 -0
  13. admina/core/__init__.py +14 -0
  14. admina/core/config.py +497 -0
  15. admina/core/event_bus.py +112 -0
  16. admina/core/secrets.py +257 -0
  17. admina/core/types.py +146 -0
  18. admina/dashboard/__init__.py +8 -0
  19. admina/dashboard/static/heimdall.png +0 -0
  20. admina/dashboard/static/index.html +1045 -0
  21. admina/dashboard/static/vendor/alpinejs.min.js +5 -0
  22. admina/domains/__init__.py +14 -0
  23. admina/domains/agent_security/__init__.py +41 -0
  24. admina/domains/agent_security/firewall.py +634 -0
  25. admina/domains/agent_security/loop_breaker.py +176 -0
  26. admina/domains/ai_infra/__init__.py +79 -0
  27. admina/domains/ai_infra/llm_engine.py +477 -0
  28. admina/domains/ai_infra/rag.py +817 -0
  29. admina/domains/ai_infra/webui.py +292 -0
  30. admina/domains/compliance/__init__.py +109 -0
  31. admina/domains/compliance/cross_regulation.py +314 -0
  32. admina/domains/compliance/eu_ai_act.py +367 -0
  33. admina/domains/compliance/forensic.py +380 -0
  34. admina/domains/compliance/gdpr.py +331 -0
  35. admina/domains/compliance/nis2.py +258 -0
  36. admina/domains/compliance/oisg.py +658 -0
  37. admina/domains/compliance/otel.py +101 -0
  38. admina/domains/data_sovereignty/__init__.py +42 -0
  39. admina/domains/data_sovereignty/classification.py +102 -0
  40. admina/domains/data_sovereignty/pii.py +260 -0
  41. admina/domains/data_sovereignty/residency.py +121 -0
  42. admina/integrations/__init__.py +14 -0
  43. admina/integrations/_engines.py +63 -0
  44. admina/integrations/cheshirecat/__init__.py +13 -0
  45. admina/integrations/cheshirecat/admina-plugin/admina_governance.py +207 -0
  46. admina/integrations/crewai/__init__.py +13 -0
  47. admina/integrations/crewai/callbacks.py +347 -0
  48. admina/integrations/langchain/__init__.py +13 -0
  49. admina/integrations/langchain/callbacks.py +341 -0
  50. admina/integrations/n8n/__init__.py +14 -0
  51. admina/integrations/openclaw/__init__.py +14 -0
  52. admina/plugins/__init__.py +49 -0
  53. admina/plugins/base.py +633 -0
  54. admina/plugins/builtin/__init__.py +14 -0
  55. admina/plugins/builtin/adapters/__init__.py +14 -0
  56. admina/plugins/builtin/adapters/ollama.py +120 -0
  57. admina/plugins/builtin/adapters/openai.py +138 -0
  58. admina/plugins/builtin/alerts/__init__.py +14 -0
  59. admina/plugins/builtin/alerts/log.py +66 -0
  60. admina/plugins/builtin/alerts/webhook.py +102 -0
  61. admina/plugins/builtin/auth/__init__.py +14 -0
  62. admina/plugins/builtin/auth/apikey.py +138 -0
  63. admina/plugins/builtin/compliance/__init__.py +14 -0
  64. admina/plugins/builtin/compliance/eu_ai_act.py +202 -0
  65. admina/plugins/builtin/connectors/__init__.py +14 -0
  66. admina/plugins/builtin/connectors/chromadb.py +137 -0
  67. admina/plugins/builtin/connectors/filesystem.py +111 -0
  68. admina/plugins/builtin/forensic/__init__.py +14 -0
  69. admina/plugins/builtin/forensic/filesystem.py +163 -0
  70. admina/plugins/builtin/forensic/minio.py +180 -0
  71. admina/plugins/builtin/guards/__init__.py +0 -0
  72. admina/plugins/builtin/guards/guardrailsai_guard.py +172 -0
  73. admina/plugins/builtin/pii/__init__.py +14 -0
  74. admina/plugins/builtin/pii/spacy_regex.py +160 -0
  75. admina/plugins/builtin/transports/__init__.py +14 -0
  76. admina/plugins/builtin/transports/http_rest.py +97 -0
  77. admina/plugins/builtin/transports/mcp.py +173 -0
  78. admina/plugins/registry.py +356 -0
  79. admina/proxy/__init__.py +15 -0
  80. admina/proxy/api/__init__.py +17 -0
  81. admina/proxy/api/dashboard.py +925 -0
  82. admina/proxy/api/integration.py +153 -0
  83. admina/proxy/config.py +214 -0
  84. admina/proxy/engine_bridge.py +306 -0
  85. admina/proxy/governance.py +232 -0
  86. admina/proxy/main.py +1484 -0
  87. admina/proxy/multi_upstream.py +156 -0
  88. admina/proxy/state.py +97 -0
  89. admina/py.typed +0 -0
  90. admina/sdk/__init__.py +34 -0
  91. admina/sdk/_compat.py +43 -0
  92. admina/sdk/compliance_kit.py +359 -0
  93. admina/sdk/governed_agent.py +391 -0
  94. admina/sdk/governed_data.py +434 -0
  95. admina/sdk/governed_model.py +241 -0
  96. admina_framework-0.9.0.dist-info/METADATA +575 -0
  97. admina_framework-0.9.0.dist-info/RECORD +102 -0
  98. admina_framework-0.9.0.dist-info/WHEEL +5 -0
  99. admina_framework-0.9.0.dist-info/entry_points.txt +2 -0
  100. admina_framework-0.9.0.dist-info/licenses/LICENSE +191 -0
  101. admina_framework-0.9.0.dist-info/licenses/NOTICE +16 -0
  102. admina_framework-0.9.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,817 @@
1
+ # Copyright © 2025–2026 Stefano Noferi & Admina contributors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Admina — RAG pipeline module.
16
+
17
+ Document ingest (PDF, DOCX, HTML, CSV, XML), chunking (recursive character
18
+ and semantic), embedding (via Ollama or sentence-transformers), vector store
19
+ (ChromaDB default), and retrieval with ranking and source citation.
20
+
21
+ Heavy operations (container start) are orchestrated by the CLI / Docker
22
+ Compose template. This module provides the pure-Python pipeline logic,
23
+ structured configuration, and Compose fragment generation for the ChromaDB
24
+ container.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import asyncio
30
+ import hashlib
31
+ import logging
32
+ import re
33
+ from dataclasses import dataclass, field
34
+ from enum import Enum
35
+ from pathlib import Path
36
+ from typing import Any, Protocol, runtime_checkable
37
+
38
+ logger = logging.getLogger("admina.ai_infra.rag")
39
+
40
+
41
+ # ── Document types ───────────────────────────────────────────
42
+
43
+
44
+ class DocumentFormat(str, Enum):
45
+ """Supported document formats for ingest."""
46
+
47
+ PDF = "pdf"
48
+ DOCX = "docx"
49
+ HTML = "html"
50
+ CSV = "csv"
51
+ XML = "xml"
52
+ TXT = "txt"
53
+
54
+
55
+ @dataclass
56
+ class Document:
57
+ """A raw document before chunking."""
58
+
59
+ content: str
60
+ metadata: dict[str, Any] = field(default_factory=dict)
61
+ source: str = ""
62
+ format: DocumentFormat = DocumentFormat.TXT
63
+ doc_id: str = ""
64
+
65
+ def __post_init__(self) -> None:
66
+ if not self.doc_id:
67
+ self.doc_id = hashlib.sha256((self.source + self.content[:256]).encode()).hexdigest()[
68
+ :16
69
+ ]
70
+
71
+
72
+ @dataclass
73
+ class Chunk:
74
+ """A chunk of text produced by a chunking strategy."""
75
+
76
+ text: str
77
+ metadata: dict[str, Any] = field(default_factory=dict)
78
+ chunk_index: int = 0
79
+ doc_id: str = ""
80
+ chunk_id: str = ""
81
+
82
+ def __post_init__(self) -> None:
83
+ if not self.chunk_id:
84
+ self.chunk_id = hashlib.sha256(
85
+ f"{self.doc_id}:{self.chunk_index}:{self.text[:64]}".encode()
86
+ ).hexdigest()[:16]
87
+
88
+
89
+ @dataclass
90
+ class RetrievalResult:
91
+ """A single retrieval result with ranking and citation."""
92
+
93
+ text: str
94
+ score: float
95
+ source: str = ""
96
+ doc_id: str = ""
97
+ chunk_index: int = 0
98
+ metadata: dict[str, Any] = field(default_factory=dict)
99
+
100
+
101
+ # ── Document parsing ─────────────────────────────────────────
102
+
103
+
104
+ _FORMAT_BY_SUFFIX: dict[str, DocumentFormat] = {
105
+ ".pdf": DocumentFormat.PDF,
106
+ ".docx": DocumentFormat.DOCX,
107
+ ".html": DocumentFormat.HTML,
108
+ ".htm": DocumentFormat.HTML,
109
+ ".csv": DocumentFormat.CSV,
110
+ ".xml": DocumentFormat.XML,
111
+ ".txt": DocumentFormat.TXT,
112
+ }
113
+
114
+
115
+ def detect_format(path: str | Path) -> DocumentFormat:
116
+ """Detect document format from file extension.
117
+
118
+ Args:
119
+ path: File path or name.
120
+
121
+ Returns:
122
+ The detected :class:`DocumentFormat`, defaulting to TXT.
123
+ """
124
+ suffix = Path(path).suffix.lower()
125
+ return _FORMAT_BY_SUFFIX.get(suffix, DocumentFormat.TXT)
126
+
127
+
128
+ def parse_plain_text(content: str) -> str:
129
+ """Identity parser for plain text / fallback."""
130
+ return content
131
+
132
+
133
+ def parse_html(content: str) -> str:
134
+ """Strip HTML tags and return plain text."""
135
+ text = re.sub(r"<script[^>]*>.*?</script>", "", content, flags=re.DOTALL)
136
+ text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL)
137
+ text = re.sub(r"<[^>]+>", " ", text)
138
+ text = re.sub(r"\s+", " ", text).strip()
139
+ return text
140
+
141
+
142
+ def parse_csv(content: str) -> str:
143
+ """Convert CSV content to a newline-delimited text representation."""
144
+ lines = content.strip().splitlines()
145
+ return "\n".join(lines)
146
+
147
+
148
+ def parse_xml(content: str) -> str:
149
+ """Strip XML tags and return text content."""
150
+ text = re.sub(r"<[^>]+>", " ", content)
151
+ text = re.sub(r"\s+", " ", text).strip()
152
+ return text
153
+
154
+
155
+ def parse_document(content: str, fmt: DocumentFormat) -> str:
156
+ """Parse raw content into plain text using the appropriate parser.
157
+
158
+ Args:
159
+ content: Raw file content as string.
160
+ fmt: The document format.
161
+
162
+ Returns:
163
+ Extracted plain text.
164
+
165
+ Note:
166
+ PDF and DOCX require optional dependencies (``PyPDF2`` /
167
+ ``python-docx``). When unavailable the raw content is returned
168
+ as-is with a warning.
169
+ """
170
+ if fmt == DocumentFormat.HTML:
171
+ return parse_html(content)
172
+ if fmt == DocumentFormat.CSV:
173
+ return parse_csv(content)
174
+ if fmt == DocumentFormat.XML:
175
+ return parse_xml(content)
176
+ if fmt == DocumentFormat.PDF:
177
+ logger.warning("PDF binary parsing requires PyPDF2; returning raw text")
178
+ return content
179
+ if fmt == DocumentFormat.DOCX:
180
+ logger.warning("DOCX binary parsing requires python-docx; returning raw text")
181
+ return content
182
+ return parse_plain_text(content)
183
+
184
+
185
+ # ── Chunking strategies ──────────────────────────────────────
186
+
187
+
188
+ class ChunkingStrategy(str, Enum):
189
+ """Available chunking strategies."""
190
+
191
+ RECURSIVE_CHARACTER = "recursive_character"
192
+ SEMANTIC = "semantic"
193
+
194
+
195
+ def chunk_recursive_character(
196
+ text: str,
197
+ *,
198
+ chunk_size: int = 512,
199
+ chunk_overlap: int = 50,
200
+ separators: list[str] | None = None,
201
+ ) -> list[str]:
202
+ """Split text using recursive character splitting.
203
+
204
+ Tries each separator in order. When a segment exceeds *chunk_size*
205
+ the next separator is tried. Falls back to character-level split.
206
+
207
+ Args:
208
+ text: Input text.
209
+ chunk_size: Maximum characters per chunk.
210
+ chunk_overlap: Overlap between consecutive chunks.
211
+ separators: Separator hierarchy (default: paragraph, sentence,
212
+ word, character).
213
+
214
+ Returns:
215
+ List of text chunks.
216
+ """
217
+ if separators is None:
218
+ separators = ["\n\n", "\n", ". ", " ", ""]
219
+
220
+ if not text or chunk_size <= 0:
221
+ return []
222
+
223
+ return _recursive_split(text, separators, chunk_size, chunk_overlap)
224
+
225
+
226
+ def _recursive_split(
227
+ text: str,
228
+ separators: list[str],
229
+ chunk_size: int,
230
+ chunk_overlap: int,
231
+ ) -> list[str]:
232
+ """Recursive helper for character splitting."""
233
+ if len(text) <= chunk_size:
234
+ return [text] if text.strip() else []
235
+
236
+ sep = separators[0] if separators else ""
237
+ remaining_seps = separators[1:] if len(separators) > 1 else []
238
+
239
+ if sep == "":
240
+ return _fixed_size_split(text, chunk_size, chunk_overlap)
241
+
242
+ parts = text.split(sep)
243
+ chunks: list[str] = []
244
+ current = ""
245
+
246
+ for part in parts:
247
+ candidate = (current + sep + part) if current else part
248
+ if len(candidate) <= chunk_size:
249
+ current = candidate
250
+ else:
251
+ if current:
252
+ chunks.append(current.strip())
253
+ if len(part) > chunk_size and remaining_seps:
254
+ chunks.extend(_recursive_split(part, remaining_seps, chunk_size, chunk_overlap))
255
+ current = ""
256
+ else:
257
+ current = part
258
+
259
+ if current and current.strip():
260
+ chunks.append(current.strip())
261
+
262
+ return _apply_overlap(chunks, chunk_overlap) if chunk_overlap > 0 else chunks
263
+
264
+
265
+ def _fixed_size_split(text: str, size: int, overlap: int) -> list[str]:
266
+ """Character-level fixed-size split with overlap."""
267
+ chunks: list[str] = []
268
+ start = 0
269
+ while start < len(text):
270
+ end = min(start + size, len(text))
271
+ chunk = text[start:end].strip()
272
+ if chunk:
273
+ chunks.append(chunk)
274
+ start += size - overlap if overlap < size else size
275
+ return chunks
276
+
277
+
278
+ def _apply_overlap(chunks: list[str], overlap: int) -> list[str]:
279
+ """Add overlap context from previous chunk to each subsequent chunk."""
280
+ if len(chunks) <= 1 or overlap <= 0:
281
+ return chunks
282
+ result = [chunks[0]]
283
+ for i in range(1, len(chunks)):
284
+ prefix = chunks[i - 1][-overlap:]
285
+ result.append(prefix + chunks[i])
286
+ return result
287
+
288
+
289
+ def chunk_semantic(
290
+ text: str,
291
+ *,
292
+ chunk_size: int = 512,
293
+ min_chunk_size: int = 100,
294
+ ) -> list[str]:
295
+ """Split text at sentence boundaries respecting chunk size limits.
296
+
297
+ A lightweight semantic chunker that splits on sentence endings and
298
+ keeps paragraphs together when they fit within *chunk_size*.
299
+
300
+ Args:
301
+ text: Input text.
302
+ chunk_size: Target maximum characters per chunk.
303
+ min_chunk_size: Minimum characters to form a chunk.
304
+
305
+ Returns:
306
+ List of text chunks.
307
+ """
308
+ if not text or chunk_size <= 0:
309
+ return []
310
+
311
+ sentences = re.split(r"(?<=[.!?])\s+", text)
312
+ chunks: list[str] = []
313
+ current = ""
314
+
315
+ for sentence in sentences:
316
+ candidate = (current + " " + sentence).strip() if current else sentence
317
+ if len(candidate) <= chunk_size:
318
+ current = candidate
319
+ else:
320
+ if current and len(current) >= min_chunk_size:
321
+ chunks.append(current)
322
+ current = sentence
323
+
324
+ if current and current.strip():
325
+ chunks.append(current.strip())
326
+
327
+ return chunks
328
+
329
+
330
+ # ── Embedding interface ──────────────────────────────────────
331
+
332
+
333
+ class EmbeddingBackend(str, Enum):
334
+ """Supported embedding backends."""
335
+
336
+ OLLAMA = "ollama"
337
+ SENTENCE_TRANSFORMERS = "sentence-transformers"
338
+
339
+
340
+ @runtime_checkable
341
+ class EmbeddingProvider(Protocol):
342
+ """Protocol for embedding providers."""
343
+
344
+ async def embed(self, texts: list[str]) -> list[list[float]]:
345
+ """Generate embeddings for a batch of texts."""
346
+ ...
347
+
348
+ @property
349
+ def dimension(self) -> int:
350
+ """Embedding vector dimension."""
351
+ ...
352
+
353
+
354
+ @dataclass
355
+ class OllamaEmbedder:
356
+ """Embedding provider using the Ollama API.
357
+
358
+ Args:
359
+ base_url: Ollama server URL.
360
+ model: Embedding model name.
361
+ """
362
+
363
+ base_url: str = "http://localhost:11434"
364
+ model: str = "nomic-embed-text"
365
+ _dimension: int = 768
366
+
367
+ async def embed(self, texts: list[str]) -> list[list[float]]:
368
+ """Generate embeddings via Ollama ``/api/embed``.
369
+
370
+ Args:
371
+ texts: Texts to embed.
372
+
373
+ Returns:
374
+ List of embedding vectors.
375
+
376
+ Raises:
377
+ RuntimeError: When the Ollama API is unreachable or returns
378
+ an error.
379
+ """
380
+ try:
381
+ import httpx # type: ignore[import-untyped]
382
+ except ImportError as exc:
383
+ raise ImportError(
384
+ "The 'httpx' package is required for OllamaEmbedder. "
385
+ "Install it with: pip install httpx"
386
+ ) from exc
387
+
388
+ embeddings: list[list[float]] = []
389
+ async with httpx.AsyncClient(timeout=60.0) as client:
390
+ for text in texts:
391
+ resp = await client.post(
392
+ f"{self.base_url}/api/embed",
393
+ json={"model": self.model, "input": text},
394
+ )
395
+ resp.raise_for_status()
396
+ data = resp.json()
397
+ embedding = data.get("embeddings", [[]])[0]
398
+ embeddings.append(embedding)
399
+ if embedding:
400
+ self._dimension = len(embedding)
401
+ return embeddings
402
+
403
+ @property
404
+ def dimension(self) -> int:
405
+ """Embedding vector dimension."""
406
+ return self._dimension
407
+
408
+
409
+ @dataclass
410
+ class SentenceTransformerEmbedder:
411
+ """Embedding provider using sentence-transformers.
412
+
413
+ Args:
414
+ model_name: HuggingFace model name.
415
+ """
416
+
417
+ model_name: str = "all-MiniLM-L6-v2"
418
+ _model: Any = field(default=None, repr=False)
419
+ _dimension: int = 384
420
+
421
+ def _get_model(self) -> Any:
422
+ """Lazily load the sentence-transformers model."""
423
+ if self._model is None:
424
+ try:
425
+ from sentence_transformers import (
426
+ SentenceTransformer, # type: ignore[import-untyped]
427
+ )
428
+ except ImportError as exc:
429
+ raise ImportError(
430
+ "The 'sentence-transformers' package is required. "
431
+ "Install it with: pip install sentence-transformers"
432
+ ) from exc
433
+ self._model = SentenceTransformer(self.model_name)
434
+ self._dimension = self._model.get_sentence_embedding_dimension()
435
+ return self._model
436
+
437
+ async def embed(self, texts: list[str]) -> list[list[float]]:
438
+ """Generate embeddings via sentence-transformers.
439
+
440
+ Args:
441
+ texts: Texts to embed.
442
+
443
+ Returns:
444
+ List of embedding vectors.
445
+ """
446
+ model = self._get_model()
447
+ loop = asyncio.get_event_loop()
448
+ vectors = await loop.run_in_executor(None, model.encode, texts)
449
+ return [v.tolist() for v in vectors]
450
+
451
+ @property
452
+ def dimension(self) -> int:
453
+ """Embedding vector dimension."""
454
+ return self._dimension
455
+
456
+
457
+ # ── Vector store interface ───────────────────────────────────
458
+
459
+
460
+ @dataclass
461
+ class ChromaDBConfig:
462
+ """Container configuration for ChromaDB."""
463
+
464
+ image: str = "chromadb/chroma:0.5.23"
465
+ container_name: str = "admina-chromadb"
466
+ port: int = 8000
467
+ persist_directory: str = "/chroma/chroma"
468
+
469
+ def to_compose_dict(self) -> dict[str, Any]:
470
+ """Return a docker-compose service fragment."""
471
+ return {
472
+ "image": self.image,
473
+ "container_name": self.container_name,
474
+ "ports": [f"{self.port}:8000"],
475
+ "volumes": ["chromadb-data:/chroma/chroma"],
476
+ "environment": [
477
+ "IS_PERSISTENT=TRUE",
478
+ f"PERSIST_DIRECTORY={self.persist_directory}",
479
+ "ANONYMIZED_TELEMETRY=FALSE",
480
+ ],
481
+ "healthcheck": {
482
+ "test": ["CMD", "curl", "-f", "http://localhost:8000/api/v1/heartbeat"],
483
+ "interval": "15s",
484
+ "timeout": "5s",
485
+ "retries": 5,
486
+ },
487
+ "networks": ["admina"],
488
+ "restart": "unless-stopped",
489
+ }
490
+
491
+
492
+ # ── Ingest result ────────────────────────────────────────────
493
+
494
+
495
+ @dataclass
496
+ class IngestResult:
497
+ """Result of a document ingest operation."""
498
+
499
+ doc_count: int = 0
500
+ chunk_count: int = 0
501
+ sources: list[str] = field(default_factory=list)
502
+ errors: list[str] = field(default_factory=list)
503
+
504
+
505
+ # ── RAG Pipeline ─────────────────────────────────────────────
506
+
507
+
508
+ @dataclass
509
+ class RAGPipeline:
510
+ """Orchestrates the full RAG pipeline.
511
+
512
+ Handles document ingest (parse → chunk → embed → store) and retrieval
513
+ (query → embed → search → rank → cite).
514
+ """
515
+
516
+ chunk_size: int = 512
517
+ chunk_overlap: int = 50
518
+ chunking_strategy: ChunkingStrategy = ChunkingStrategy.RECURSIVE_CHARACTER
519
+ embedding_backend: EmbeddingBackend = EmbeddingBackend.OLLAMA
520
+ embedding_model: str = "nomic-embed-text"
521
+ chromadb_host: str = "localhost"
522
+ chromadb_port: int = 8000
523
+ collection_name: str = "admina_default"
524
+
525
+ # ── Factory ──────────────────────────────────────────────
526
+
527
+ @classmethod
528
+ def from_config(
529
+ cls,
530
+ *,
531
+ backend: str = "chromadb",
532
+ chunk_size: int = 512,
533
+ chunk_overlap: int = 50,
534
+ chunking_strategy: str = "recursive_character",
535
+ embedding_backend: str = "ollama",
536
+ embedding_model: str = "nomic-embed-text",
537
+ chromadb_host: str = "localhost",
538
+ chromadb_port: int = 8000,
539
+ collection_name: str = "admina_default",
540
+ ) -> RAGPipeline:
541
+ """Create a pipeline from admina.yaml values.
542
+
543
+ Args:
544
+ backend: Vector store backend (currently only ``"chromadb"``).
545
+ chunk_size: Maximum characters per chunk.
546
+ chunk_overlap: Overlap between consecutive chunks.
547
+ chunking_strategy: ``"recursive_character"`` or ``"semantic"``.
548
+ embedding_backend: ``"ollama"`` or ``"sentence-transformers"``.
549
+ embedding_model: Model name for embeddings.
550
+ chromadb_host: ChromaDB server host.
551
+ chromadb_port: ChromaDB server port.
552
+ collection_name: Default collection name.
553
+ """
554
+ return cls(
555
+ chunk_size=chunk_size,
556
+ chunk_overlap=chunk_overlap,
557
+ chunking_strategy=ChunkingStrategy(chunking_strategy),
558
+ embedding_backend=EmbeddingBackend(embedding_backend),
559
+ embedding_model=embedding_model,
560
+ chromadb_host=chromadb_host,
561
+ chromadb_port=chromadb_port,
562
+ collection_name=collection_name,
563
+ )
564
+
565
+ # ── Document parsing ─────────────────────────────────────
566
+
567
+ def parse(self, content: str, fmt: DocumentFormat) -> str:
568
+ """Parse raw content into plain text.
569
+
570
+ Args:
571
+ content: Raw file content.
572
+ fmt: Document format.
573
+
574
+ Returns:
575
+ Extracted plain text.
576
+ """
577
+ return parse_document(content, fmt)
578
+
579
+ # ── Chunking ─────────────────────────────────────────────
580
+
581
+ def chunk(self, text: str) -> list[Chunk]:
582
+ """Split text into chunks using the configured strategy.
583
+
584
+ Args:
585
+ text: Plain text to chunk.
586
+
587
+ Returns:
588
+ List of :class:`Chunk` objects.
589
+ """
590
+ if self.chunking_strategy == ChunkingStrategy.SEMANTIC:
591
+ raw_chunks = chunk_semantic(
592
+ text,
593
+ chunk_size=self.chunk_size,
594
+ )
595
+ else:
596
+ raw_chunks = chunk_recursive_character(
597
+ text,
598
+ chunk_size=self.chunk_size,
599
+ chunk_overlap=self.chunk_overlap,
600
+ )
601
+
602
+ return [Chunk(text=c, chunk_index=i) for i, c in enumerate(raw_chunks)]
603
+
604
+ # ── Embedding ────────────────────────────────────────────
605
+
606
+ def _get_embedder(self) -> OllamaEmbedder | SentenceTransformerEmbedder:
607
+ """Return the configured embedding provider."""
608
+ if self.embedding_backend == EmbeddingBackend.SENTENCE_TRANSFORMERS:
609
+ return SentenceTransformerEmbedder(model_name=self.embedding_model)
610
+ return OllamaEmbedder(model=self.embedding_model)
611
+
612
+ # ── Full ingest pipeline ─────────────────────────────────
613
+
614
+ async def ingest_documents(
615
+ self,
616
+ documents: list[Document],
617
+ ) -> IngestResult:
618
+ """Run the full ingest pipeline: parse → chunk → embed → store.
619
+
620
+ Args:
621
+ documents: Documents to ingest.
622
+
623
+ Returns:
624
+ An :class:`IngestResult` with counts and any errors.
625
+
626
+ Note:
627
+ Requires a running ChromaDB instance and embedding backend.
628
+ In unit tests, mock the ``_store_chunks`` method.
629
+ """
630
+ result = IngestResult()
631
+ all_chunks: list[Chunk] = []
632
+
633
+ for doc in documents:
634
+ try:
635
+ text = self.parse(doc.content, doc.format)
636
+ chunks = self.chunk(text)
637
+ for chunk in chunks:
638
+ chunk.doc_id = doc.doc_id
639
+ chunk.metadata.update(doc.metadata)
640
+ chunk.metadata["source"] = doc.source
641
+ all_chunks.extend(chunks)
642
+ result.doc_count += 1
643
+ result.sources.append(doc.source)
644
+ except (OSError, ValueError, RuntimeError) as exc:
645
+ result.errors.append(f"{doc.source}: {exc}")
646
+ logger.error("Ingest error for %s: %s", doc.source, exc)
647
+
648
+ result.chunk_count = len(all_chunks)
649
+
650
+ if all_chunks:
651
+ try:
652
+ await self._store_chunks(all_chunks)
653
+ except (OSError, ValueError, RuntimeError) as exc:
654
+ result.errors.append(f"store: {exc}")
655
+ logger.error("Failed to store chunks: %s", exc)
656
+
657
+ return result
658
+
659
+ def ingest_documents_sync(self, documents: list[Document]) -> IngestResult:
660
+ """Synchronous convenience wrapper for :meth:`ingest_documents`."""
661
+ return asyncio.get_event_loop().run_until_complete(self.ingest_documents(documents))
662
+
663
+ async def _store_chunks(self, chunks: list[Chunk]) -> None:
664
+ """Store chunks in ChromaDB via the plugin connector.
665
+
666
+ Args:
667
+ chunks: Processed chunks to store.
668
+ """
669
+ try:
670
+ import chromadb # type: ignore[import-untyped]
671
+ except ImportError as exc:
672
+ raise ImportError(
673
+ "The 'chromadb' package is required for vector storage. "
674
+ "Install it with: pip install chromadb"
675
+ ) from exc
676
+
677
+ client = chromadb.HttpClient(
678
+ host=self.chromadb_host,
679
+ port=self.chromadb_port,
680
+ )
681
+ collection = client.get_or_create_collection(name=self.collection_name)
682
+
683
+ embedder = self._get_embedder()
684
+ texts = [c.text for c in chunks]
685
+ embeddings = await embedder.embed(texts)
686
+
687
+ collection.add(
688
+ ids=[c.chunk_id for c in chunks],
689
+ documents=texts,
690
+ embeddings=embeddings,
691
+ metadatas=[c.metadata for c in chunks],
692
+ )
693
+
694
+ # ── Retrieval ────────────────────────────────────────────
695
+
696
+ async def retrieve(
697
+ self,
698
+ query: str,
699
+ *,
700
+ top_k: int = 5,
701
+ min_score: float = 0.0,
702
+ ) -> list[RetrievalResult]:
703
+ """Query the vector store and return ranked results with citations.
704
+
705
+ Args:
706
+ query: Search query string.
707
+ top_k: Maximum number of results.
708
+ min_score: Minimum similarity score threshold.
709
+
710
+ Returns:
711
+ Ranked list of :class:`RetrievalResult` objects.
712
+ """
713
+ try:
714
+ import chromadb # type: ignore[import-untyped]
715
+ except ImportError as exc:
716
+ raise ImportError(
717
+ "The 'chromadb' package is required for retrieval. "
718
+ "Install it with: pip install chromadb"
719
+ ) from exc
720
+
721
+ client = chromadb.HttpClient(
722
+ host=self.chromadb_host,
723
+ port=self.chromadb_port,
724
+ )
725
+ collection = client.get_or_create_collection(name=self.collection_name)
726
+
727
+ embedder = self._get_embedder()
728
+ query_embedding = (await embedder.embed([query]))[0]
729
+
730
+ raw = collection.query(
731
+ query_embeddings=[query_embedding],
732
+ n_results=top_k,
733
+ )
734
+
735
+ return self._rank_results(raw, min_score=min_score)
736
+
737
+ def retrieve_sync(
738
+ self,
739
+ query: str,
740
+ *,
741
+ top_k: int = 5,
742
+ min_score: float = 0.0,
743
+ ) -> list[RetrievalResult]:
744
+ """Synchronous convenience wrapper for :meth:`retrieve`."""
745
+ return asyncio.get_event_loop().run_until_complete(
746
+ self.retrieve(query, top_k=top_k, min_score=min_score)
747
+ )
748
+
749
+ @staticmethod
750
+ def _rank_results(
751
+ raw: dict[str, Any],
752
+ *,
753
+ min_score: float = 0.0,
754
+ ) -> list[RetrievalResult]:
755
+ """Convert ChromaDB query results to ranked RetrievalResults.
756
+
757
+ Args:
758
+ raw: Raw ChromaDB query output.
759
+ min_score: Filter results below this score.
760
+
761
+ Returns:
762
+ Sorted list of :class:`RetrievalResult`.
763
+ """
764
+ documents = raw.get("documents", [[]])[0]
765
+ metadatas = raw.get("metadatas", [[]])[0]
766
+ distances = raw.get("distances", [[]])[0]
767
+
768
+ results: list[RetrievalResult] = []
769
+ for text, meta, dist in zip(documents, metadatas, distances):
770
+ score = round(1.0 / (1.0 + dist), 4)
771
+ if score < min_score:
772
+ continue
773
+ results.append(
774
+ RetrievalResult(
775
+ text=text,
776
+ score=score,
777
+ source=meta.get("source", "") if meta else "",
778
+ doc_id=meta.get("doc_id", "") if meta else "",
779
+ chunk_index=meta.get("chunk_index", 0) if meta else 0,
780
+ metadata=meta or {},
781
+ )
782
+ )
783
+
784
+ results.sort(key=lambda r: r.score, reverse=True)
785
+ return results
786
+
787
+ # ── Compose generation ───────────────────────────────────
788
+
789
+ def compose_service(
790
+ self,
791
+ project_name: str = "admina",
792
+ ) -> dict[str, Any]:
793
+ """Return the docker-compose service dict for ChromaDB.
794
+
795
+ Args:
796
+ project_name: Used for container naming.
797
+ """
798
+ cfg = ChromaDBConfig(
799
+ container_name=f"{project_name}-chromadb",
800
+ port=self.chromadb_port,
801
+ )
802
+ return cfg.to_compose_dict()
803
+
804
+ # ── Status ───────────────────────────────────────────────
805
+
806
+ def summary(self) -> dict[str, Any]:
807
+ """Return a JSON-serialisable summary of pipeline config."""
808
+ return {
809
+ "chunk_size": self.chunk_size,
810
+ "chunk_overlap": self.chunk_overlap,
811
+ "chunking_strategy": self.chunking_strategy.value,
812
+ "embedding_backend": self.embedding_backend.value,
813
+ "embedding_model": self.embedding_model,
814
+ "chromadb_host": self.chromadb_host,
815
+ "chromadb_port": self.chromadb_port,
816
+ "collection_name": self.collection_name,
817
+ }