admina-framework 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- admina/__init__.py +34 -0
- admina/cli/__init__.py +14 -0
- admina/cli/commands/__init__.py +14 -0
- admina/cli/main.py +1522 -0
- admina/cli/templates/admina.yaml.j2 +77 -0
- admina/cli/templates/docker-compose.yml.j2 +254 -0
- admina/cli/templates/env.j2 +10 -0
- admina/cli/templates/main.py.j2 +95 -0
- admina/cli/templates/plugin.py.j2 +145 -0
- admina/cli/templates/plugin_pyproject.toml.j2 +15 -0
- admina/cli/templates/plugin_readme.md.j2 +27 -0
- admina/cli/templates/plugin_test.py.j2 +48 -0
- admina/core/__init__.py +14 -0
- admina/core/config.py +497 -0
- admina/core/event_bus.py +112 -0
- admina/core/secrets.py +257 -0
- admina/core/types.py +146 -0
- admina/dashboard/__init__.py +8 -0
- admina/dashboard/static/heimdall.png +0 -0
- admina/dashboard/static/index.html +1045 -0
- admina/dashboard/static/vendor/alpinejs.min.js +5 -0
- admina/domains/__init__.py +14 -0
- admina/domains/agent_security/__init__.py +41 -0
- admina/domains/agent_security/firewall.py +634 -0
- admina/domains/agent_security/loop_breaker.py +176 -0
- admina/domains/ai_infra/__init__.py +79 -0
- admina/domains/ai_infra/llm_engine.py +477 -0
- admina/domains/ai_infra/rag.py +817 -0
- admina/domains/ai_infra/webui.py +292 -0
- admina/domains/compliance/__init__.py +109 -0
- admina/domains/compliance/cross_regulation.py +314 -0
- admina/domains/compliance/eu_ai_act.py +367 -0
- admina/domains/compliance/forensic.py +380 -0
- admina/domains/compliance/gdpr.py +331 -0
- admina/domains/compliance/nis2.py +258 -0
- admina/domains/compliance/oisg.py +658 -0
- admina/domains/compliance/otel.py +101 -0
- admina/domains/data_sovereignty/__init__.py +42 -0
- admina/domains/data_sovereignty/classification.py +102 -0
- admina/domains/data_sovereignty/pii.py +260 -0
- admina/domains/data_sovereignty/residency.py +121 -0
- admina/integrations/__init__.py +14 -0
- admina/integrations/_engines.py +63 -0
- admina/integrations/cheshirecat/__init__.py +13 -0
- admina/integrations/cheshirecat/admina-plugin/admina_governance.py +207 -0
- admina/integrations/crewai/__init__.py +13 -0
- admina/integrations/crewai/callbacks.py +347 -0
- admina/integrations/langchain/__init__.py +13 -0
- admina/integrations/langchain/callbacks.py +341 -0
- admina/integrations/n8n/__init__.py +14 -0
- admina/integrations/openclaw/__init__.py +14 -0
- admina/plugins/__init__.py +49 -0
- admina/plugins/base.py +633 -0
- admina/plugins/builtin/__init__.py +14 -0
- admina/plugins/builtin/adapters/__init__.py +14 -0
- admina/plugins/builtin/adapters/ollama.py +120 -0
- admina/plugins/builtin/adapters/openai.py +138 -0
- admina/plugins/builtin/alerts/__init__.py +14 -0
- admina/plugins/builtin/alerts/log.py +66 -0
- admina/plugins/builtin/alerts/webhook.py +102 -0
- admina/plugins/builtin/auth/__init__.py +14 -0
- admina/plugins/builtin/auth/apikey.py +138 -0
- admina/plugins/builtin/compliance/__init__.py +14 -0
- admina/plugins/builtin/compliance/eu_ai_act.py +202 -0
- admina/plugins/builtin/connectors/__init__.py +14 -0
- admina/plugins/builtin/connectors/chromadb.py +137 -0
- admina/plugins/builtin/connectors/filesystem.py +111 -0
- admina/plugins/builtin/forensic/__init__.py +14 -0
- admina/plugins/builtin/forensic/filesystem.py +163 -0
- admina/plugins/builtin/forensic/minio.py +180 -0
- admina/plugins/builtin/guards/__init__.py +0 -0
- admina/plugins/builtin/guards/guardrailsai_guard.py +172 -0
- admina/plugins/builtin/pii/__init__.py +14 -0
- admina/plugins/builtin/pii/spacy_regex.py +160 -0
- admina/plugins/builtin/transports/__init__.py +14 -0
- admina/plugins/builtin/transports/http_rest.py +97 -0
- admina/plugins/builtin/transports/mcp.py +173 -0
- admina/plugins/registry.py +356 -0
- admina/proxy/__init__.py +15 -0
- admina/proxy/api/__init__.py +17 -0
- admina/proxy/api/dashboard.py +925 -0
- admina/proxy/api/integration.py +153 -0
- admina/proxy/config.py +214 -0
- admina/proxy/engine_bridge.py +306 -0
- admina/proxy/governance.py +232 -0
- admina/proxy/main.py +1484 -0
- admina/proxy/multi_upstream.py +156 -0
- admina/proxy/state.py +97 -0
- admina/py.typed +0 -0
- admina/sdk/__init__.py +34 -0
- admina/sdk/_compat.py +43 -0
- admina/sdk/compliance_kit.py +359 -0
- admina/sdk/governed_agent.py +391 -0
- admina/sdk/governed_data.py +434 -0
- admina/sdk/governed_model.py +241 -0
- admina_framework-0.9.0.dist-info/METADATA +575 -0
- admina_framework-0.9.0.dist-info/RECORD +102 -0
- admina_framework-0.9.0.dist-info/WHEEL +5 -0
- admina_framework-0.9.0.dist-info/entry_points.txt +2 -0
- admina_framework-0.9.0.dist-info/licenses/LICENSE +191 -0
- admina_framework-0.9.0.dist-info/licenses/NOTICE +16 -0
- admina_framework-0.9.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,817 @@
|
|
|
1
|
+
# Copyright © 2025–2026 Stefano Noferi & Admina contributors
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Admina — RAG pipeline module.
|
|
16
|
+
|
|
17
|
+
Document ingest (PDF, DOCX, HTML, CSV, XML), chunking (recursive character
|
|
18
|
+
and semantic), embedding (via Ollama or sentence-transformers), vector store
|
|
19
|
+
(ChromaDB default), and retrieval with ranking and source citation.
|
|
20
|
+
|
|
21
|
+
Heavy operations (container start) are orchestrated by the CLI / Docker
|
|
22
|
+
Compose template. This module provides the pure-Python pipeline logic,
|
|
23
|
+
structured configuration, and Compose fragment generation for the ChromaDB
|
|
24
|
+
container.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import asyncio
|
|
30
|
+
import hashlib
|
|
31
|
+
import logging
|
|
32
|
+
import re
|
|
33
|
+
from dataclasses import dataclass, field
|
|
34
|
+
from enum import Enum
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
from typing import Any, Protocol, runtime_checkable
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger("admina.ai_infra.rag")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ── Document types ───────────────────────────────────────────
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DocumentFormat(str, Enum):
|
|
45
|
+
"""Supported document formats for ingest."""
|
|
46
|
+
|
|
47
|
+
PDF = "pdf"
|
|
48
|
+
DOCX = "docx"
|
|
49
|
+
HTML = "html"
|
|
50
|
+
CSV = "csv"
|
|
51
|
+
XML = "xml"
|
|
52
|
+
TXT = "txt"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class Document:
|
|
57
|
+
"""A raw document before chunking."""
|
|
58
|
+
|
|
59
|
+
content: str
|
|
60
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
61
|
+
source: str = ""
|
|
62
|
+
format: DocumentFormat = DocumentFormat.TXT
|
|
63
|
+
doc_id: str = ""
|
|
64
|
+
|
|
65
|
+
def __post_init__(self) -> None:
|
|
66
|
+
if not self.doc_id:
|
|
67
|
+
self.doc_id = hashlib.sha256((self.source + self.content[:256]).encode()).hexdigest()[
|
|
68
|
+
:16
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class Chunk:
|
|
74
|
+
"""A chunk of text produced by a chunking strategy."""
|
|
75
|
+
|
|
76
|
+
text: str
|
|
77
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
78
|
+
chunk_index: int = 0
|
|
79
|
+
doc_id: str = ""
|
|
80
|
+
chunk_id: str = ""
|
|
81
|
+
|
|
82
|
+
def __post_init__(self) -> None:
|
|
83
|
+
if not self.chunk_id:
|
|
84
|
+
self.chunk_id = hashlib.sha256(
|
|
85
|
+
f"{self.doc_id}:{self.chunk_index}:{self.text[:64]}".encode()
|
|
86
|
+
).hexdigest()[:16]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class RetrievalResult:
|
|
91
|
+
"""A single retrieval result with ranking and citation."""
|
|
92
|
+
|
|
93
|
+
text: str
|
|
94
|
+
score: float
|
|
95
|
+
source: str = ""
|
|
96
|
+
doc_id: str = ""
|
|
97
|
+
chunk_index: int = 0
|
|
98
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ── Document parsing ─────────────────────────────────────────
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
_FORMAT_BY_SUFFIX: dict[str, DocumentFormat] = {
|
|
105
|
+
".pdf": DocumentFormat.PDF,
|
|
106
|
+
".docx": DocumentFormat.DOCX,
|
|
107
|
+
".html": DocumentFormat.HTML,
|
|
108
|
+
".htm": DocumentFormat.HTML,
|
|
109
|
+
".csv": DocumentFormat.CSV,
|
|
110
|
+
".xml": DocumentFormat.XML,
|
|
111
|
+
".txt": DocumentFormat.TXT,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def detect_format(path: str | Path) -> DocumentFormat:
|
|
116
|
+
"""Detect document format from file extension.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
path: File path or name.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
The detected :class:`DocumentFormat`, defaulting to TXT.
|
|
123
|
+
"""
|
|
124
|
+
suffix = Path(path).suffix.lower()
|
|
125
|
+
return _FORMAT_BY_SUFFIX.get(suffix, DocumentFormat.TXT)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def parse_plain_text(content: str) -> str:
|
|
129
|
+
"""Identity parser for plain text / fallback."""
|
|
130
|
+
return content
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def parse_html(content: str) -> str:
|
|
134
|
+
"""Strip HTML tags and return plain text."""
|
|
135
|
+
text = re.sub(r"<script[^>]*>.*?</script>", "", content, flags=re.DOTALL)
|
|
136
|
+
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL)
|
|
137
|
+
text = re.sub(r"<[^>]+>", " ", text)
|
|
138
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
139
|
+
return text
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def parse_csv(content: str) -> str:
|
|
143
|
+
"""Convert CSV content to a newline-delimited text representation."""
|
|
144
|
+
lines = content.strip().splitlines()
|
|
145
|
+
return "\n".join(lines)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def parse_xml(content: str) -> str:
|
|
149
|
+
"""Strip XML tags and return text content."""
|
|
150
|
+
text = re.sub(r"<[^>]+>", " ", content)
|
|
151
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
152
|
+
return text
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def parse_document(content: str, fmt: DocumentFormat) -> str:
|
|
156
|
+
"""Parse raw content into plain text using the appropriate parser.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
content: Raw file content as string.
|
|
160
|
+
fmt: The document format.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Extracted plain text.
|
|
164
|
+
|
|
165
|
+
Note:
|
|
166
|
+
PDF and DOCX require optional dependencies (``PyPDF2`` /
|
|
167
|
+
``python-docx``). When unavailable the raw content is returned
|
|
168
|
+
as-is with a warning.
|
|
169
|
+
"""
|
|
170
|
+
if fmt == DocumentFormat.HTML:
|
|
171
|
+
return parse_html(content)
|
|
172
|
+
if fmt == DocumentFormat.CSV:
|
|
173
|
+
return parse_csv(content)
|
|
174
|
+
if fmt == DocumentFormat.XML:
|
|
175
|
+
return parse_xml(content)
|
|
176
|
+
if fmt == DocumentFormat.PDF:
|
|
177
|
+
logger.warning("PDF binary parsing requires PyPDF2; returning raw text")
|
|
178
|
+
return content
|
|
179
|
+
if fmt == DocumentFormat.DOCX:
|
|
180
|
+
logger.warning("DOCX binary parsing requires python-docx; returning raw text")
|
|
181
|
+
return content
|
|
182
|
+
return parse_plain_text(content)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# ── Chunking strategies ──────────────────────────────────────
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class ChunkingStrategy(str, Enum):
|
|
189
|
+
"""Available chunking strategies."""
|
|
190
|
+
|
|
191
|
+
RECURSIVE_CHARACTER = "recursive_character"
|
|
192
|
+
SEMANTIC = "semantic"
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def chunk_recursive_character(
|
|
196
|
+
text: str,
|
|
197
|
+
*,
|
|
198
|
+
chunk_size: int = 512,
|
|
199
|
+
chunk_overlap: int = 50,
|
|
200
|
+
separators: list[str] | None = None,
|
|
201
|
+
) -> list[str]:
|
|
202
|
+
"""Split text using recursive character splitting.
|
|
203
|
+
|
|
204
|
+
Tries each separator in order. When a segment exceeds *chunk_size*
|
|
205
|
+
the next separator is tried. Falls back to character-level split.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
text: Input text.
|
|
209
|
+
chunk_size: Maximum characters per chunk.
|
|
210
|
+
chunk_overlap: Overlap between consecutive chunks.
|
|
211
|
+
separators: Separator hierarchy (default: paragraph, sentence,
|
|
212
|
+
word, character).
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
List of text chunks.
|
|
216
|
+
"""
|
|
217
|
+
if separators is None:
|
|
218
|
+
separators = ["\n\n", "\n", ". ", " ", ""]
|
|
219
|
+
|
|
220
|
+
if not text or chunk_size <= 0:
|
|
221
|
+
return []
|
|
222
|
+
|
|
223
|
+
return _recursive_split(text, separators, chunk_size, chunk_overlap)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _recursive_split(
|
|
227
|
+
text: str,
|
|
228
|
+
separators: list[str],
|
|
229
|
+
chunk_size: int,
|
|
230
|
+
chunk_overlap: int,
|
|
231
|
+
) -> list[str]:
|
|
232
|
+
"""Recursive helper for character splitting."""
|
|
233
|
+
if len(text) <= chunk_size:
|
|
234
|
+
return [text] if text.strip() else []
|
|
235
|
+
|
|
236
|
+
sep = separators[0] if separators else ""
|
|
237
|
+
remaining_seps = separators[1:] if len(separators) > 1 else []
|
|
238
|
+
|
|
239
|
+
if sep == "":
|
|
240
|
+
return _fixed_size_split(text, chunk_size, chunk_overlap)
|
|
241
|
+
|
|
242
|
+
parts = text.split(sep)
|
|
243
|
+
chunks: list[str] = []
|
|
244
|
+
current = ""
|
|
245
|
+
|
|
246
|
+
for part in parts:
|
|
247
|
+
candidate = (current + sep + part) if current else part
|
|
248
|
+
if len(candidate) <= chunk_size:
|
|
249
|
+
current = candidate
|
|
250
|
+
else:
|
|
251
|
+
if current:
|
|
252
|
+
chunks.append(current.strip())
|
|
253
|
+
if len(part) > chunk_size and remaining_seps:
|
|
254
|
+
chunks.extend(_recursive_split(part, remaining_seps, chunk_size, chunk_overlap))
|
|
255
|
+
current = ""
|
|
256
|
+
else:
|
|
257
|
+
current = part
|
|
258
|
+
|
|
259
|
+
if current and current.strip():
|
|
260
|
+
chunks.append(current.strip())
|
|
261
|
+
|
|
262
|
+
return _apply_overlap(chunks, chunk_overlap) if chunk_overlap > 0 else chunks
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _fixed_size_split(text: str, size: int, overlap: int) -> list[str]:
|
|
266
|
+
"""Character-level fixed-size split with overlap."""
|
|
267
|
+
chunks: list[str] = []
|
|
268
|
+
start = 0
|
|
269
|
+
while start < len(text):
|
|
270
|
+
end = min(start + size, len(text))
|
|
271
|
+
chunk = text[start:end].strip()
|
|
272
|
+
if chunk:
|
|
273
|
+
chunks.append(chunk)
|
|
274
|
+
start += size - overlap if overlap < size else size
|
|
275
|
+
return chunks
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _apply_overlap(chunks: list[str], overlap: int) -> list[str]:
|
|
279
|
+
"""Add overlap context from previous chunk to each subsequent chunk."""
|
|
280
|
+
if len(chunks) <= 1 or overlap <= 0:
|
|
281
|
+
return chunks
|
|
282
|
+
result = [chunks[0]]
|
|
283
|
+
for i in range(1, len(chunks)):
|
|
284
|
+
prefix = chunks[i - 1][-overlap:]
|
|
285
|
+
result.append(prefix + chunks[i])
|
|
286
|
+
return result
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def chunk_semantic(
|
|
290
|
+
text: str,
|
|
291
|
+
*,
|
|
292
|
+
chunk_size: int = 512,
|
|
293
|
+
min_chunk_size: int = 100,
|
|
294
|
+
) -> list[str]:
|
|
295
|
+
"""Split text at sentence boundaries respecting chunk size limits.
|
|
296
|
+
|
|
297
|
+
A lightweight semantic chunker that splits on sentence endings and
|
|
298
|
+
keeps paragraphs together when they fit within *chunk_size*.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
text: Input text.
|
|
302
|
+
chunk_size: Target maximum characters per chunk.
|
|
303
|
+
min_chunk_size: Minimum characters to form a chunk.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
List of text chunks.
|
|
307
|
+
"""
|
|
308
|
+
if not text or chunk_size <= 0:
|
|
309
|
+
return []
|
|
310
|
+
|
|
311
|
+
sentences = re.split(r"(?<=[.!?])\s+", text)
|
|
312
|
+
chunks: list[str] = []
|
|
313
|
+
current = ""
|
|
314
|
+
|
|
315
|
+
for sentence in sentences:
|
|
316
|
+
candidate = (current + " " + sentence).strip() if current else sentence
|
|
317
|
+
if len(candidate) <= chunk_size:
|
|
318
|
+
current = candidate
|
|
319
|
+
else:
|
|
320
|
+
if current and len(current) >= min_chunk_size:
|
|
321
|
+
chunks.append(current)
|
|
322
|
+
current = sentence
|
|
323
|
+
|
|
324
|
+
if current and current.strip():
|
|
325
|
+
chunks.append(current.strip())
|
|
326
|
+
|
|
327
|
+
return chunks
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
# ── Embedding interface ──────────────────────────────────────
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class EmbeddingBackend(str, Enum):
|
|
334
|
+
"""Supported embedding backends."""
|
|
335
|
+
|
|
336
|
+
OLLAMA = "ollama"
|
|
337
|
+
SENTENCE_TRANSFORMERS = "sentence-transformers"
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
@runtime_checkable
|
|
341
|
+
class EmbeddingProvider(Protocol):
|
|
342
|
+
"""Protocol for embedding providers."""
|
|
343
|
+
|
|
344
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
345
|
+
"""Generate embeddings for a batch of texts."""
|
|
346
|
+
...
|
|
347
|
+
|
|
348
|
+
@property
|
|
349
|
+
def dimension(self) -> int:
|
|
350
|
+
"""Embedding vector dimension."""
|
|
351
|
+
...
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
@dataclass
|
|
355
|
+
class OllamaEmbedder:
|
|
356
|
+
"""Embedding provider using the Ollama API.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
base_url: Ollama server URL.
|
|
360
|
+
model: Embedding model name.
|
|
361
|
+
"""
|
|
362
|
+
|
|
363
|
+
base_url: str = "http://localhost:11434"
|
|
364
|
+
model: str = "nomic-embed-text"
|
|
365
|
+
_dimension: int = 768
|
|
366
|
+
|
|
367
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
368
|
+
"""Generate embeddings via Ollama ``/api/embed``.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
texts: Texts to embed.
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
List of embedding vectors.
|
|
375
|
+
|
|
376
|
+
Raises:
|
|
377
|
+
RuntimeError: When the Ollama API is unreachable or returns
|
|
378
|
+
an error.
|
|
379
|
+
"""
|
|
380
|
+
try:
|
|
381
|
+
import httpx # type: ignore[import-untyped]
|
|
382
|
+
except ImportError as exc:
|
|
383
|
+
raise ImportError(
|
|
384
|
+
"The 'httpx' package is required for OllamaEmbedder. "
|
|
385
|
+
"Install it with: pip install httpx"
|
|
386
|
+
) from exc
|
|
387
|
+
|
|
388
|
+
embeddings: list[list[float]] = []
|
|
389
|
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
390
|
+
for text in texts:
|
|
391
|
+
resp = await client.post(
|
|
392
|
+
f"{self.base_url}/api/embed",
|
|
393
|
+
json={"model": self.model, "input": text},
|
|
394
|
+
)
|
|
395
|
+
resp.raise_for_status()
|
|
396
|
+
data = resp.json()
|
|
397
|
+
embedding = data.get("embeddings", [[]])[0]
|
|
398
|
+
embeddings.append(embedding)
|
|
399
|
+
if embedding:
|
|
400
|
+
self._dimension = len(embedding)
|
|
401
|
+
return embeddings
|
|
402
|
+
|
|
403
|
+
@property
|
|
404
|
+
def dimension(self) -> int:
|
|
405
|
+
"""Embedding vector dimension."""
|
|
406
|
+
return self._dimension
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
@dataclass
|
|
410
|
+
class SentenceTransformerEmbedder:
|
|
411
|
+
"""Embedding provider using sentence-transformers.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
model_name: HuggingFace model name.
|
|
415
|
+
"""
|
|
416
|
+
|
|
417
|
+
model_name: str = "all-MiniLM-L6-v2"
|
|
418
|
+
_model: Any = field(default=None, repr=False)
|
|
419
|
+
_dimension: int = 384
|
|
420
|
+
|
|
421
|
+
def _get_model(self) -> Any:
|
|
422
|
+
"""Lazily load the sentence-transformers model."""
|
|
423
|
+
if self._model is None:
|
|
424
|
+
try:
|
|
425
|
+
from sentence_transformers import (
|
|
426
|
+
SentenceTransformer, # type: ignore[import-untyped]
|
|
427
|
+
)
|
|
428
|
+
except ImportError as exc:
|
|
429
|
+
raise ImportError(
|
|
430
|
+
"The 'sentence-transformers' package is required. "
|
|
431
|
+
"Install it with: pip install sentence-transformers"
|
|
432
|
+
) from exc
|
|
433
|
+
self._model = SentenceTransformer(self.model_name)
|
|
434
|
+
self._dimension = self._model.get_sentence_embedding_dimension()
|
|
435
|
+
return self._model
|
|
436
|
+
|
|
437
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
438
|
+
"""Generate embeddings via sentence-transformers.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
texts: Texts to embed.
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
List of embedding vectors.
|
|
445
|
+
"""
|
|
446
|
+
model = self._get_model()
|
|
447
|
+
loop = asyncio.get_event_loop()
|
|
448
|
+
vectors = await loop.run_in_executor(None, model.encode, texts)
|
|
449
|
+
return [v.tolist() for v in vectors]
|
|
450
|
+
|
|
451
|
+
@property
|
|
452
|
+
def dimension(self) -> int:
|
|
453
|
+
"""Embedding vector dimension."""
|
|
454
|
+
return self._dimension
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
# ── Vector store interface ───────────────────────────────────
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
@dataclass
|
|
461
|
+
class ChromaDBConfig:
|
|
462
|
+
"""Container configuration for ChromaDB."""
|
|
463
|
+
|
|
464
|
+
image: str = "chromadb/chroma:0.5.23"
|
|
465
|
+
container_name: str = "admina-chromadb"
|
|
466
|
+
port: int = 8000
|
|
467
|
+
persist_directory: str = "/chroma/chroma"
|
|
468
|
+
|
|
469
|
+
def to_compose_dict(self) -> dict[str, Any]:
|
|
470
|
+
"""Return a docker-compose service fragment."""
|
|
471
|
+
return {
|
|
472
|
+
"image": self.image,
|
|
473
|
+
"container_name": self.container_name,
|
|
474
|
+
"ports": [f"{self.port}:8000"],
|
|
475
|
+
"volumes": ["chromadb-data:/chroma/chroma"],
|
|
476
|
+
"environment": [
|
|
477
|
+
"IS_PERSISTENT=TRUE",
|
|
478
|
+
f"PERSIST_DIRECTORY={self.persist_directory}",
|
|
479
|
+
"ANONYMIZED_TELEMETRY=FALSE",
|
|
480
|
+
],
|
|
481
|
+
"healthcheck": {
|
|
482
|
+
"test": ["CMD", "curl", "-f", "http://localhost:8000/api/v1/heartbeat"],
|
|
483
|
+
"interval": "15s",
|
|
484
|
+
"timeout": "5s",
|
|
485
|
+
"retries": 5,
|
|
486
|
+
},
|
|
487
|
+
"networks": ["admina"],
|
|
488
|
+
"restart": "unless-stopped",
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
# ── Ingest result ────────────────────────────────────────────
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
@dataclass
|
|
496
|
+
class IngestResult:
|
|
497
|
+
"""Result of a document ingest operation."""
|
|
498
|
+
|
|
499
|
+
doc_count: int = 0
|
|
500
|
+
chunk_count: int = 0
|
|
501
|
+
sources: list[str] = field(default_factory=list)
|
|
502
|
+
errors: list[str] = field(default_factory=list)
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
# ── RAG Pipeline ─────────────────────────────────────────────
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
@dataclass
|
|
509
|
+
class RAGPipeline:
|
|
510
|
+
"""Orchestrates the full RAG pipeline.
|
|
511
|
+
|
|
512
|
+
Handles document ingest (parse → chunk → embed → store) and retrieval
|
|
513
|
+
(query → embed → search → rank → cite).
|
|
514
|
+
"""
|
|
515
|
+
|
|
516
|
+
chunk_size: int = 512
|
|
517
|
+
chunk_overlap: int = 50
|
|
518
|
+
chunking_strategy: ChunkingStrategy = ChunkingStrategy.RECURSIVE_CHARACTER
|
|
519
|
+
embedding_backend: EmbeddingBackend = EmbeddingBackend.OLLAMA
|
|
520
|
+
embedding_model: str = "nomic-embed-text"
|
|
521
|
+
chromadb_host: str = "localhost"
|
|
522
|
+
chromadb_port: int = 8000
|
|
523
|
+
collection_name: str = "admina_default"
|
|
524
|
+
|
|
525
|
+
# ── Factory ──────────────────────────────────────────────
|
|
526
|
+
|
|
527
|
+
@classmethod
|
|
528
|
+
def from_config(
|
|
529
|
+
cls,
|
|
530
|
+
*,
|
|
531
|
+
backend: str = "chromadb",
|
|
532
|
+
chunk_size: int = 512,
|
|
533
|
+
chunk_overlap: int = 50,
|
|
534
|
+
chunking_strategy: str = "recursive_character",
|
|
535
|
+
embedding_backend: str = "ollama",
|
|
536
|
+
embedding_model: str = "nomic-embed-text",
|
|
537
|
+
chromadb_host: str = "localhost",
|
|
538
|
+
chromadb_port: int = 8000,
|
|
539
|
+
collection_name: str = "admina_default",
|
|
540
|
+
) -> RAGPipeline:
|
|
541
|
+
"""Create a pipeline from admina.yaml values.
|
|
542
|
+
|
|
543
|
+
Args:
|
|
544
|
+
backend: Vector store backend (currently only ``"chromadb"``).
|
|
545
|
+
chunk_size: Maximum characters per chunk.
|
|
546
|
+
chunk_overlap: Overlap between consecutive chunks.
|
|
547
|
+
chunking_strategy: ``"recursive_character"`` or ``"semantic"``.
|
|
548
|
+
embedding_backend: ``"ollama"`` or ``"sentence-transformers"``.
|
|
549
|
+
embedding_model: Model name for embeddings.
|
|
550
|
+
chromadb_host: ChromaDB server host.
|
|
551
|
+
chromadb_port: ChromaDB server port.
|
|
552
|
+
collection_name: Default collection name.
|
|
553
|
+
"""
|
|
554
|
+
return cls(
|
|
555
|
+
chunk_size=chunk_size,
|
|
556
|
+
chunk_overlap=chunk_overlap,
|
|
557
|
+
chunking_strategy=ChunkingStrategy(chunking_strategy),
|
|
558
|
+
embedding_backend=EmbeddingBackend(embedding_backend),
|
|
559
|
+
embedding_model=embedding_model,
|
|
560
|
+
chromadb_host=chromadb_host,
|
|
561
|
+
chromadb_port=chromadb_port,
|
|
562
|
+
collection_name=collection_name,
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
# ── Document parsing ─────────────────────────────────────
|
|
566
|
+
|
|
567
|
+
def parse(self, content: str, fmt: DocumentFormat) -> str:
|
|
568
|
+
"""Parse raw content into plain text.
|
|
569
|
+
|
|
570
|
+
Args:
|
|
571
|
+
content: Raw file content.
|
|
572
|
+
fmt: Document format.
|
|
573
|
+
|
|
574
|
+
Returns:
|
|
575
|
+
Extracted plain text.
|
|
576
|
+
"""
|
|
577
|
+
return parse_document(content, fmt)
|
|
578
|
+
|
|
579
|
+
# ── Chunking ─────────────────────────────────────────────
|
|
580
|
+
|
|
581
|
+
def chunk(self, text: str) -> list[Chunk]:
|
|
582
|
+
"""Split text into chunks using the configured strategy.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
text: Plain text to chunk.
|
|
586
|
+
|
|
587
|
+
Returns:
|
|
588
|
+
List of :class:`Chunk` objects.
|
|
589
|
+
"""
|
|
590
|
+
if self.chunking_strategy == ChunkingStrategy.SEMANTIC:
|
|
591
|
+
raw_chunks = chunk_semantic(
|
|
592
|
+
text,
|
|
593
|
+
chunk_size=self.chunk_size,
|
|
594
|
+
)
|
|
595
|
+
else:
|
|
596
|
+
raw_chunks = chunk_recursive_character(
|
|
597
|
+
text,
|
|
598
|
+
chunk_size=self.chunk_size,
|
|
599
|
+
chunk_overlap=self.chunk_overlap,
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
return [Chunk(text=c, chunk_index=i) for i, c in enumerate(raw_chunks)]
|
|
603
|
+
|
|
604
|
+
# ── Embedding ────────────────────────────────────────────
|
|
605
|
+
|
|
606
|
+
def _get_embedder(self) -> OllamaEmbedder | SentenceTransformerEmbedder:
|
|
607
|
+
"""Return the configured embedding provider."""
|
|
608
|
+
if self.embedding_backend == EmbeddingBackend.SENTENCE_TRANSFORMERS:
|
|
609
|
+
return SentenceTransformerEmbedder(model_name=self.embedding_model)
|
|
610
|
+
return OllamaEmbedder(model=self.embedding_model)
|
|
611
|
+
|
|
612
|
+
# ── Full ingest pipeline ─────────────────────────────────
|
|
613
|
+
|
|
614
|
+
async def ingest_documents(
|
|
615
|
+
self,
|
|
616
|
+
documents: list[Document],
|
|
617
|
+
) -> IngestResult:
|
|
618
|
+
"""Run the full ingest pipeline: parse → chunk → embed → store.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
documents: Documents to ingest.
|
|
622
|
+
|
|
623
|
+
Returns:
|
|
624
|
+
An :class:`IngestResult` with counts and any errors.
|
|
625
|
+
|
|
626
|
+
Note:
|
|
627
|
+
Requires a running ChromaDB instance and embedding backend.
|
|
628
|
+
In unit tests, mock the ``_store_chunks`` method.
|
|
629
|
+
"""
|
|
630
|
+
result = IngestResult()
|
|
631
|
+
all_chunks: list[Chunk] = []
|
|
632
|
+
|
|
633
|
+
for doc in documents:
|
|
634
|
+
try:
|
|
635
|
+
text = self.parse(doc.content, doc.format)
|
|
636
|
+
chunks = self.chunk(text)
|
|
637
|
+
for chunk in chunks:
|
|
638
|
+
chunk.doc_id = doc.doc_id
|
|
639
|
+
chunk.metadata.update(doc.metadata)
|
|
640
|
+
chunk.metadata["source"] = doc.source
|
|
641
|
+
all_chunks.extend(chunks)
|
|
642
|
+
result.doc_count += 1
|
|
643
|
+
result.sources.append(doc.source)
|
|
644
|
+
except (OSError, ValueError, RuntimeError) as exc:
|
|
645
|
+
result.errors.append(f"{doc.source}: {exc}")
|
|
646
|
+
logger.error("Ingest error for %s: %s", doc.source, exc)
|
|
647
|
+
|
|
648
|
+
result.chunk_count = len(all_chunks)
|
|
649
|
+
|
|
650
|
+
if all_chunks:
|
|
651
|
+
try:
|
|
652
|
+
await self._store_chunks(all_chunks)
|
|
653
|
+
except (OSError, ValueError, RuntimeError) as exc:
|
|
654
|
+
result.errors.append(f"store: {exc}")
|
|
655
|
+
logger.error("Failed to store chunks: %s", exc)
|
|
656
|
+
|
|
657
|
+
return result
|
|
658
|
+
|
|
659
|
+
def ingest_documents_sync(self, documents: list[Document]) -> IngestResult:
|
|
660
|
+
"""Synchronous convenience wrapper for :meth:`ingest_documents`."""
|
|
661
|
+
return asyncio.get_event_loop().run_until_complete(self.ingest_documents(documents))
|
|
662
|
+
|
|
663
|
+
async def _store_chunks(self, chunks: list[Chunk]) -> None:
|
|
664
|
+
"""Store chunks in ChromaDB via the plugin connector.
|
|
665
|
+
|
|
666
|
+
Args:
|
|
667
|
+
chunks: Processed chunks to store.
|
|
668
|
+
"""
|
|
669
|
+
try:
|
|
670
|
+
import chromadb # type: ignore[import-untyped]
|
|
671
|
+
except ImportError as exc:
|
|
672
|
+
raise ImportError(
|
|
673
|
+
"The 'chromadb' package is required for vector storage. "
|
|
674
|
+
"Install it with: pip install chromadb"
|
|
675
|
+
) from exc
|
|
676
|
+
|
|
677
|
+
client = chromadb.HttpClient(
|
|
678
|
+
host=self.chromadb_host,
|
|
679
|
+
port=self.chromadb_port,
|
|
680
|
+
)
|
|
681
|
+
collection = client.get_or_create_collection(name=self.collection_name)
|
|
682
|
+
|
|
683
|
+
embedder = self._get_embedder()
|
|
684
|
+
texts = [c.text for c in chunks]
|
|
685
|
+
embeddings = await embedder.embed(texts)
|
|
686
|
+
|
|
687
|
+
collection.add(
|
|
688
|
+
ids=[c.chunk_id for c in chunks],
|
|
689
|
+
documents=texts,
|
|
690
|
+
embeddings=embeddings,
|
|
691
|
+
metadatas=[c.metadata for c in chunks],
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
# ── Retrieval ────────────────────────────────────────────
|
|
695
|
+
|
|
696
|
+
async def retrieve(
|
|
697
|
+
self,
|
|
698
|
+
query: str,
|
|
699
|
+
*,
|
|
700
|
+
top_k: int = 5,
|
|
701
|
+
min_score: float = 0.0,
|
|
702
|
+
) -> list[RetrievalResult]:
|
|
703
|
+
"""Query the vector store and return ranked results with citations.
|
|
704
|
+
|
|
705
|
+
Args:
|
|
706
|
+
query: Search query string.
|
|
707
|
+
top_k: Maximum number of results.
|
|
708
|
+
min_score: Minimum similarity score threshold.
|
|
709
|
+
|
|
710
|
+
Returns:
|
|
711
|
+
Ranked list of :class:`RetrievalResult` objects.
|
|
712
|
+
"""
|
|
713
|
+
try:
|
|
714
|
+
import chromadb # type: ignore[import-untyped]
|
|
715
|
+
except ImportError as exc:
|
|
716
|
+
raise ImportError(
|
|
717
|
+
"The 'chromadb' package is required for retrieval. "
|
|
718
|
+
"Install it with: pip install chromadb"
|
|
719
|
+
) from exc
|
|
720
|
+
|
|
721
|
+
client = chromadb.HttpClient(
|
|
722
|
+
host=self.chromadb_host,
|
|
723
|
+
port=self.chromadb_port,
|
|
724
|
+
)
|
|
725
|
+
collection = client.get_or_create_collection(name=self.collection_name)
|
|
726
|
+
|
|
727
|
+
embedder = self._get_embedder()
|
|
728
|
+
query_embedding = (await embedder.embed([query]))[0]
|
|
729
|
+
|
|
730
|
+
raw = collection.query(
|
|
731
|
+
query_embeddings=[query_embedding],
|
|
732
|
+
n_results=top_k,
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
return self._rank_results(raw, min_score=min_score)
|
|
736
|
+
|
|
737
|
+
def retrieve_sync(
|
|
738
|
+
self,
|
|
739
|
+
query: str,
|
|
740
|
+
*,
|
|
741
|
+
top_k: int = 5,
|
|
742
|
+
min_score: float = 0.0,
|
|
743
|
+
) -> list[RetrievalResult]:
|
|
744
|
+
"""Synchronous convenience wrapper for :meth:`retrieve`."""
|
|
745
|
+
return asyncio.get_event_loop().run_until_complete(
|
|
746
|
+
self.retrieve(query, top_k=top_k, min_score=min_score)
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
@staticmethod
|
|
750
|
+
def _rank_results(
|
|
751
|
+
raw: dict[str, Any],
|
|
752
|
+
*,
|
|
753
|
+
min_score: float = 0.0,
|
|
754
|
+
) -> list[RetrievalResult]:
|
|
755
|
+
"""Convert ChromaDB query results to ranked RetrievalResults.
|
|
756
|
+
|
|
757
|
+
Args:
|
|
758
|
+
raw: Raw ChromaDB query output.
|
|
759
|
+
min_score: Filter results below this score.
|
|
760
|
+
|
|
761
|
+
Returns:
|
|
762
|
+
Sorted list of :class:`RetrievalResult`.
|
|
763
|
+
"""
|
|
764
|
+
documents = raw.get("documents", [[]])[0]
|
|
765
|
+
metadatas = raw.get("metadatas", [[]])[0]
|
|
766
|
+
distances = raw.get("distances", [[]])[0]
|
|
767
|
+
|
|
768
|
+
results: list[RetrievalResult] = []
|
|
769
|
+
for text, meta, dist in zip(documents, metadatas, distances):
|
|
770
|
+
score = round(1.0 / (1.0 + dist), 4)
|
|
771
|
+
if score < min_score:
|
|
772
|
+
continue
|
|
773
|
+
results.append(
|
|
774
|
+
RetrievalResult(
|
|
775
|
+
text=text,
|
|
776
|
+
score=score,
|
|
777
|
+
source=meta.get("source", "") if meta else "",
|
|
778
|
+
doc_id=meta.get("doc_id", "") if meta else "",
|
|
779
|
+
chunk_index=meta.get("chunk_index", 0) if meta else 0,
|
|
780
|
+
metadata=meta or {},
|
|
781
|
+
)
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
results.sort(key=lambda r: r.score, reverse=True)
|
|
785
|
+
return results
|
|
786
|
+
|
|
787
|
+
# ── Compose generation ───────────────────────────────────
|
|
788
|
+
|
|
789
|
+
def compose_service(
|
|
790
|
+
self,
|
|
791
|
+
project_name: str = "admina",
|
|
792
|
+
) -> dict[str, Any]:
|
|
793
|
+
"""Return the docker-compose service dict for ChromaDB.
|
|
794
|
+
|
|
795
|
+
Args:
|
|
796
|
+
project_name: Used for container naming.
|
|
797
|
+
"""
|
|
798
|
+
cfg = ChromaDBConfig(
|
|
799
|
+
container_name=f"{project_name}-chromadb",
|
|
800
|
+
port=self.chromadb_port,
|
|
801
|
+
)
|
|
802
|
+
return cfg.to_compose_dict()
|
|
803
|
+
|
|
804
|
+
# ── Status ───────────────────────────────────────────────
|
|
805
|
+
|
|
806
|
+
def summary(self) -> dict[str, Any]:
|
|
807
|
+
"""Return a JSON-serialisable summary of pipeline config."""
|
|
808
|
+
return {
|
|
809
|
+
"chunk_size": self.chunk_size,
|
|
810
|
+
"chunk_overlap": self.chunk_overlap,
|
|
811
|
+
"chunking_strategy": self.chunking_strategy.value,
|
|
812
|
+
"embedding_backend": self.embedding_backend.value,
|
|
813
|
+
"embedding_model": self.embedding_model,
|
|
814
|
+
"chromadb_host": self.chromadb_host,
|
|
815
|
+
"chromadb_port": self.chromadb_port,
|
|
816
|
+
"collection_name": self.collection_name,
|
|
817
|
+
}
|