msaas-rag 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msaas_rag-1.0.0/.gitignore +21 -0
- msaas_rag-1.0.0/PKG-INFO +20 -0
- msaas_rag-1.0.0/pyproject.toml +39 -0
- msaas_rag-1.0.0/src/rag/__init__.py +54 -0
- msaas_rag-1.0.0/src/rag/chunking.py +202 -0
- msaas_rag-1.0.0/src/rag/config.py +46 -0
- msaas_rag-1.0.0/src/rag/embeddings.py +136 -0
- msaas_rag-1.0.0/src/rag/models.py +73 -0
- msaas_rag-1.0.0/src/rag/pipeline.py +110 -0
- msaas_rag-1.0.0/src/rag/reranker.py +113 -0
- msaas_rag-1.0.0/src/rag/vector_store.py +293 -0
- msaas_rag-1.0.0/tests/__init__.py +0 -0
- msaas_rag-1.0.0/tests/test_chunking.py +122 -0
- msaas_rag-1.0.0/tests/test_config.py +35 -0
- msaas_rag-1.0.0/tests/test_embeddings.py +87 -0
- msaas_rag-1.0.0/tests/test_models.py +79 -0
- msaas_rag-1.0.0/tests/test_pipeline.py +106 -0
- msaas_rag-1.0.0/tests/test_reranker.py +102 -0
- msaas_rag-1.0.0/tests/test_vector_store.py +258 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
node_modules/
|
|
2
|
+
dist/
|
|
3
|
+
.next/
|
|
4
|
+
.turbo/
|
|
5
|
+
*.pyc
|
|
6
|
+
__pycache__/
|
|
7
|
+
.venv/
|
|
8
|
+
*.egg-info/
|
|
9
|
+
.pytest_cache/
|
|
10
|
+
.ruff_cache/
|
|
11
|
+
.env
|
|
12
|
+
.env.local
|
|
13
|
+
.env.*.local
|
|
14
|
+
.DS_Store
|
|
15
|
+
coverage/
|
|
16
|
+
|
|
17
|
+
# Runtime artifacts
|
|
18
|
+
logs_llm/
|
|
19
|
+
vectors.db
|
|
20
|
+
vectors.db-shm
|
|
21
|
+
vectors.db-wal
|
msaas_rag-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: msaas-rag
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: RAG pipeline library — chunking, embeddings, vector search, and retrieval for the Willian SaaS platform
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Requires-Dist: asyncpg>=0.30.0
|
|
8
|
+
Requires-Dist: pydantic>=2.0
|
|
9
|
+
Provides-Extra: all
|
|
10
|
+
Requires-Dist: numpy>=2.0; extra == 'all'
|
|
11
|
+
Requires-Dist: openai>=1.50.0; extra == 'all'
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: numpy>=2.0; extra == 'dev'
|
|
14
|
+
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
15
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
16
|
+
Requires-Dist: ruff>=0.8; extra == 'dev'
|
|
17
|
+
Provides-Extra: numpy
|
|
18
|
+
Requires-Dist: numpy>=2.0; extra == 'numpy'
|
|
19
|
+
Provides-Extra: openai
|
|
20
|
+
Requires-Dist: openai>=1.50.0; extra == 'openai'
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "msaas-rag"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "RAG pipeline library — chunking, embeddings, vector search, and retrieval for the Willian SaaS platform"
|
|
5
|
+
requires-python = ">=3.12"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
dependencies = [
|
|
8
|
+
"pydantic>=2.0",
|
|
9
|
+
"asyncpg>=0.30.0",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[project.optional-dependencies]
|
|
13
|
+
openai = ["openai>=1.50.0"]
|
|
14
|
+
numpy = ["numpy>=2.0"]
|
|
15
|
+
all = ["openai>=1.50.0", "numpy>=2.0"]
|
|
16
|
+
dev = [
|
|
17
|
+
"pytest>=8.0",
|
|
18
|
+
"pytest-asyncio>=0.24.0",
|
|
19
|
+
"numpy>=2.0",
|
|
20
|
+
"ruff>=0.8",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[build-system]
|
|
24
|
+
requires = ["hatchling"]
|
|
25
|
+
build-backend = "hatchling.build"
|
|
26
|
+
|
|
27
|
+
[tool.hatch.build.targets.wheel]
|
|
28
|
+
packages = ["src/rag"]
|
|
29
|
+
|
|
30
|
+
[tool.ruff]
|
|
31
|
+
target-version = "py312"
|
|
32
|
+
line-length = 100
|
|
33
|
+
|
|
34
|
+
[tool.ruff.lint]
|
|
35
|
+
select = ["E", "F", "I", "N", "W", "UP", "B", "SIM", "TCH"]
|
|
36
|
+
|
|
37
|
+
[tool.pytest.ini_options]
|
|
38
|
+
testpaths = ["tests"]
|
|
39
|
+
asyncio_mode = "auto"
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Willian RAG — retrieval-augmented generation pipeline library."""
|
|
2
|
+
|
|
3
|
+
from rag.chunking import MarkdownChunker, TextChunker
|
|
4
|
+
from rag.config import get_config, get_pipeline, init_rag
|
|
5
|
+
from rag.embeddings import EmbeddingProvider, LocalEmbeddings, OpenAIEmbeddings
|
|
6
|
+
from rag.models import (
|
|
7
|
+
Chunk,
|
|
8
|
+
ChunkingConfig,
|
|
9
|
+
Document,
|
|
10
|
+
EmbeddingConfig,
|
|
11
|
+
RAGConfig,
|
|
12
|
+
SearchResult,
|
|
13
|
+
VectorStoreConfig,
|
|
14
|
+
)
|
|
15
|
+
from rag.pipeline import RAGPipeline
|
|
16
|
+
from rag.reranker import (
|
|
17
|
+
CrossEncoderReranker,
|
|
18
|
+
LLMReranker,
|
|
19
|
+
Reranker,
|
|
20
|
+
reciprocal_rank_fusion,
|
|
21
|
+
)
|
|
22
|
+
from rag.vector_store import InMemoryVectorStore, PgVectorStore, VectorStore
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
# Pipeline
|
|
26
|
+
"RAGPipeline",
|
|
27
|
+
"init_rag",
|
|
28
|
+
"get_config",
|
|
29
|
+
"get_pipeline",
|
|
30
|
+
# Models
|
|
31
|
+
"Document",
|
|
32
|
+
"Chunk",
|
|
33
|
+
"SearchResult",
|
|
34
|
+
"RAGConfig",
|
|
35
|
+
"ChunkingConfig",
|
|
36
|
+
"EmbeddingConfig",
|
|
37
|
+
"VectorStoreConfig",
|
|
38
|
+
# Chunking
|
|
39
|
+
"TextChunker",
|
|
40
|
+
"MarkdownChunker",
|
|
41
|
+
# Embeddings
|
|
42
|
+
"EmbeddingProvider",
|
|
43
|
+
"OpenAIEmbeddings",
|
|
44
|
+
"LocalEmbeddings",
|
|
45
|
+
# Vector stores
|
|
46
|
+
"VectorStore",
|
|
47
|
+
"PgVectorStore",
|
|
48
|
+
"InMemoryVectorStore",
|
|
49
|
+
# Reranking
|
|
50
|
+
"Reranker",
|
|
51
|
+
"CrossEncoderReranker",
|
|
52
|
+
"LLMReranker",
|
|
53
|
+
"reciprocal_rank_fusion",
|
|
54
|
+
]
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Text chunking strategies for document processing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import uuid
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from rag.models import Chunk
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TextChunker:
|
|
13
|
+
"""Split plain text into chunks using configurable strategies.
|
|
14
|
+
|
|
15
|
+
Supported strategies:
|
|
16
|
+
- ``fixed_size``: split at exact character boundaries
|
|
17
|
+
- ``recursive``: split by paragraph > sentence > character, preferring natural breaks
|
|
18
|
+
- ``semantic``: sentence-based splitting that groups sentences up to chunk_size
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
SEPARATORS = ["\n\n", "\n", ". ", " "]
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
strategy: str = "recursive",
|
|
26
|
+
chunk_size: int = 512,
|
|
27
|
+
chunk_overlap: int = 64,
|
|
28
|
+
) -> None:
|
|
29
|
+
if strategy not in ("fixed_size", "recursive", "semantic"):
|
|
30
|
+
msg = f"Unknown strategy: {strategy!r}. Use fixed_size, recursive, or semantic."
|
|
31
|
+
raise ValueError(msg)
|
|
32
|
+
self.strategy = strategy
|
|
33
|
+
self.chunk_size = chunk_size
|
|
34
|
+
self.chunk_overlap = chunk_overlap
|
|
35
|
+
|
|
36
|
+
def chunk(
|
|
37
|
+
self,
|
|
38
|
+
text: str,
|
|
39
|
+
document_id: str,
|
|
40
|
+
metadata: dict[str, Any] | None = None,
|
|
41
|
+
) -> list[Chunk]:
|
|
42
|
+
"""Split *text* into Chunk objects."""
|
|
43
|
+
meta = metadata or {}
|
|
44
|
+
if self.strategy == "fixed_size":
|
|
45
|
+
return self._fixed_size(text, document_id, meta)
|
|
46
|
+
if self.strategy == "semantic":
|
|
47
|
+
return self._semantic(text, document_id, meta)
|
|
48
|
+
return self._recursive(text, document_id, meta)
|
|
49
|
+
|
|
50
|
+
# -- strategies ----------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
def _fixed_size(self, text: str, document_id: str, meta: dict[str, Any]) -> list[Chunk]:
|
|
53
|
+
chunks: list[Chunk] = []
|
|
54
|
+
step = max(1, self.chunk_size - self.chunk_overlap)
|
|
55
|
+
for start in range(0, len(text), step):
|
|
56
|
+
end = min(start + self.chunk_size, len(text))
|
|
57
|
+
chunks.append(self._make(text[start:end], document_id, start, end, meta))
|
|
58
|
+
if end == len(text):
|
|
59
|
+
break
|
|
60
|
+
return chunks
|
|
61
|
+
|
|
62
|
+
def _recursive(self, text: str, document_id: str, meta: dict[str, Any]) -> list[Chunk]:
|
|
63
|
+
return self._recursive_split(text, document_id, meta, self.SEPARATORS)
|
|
64
|
+
|
|
65
|
+
def _recursive_split(
|
|
66
|
+
self,
|
|
67
|
+
text: str,
|
|
68
|
+
document_id: str,
|
|
69
|
+
meta: dict[str, Any],
|
|
70
|
+
separators: list[str],
|
|
71
|
+
) -> list[Chunk]:
|
|
72
|
+
if len(text) <= self.chunk_size:
|
|
73
|
+
return [self._make(text, document_id, 0, len(text), meta)]
|
|
74
|
+
|
|
75
|
+
sep = separators[0] if separators else ""
|
|
76
|
+
parts = text.split(sep) if sep else list(text)
|
|
77
|
+
remaining_seps = separators[1:] if separators else []
|
|
78
|
+
|
|
79
|
+
chunks: list[Chunk] = []
|
|
80
|
+
current = ""
|
|
81
|
+
offset = 0
|
|
82
|
+
|
|
83
|
+
for i, part in enumerate(parts):
|
|
84
|
+
candidate = current + (sep if current else "") + part
|
|
85
|
+
if len(candidate) > self.chunk_size and current:
|
|
86
|
+
start = offset
|
|
87
|
+
end = offset + len(current)
|
|
88
|
+
chunks.append(self._make(current, document_id, start, end, meta))
|
|
89
|
+
# Compute overlap start
|
|
90
|
+
overlap_start = max(0, len(current) - self.chunk_overlap)
|
|
91
|
+
offset = offset + len(current) - (len(current) - overlap_start) + len(sep)
|
|
92
|
+
current = (
|
|
93
|
+
current[overlap_start:] + sep + part if overlap_start < len(current) else part
|
|
94
|
+
)
|
|
95
|
+
elif len(candidate) > self.chunk_size and remaining_seps:
|
|
96
|
+
sub = self._recursive_split(part, document_id, meta, remaining_seps)
|
|
97
|
+
chunks.extend(sub)
|
|
98
|
+
current = ""
|
|
99
|
+
offset += len(part) + len(sep)
|
|
100
|
+
else:
|
|
101
|
+
current = candidate
|
|
102
|
+
if not current.startswith(part) and i == 0:
|
|
103
|
+
pass # offset stays
|
|
104
|
+
if current.strip():
|
|
105
|
+
chunks.append(self._make(current, document_id, offset, offset + len(current), meta))
|
|
106
|
+
return chunks
|
|
107
|
+
|
|
108
|
+
def _semantic(self, text: str, document_id: str, meta: dict[str, Any]) -> list[Chunk]:
|
|
109
|
+
"""Sentence-based chunking: group sentences up to chunk_size."""
|
|
110
|
+
sentences = re.split(r"(?<=[.!?])\s+", text)
|
|
111
|
+
chunks: list[Chunk] = []
|
|
112
|
+
current = ""
|
|
113
|
+
offset = 0
|
|
114
|
+
|
|
115
|
+
for sentence in sentences:
|
|
116
|
+
candidate = (current + " " + sentence).strip() if current else sentence
|
|
117
|
+
if len(candidate) > self.chunk_size and current:
|
|
118
|
+
end = offset + len(current)
|
|
119
|
+
chunks.append(self._make(current, document_id, offset, end, meta))
|
|
120
|
+
offset = end + 1
|
|
121
|
+
current = sentence
|
|
122
|
+
else:
|
|
123
|
+
current = candidate
|
|
124
|
+
if current.strip():
|
|
125
|
+
chunks.append(self._make(current, document_id, offset, offset + len(current), meta))
|
|
126
|
+
return chunks
|
|
127
|
+
|
|
128
|
+
@staticmethod
|
|
129
|
+
def _make(text: str, document_id: str, start: int, end: int, meta: dict[str, Any]) -> Chunk:
|
|
130
|
+
return Chunk(
|
|
131
|
+
id=uuid.uuid4().hex,
|
|
132
|
+
text=text,
|
|
133
|
+
document_id=document_id,
|
|
134
|
+
start_idx=start,
|
|
135
|
+
end_idx=end,
|
|
136
|
+
metadata=meta,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class MarkdownChunker:
|
|
141
|
+
"""Split Markdown documents by headers, preserving structure.
|
|
142
|
+
|
|
143
|
+
Each header section becomes a chunk. Sections exceeding ``chunk_size`` are
|
|
144
|
+
further split by the given ``fallback`` TextChunker strategy.
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
HEADER_RE = re.compile(r"^(#{1,6})\s+(.*)", re.MULTILINE)
|
|
148
|
+
|
|
149
|
+
def __init__(self, chunk_size: int = 1024, chunk_overlap: int = 64) -> None:
|
|
150
|
+
self.chunk_size = chunk_size
|
|
151
|
+
self.chunk_overlap = chunk_overlap
|
|
152
|
+
self._fallback = TextChunker(
|
|
153
|
+
strategy="recursive", chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
def chunk(
|
|
157
|
+
self,
|
|
158
|
+
text: str,
|
|
159
|
+
document_id: str,
|
|
160
|
+
metadata: dict[str, Any] | None = None,
|
|
161
|
+
) -> list[Chunk]:
|
|
162
|
+
meta = metadata or {}
|
|
163
|
+
sections = self._split_by_headers(text)
|
|
164
|
+
chunks: list[Chunk] = []
|
|
165
|
+
|
|
166
|
+
for header, body, start in sections:
|
|
167
|
+
section_meta = {**meta}
|
|
168
|
+
if header:
|
|
169
|
+
section_meta["header"] = header
|
|
170
|
+
full_text = f"{header}\n{body}".strip() if header else body.strip()
|
|
171
|
+
if len(full_text) <= self.chunk_size:
|
|
172
|
+
chunks.append(
|
|
173
|
+
TextChunker._make(
|
|
174
|
+
full_text, document_id, start, start + len(full_text), section_meta
|
|
175
|
+
)
|
|
176
|
+
)
|
|
177
|
+
else:
|
|
178
|
+
sub = self._fallback.chunk(full_text, document_id, section_meta)
|
|
179
|
+
chunks.extend(sub)
|
|
180
|
+
return chunks
|
|
181
|
+
|
|
182
|
+
def _split_by_headers(self, text: str) -> list[tuple[str, str, int]]:
|
|
183
|
+
"""Return list of (header_line, body_text, start_index)."""
|
|
184
|
+
matches = list(self.HEADER_RE.finditer(text))
|
|
185
|
+
if not matches:
|
|
186
|
+
return [("", text, 0)]
|
|
187
|
+
|
|
188
|
+
sections: list[tuple[str, str, int]] = []
|
|
189
|
+
# Text before first header
|
|
190
|
+
if matches[0].start() > 0:
|
|
191
|
+
preamble = text[: matches[0].start()]
|
|
192
|
+
if preamble.strip():
|
|
193
|
+
sections.append(("", preamble.strip(), 0))
|
|
194
|
+
|
|
195
|
+
for i, match in enumerate(matches):
|
|
196
|
+
header_line = match.group(0)
|
|
197
|
+
body_start = match.end()
|
|
198
|
+
body_end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
|
199
|
+
body = text[body_start:body_end].strip()
|
|
200
|
+
sections.append((header_line, body, match.start()))
|
|
201
|
+
|
|
202
|
+
return sections
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Global configuration and pipeline singleton management."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from rag.models import RAGConfig
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from rag.pipeline import RAGPipeline
|
|
11
|
+
|
|
12
|
+
_pipeline: RAGPipeline | None = None
|
|
13
|
+
_config: RAGConfig | None = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def init_rag(config: RAGConfig | None = None) -> RAGConfig:
|
|
17
|
+
"""Initialize global RAG configuration.
|
|
18
|
+
|
|
19
|
+
If no config is provided, a default configuration is created.
|
|
20
|
+
Returns the active config for further customization.
|
|
21
|
+
"""
|
|
22
|
+
global _config, _pipeline
|
|
23
|
+
_config = config or RAGConfig()
|
|
24
|
+
_pipeline = None # Reset pipeline so it picks up new config
|
|
25
|
+
return _config
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_config() -> RAGConfig:
|
|
29
|
+
"""Return the current global RAG config, initializing defaults if needed."""
|
|
30
|
+
global _config
|
|
31
|
+
if _config is None:
|
|
32
|
+
_config = RAGConfig()
|
|
33
|
+
return _config
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_pipeline() -> RAGPipeline:
|
|
37
|
+
"""Return (or create) the global RAGPipeline singleton.
|
|
38
|
+
|
|
39
|
+
Uses the current global config. Call ``init_rag()`` first to customize.
|
|
40
|
+
"""
|
|
41
|
+
global _pipeline
|
|
42
|
+
if _pipeline is None:
|
|
43
|
+
from rag.pipeline import RAGPipeline
|
|
44
|
+
|
|
45
|
+
_pipeline = RAGPipeline(config=get_config())
|
|
46
|
+
return _pipeline
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Embedding providers for converting text to vector representations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import hashlib
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EmbeddingProvider(ABC):
|
|
12
|
+
"""Abstract base for embedding providers."""
|
|
13
|
+
|
|
14
|
+
@abstractmethod
|
|
15
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
16
|
+
"""Return embedding vectors for each text in *texts*."""
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def dimensions(self) -> int:
|
|
20
|
+
"""Return the dimensionality of the embedding vectors."""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class OpenAIEmbeddings(EmbeddingProvider):
|
|
24
|
+
"""OpenAI text-embedding-3-small/large provider with caching and batching.
|
|
25
|
+
|
|
26
|
+
Requires the ``openai`` extra: ``pip install willian-rag[openai]``
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
MODEL_DIMENSIONS = {
|
|
30
|
+
"text-embedding-3-small": 1536,
|
|
31
|
+
"text-embedding-3-large": 3072,
|
|
32
|
+
"text-embedding-ada-002": 1536,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
model: str = "text-embedding-3-small",
|
|
38
|
+
api_key: str | None = None,
|
|
39
|
+
batch_size: int = 100,
|
|
40
|
+
max_retries: int = 3,
|
|
41
|
+
retry_delay: float = 1.0,
|
|
42
|
+
) -> None:
|
|
43
|
+
try:
|
|
44
|
+
import openai # noqa: F811
|
|
45
|
+
except ImportError as exc:
|
|
46
|
+
msg = "Install the openai extra: pip install willian-rag[openai]"
|
|
47
|
+
raise ImportError(msg) from exc
|
|
48
|
+
|
|
49
|
+
self.model = model
|
|
50
|
+
self.batch_size = batch_size
|
|
51
|
+
self.max_retries = max_retries
|
|
52
|
+
self.retry_delay = retry_delay
|
|
53
|
+
self._client = openai.AsyncOpenAI(api_key=api_key)
|
|
54
|
+
self._cache: dict[str, list[float]] = {}
|
|
55
|
+
|
|
56
|
+
def dimensions(self) -> int:
|
|
57
|
+
return self.MODEL_DIMENSIONS.get(self.model, 1536)
|
|
58
|
+
|
|
59
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
60
|
+
"""Embed texts with batching, rate limiting, and caching."""
|
|
61
|
+
results: dict[int, list[float]] = {}
|
|
62
|
+
uncached: list[tuple[int, str]] = []
|
|
63
|
+
|
|
64
|
+
for i, text in enumerate(texts):
|
|
65
|
+
key = self._cache_key(text)
|
|
66
|
+
if key in self._cache:
|
|
67
|
+
results[i] = self._cache[key]
|
|
68
|
+
else:
|
|
69
|
+
uncached.append((i, text))
|
|
70
|
+
|
|
71
|
+
# Process uncached in batches
|
|
72
|
+
for batch_start in range(0, len(uncached), self.batch_size):
|
|
73
|
+
batch = uncached[batch_start : batch_start + self.batch_size]
|
|
74
|
+
batch_texts = [t for _, t in batch]
|
|
75
|
+
embeddings = await self._embed_with_retry(batch_texts)
|
|
76
|
+
for (idx, text), emb in zip(batch, embeddings):
|
|
77
|
+
self._cache[self._cache_key(text)] = emb
|
|
78
|
+
results[idx] = emb
|
|
79
|
+
|
|
80
|
+
return [results[i] for i in range(len(texts))]
|
|
81
|
+
|
|
82
|
+
async def _embed_with_retry(self, texts: list[str]) -> list[list[float]]:
|
|
83
|
+
last_error: Exception | None = None
|
|
84
|
+
for attempt in range(self.max_retries):
|
|
85
|
+
try:
|
|
86
|
+
response = await self._client.embeddings.create(model=self.model, input=texts)
|
|
87
|
+
return [item.embedding for item in response.data]
|
|
88
|
+
except Exception as exc:
|
|
89
|
+
last_error = exc
|
|
90
|
+
if attempt < self.max_retries - 1:
|
|
91
|
+
await asyncio.sleep(self.retry_delay * (attempt + 1))
|
|
92
|
+
msg = f"Embedding failed after {self.max_retries} retries"
|
|
93
|
+
raise RuntimeError(msg) from last_error
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def _cache_key(text: str) -> str:
|
|
97
|
+
return hashlib.sha256(text.encode()).hexdigest()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class LocalEmbeddings(EmbeddingProvider):
|
|
101
|
+
"""Placeholder for sentence-transformers local embedding.
|
|
102
|
+
|
|
103
|
+
Returns deterministic pseudo-embeddings based on text hash for testing.
|
|
104
|
+
Replace the ``embed`` method with a real model for production use.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
def __init__(self, dims: int = 384) -> None:
|
|
108
|
+
self._dims = dims
|
|
109
|
+
|
|
110
|
+
def dimensions(self) -> int:
|
|
111
|
+
return self._dims
|
|
112
|
+
|
|
113
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
114
|
+
return [self._pseudo_embedding(t) for t in texts]
|
|
115
|
+
|
|
116
|
+
def _pseudo_embedding(self, text: str) -> list[float]:
|
|
117
|
+
"""Generate a deterministic pseudo-embedding from text hash."""
|
|
118
|
+
h = hashlib.sha256(text.encode()).digest()
|
|
119
|
+
raw = [b / 255.0 for b in h]
|
|
120
|
+
# Extend or truncate to match dimensions
|
|
121
|
+
while len(raw) < self._dims:
|
|
122
|
+
raw = raw + raw
|
|
123
|
+
raw = raw[: self._dims]
|
|
124
|
+
# Normalize to unit vector
|
|
125
|
+
norm = sum(x * x for x in raw) ** 0.5
|
|
126
|
+
return [x / norm if norm > 0 else 0.0 for x in raw]
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _build_provider(config: Any) -> EmbeddingProvider:
|
|
130
|
+
"""Factory: create an EmbeddingProvider from an EmbeddingConfig."""
|
|
131
|
+
if config.provider == "openai":
|
|
132
|
+
return OpenAIEmbeddings(model=config.model, batch_size=config.batch_size)
|
|
133
|
+
if config.provider == "local":
|
|
134
|
+
return LocalEmbeddings(dims=config.dimensions)
|
|
135
|
+
msg = f"Unknown embedding provider: {config.provider!r}"
|
|
136
|
+
raise ValueError(msg)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Core domain models for the RAG pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _default_id() -> str:
|
|
12
|
+
return uuid.uuid4().hex
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Document(BaseModel):
|
|
16
|
+
"""A source document to be ingested into the RAG pipeline."""
|
|
17
|
+
|
|
18
|
+
id: str = Field(default_factory=_default_id)
|
|
19
|
+
text: str
|
|
20
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Chunk(BaseModel):
|
|
24
|
+
"""A text chunk derived from a document."""
|
|
25
|
+
|
|
26
|
+
id: str = Field(default_factory=_default_id)
|
|
27
|
+
text: str
|
|
28
|
+
document_id: str
|
|
29
|
+
start_idx: int
|
|
30
|
+
end_idx: int
|
|
31
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
32
|
+
embedding: list[float] | None = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SearchResult(BaseModel):
|
|
36
|
+
"""A single result returned from vector search."""
|
|
37
|
+
|
|
38
|
+
chunk: Chunk
|
|
39
|
+
score: float
|
|
40
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ChunkingConfig(BaseModel):
|
|
44
|
+
"""Configuration for text chunking."""
|
|
45
|
+
|
|
46
|
+
strategy: str = "recursive"
|
|
47
|
+
chunk_size: int = 512
|
|
48
|
+
chunk_overlap: int = 64
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class EmbeddingConfig(BaseModel):
|
|
52
|
+
"""Configuration for the embedding provider."""
|
|
53
|
+
|
|
54
|
+
provider: str = "openai"
|
|
55
|
+
model: str = "text-embedding-3-small"
|
|
56
|
+
dimensions: int = 1536
|
|
57
|
+
batch_size: int = 100
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class VectorStoreConfig(BaseModel):
|
|
61
|
+
"""Configuration for the vector store backend."""
|
|
62
|
+
|
|
63
|
+
backend: str = "pgvector"
|
|
64
|
+
table_name: str = "rag_chunks"
|
|
65
|
+
dsn: str = ""
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class RAGConfig(BaseModel):
|
|
69
|
+
"""Top-level configuration for the entire RAG pipeline."""
|
|
70
|
+
|
|
71
|
+
chunking: ChunkingConfig = Field(default_factory=ChunkingConfig)
|
|
72
|
+
embedding: EmbeddingConfig = Field(default_factory=EmbeddingConfig)
|
|
73
|
+
vector_store: VectorStoreConfig = Field(default_factory=VectorStoreConfig)
|