msaas-rag 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ node_modules/
2
+ dist/
3
+ .next/
4
+ .turbo/
5
+ *.pyc
6
+ __pycache__/
7
+ .venv/
8
+ *.egg-info/
9
+ .pytest_cache/
10
+ .ruff_cache/
11
+ .env
12
+ .env.local
13
+ .env.*.local
14
+ .DS_Store
15
+ coverage/
16
+
17
+ # Runtime artifacts
18
+ logs_llm/
19
+ vectors.db
20
+ vectors.db-shm
21
+ vectors.db-wal
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.4
2
+ Name: msaas-rag
3
+ Version: 1.0.0
4
+ Summary: RAG pipeline library — chunking, embeddings, vector search, and retrieval for the Willian SaaS platform
5
+ License: MIT
6
+ Requires-Python: >=3.12
7
+ Requires-Dist: asyncpg>=0.30.0
8
+ Requires-Dist: pydantic>=2.0
9
+ Provides-Extra: all
10
+ Requires-Dist: numpy>=2.0; extra == 'all'
11
+ Requires-Dist: openai>=1.50.0; extra == 'all'
12
+ Provides-Extra: dev
13
+ Requires-Dist: numpy>=2.0; extra == 'dev'
14
+ Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
15
+ Requires-Dist: pytest>=8.0; extra == 'dev'
16
+ Requires-Dist: ruff>=0.8; extra == 'dev'
17
+ Provides-Extra: numpy
18
+ Requires-Dist: numpy>=2.0; extra == 'numpy'
19
+ Provides-Extra: openai
20
+ Requires-Dist: openai>=1.50.0; extra == 'openai'
@@ -0,0 +1,39 @@
1
+ [project]
2
+ name = "msaas-rag"
3
+ version = "1.0.0"
4
+ description = "RAG pipeline library — chunking, embeddings, vector search, and retrieval for the Willian SaaS platform"
5
+ requires-python = ">=3.12"
6
+ license = { text = "MIT" }
7
+ dependencies = [
8
+ "pydantic>=2.0",
9
+ "asyncpg>=0.30.0",
10
+ ]
11
+
12
+ [project.optional-dependencies]
13
+ openai = ["openai>=1.50.0"]
14
+ numpy = ["numpy>=2.0"]
15
+ all = ["openai>=1.50.0", "numpy>=2.0"]
16
+ dev = [
17
+ "pytest>=8.0",
18
+ "pytest-asyncio>=0.24.0",
19
+ "numpy>=2.0",
20
+ "ruff>=0.8",
21
+ ]
22
+
23
+ [build-system]
24
+ requires = ["hatchling"]
25
+ build-backend = "hatchling.build"
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ packages = ["src/rag"]
29
+
30
+ [tool.ruff]
31
+ target-version = "py312"
32
+ line-length = 100
33
+
34
+ [tool.ruff.lint]
35
+ select = ["E", "F", "I", "N", "W", "UP", "B", "SIM", "TCH"]
36
+
37
+ [tool.pytest.ini_options]
38
+ testpaths = ["tests"]
39
+ asyncio_mode = "auto"
@@ -0,0 +1,54 @@
1
+ """Willian RAG — retrieval-augmented generation pipeline library."""
2
+
3
+ from rag.chunking import MarkdownChunker, TextChunker
4
+ from rag.config import get_config, get_pipeline, init_rag
5
+ from rag.embeddings import EmbeddingProvider, LocalEmbeddings, OpenAIEmbeddings
6
+ from rag.models import (
7
+ Chunk,
8
+ ChunkingConfig,
9
+ Document,
10
+ EmbeddingConfig,
11
+ RAGConfig,
12
+ SearchResult,
13
+ VectorStoreConfig,
14
+ )
15
+ from rag.pipeline import RAGPipeline
16
+ from rag.reranker import (
17
+ CrossEncoderReranker,
18
+ LLMReranker,
19
+ Reranker,
20
+ reciprocal_rank_fusion,
21
+ )
22
+ from rag.vector_store import InMemoryVectorStore, PgVectorStore, VectorStore
23
+
24
+ __all__ = [
25
+ # Pipeline
26
+ "RAGPipeline",
27
+ "init_rag",
28
+ "get_config",
29
+ "get_pipeline",
30
+ # Models
31
+ "Document",
32
+ "Chunk",
33
+ "SearchResult",
34
+ "RAGConfig",
35
+ "ChunkingConfig",
36
+ "EmbeddingConfig",
37
+ "VectorStoreConfig",
38
+ # Chunking
39
+ "TextChunker",
40
+ "MarkdownChunker",
41
+ # Embeddings
42
+ "EmbeddingProvider",
43
+ "OpenAIEmbeddings",
44
+ "LocalEmbeddings",
45
+ # Vector stores
46
+ "VectorStore",
47
+ "PgVectorStore",
48
+ "InMemoryVectorStore",
49
+ # Reranking
50
+ "Reranker",
51
+ "CrossEncoderReranker",
52
+ "LLMReranker",
53
+ "reciprocal_rank_fusion",
54
+ ]
@@ -0,0 +1,202 @@
1
+ """Text chunking strategies for document processing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import uuid
7
+ from typing import Any
8
+
9
+ from rag.models import Chunk
10
+
11
+
12
+ class TextChunker:
13
+ """Split plain text into chunks using configurable strategies.
14
+
15
+ Supported strategies:
16
+ - ``fixed_size``: split at exact character boundaries
17
+ - ``recursive``: split by paragraph > sentence > character, preferring natural breaks
18
+ - ``semantic``: sentence-based splitting that groups sentences up to chunk_size
19
+ """
20
+
21
+ SEPARATORS = ["\n\n", "\n", ". ", " "]
22
+
23
+ def __init__(
24
+ self,
25
+ strategy: str = "recursive",
26
+ chunk_size: int = 512,
27
+ chunk_overlap: int = 64,
28
+ ) -> None:
29
+ if strategy not in ("fixed_size", "recursive", "semantic"):
30
+ msg = f"Unknown strategy: {strategy!r}. Use fixed_size, recursive, or semantic."
31
+ raise ValueError(msg)
32
+ self.strategy = strategy
33
+ self.chunk_size = chunk_size
34
+ self.chunk_overlap = chunk_overlap
35
+
36
+ def chunk(
37
+ self,
38
+ text: str,
39
+ document_id: str,
40
+ metadata: dict[str, Any] | None = None,
41
+ ) -> list[Chunk]:
42
+ """Split *text* into Chunk objects."""
43
+ meta = metadata or {}
44
+ if self.strategy == "fixed_size":
45
+ return self._fixed_size(text, document_id, meta)
46
+ if self.strategy == "semantic":
47
+ return self._semantic(text, document_id, meta)
48
+ return self._recursive(text, document_id, meta)
49
+
50
+ # -- strategies ----------------------------------------------------------
51
+
52
+ def _fixed_size(self, text: str, document_id: str, meta: dict[str, Any]) -> list[Chunk]:
53
+ chunks: list[Chunk] = []
54
+ step = max(1, self.chunk_size - self.chunk_overlap)
55
+ for start in range(0, len(text), step):
56
+ end = min(start + self.chunk_size, len(text))
57
+ chunks.append(self._make(text[start:end], document_id, start, end, meta))
58
+ if end == len(text):
59
+ break
60
+ return chunks
61
+
62
+ def _recursive(self, text: str, document_id: str, meta: dict[str, Any]) -> list[Chunk]:
63
+ return self._recursive_split(text, document_id, meta, self.SEPARATORS)
64
+
65
+ def _recursive_split(
66
+ self,
67
+ text: str,
68
+ document_id: str,
69
+ meta: dict[str, Any],
70
+ separators: list[str],
71
+ ) -> list[Chunk]:
72
+ if len(text) <= self.chunk_size:
73
+ return [self._make(text, document_id, 0, len(text), meta)]
74
+
75
+ sep = separators[0] if separators else ""
76
+ parts = text.split(sep) if sep else list(text)
77
+ remaining_seps = separators[1:] if separators else []
78
+
79
+ chunks: list[Chunk] = []
80
+ current = ""
81
+ offset = 0
82
+
83
+ for i, part in enumerate(parts):
84
+ candidate = current + (sep if current else "") + part
85
+ if len(candidate) > self.chunk_size and current:
86
+ start = offset
87
+ end = offset + len(current)
88
+ chunks.append(self._make(current, document_id, start, end, meta))
89
+ # Compute overlap start
90
+ overlap_start = max(0, len(current) - self.chunk_overlap)
91
+ offset = offset + len(current) - (len(current) - overlap_start) + len(sep)
92
+ current = (
93
+ current[overlap_start:] + sep + part if overlap_start < len(current) else part
94
+ )
95
+ elif len(candidate) > self.chunk_size and remaining_seps:
96
+ sub = self._recursive_split(part, document_id, meta, remaining_seps)
97
+ chunks.extend(sub)
98
+ current = ""
99
+ offset += len(part) + len(sep)
100
+ else:
101
+ current = candidate
102
+ if not current.startswith(part) and i == 0:
103
+ pass # offset stays
104
+ if current.strip():
105
+ chunks.append(self._make(current, document_id, offset, offset + len(current), meta))
106
+ return chunks
107
+
108
+ def _semantic(self, text: str, document_id: str, meta: dict[str, Any]) -> list[Chunk]:
109
+ """Sentence-based chunking: group sentences up to chunk_size."""
110
+ sentences = re.split(r"(?<=[.!?])\s+", text)
111
+ chunks: list[Chunk] = []
112
+ current = ""
113
+ offset = 0
114
+
115
+ for sentence in sentences:
116
+ candidate = (current + " " + sentence).strip() if current else sentence
117
+ if len(candidate) > self.chunk_size and current:
118
+ end = offset + len(current)
119
+ chunks.append(self._make(current, document_id, offset, end, meta))
120
+ offset = end + 1
121
+ current = sentence
122
+ else:
123
+ current = candidate
124
+ if current.strip():
125
+ chunks.append(self._make(current, document_id, offset, offset + len(current), meta))
126
+ return chunks
127
+
128
+ @staticmethod
129
+ def _make(text: str, document_id: str, start: int, end: int, meta: dict[str, Any]) -> Chunk:
130
+ return Chunk(
131
+ id=uuid.uuid4().hex,
132
+ text=text,
133
+ document_id=document_id,
134
+ start_idx=start,
135
+ end_idx=end,
136
+ metadata=meta,
137
+ )
138
+
139
+
140
+ class MarkdownChunker:
141
+ """Split Markdown documents by headers, preserving structure.
142
+
143
+ Each header section becomes a chunk. Sections exceeding ``chunk_size`` are
144
+ further split by the given ``fallback`` TextChunker strategy.
145
+ """
146
+
147
+ HEADER_RE = re.compile(r"^(#{1,6})\s+(.*)", re.MULTILINE)
148
+
149
+ def __init__(self, chunk_size: int = 1024, chunk_overlap: int = 64) -> None:
150
+ self.chunk_size = chunk_size
151
+ self.chunk_overlap = chunk_overlap
152
+ self._fallback = TextChunker(
153
+ strategy="recursive", chunk_size=chunk_size, chunk_overlap=chunk_overlap
154
+ )
155
+
156
+ def chunk(
157
+ self,
158
+ text: str,
159
+ document_id: str,
160
+ metadata: dict[str, Any] | None = None,
161
+ ) -> list[Chunk]:
162
+ meta = metadata or {}
163
+ sections = self._split_by_headers(text)
164
+ chunks: list[Chunk] = []
165
+
166
+ for header, body, start in sections:
167
+ section_meta = {**meta}
168
+ if header:
169
+ section_meta["header"] = header
170
+ full_text = f"{header}\n{body}".strip() if header else body.strip()
171
+ if len(full_text) <= self.chunk_size:
172
+ chunks.append(
173
+ TextChunker._make(
174
+ full_text, document_id, start, start + len(full_text), section_meta
175
+ )
176
+ )
177
+ else:
178
+ sub = self._fallback.chunk(full_text, document_id, section_meta)
179
+ chunks.extend(sub)
180
+ return chunks
181
+
182
+ def _split_by_headers(self, text: str) -> list[tuple[str, str, int]]:
183
+ """Return list of (header_line, body_text, start_index)."""
184
+ matches = list(self.HEADER_RE.finditer(text))
185
+ if not matches:
186
+ return [("", text, 0)]
187
+
188
+ sections: list[tuple[str, str, int]] = []
189
+ # Text before first header
190
+ if matches[0].start() > 0:
191
+ preamble = text[: matches[0].start()]
192
+ if preamble.strip():
193
+ sections.append(("", preamble.strip(), 0))
194
+
195
+ for i, match in enumerate(matches):
196
+ header_line = match.group(0)
197
+ body_start = match.end()
198
+ body_end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
199
+ body = text[body_start:body_end].strip()
200
+ sections.append((header_line, body, match.start()))
201
+
202
+ return sections
@@ -0,0 +1,46 @@
1
+ """Global configuration and pipeline singleton management."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ from rag.models import RAGConfig
8
+
9
+ if TYPE_CHECKING:
10
+ from rag.pipeline import RAGPipeline
11
+
12
+ _pipeline: RAGPipeline | None = None
13
+ _config: RAGConfig | None = None
14
+
15
+
16
+ def init_rag(config: RAGConfig | None = None) -> RAGConfig:
17
+ """Initialize global RAG configuration.
18
+
19
+ If no config is provided, a default configuration is created.
20
+ Returns the active config for further customization.
21
+ """
22
+ global _config, _pipeline
23
+ _config = config or RAGConfig()
24
+ _pipeline = None # Reset pipeline so it picks up new config
25
+ return _config
26
+
27
+
28
+ def get_config() -> RAGConfig:
29
+ """Return the current global RAG config, initializing defaults if needed."""
30
+ global _config
31
+ if _config is None:
32
+ _config = RAGConfig()
33
+ return _config
34
+
35
+
36
+ def get_pipeline() -> RAGPipeline:
37
+ """Return (or create) the global RAGPipeline singleton.
38
+
39
+ Uses the current global config. Call ``init_rag()`` first to customize.
40
+ """
41
+ global _pipeline
42
+ if _pipeline is None:
43
+ from rag.pipeline import RAGPipeline
44
+
45
+ _pipeline = RAGPipeline(config=get_config())
46
+ return _pipeline
@@ -0,0 +1,136 @@
1
+ """Embedding providers for converting text to vector representations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import hashlib
7
+ from abc import ABC, abstractmethod
8
+ from typing import Any
9
+
10
+
11
+ class EmbeddingProvider(ABC):
12
+ """Abstract base for embedding providers."""
13
+
14
+ @abstractmethod
15
+ async def embed(self, texts: list[str]) -> list[list[float]]:
16
+ """Return embedding vectors for each text in *texts*."""
17
+
18
+ @abstractmethod
19
+ def dimensions(self) -> int:
20
+ """Return the dimensionality of the embedding vectors."""
21
+
22
+
23
+ class OpenAIEmbeddings(EmbeddingProvider):
24
+ """OpenAI text-embedding-3-small/large provider with caching and batching.
25
+
26
+ Requires the ``openai`` extra: ``pip install willian-rag[openai]``
27
+ """
28
+
29
+ MODEL_DIMENSIONS = {
30
+ "text-embedding-3-small": 1536,
31
+ "text-embedding-3-large": 3072,
32
+ "text-embedding-ada-002": 1536,
33
+ }
34
+
35
+ def __init__(
36
+ self,
37
+ model: str = "text-embedding-3-small",
38
+ api_key: str | None = None,
39
+ batch_size: int = 100,
40
+ max_retries: int = 3,
41
+ retry_delay: float = 1.0,
42
+ ) -> None:
43
+ try:
44
+ import openai # noqa: F811
45
+ except ImportError as exc:
46
+ msg = "Install the openai extra: pip install willian-rag[openai]"
47
+ raise ImportError(msg) from exc
48
+
49
+ self.model = model
50
+ self.batch_size = batch_size
51
+ self.max_retries = max_retries
52
+ self.retry_delay = retry_delay
53
+ self._client = openai.AsyncOpenAI(api_key=api_key)
54
+ self._cache: dict[str, list[float]] = {}
55
+
56
+ def dimensions(self) -> int:
57
+ return self.MODEL_DIMENSIONS.get(self.model, 1536)
58
+
59
+ async def embed(self, texts: list[str]) -> list[list[float]]:
60
+ """Embed texts with batching, rate limiting, and caching."""
61
+ results: dict[int, list[float]] = {}
62
+ uncached: list[tuple[int, str]] = []
63
+
64
+ for i, text in enumerate(texts):
65
+ key = self._cache_key(text)
66
+ if key in self._cache:
67
+ results[i] = self._cache[key]
68
+ else:
69
+ uncached.append((i, text))
70
+
71
+ # Process uncached in batches
72
+ for batch_start in range(0, len(uncached), self.batch_size):
73
+ batch = uncached[batch_start : batch_start + self.batch_size]
74
+ batch_texts = [t for _, t in batch]
75
+ embeddings = await self._embed_with_retry(batch_texts)
76
+ for (idx, text), emb in zip(batch, embeddings):
77
+ self._cache[self._cache_key(text)] = emb
78
+ results[idx] = emb
79
+
80
+ return [results[i] for i in range(len(texts))]
81
+
82
+ async def _embed_with_retry(self, texts: list[str]) -> list[list[float]]:
83
+ last_error: Exception | None = None
84
+ for attempt in range(self.max_retries):
85
+ try:
86
+ response = await self._client.embeddings.create(model=self.model, input=texts)
87
+ return [item.embedding for item in response.data]
88
+ except Exception as exc:
89
+ last_error = exc
90
+ if attempt < self.max_retries - 1:
91
+ await asyncio.sleep(self.retry_delay * (attempt + 1))
92
+ msg = f"Embedding failed after {self.max_retries} retries"
93
+ raise RuntimeError(msg) from last_error
94
+
95
+ @staticmethod
96
+ def _cache_key(text: str) -> str:
97
+ return hashlib.sha256(text.encode()).hexdigest()
98
+
99
+
100
+ class LocalEmbeddings(EmbeddingProvider):
101
+ """Placeholder for sentence-transformers local embedding.
102
+
103
+ Returns deterministic pseudo-embeddings based on text hash for testing.
104
+ Replace the ``embed`` method with a real model for production use.
105
+ """
106
+
107
+ def __init__(self, dims: int = 384) -> None:
108
+ self._dims = dims
109
+
110
+ def dimensions(self) -> int:
111
+ return self._dims
112
+
113
+ async def embed(self, texts: list[str]) -> list[list[float]]:
114
+ return [self._pseudo_embedding(t) for t in texts]
115
+
116
+ def _pseudo_embedding(self, text: str) -> list[float]:
117
+ """Generate a deterministic pseudo-embedding from text hash."""
118
+ h = hashlib.sha256(text.encode()).digest()
119
+ raw = [b / 255.0 for b in h]
120
+ # Extend or truncate to match dimensions
121
+ while len(raw) < self._dims:
122
+ raw = raw + raw
123
+ raw = raw[: self._dims]
124
+ # Normalize to unit vector
125
+ norm = sum(x * x for x in raw) ** 0.5
126
+ return [x / norm if norm > 0 else 0.0 for x in raw]
127
+
128
+
129
+ def _build_provider(config: Any) -> EmbeddingProvider:
130
+ """Factory: create an EmbeddingProvider from an EmbeddingConfig."""
131
+ if config.provider == "openai":
132
+ return OpenAIEmbeddings(model=config.model, batch_size=config.batch_size)
133
+ if config.provider == "local":
134
+ return LocalEmbeddings(dims=config.dimensions)
135
+ msg = f"Unknown embedding provider: {config.provider!r}"
136
+ raise ValueError(msg)
@@ -0,0 +1,73 @@
1
+ """Core domain models for the RAG pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import uuid
6
+ from typing import Any
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ def _default_id() -> str:
12
+ return uuid.uuid4().hex
13
+
14
+
15
+ class Document(BaseModel):
16
+ """A source document to be ingested into the RAG pipeline."""
17
+
18
+ id: str = Field(default_factory=_default_id)
19
+ text: str
20
+ metadata: dict[str, Any] = Field(default_factory=dict)
21
+
22
+
23
+ class Chunk(BaseModel):
24
+ """A text chunk derived from a document."""
25
+
26
+ id: str = Field(default_factory=_default_id)
27
+ text: str
28
+ document_id: str
29
+ start_idx: int
30
+ end_idx: int
31
+ metadata: dict[str, Any] = Field(default_factory=dict)
32
+ embedding: list[float] | None = None
33
+
34
+
35
+ class SearchResult(BaseModel):
36
+ """A single result returned from vector search."""
37
+
38
+ chunk: Chunk
39
+ score: float
40
+ metadata: dict[str, Any] = Field(default_factory=dict)
41
+
42
+
43
+ class ChunkingConfig(BaseModel):
44
+ """Configuration for text chunking."""
45
+
46
+ strategy: str = "recursive"
47
+ chunk_size: int = 512
48
+ chunk_overlap: int = 64
49
+
50
+
51
+ class EmbeddingConfig(BaseModel):
52
+ """Configuration for the embedding provider."""
53
+
54
+ provider: str = "openai"
55
+ model: str = "text-embedding-3-small"
56
+ dimensions: int = 1536
57
+ batch_size: int = 100
58
+
59
+
60
+ class VectorStoreConfig(BaseModel):
61
+ """Configuration for the vector store backend."""
62
+
63
+ backend: str = "pgvector"
64
+ table_name: str = "rag_chunks"
65
+ dsn: str = ""
66
+
67
+
68
+ class RAGConfig(BaseModel):
69
+ """Top-level configuration for the entire RAG pipeline."""
70
+
71
+ chunking: ChunkingConfig = Field(default_factory=ChunkingConfig)
72
+ embedding: EmbeddingConfig = Field(default_factory=EmbeddingConfig)
73
+ vector_store: VectorStoreConfig = Field(default_factory=VectorStoreConfig)