swiftrag 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swiftrag/__init__.py +58 -0
- swiftrag/chunking.py +213 -0
- swiftrag/core.py +508 -0
- swiftrag/embeddings.py +201 -0
- swiftrag/exceptions.py +27 -0
- swiftrag/llms.py +213 -0
- swiftrag/py.typed +0 -0
- swiftrag/store.py +189 -0
- swiftrag/types.py +69 -0
- swiftrag-0.1.0.dist-info/METADATA +229 -0
- swiftrag-0.1.0.dist-info/RECORD +13 -0
- swiftrag-0.1.0.dist-info/WHEEL +4 -0
- swiftrag-0.1.0.dist-info/licenses/LICENSE +21 -0
swiftrag/__init__.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""swiftrag — instant, optimized Retrieval-Augmented Generation.
|
|
2
|
+
|
|
3
|
+
Pass your text and (optionally) a model. Get a RAG-powered LLM in one line.
|
|
4
|
+
|
|
5
|
+
from swiftrag import RAG
|
|
6
|
+
|
|
7
|
+
rag = RAG(
|
|
8
|
+
documents="your knowledge as a string (or a list of strings/dicts)",
|
|
9
|
+
embedding_model="openai:text-embedding-3-small",
|
|
10
|
+
llm_model="openai:gpt-4o-mini",
|
|
11
|
+
)
|
|
12
|
+
print(rag.query("your question"))
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from .chunking import chunk_text, count_tokens
|
|
18
|
+
from .core import RAG
|
|
19
|
+
from .embeddings import (
|
|
20
|
+
EmbeddingProvider,
|
|
21
|
+
HashEmbeddings,
|
|
22
|
+
OpenAIEmbeddings,
|
|
23
|
+
SentenceTransformerEmbeddings,
|
|
24
|
+
)
|
|
25
|
+
from .exceptions import (
|
|
26
|
+
ConfigurationError,
|
|
27
|
+
DependencyError,
|
|
28
|
+
EmptyCorpusError,
|
|
29
|
+
SwiftRagError,
|
|
30
|
+
)
|
|
31
|
+
from .llms import AnthropicLLM, CallableLLM, EchoLLM, LLMProvider, OpenAILLM
|
|
32
|
+
from .types import Chunk, Document, RAGResponse, ScoredChunk
|
|
33
|
+
|
|
34
|
+
__version__ = "0.1.0"
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
"RAG",
|
|
38
|
+
"chunk_text",
|
|
39
|
+
"count_tokens",
|
|
40
|
+
"Document",
|
|
41
|
+
"Chunk",
|
|
42
|
+
"ScoredChunk",
|
|
43
|
+
"RAGResponse",
|
|
44
|
+
"EmbeddingProvider",
|
|
45
|
+
"HashEmbeddings",
|
|
46
|
+
"OpenAIEmbeddings",
|
|
47
|
+
"SentenceTransformerEmbeddings",
|
|
48
|
+
"LLMProvider",
|
|
49
|
+
"OpenAILLM",
|
|
50
|
+
"AnthropicLLM",
|
|
51
|
+
"EchoLLM",
|
|
52
|
+
"CallableLLM",
|
|
53
|
+
"SwiftRagError",
|
|
54
|
+
"ConfigurationError",
|
|
55
|
+
"DependencyError",
|
|
56
|
+
"EmptyCorpusError",
|
|
57
|
+
"__version__",
|
|
58
|
+
]
|
swiftrag/chunking.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""Text chunking.
|
|
2
|
+
|
|
3
|
+
Strategy (optimized for retrieval quality + speed):
|
|
4
|
+
|
|
5
|
+
1. If ``tiktoken`` is available we chunk by *tokens*, which matches how the
|
|
6
|
+
downstream embedding/LLM models actually see text. This gives uniform chunk
|
|
7
|
+
sizes regardless of language/whitespace.
|
|
8
|
+
2. Otherwise we fall back to a fast, dependency-free splitter that respects
|
|
9
|
+
natural boundaries (paragraphs -> sentences -> words) and approximates token
|
|
10
|
+
counts with a characters-per-token heuristic.
|
|
11
|
+
|
|
12
|
+
Both paths support configurable overlap so context isn't lost at boundaries.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
from functools import lru_cache
|
|
19
|
+
|
|
20
|
+
# Rough average for English text with common BPE tokenizers.
|
|
21
|
+
_CHARS_PER_TOKEN = 4
|
|
22
|
+
|
|
23
|
+
_PARAGRAPH_RE = re.compile(r"\n\s*\n")
|
|
24
|
+
# Split on sentence terminators while keeping reasonable behavior on edge cases.
|
|
25
|
+
_SENTENCE_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z0-9\"'(])")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@lru_cache(maxsize=4)
|
|
29
|
+
def _get_tiktoken_encoding(name: str):
|
|
30
|
+
try:
|
|
31
|
+
import tiktoken
|
|
32
|
+
except ImportError:
|
|
33
|
+
return None
|
|
34
|
+
try:
|
|
35
|
+
return tiktoken.get_encoding(name)
|
|
36
|
+
except Exception:
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
|
|
41
|
+
"""Count tokens in ``text``.
|
|
42
|
+
|
|
43
|
+
Uses tiktoken when available for an exact count; otherwise falls back to a
|
|
44
|
+
fast characters-per-token estimate so callers never need to special-case it.
|
|
45
|
+
"""
|
|
46
|
+
if not text:
|
|
47
|
+
return 0
|
|
48
|
+
enc = _get_tiktoken_encoding(encoding_name)
|
|
49
|
+
if enc is not None:
|
|
50
|
+
return len(enc.encode(text))
|
|
51
|
+
return max(1, len(text) // _CHARS_PER_TOKEN)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def chunk_text(
|
|
55
|
+
text: str,
|
|
56
|
+
*,
|
|
57
|
+
chunk_size: int = 512,
|
|
58
|
+
chunk_overlap: int = 64,
|
|
59
|
+
encoding_name: str = "cl100k_base",
|
|
60
|
+
) -> list[str]:
|
|
61
|
+
"""Split ``text`` into overlapping chunks of ~``chunk_size`` tokens.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
text: The input text.
|
|
65
|
+
chunk_size: Target chunk length in tokens.
|
|
66
|
+
chunk_overlap: Number of tokens shared between consecutive chunks.
|
|
67
|
+
encoding_name: tiktoken encoding to use when available.
|
|
68
|
+
"""
|
|
69
|
+
text = text.strip()
|
|
70
|
+
if not text:
|
|
71
|
+
return []
|
|
72
|
+
if chunk_overlap >= chunk_size:
|
|
73
|
+
chunk_overlap = chunk_size // 4
|
|
74
|
+
|
|
75
|
+
enc = _get_tiktoken_encoding(encoding_name)
|
|
76
|
+
if enc is not None:
|
|
77
|
+
return _chunk_by_tokens(text, enc, chunk_size, chunk_overlap)
|
|
78
|
+
return _chunk_by_chars(text, chunk_size * _CHARS_PER_TOKEN, chunk_overlap * _CHARS_PER_TOKEN)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _chunk_by_tokens(text: str, enc, chunk_size: int, chunk_overlap: int) -> list[str]:
|
|
82
|
+
tokens = enc.encode(text)
|
|
83
|
+
if len(tokens) <= chunk_size:
|
|
84
|
+
return [text]
|
|
85
|
+
step = chunk_size - chunk_overlap
|
|
86
|
+
chunks: list[str] = []
|
|
87
|
+
for start in range(0, len(tokens), step):
|
|
88
|
+
window = tokens[start : start + chunk_size]
|
|
89
|
+
if not window:
|
|
90
|
+
break
|
|
91
|
+
chunks.append(enc.decode(window).strip())
|
|
92
|
+
if start + chunk_size >= len(tokens):
|
|
93
|
+
break
|
|
94
|
+
return [c for c in chunks if c]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _chunk_by_chars(text: str, max_chars: int, overlap_chars: int) -> list[str]:
|
|
98
|
+
if len(text) <= max_chars:
|
|
99
|
+
return [text]
|
|
100
|
+
|
|
101
|
+
# Build semantic units: paragraphs, then sentences for oversized paragraphs.
|
|
102
|
+
units: list[str] = []
|
|
103
|
+
for para in _PARAGRAPH_RE.split(text):
|
|
104
|
+
para = para.strip()
|
|
105
|
+
if not para:
|
|
106
|
+
continue
|
|
107
|
+
if len(para) <= max_chars:
|
|
108
|
+
units.append(para)
|
|
109
|
+
else:
|
|
110
|
+
units.extend(_split_oversized(para, max_chars))
|
|
111
|
+
|
|
112
|
+
chunks: list[str] = []
|
|
113
|
+
buf: list[str] = []
|
|
114
|
+
buf_len = 0
|
|
115
|
+
for unit in units:
|
|
116
|
+
unit_len = len(unit) + 1
|
|
117
|
+
if buf and buf_len + unit_len > max_chars:
|
|
118
|
+
chunks.append("\n".join(buf))
|
|
119
|
+
buf, buf_len = _carry_overlap(buf, overlap_chars)
|
|
120
|
+
buf.append(unit)
|
|
121
|
+
buf_len += unit_len
|
|
122
|
+
if buf:
|
|
123
|
+
chunks.append("\n".join(buf))
|
|
124
|
+
return [c.strip() for c in chunks if c.strip()]
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _split_oversized(para: str, max_chars: int) -> list[str]:
|
|
128
|
+
"""Split a paragraph that is itself larger than ``max_chars``."""
|
|
129
|
+
pieces: list[str] = []
|
|
130
|
+
sentences = _SENTENCE_RE.split(para)
|
|
131
|
+
buf: list[str] = []
|
|
132
|
+
buf_len = 0
|
|
133
|
+
for sent in sentences:
|
|
134
|
+
sent = sent.strip()
|
|
135
|
+
if not sent:
|
|
136
|
+
continue
|
|
137
|
+
if len(sent) > max_chars:
|
|
138
|
+
if buf:
|
|
139
|
+
pieces.append(" ".join(buf))
|
|
140
|
+
buf, buf_len = [], 0
|
|
141
|
+
pieces.extend(_split_words(sent, max_chars))
|
|
142
|
+
continue
|
|
143
|
+
if buf and buf_len + len(sent) + 1 > max_chars:
|
|
144
|
+
pieces.append(" ".join(buf))
|
|
145
|
+
buf, buf_len = [], 0
|
|
146
|
+
buf.append(sent)
|
|
147
|
+
buf_len += len(sent) + 1
|
|
148
|
+
if buf:
|
|
149
|
+
pieces.append(" ".join(buf))
|
|
150
|
+
return pieces
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _split_words(text: str, max_chars: int) -> list[str]:
|
|
154
|
+
words = text.split()
|
|
155
|
+
pieces: list[str] = []
|
|
156
|
+
buf: list[str] = []
|
|
157
|
+
buf_len = 0
|
|
158
|
+
for word in words:
|
|
159
|
+
if buf and buf_len + len(word) + 1 > max_chars:
|
|
160
|
+
pieces.append(" ".join(buf))
|
|
161
|
+
buf, buf_len = [], 0
|
|
162
|
+
buf.append(word)
|
|
163
|
+
buf_len += len(word) + 1
|
|
164
|
+
if buf:
|
|
165
|
+
pieces.append(" ".join(buf))
|
|
166
|
+
return pieces
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _carry_overlap(buf: list[str], overlap_chars: int) -> tuple[list[str], int]:
|
|
170
|
+
"""Keep trailing units from ``buf`` to seed the next chunk's overlap."""
|
|
171
|
+
if overlap_chars <= 0:
|
|
172
|
+
return [], 0
|
|
173
|
+
carried: list[str] = []
|
|
174
|
+
length = 0
|
|
175
|
+
for unit in reversed(buf):
|
|
176
|
+
if length >= overlap_chars:
|
|
177
|
+
break
|
|
178
|
+
carried.insert(0, unit)
|
|
179
|
+
length += len(unit) + 1
|
|
180
|
+
return carried, length
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def chunk_documents(
|
|
184
|
+
docs,
|
|
185
|
+
*,
|
|
186
|
+
chunk_size: int = 512,
|
|
187
|
+
chunk_overlap: int = 64,
|
|
188
|
+
encoding_name: str = "cl100k_base",
|
|
189
|
+
) -> list:
|
|
190
|
+
"""Chunk an iterable of :class:`~swiftrag.types.Document` into chunks."""
|
|
191
|
+
from .types import Chunk # local import to avoid cycle at module import time
|
|
192
|
+
|
|
193
|
+
chunks: list[Chunk] = []
|
|
194
|
+
for doc in docs:
|
|
195
|
+
parts = chunk_text(
|
|
196
|
+
doc.text,
|
|
197
|
+
chunk_size=chunk_size,
|
|
198
|
+
chunk_overlap=chunk_overlap,
|
|
199
|
+
encoding_name=encoding_name,
|
|
200
|
+
)
|
|
201
|
+
for i, part in enumerate(parts):
|
|
202
|
+
chunks.append(
|
|
203
|
+
Chunk(
|
|
204
|
+
text=part,
|
|
205
|
+
doc_id=doc.id or "",
|
|
206
|
+
chunk_index=i,
|
|
207
|
+
metadata=dict(doc.metadata),
|
|
208
|
+
)
|
|
209
|
+
)
|
|
210
|
+
return chunks
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
__all__ = ["chunk_text", "chunk_documents", "count_tokens"]
|