swiftrag 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
swiftrag/__init__.py ADDED
@@ -0,0 +1,58 @@
1
+ """swiftrag — instant, optimized Retrieval-Augmented Generation.
2
+
3
+ Pass your text and (optionally) a model. Get a RAG-powered LLM in one line.
4
+
5
+ from swiftrag import RAG
6
+
7
+ rag = RAG(
8
+ documents="your knowledge as a string (or a list of strings/dicts)",
9
+ embedding_model="openai:text-embedding-3-small",
10
+ llm_model="openai:gpt-4o-mini",
11
+ )
12
+ print(rag.query("your question"))
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from .chunking import chunk_text, count_tokens
18
+ from .core import RAG
19
+ from .embeddings import (
20
+ EmbeddingProvider,
21
+ HashEmbeddings,
22
+ OpenAIEmbeddings,
23
+ SentenceTransformerEmbeddings,
24
+ )
25
+ from .exceptions import (
26
+ ConfigurationError,
27
+ DependencyError,
28
+ EmptyCorpusError,
29
+ SwiftRagError,
30
+ )
31
+ from .llms import AnthropicLLM, CallableLLM, EchoLLM, LLMProvider, OpenAILLM
32
+ from .types import Chunk, Document, RAGResponse, ScoredChunk
33
+
34
+ __version__ = "0.1.0"
35
+
36
+ __all__ = [
37
+ "RAG",
38
+ "chunk_text",
39
+ "count_tokens",
40
+ "Document",
41
+ "Chunk",
42
+ "ScoredChunk",
43
+ "RAGResponse",
44
+ "EmbeddingProvider",
45
+ "HashEmbeddings",
46
+ "OpenAIEmbeddings",
47
+ "SentenceTransformerEmbeddings",
48
+ "LLMProvider",
49
+ "OpenAILLM",
50
+ "AnthropicLLM",
51
+ "EchoLLM",
52
+ "CallableLLM",
53
+ "SwiftRagError",
54
+ "ConfigurationError",
55
+ "DependencyError",
56
+ "EmptyCorpusError",
57
+ "__version__",
58
+ ]
swiftrag/chunking.py ADDED
@@ -0,0 +1,213 @@
1
+ """Text chunking.
2
+
3
+ Strategy (optimized for retrieval quality + speed):
4
+
5
+ 1. If ``tiktoken`` is available we chunk by *tokens*, which matches how the
6
+ downstream embedding/LLM models actually see text. This gives uniform chunk
7
+ sizes regardless of language/whitespace.
8
+ 2. Otherwise we fall back to a fast, dependency-free splitter that respects
9
+ natural boundaries (paragraphs -> sentences -> words) and approximates token
10
+ counts with a characters-per-token heuristic.
11
+
12
+ Both paths support configurable overlap so context isn't lost at boundaries.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import re
18
+ from functools import lru_cache
19
+
20
+ # Rough average for English text with common BPE tokenizers.
21
+ _CHARS_PER_TOKEN = 4
22
+
23
+ _PARAGRAPH_RE = re.compile(r"\n\s*\n")
24
+ # Split on sentence terminators while keeping reasonable behavior on edge cases.
25
+ _SENTENCE_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z0-9\"'(])")
26
+
27
+
28
+ @lru_cache(maxsize=4)
29
+ def _get_tiktoken_encoding(name: str):
30
+ try:
31
+ import tiktoken
32
+ except ImportError:
33
+ return None
34
+ try:
35
+ return tiktoken.get_encoding(name)
36
+ except Exception:
37
+ return None
38
+
39
+
40
+ def count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
41
+ """Count tokens in ``text``.
42
+
43
+ Uses tiktoken when available for an exact count; otherwise falls back to a
44
+ fast characters-per-token estimate so callers never need to special-case it.
45
+ """
46
+ if not text:
47
+ return 0
48
+ enc = _get_tiktoken_encoding(encoding_name)
49
+ if enc is not None:
50
+ return len(enc.encode(text))
51
+ return max(1, len(text) // _CHARS_PER_TOKEN)
52
+
53
+
54
+ def chunk_text(
55
+ text: str,
56
+ *,
57
+ chunk_size: int = 512,
58
+ chunk_overlap: int = 64,
59
+ encoding_name: str = "cl100k_base",
60
+ ) -> list[str]:
61
+ """Split ``text`` into overlapping chunks of ~``chunk_size`` tokens.
62
+
63
+ Args:
64
+ text: The input text.
65
+ chunk_size: Target chunk length in tokens.
66
+ chunk_overlap: Number of tokens shared between consecutive chunks.
67
+ encoding_name: tiktoken encoding to use when available.
68
+ """
69
+ text = text.strip()
70
+ if not text:
71
+ return []
72
+ if chunk_overlap >= chunk_size:
73
+ chunk_overlap = chunk_size // 4
74
+
75
+ enc = _get_tiktoken_encoding(encoding_name)
76
+ if enc is not None:
77
+ return _chunk_by_tokens(text, enc, chunk_size, chunk_overlap)
78
+ return _chunk_by_chars(text, chunk_size * _CHARS_PER_TOKEN, chunk_overlap * _CHARS_PER_TOKEN)
79
+
80
+
81
+ def _chunk_by_tokens(text: str, enc, chunk_size: int, chunk_overlap: int) -> list[str]:
82
+ tokens = enc.encode(text)
83
+ if len(tokens) <= chunk_size:
84
+ return [text]
85
+ step = chunk_size - chunk_overlap
86
+ chunks: list[str] = []
87
+ for start in range(0, len(tokens), step):
88
+ window = tokens[start : start + chunk_size]
89
+ if not window:
90
+ break
91
+ chunks.append(enc.decode(window).strip())
92
+ if start + chunk_size >= len(tokens):
93
+ break
94
+ return [c for c in chunks if c]
95
+
96
+
97
+ def _chunk_by_chars(text: str, max_chars: int, overlap_chars: int) -> list[str]:
98
+ if len(text) <= max_chars:
99
+ return [text]
100
+
101
+ # Build semantic units: paragraphs, then sentences for oversized paragraphs.
102
+ units: list[str] = []
103
+ for para in _PARAGRAPH_RE.split(text):
104
+ para = para.strip()
105
+ if not para:
106
+ continue
107
+ if len(para) <= max_chars:
108
+ units.append(para)
109
+ else:
110
+ units.extend(_split_oversized(para, max_chars))
111
+
112
+ chunks: list[str] = []
113
+ buf: list[str] = []
114
+ buf_len = 0
115
+ for unit in units:
116
+ unit_len = len(unit) + 1
117
+ if buf and buf_len + unit_len > max_chars:
118
+ chunks.append("\n".join(buf))
119
+ buf, buf_len = _carry_overlap(buf, overlap_chars)
120
+ buf.append(unit)
121
+ buf_len += unit_len
122
+ if buf:
123
+ chunks.append("\n".join(buf))
124
+ return [c.strip() for c in chunks if c.strip()]
125
+
126
+
127
+ def _split_oversized(para: str, max_chars: int) -> list[str]:
128
+ """Split a paragraph that is itself larger than ``max_chars``."""
129
+ pieces: list[str] = []
130
+ sentences = _SENTENCE_RE.split(para)
131
+ buf: list[str] = []
132
+ buf_len = 0
133
+ for sent in sentences:
134
+ sent = sent.strip()
135
+ if not sent:
136
+ continue
137
+ if len(sent) > max_chars:
138
+ if buf:
139
+ pieces.append(" ".join(buf))
140
+ buf, buf_len = [], 0
141
+ pieces.extend(_split_words(sent, max_chars))
142
+ continue
143
+ if buf and buf_len + len(sent) + 1 > max_chars:
144
+ pieces.append(" ".join(buf))
145
+ buf, buf_len = [], 0
146
+ buf.append(sent)
147
+ buf_len += len(sent) + 1
148
+ if buf:
149
+ pieces.append(" ".join(buf))
150
+ return pieces
151
+
152
+
153
+ def _split_words(text: str, max_chars: int) -> list[str]:
154
+ words = text.split()
155
+ pieces: list[str] = []
156
+ buf: list[str] = []
157
+ buf_len = 0
158
+ for word in words:
159
+ if buf and buf_len + len(word) + 1 > max_chars:
160
+ pieces.append(" ".join(buf))
161
+ buf, buf_len = [], 0
162
+ buf.append(word)
163
+ buf_len += len(word) + 1
164
+ if buf:
165
+ pieces.append(" ".join(buf))
166
+ return pieces
167
+
168
+
169
+ def _carry_overlap(buf: list[str], overlap_chars: int) -> tuple[list[str], int]:
170
+ """Keep trailing units from ``buf`` to seed the next chunk's overlap."""
171
+ if overlap_chars <= 0:
172
+ return [], 0
173
+ carried: list[str] = []
174
+ length = 0
175
+ for unit in reversed(buf):
176
+ if length >= overlap_chars:
177
+ break
178
+ carried.insert(0, unit)
179
+ length += len(unit) + 1
180
+ return carried, length
181
+
182
+
183
+ def chunk_documents(
184
+ docs,
185
+ *,
186
+ chunk_size: int = 512,
187
+ chunk_overlap: int = 64,
188
+ encoding_name: str = "cl100k_base",
189
+ ) -> list:
190
+ """Chunk an iterable of :class:`~swiftrag.types.Document` into chunks."""
191
+ from .types import Chunk # local import to avoid cycle at module import time
192
+
193
+ chunks: list[Chunk] = []
194
+ for doc in docs:
195
+ parts = chunk_text(
196
+ doc.text,
197
+ chunk_size=chunk_size,
198
+ chunk_overlap=chunk_overlap,
199
+ encoding_name=encoding_name,
200
+ )
201
+ for i, part in enumerate(parts):
202
+ chunks.append(
203
+ Chunk(
204
+ text=part,
205
+ doc_id=doc.id or "",
206
+ chunk_index=i,
207
+ metadata=dict(doc.metadata),
208
+ )
209
+ )
210
+ return chunks
211
+
212
+
213
+ __all__ = ["chunk_text", "chunk_documents", "count_tokens"]