ragit 0.7.3__tar.gz → 0.7.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ragit-0.7.3 → ragit-0.7.5}/PKG-INFO +74 -1
- {ragit-0.7.3 → ragit-0.7.5}/README.md +71 -0
- {ragit-0.7.3 → ragit-0.7.5}/pyproject.toml +2 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit/config.py +1 -1
- {ragit-0.7.3 → ragit-0.7.5}/ragit/providers/ollama.py +198 -47
- {ragit-0.7.3 → ragit-0.7.5}/ragit/version.py +1 -1
- {ragit-0.7.3 → ragit-0.7.5}/ragit.egg-info/PKG-INFO +74 -1
- {ragit-0.7.3 → ragit-0.7.5}/ragit.egg-info/requires.txt +2 -0
- {ragit-0.7.3 → ragit-0.7.5}/LICENSE +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit/__init__.py +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit/assistant.py +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit/core/__init__.py +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit/core/experiment/__init__.py +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit/core/experiment/experiment.py +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit/core/experiment/results.py +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit/loaders.py +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit/providers/__init__.py +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit/providers/base.py +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit/utils/__init__.py +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit.egg-info/SOURCES.txt +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit.egg-info/dependency_links.txt +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/ragit.egg-info/top_level.txt +0 -0
- {ragit-0.7.3 → ragit-0.7.5}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragit
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.5
|
|
4
4
|
Summary: Automatic RAG Pattern Optimization Engine
|
|
5
5
|
Author: RODMENA LIMITED
|
|
6
6
|
Maintainer-email: RODMENA LIMITED <info@rodmena.co.uk>
|
|
@@ -26,6 +26,8 @@ Requires-Dist: pydantic>=2.0.0
|
|
|
26
26
|
Requires-Dist: python-dotenv>=1.0.0
|
|
27
27
|
Requires-Dist: scikit-learn>=1.5.0
|
|
28
28
|
Requires-Dist: tqdm>=4.66.0
|
|
29
|
+
Requires-Dist: trio>=0.24.0
|
|
30
|
+
Requires-Dist: httpx>=0.27.0
|
|
29
31
|
Provides-Extra: dev
|
|
30
32
|
Requires-Dist: ragit[test]; extra == "dev"
|
|
31
33
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -443,6 +445,77 @@ print(f"Score: {best.score:.3f}")
|
|
|
443
445
|
|
|
444
446
|
The experiment tests different combinations of chunk sizes, overlaps, and retrieval parameters to find what works best for your content.
|
|
445
447
|
|
|
448
|
+
## Performance Features
|
|
449
|
+
|
|
450
|
+
Ragit includes several optimizations for production workloads:
|
|
451
|
+
|
|
452
|
+
### Connection Pooling
|
|
453
|
+
|
|
454
|
+
`OllamaProvider` uses HTTP connection pooling via `requests.Session()` for faster sequential requests:
|
|
455
|
+
|
|
456
|
+
```python
|
|
457
|
+
from ragit.providers import OllamaProvider
|
|
458
|
+
|
|
459
|
+
provider = OllamaProvider()
|
|
460
|
+
|
|
461
|
+
# All requests reuse the same connection pool
|
|
462
|
+
for text in texts:
|
|
463
|
+
provider.embed(text, model="mxbai-embed-large")
|
|
464
|
+
|
|
465
|
+
# Explicitly close when done (optional, auto-closes on garbage collection)
|
|
466
|
+
provider.close()
|
|
467
|
+
```
|
|
468
|
+
|
|
469
|
+
### Async Parallel Embedding
|
|
470
|
+
|
|
471
|
+
For large batches, use `embed_batch_async()` with trio for 5-10x faster embedding:
|
|
472
|
+
|
|
473
|
+
```python
|
|
474
|
+
import trio
|
|
475
|
+
from ragit.providers import OllamaProvider
|
|
476
|
+
|
|
477
|
+
provider = OllamaProvider()
|
|
478
|
+
|
|
479
|
+
async def embed_documents():
|
|
480
|
+
texts = ["doc1...", "doc2...", "doc3...", ...] # hundreds of texts
|
|
481
|
+
embeddings = await provider.embed_batch_async(
|
|
482
|
+
texts,
|
|
483
|
+
model="mxbai-embed-large",
|
|
484
|
+
max_concurrent=10 # Adjust based on server capacity
|
|
485
|
+
)
|
|
486
|
+
return embeddings
|
|
487
|
+
|
|
488
|
+
# Run with trio
|
|
489
|
+
results = trio.run(embed_documents)
|
|
490
|
+
```
|
|
491
|
+
|
|
492
|
+
### Embedding Cache
|
|
493
|
+
|
|
494
|
+
Repeated embedding calls are cached automatically (2048 entries LRU):
|
|
495
|
+
|
|
496
|
+
```python
|
|
497
|
+
from ragit.providers import OllamaProvider
|
|
498
|
+
|
|
499
|
+
provider = OllamaProvider(use_cache=True) # Default
|
|
500
|
+
|
|
501
|
+
# First call hits the API
|
|
502
|
+
provider.embed("Hello world", model="mxbai-embed-large")
|
|
503
|
+
|
|
504
|
+
# Second call returns cached result instantly
|
|
505
|
+
provider.embed("Hello world", model="mxbai-embed-large")
|
|
506
|
+
|
|
507
|
+
# View cache statistics
|
|
508
|
+
print(OllamaProvider.embedding_cache_info())
|
|
509
|
+
# {'hits': 1, 'misses': 1, 'maxsize': 2048, 'currsize': 1}
|
|
510
|
+
|
|
511
|
+
# Clear cache if needed
|
|
512
|
+
OllamaProvider.clear_embedding_cache()
|
|
513
|
+
```
|
|
514
|
+
|
|
515
|
+
### Pre-normalized Embeddings
|
|
516
|
+
|
|
517
|
+
Vector similarity uses pre-normalized embeddings, making cosine similarity a simple dot product (O(1) per comparison).
|
|
518
|
+
|
|
446
519
|
## API Reference
|
|
447
520
|
|
|
448
521
|
### Document Loading
|
|
@@ -398,6 +398,77 @@ print(f"Score: {best.score:.3f}")
|
|
|
398
398
|
|
|
399
399
|
The experiment tests different combinations of chunk sizes, overlaps, and retrieval parameters to find what works best for your content.
|
|
400
400
|
|
|
401
|
+
## Performance Features
|
|
402
|
+
|
|
403
|
+
Ragit includes several optimizations for production workloads:
|
|
404
|
+
|
|
405
|
+
### Connection Pooling
|
|
406
|
+
|
|
407
|
+
`OllamaProvider` uses HTTP connection pooling via `requests.Session()` for faster sequential requests:
|
|
408
|
+
|
|
409
|
+
```python
|
|
410
|
+
from ragit.providers import OllamaProvider
|
|
411
|
+
|
|
412
|
+
provider = OllamaProvider()
|
|
413
|
+
|
|
414
|
+
# All requests reuse the same connection pool
|
|
415
|
+
for text in texts:
|
|
416
|
+
provider.embed(text, model="mxbai-embed-large")
|
|
417
|
+
|
|
418
|
+
# Explicitly close when done (optional, auto-closes on garbage collection)
|
|
419
|
+
provider.close()
|
|
420
|
+
```
|
|
421
|
+
|
|
422
|
+
### Async Parallel Embedding
|
|
423
|
+
|
|
424
|
+
For large batches, use `embed_batch_async()` with trio for 5-10x faster embedding:
|
|
425
|
+
|
|
426
|
+
```python
|
|
427
|
+
import trio
|
|
428
|
+
from ragit.providers import OllamaProvider
|
|
429
|
+
|
|
430
|
+
provider = OllamaProvider()
|
|
431
|
+
|
|
432
|
+
async def embed_documents():
|
|
433
|
+
texts = ["doc1...", "doc2...", "doc3...", ...] # hundreds of texts
|
|
434
|
+
embeddings = await provider.embed_batch_async(
|
|
435
|
+
texts,
|
|
436
|
+
model="mxbai-embed-large",
|
|
437
|
+
max_concurrent=10 # Adjust based on server capacity
|
|
438
|
+
)
|
|
439
|
+
return embeddings
|
|
440
|
+
|
|
441
|
+
# Run with trio
|
|
442
|
+
results = trio.run(embed_documents)
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
### Embedding Cache
|
|
446
|
+
|
|
447
|
+
Repeated embedding calls are cached automatically (2048 entries LRU):
|
|
448
|
+
|
|
449
|
+
```python
|
|
450
|
+
from ragit.providers import OllamaProvider
|
|
451
|
+
|
|
452
|
+
provider = OllamaProvider(use_cache=True) # Default
|
|
453
|
+
|
|
454
|
+
# First call hits the API
|
|
455
|
+
provider.embed("Hello world", model="mxbai-embed-large")
|
|
456
|
+
|
|
457
|
+
# Second call returns cached result instantly
|
|
458
|
+
provider.embed("Hello world", model="mxbai-embed-large")
|
|
459
|
+
|
|
460
|
+
# View cache statistics
|
|
461
|
+
print(OllamaProvider.embedding_cache_info())
|
|
462
|
+
# {'hits': 1, 'misses': 1, 'maxsize': 2048, 'currsize': 1}
|
|
463
|
+
|
|
464
|
+
# Clear cache if needed
|
|
465
|
+
OllamaProvider.clear_embedding_cache()
|
|
466
|
+
```
|
|
467
|
+
|
|
468
|
+
### Pre-normalized Embeddings
|
|
469
|
+
|
|
470
|
+
Vector similarity uses pre-normalized embeddings, making cosine similarity a simple dot product (O(1) per comparison).
|
|
471
|
+
|
|
401
472
|
## API Reference
|
|
402
473
|
|
|
403
474
|
### Document Loading
|
|
@@ -41,7 +41,7 @@ class Config:
|
|
|
41
41
|
|
|
42
42
|
# Default Models
|
|
43
43
|
DEFAULT_LLM_MODEL: str = os.getenv("RAGIT_DEFAULT_LLM_MODEL", "qwen3-vl:235b-instruct")
|
|
44
|
-
DEFAULT_EMBEDDING_MODEL: str = os.getenv("RAGIT_DEFAULT_EMBEDDING_MODEL", "
|
|
44
|
+
DEFAULT_EMBEDDING_MODEL: str = os.getenv("RAGIT_DEFAULT_EMBEDDING_MODEL", "nomic-embed-text:latest")
|
|
45
45
|
|
|
46
46
|
# Logging
|
|
47
47
|
LOG_LEVEL: str = os.getenv("RAGIT_LOG_LEVEL", "INFO")
|
|
@@ -7,8 +7,17 @@ Ollama provider for LLM and Embedding operations.
|
|
|
7
7
|
|
|
8
8
|
This provider connects to a local or remote Ollama server.
|
|
9
9
|
Configuration is loaded from environment variables.
|
|
10
|
+
|
|
11
|
+
Performance optimizations:
|
|
12
|
+
- Connection pooling via requests.Session()
|
|
13
|
+
- Async parallel embedding via trio + httpx
|
|
14
|
+
- LRU cache for repeated embedding queries
|
|
10
15
|
"""
|
|
11
16
|
|
|
17
|
+
from functools import lru_cache
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
import httpx
|
|
12
21
|
import requests
|
|
13
22
|
|
|
14
23
|
from ragit.config import config
|
|
@@ -20,10 +29,37 @@ from ragit.providers.base import (
|
|
|
20
29
|
)
|
|
21
30
|
|
|
22
31
|
|
|
32
|
+
# Module-level cache for embeddings (shared across instances)
|
|
33
|
+
@lru_cache(maxsize=2048)
|
|
34
|
+
def _cached_embedding(text: str, model: str, embedding_url: str, timeout: int) -> tuple[float, ...]:
|
|
35
|
+
"""Cache embedding results to avoid redundant API calls."""
|
|
36
|
+
# Truncate oversized inputs
|
|
37
|
+
if len(text) > OllamaProvider.MAX_EMBED_CHARS:
|
|
38
|
+
text = text[: OllamaProvider.MAX_EMBED_CHARS]
|
|
39
|
+
|
|
40
|
+
response = requests.post(
|
|
41
|
+
f"{embedding_url}/api/embed",
|
|
42
|
+
headers={"Content-Type": "application/json"},
|
|
43
|
+
json={"model": model, "input": text},
|
|
44
|
+
timeout=timeout,
|
|
45
|
+
)
|
|
46
|
+
response.raise_for_status()
|
|
47
|
+
data = response.json()
|
|
48
|
+
embeddings = data.get("embeddings", [])
|
|
49
|
+
if not embeddings or not embeddings[0]:
|
|
50
|
+
raise ValueError("Empty embedding returned from Ollama")
|
|
51
|
+
return tuple(embeddings[0])
|
|
52
|
+
|
|
53
|
+
|
|
23
54
|
class OllamaProvider(BaseLLMProvider, BaseEmbeddingProvider):
|
|
24
55
|
"""
|
|
25
56
|
Ollama provider for both LLM and Embedding operations.
|
|
26
57
|
|
|
58
|
+
Performance features:
|
|
59
|
+
- Connection pooling via requests.Session() for faster sequential requests
|
|
60
|
+
- Native batch embedding via /api/embed endpoint (single API call)
|
|
61
|
+
- LRU cache for repeated embedding queries (2048 entries)
|
|
62
|
+
|
|
27
63
|
Parameters
|
|
28
64
|
----------
|
|
29
65
|
base_url : str, optional
|
|
@@ -32,6 +68,8 @@ class OllamaProvider(BaseLLMProvider, BaseEmbeddingProvider):
|
|
|
32
68
|
API key for authentication (default: from OLLAMA_API_KEY env var)
|
|
33
69
|
timeout : int, optional
|
|
34
70
|
Request timeout in seconds (default: from OLLAMA_TIMEOUT env var)
|
|
71
|
+
use_cache : bool, optional
|
|
72
|
+
Enable embedding cache (default: True)
|
|
35
73
|
|
|
36
74
|
Examples
|
|
37
75
|
--------
|
|
@@ -39,12 +77,12 @@ class OllamaProvider(BaseLLMProvider, BaseEmbeddingProvider):
|
|
|
39
77
|
>>> response = provider.generate("What is RAG?", model="llama3")
|
|
40
78
|
>>> print(response.text)
|
|
41
79
|
|
|
42
|
-
>>> embedding
|
|
43
|
-
>>>
|
|
80
|
+
>>> # Batch embedding (single API call)
|
|
81
|
+
>>> embeddings = provider.embed_batch(texts, "mxbai-embed-large")
|
|
44
82
|
"""
|
|
45
83
|
|
|
46
84
|
# Known embedding model dimensions
|
|
47
|
-
EMBEDDING_DIMENSIONS = {
|
|
85
|
+
EMBEDDING_DIMENSIONS: dict[str, int] = {
|
|
48
86
|
"nomic-embed-text": 768,
|
|
49
87
|
"nomic-embed-text:latest": 768,
|
|
50
88
|
"mxbai-embed-large": 1024,
|
|
@@ -57,7 +95,7 @@ class OllamaProvider(BaseLLMProvider, BaseEmbeddingProvider):
|
|
|
57
95
|
}
|
|
58
96
|
|
|
59
97
|
# Max characters per embedding request (safe limit for 512 token models)
|
|
60
|
-
MAX_EMBED_CHARS =
|
|
98
|
+
MAX_EMBED_CHARS = 2000
|
|
61
99
|
|
|
62
100
|
def __init__(
|
|
63
101
|
self,
|
|
@@ -65,14 +103,39 @@ class OllamaProvider(BaseLLMProvider, BaseEmbeddingProvider):
|
|
|
65
103
|
embedding_url: str | None = None,
|
|
66
104
|
api_key: str | None = None,
|
|
67
105
|
timeout: int | None = None,
|
|
106
|
+
use_cache: bool = True,
|
|
68
107
|
) -> None:
|
|
69
108
|
self.base_url = (base_url or config.OLLAMA_BASE_URL).rstrip("/")
|
|
70
109
|
self.embedding_url = (embedding_url or config.OLLAMA_EMBEDDING_URL).rstrip("/")
|
|
71
110
|
self.api_key = api_key or config.OLLAMA_API_KEY
|
|
72
111
|
self.timeout = timeout or config.OLLAMA_TIMEOUT
|
|
112
|
+
self.use_cache = use_cache
|
|
73
113
|
self._current_embed_model: str | None = None
|
|
74
114
|
self._current_dimensions: int = 768 # default
|
|
75
115
|
|
|
116
|
+
# Connection pooling via session
|
|
117
|
+
self._session: requests.Session | None = None
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def session(self) -> requests.Session:
|
|
121
|
+
"""Lazy-initialized session for connection pooling."""
|
|
122
|
+
if self._session is None:
|
|
123
|
+
self._session = requests.Session()
|
|
124
|
+
self._session.headers.update({"Content-Type": "application/json"})
|
|
125
|
+
if self.api_key:
|
|
126
|
+
self._session.headers.update({"Authorization": f"Bearer {self.api_key}"})
|
|
127
|
+
return self._session
|
|
128
|
+
|
|
129
|
+
def close(self) -> None:
|
|
130
|
+
"""Close the session and release resources."""
|
|
131
|
+
if self._session is not None:
|
|
132
|
+
self._session.close()
|
|
133
|
+
self._session = None
|
|
134
|
+
|
|
135
|
+
def __del__(self) -> None:
|
|
136
|
+
"""Cleanup on garbage collection."""
|
|
137
|
+
self.close()
|
|
138
|
+
|
|
76
139
|
def _get_headers(self, include_auth: bool = True) -> dict[str, str]:
|
|
77
140
|
"""Get request headers including authentication if API key is set."""
|
|
78
141
|
headers = {"Content-Type": "application/json"}
|
|
@@ -91,21 +154,19 @@ class OllamaProvider(BaseLLMProvider, BaseEmbeddingProvider):
|
|
|
91
154
|
def is_available(self) -> bool:
|
|
92
155
|
"""Check if Ollama server is reachable."""
|
|
93
156
|
try:
|
|
94
|
-
response =
|
|
157
|
+
response = self.session.get(
|
|
95
158
|
f"{self.base_url}/api/tags",
|
|
96
|
-
headers=self._get_headers(),
|
|
97
159
|
timeout=5,
|
|
98
160
|
)
|
|
99
161
|
return response.status_code == 200
|
|
100
162
|
except requests.RequestException:
|
|
101
163
|
return False
|
|
102
164
|
|
|
103
|
-
def list_models(self) -> list[dict[str,
|
|
165
|
+
def list_models(self) -> list[dict[str, Any]]:
|
|
104
166
|
"""List available models on the Ollama server."""
|
|
105
167
|
try:
|
|
106
|
-
response =
|
|
168
|
+
response = self.session.get(
|
|
107
169
|
f"{self.base_url}/api/tags",
|
|
108
|
-
headers=self._get_headers(),
|
|
109
170
|
timeout=10,
|
|
110
171
|
)
|
|
111
172
|
response.raise_for_status()
|
|
@@ -138,9 +199,8 @@ class OllamaProvider(BaseLLMProvider, BaseEmbeddingProvider):
|
|
|
138
199
|
payload["system"] = system_prompt
|
|
139
200
|
|
|
140
201
|
try:
|
|
141
|
-
response =
|
|
202
|
+
response = self.session.post(
|
|
142
203
|
f"{self.base_url}/api/generate",
|
|
143
|
-
headers=self._get_headers(),
|
|
144
204
|
json=payload,
|
|
145
205
|
timeout=self.timeout,
|
|
146
206
|
)
|
|
@@ -161,33 +221,34 @@ class OllamaProvider(BaseLLMProvider, BaseEmbeddingProvider):
|
|
|
161
221
|
raise ConnectionError(f"Ollama generate failed: {e}") from e
|
|
162
222
|
|
|
163
223
|
def embed(self, text: str, model: str) -> EmbeddingResponse:
|
|
164
|
-
"""Generate embedding using Ollama
|
|
224
|
+
"""Generate embedding using Ollama with optional caching."""
|
|
165
225
|
self._current_embed_model = model
|
|
166
226
|
self._current_dimensions = self.EMBEDDING_DIMENSIONS.get(model, 768)
|
|
167
227
|
|
|
168
|
-
# Truncate oversized inputs to prevent context length errors
|
|
169
|
-
if len(text) > self.MAX_EMBED_CHARS:
|
|
170
|
-
text = text[: self.MAX_EMBED_CHARS]
|
|
171
|
-
|
|
172
228
|
try:
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
229
|
+
if self.use_cache:
|
|
230
|
+
# Use cached version
|
|
231
|
+
embedding = _cached_embedding(text, model, self.embedding_url, self.timeout)
|
|
232
|
+
else:
|
|
233
|
+
# Direct call without cache
|
|
234
|
+
truncated = text[: self.MAX_EMBED_CHARS] if len(text) > self.MAX_EMBED_CHARS else text
|
|
235
|
+
response = self.session.post(
|
|
236
|
+
f"{self.embedding_url}/api/embed",
|
|
237
|
+
json={"model": model, "input": truncated},
|
|
238
|
+
timeout=self.timeout,
|
|
239
|
+
)
|
|
240
|
+
response.raise_for_status()
|
|
241
|
+
data = response.json()
|
|
242
|
+
embeddings = data.get("embeddings", [])
|
|
243
|
+
if not embeddings or not embeddings[0]:
|
|
244
|
+
raise ValueError("Empty embedding returned from Ollama")
|
|
245
|
+
embedding = tuple(embeddings[0])
|
|
185
246
|
|
|
186
247
|
# Update dimensions from actual response
|
|
187
248
|
self._current_dimensions = len(embedding)
|
|
188
249
|
|
|
189
250
|
return EmbeddingResponse(
|
|
190
|
-
embedding=
|
|
251
|
+
embedding=embedding,
|
|
191
252
|
model=model,
|
|
192
253
|
provider=self.provider_name,
|
|
193
254
|
dimensions=len(embedding),
|
|
@@ -196,44 +257,115 @@ class OllamaProvider(BaseLLMProvider, BaseEmbeddingProvider):
|
|
|
196
257
|
raise ConnectionError(f"Ollama embed failed: {e}") from e
|
|
197
258
|
|
|
198
259
|
def embed_batch(self, texts: list[str], model: str) -> list[EmbeddingResponse]:
|
|
199
|
-
"""Generate embeddings for multiple texts
|
|
260
|
+
"""Generate embeddings for multiple texts in a single API call.
|
|
200
261
|
|
|
201
|
-
|
|
262
|
+
The /api/embed endpoint supports batch inputs natively.
|
|
202
263
|
"""
|
|
203
264
|
self._current_embed_model = model
|
|
204
265
|
self._current_dimensions = self.EMBEDDING_DIMENSIONS.get(model, 768)
|
|
205
266
|
|
|
206
|
-
|
|
267
|
+
# Truncate oversized inputs
|
|
268
|
+
truncated_texts = [text[: self.MAX_EMBED_CHARS] if len(text) > self.MAX_EMBED_CHARS else text for text in texts]
|
|
269
|
+
|
|
207
270
|
try:
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
271
|
+
response = self.session.post(
|
|
272
|
+
f"{self.embedding_url}/api/embed",
|
|
273
|
+
json={"model": model, "input": truncated_texts},
|
|
274
|
+
timeout=self.timeout,
|
|
275
|
+
)
|
|
276
|
+
response.raise_for_status()
|
|
277
|
+
data = response.json()
|
|
278
|
+
embeddings_list = data.get("embeddings", [])
|
|
279
|
+
|
|
280
|
+
if not embeddings_list:
|
|
281
|
+
raise ValueError("Empty embeddings returned from Ollama")
|
|
282
|
+
|
|
283
|
+
results = []
|
|
284
|
+
for embedding_data in embeddings_list:
|
|
285
|
+
embedding = tuple(embedding_data) if embedding_data else ()
|
|
286
|
+
if embedding:
|
|
287
|
+
self._current_dimensions = len(embedding)
|
|
288
|
+
|
|
289
|
+
results.append(
|
|
290
|
+
EmbeddingResponse(
|
|
291
|
+
embedding=embedding,
|
|
292
|
+
model=model,
|
|
293
|
+
provider=self.provider_name,
|
|
294
|
+
dimensions=len(embedding),
|
|
295
|
+
)
|
|
296
|
+
)
|
|
297
|
+
return results
|
|
298
|
+
except requests.RequestException as e:
|
|
299
|
+
raise ConnectionError(f"Ollama batch embed failed: {e}") from e
|
|
300
|
+
|
|
301
|
+
async def embed_batch_async(
|
|
302
|
+
self,
|
|
303
|
+
texts: list[str],
|
|
304
|
+
model: str,
|
|
305
|
+
max_concurrent: int = 10, # kept for API compatibility, no longer used
|
|
306
|
+
) -> list[EmbeddingResponse]:
|
|
307
|
+
"""Generate embeddings for multiple texts asynchronously.
|
|
308
|
+
|
|
309
|
+
The /api/embed endpoint supports batch inputs natively, so this
|
|
310
|
+
makes a single async HTTP request for all texts.
|
|
311
|
+
|
|
312
|
+
Parameters
|
|
313
|
+
----------
|
|
314
|
+
texts : list[str]
|
|
315
|
+
Texts to embed.
|
|
316
|
+
model : str
|
|
317
|
+
Embedding model name.
|
|
318
|
+
max_concurrent : int
|
|
319
|
+
Deprecated, kept for API compatibility. No longer used since
|
|
320
|
+
the API now supports native batching.
|
|
321
|
+
|
|
322
|
+
Returns
|
|
323
|
+
-------
|
|
324
|
+
list[EmbeddingResponse]
|
|
325
|
+
Embeddings in the same order as input texts.
|
|
326
|
+
|
|
327
|
+
Examples
|
|
328
|
+
--------
|
|
329
|
+
>>> import trio
|
|
330
|
+
>>> embeddings = trio.run(provider.embed_batch_async, texts, "mxbai-embed-large")
|
|
331
|
+
"""
|
|
332
|
+
self._current_embed_model = model
|
|
333
|
+
self._current_dimensions = self.EMBEDDING_DIMENSIONS.get(model, 768)
|
|
334
|
+
|
|
335
|
+
# Truncate oversized inputs
|
|
336
|
+
truncated_texts = [text[: self.MAX_EMBED_CHARS] if len(text) > self.MAX_EMBED_CHARS else text for text in texts]
|
|
337
|
+
|
|
338
|
+
try:
|
|
339
|
+
async with httpx.AsyncClient() as client:
|
|
340
|
+
response = await client.post(
|
|
341
|
+
f"{self.embedding_url}/api/embed",
|
|
342
|
+
json={"model": model, "input": truncated_texts},
|
|
217
343
|
timeout=self.timeout,
|
|
218
344
|
)
|
|
219
345
|
response.raise_for_status()
|
|
220
346
|
data = response.json()
|
|
221
347
|
|
|
222
|
-
|
|
348
|
+
embeddings_list = data.get("embeddings", [])
|
|
349
|
+
if not embeddings_list:
|
|
350
|
+
raise ValueError("Empty embeddings returned from Ollama")
|
|
351
|
+
|
|
352
|
+
results = []
|
|
353
|
+
for embedding_data in embeddings_list:
|
|
354
|
+
embedding = tuple(embedding_data) if embedding_data else ()
|
|
223
355
|
if embedding:
|
|
224
356
|
self._current_dimensions = len(embedding)
|
|
225
357
|
|
|
226
358
|
results.append(
|
|
227
359
|
EmbeddingResponse(
|
|
228
|
-
embedding=
|
|
360
|
+
embedding=embedding,
|
|
229
361
|
model=model,
|
|
230
362
|
provider=self.provider_name,
|
|
231
363
|
dimensions=len(embedding),
|
|
232
364
|
)
|
|
233
365
|
)
|
|
234
366
|
return results
|
|
235
|
-
except
|
|
236
|
-
raise ConnectionError(f"Ollama batch embed failed: {e}") from e
|
|
367
|
+
except httpx.HTTPError as e:
|
|
368
|
+
raise ConnectionError(f"Ollama async batch embed failed: {e}") from e
|
|
237
369
|
|
|
238
370
|
def chat(
|
|
239
371
|
self,
|
|
@@ -273,9 +405,8 @@ class OllamaProvider(BaseLLMProvider, BaseEmbeddingProvider):
|
|
|
273
405
|
}
|
|
274
406
|
|
|
275
407
|
try:
|
|
276
|
-
response =
|
|
408
|
+
response = self.session.post(
|
|
277
409
|
f"{self.base_url}/api/chat",
|
|
278
|
-
headers=self._get_headers(),
|
|
279
410
|
json=payload,
|
|
280
411
|
timeout=self.timeout,
|
|
281
412
|
)
|
|
@@ -293,3 +424,23 @@ class OllamaProvider(BaseLLMProvider, BaseEmbeddingProvider):
|
|
|
293
424
|
)
|
|
294
425
|
except requests.RequestException as e:
|
|
295
426
|
raise ConnectionError(f"Ollama chat failed: {e}") from e
|
|
427
|
+
|
|
428
|
+
@staticmethod
|
|
429
|
+
def clear_embedding_cache() -> None:
|
|
430
|
+
"""Clear the embedding cache."""
|
|
431
|
+
_cached_embedding.cache_clear()
|
|
432
|
+
|
|
433
|
+
@staticmethod
|
|
434
|
+
def embedding_cache_info() -> dict[str, int]:
|
|
435
|
+
"""Get embedding cache statistics."""
|
|
436
|
+
info = _cached_embedding.cache_info()
|
|
437
|
+
return {
|
|
438
|
+
"hits": info.hits,
|
|
439
|
+
"misses": info.misses,
|
|
440
|
+
"maxsize": info.maxsize or 0,
|
|
441
|
+
"currsize": info.currsize,
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
# Export the EMBEDDING_DIMENSIONS for external use
|
|
446
|
+
EMBEDDING_DIMENSIONS = OllamaProvider.EMBEDDING_DIMENSIONS
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragit
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.5
|
|
4
4
|
Summary: Automatic RAG Pattern Optimization Engine
|
|
5
5
|
Author: RODMENA LIMITED
|
|
6
6
|
Maintainer-email: RODMENA LIMITED <info@rodmena.co.uk>
|
|
@@ -26,6 +26,8 @@ Requires-Dist: pydantic>=2.0.0
|
|
|
26
26
|
Requires-Dist: python-dotenv>=1.0.0
|
|
27
27
|
Requires-Dist: scikit-learn>=1.5.0
|
|
28
28
|
Requires-Dist: tqdm>=4.66.0
|
|
29
|
+
Requires-Dist: trio>=0.24.0
|
|
30
|
+
Requires-Dist: httpx>=0.27.0
|
|
29
31
|
Provides-Extra: dev
|
|
30
32
|
Requires-Dist: ragit[test]; extra == "dev"
|
|
31
33
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -443,6 +445,77 @@ print(f"Score: {best.score:.3f}")
|
|
|
443
445
|
|
|
444
446
|
The experiment tests different combinations of chunk sizes, overlaps, and retrieval parameters to find what works best for your content.
|
|
445
447
|
|
|
448
|
+
## Performance Features
|
|
449
|
+
|
|
450
|
+
Ragit includes several optimizations for production workloads:
|
|
451
|
+
|
|
452
|
+
### Connection Pooling
|
|
453
|
+
|
|
454
|
+
`OllamaProvider` uses HTTP connection pooling via `requests.Session()` for faster sequential requests:
|
|
455
|
+
|
|
456
|
+
```python
|
|
457
|
+
from ragit.providers import OllamaProvider
|
|
458
|
+
|
|
459
|
+
provider = OllamaProvider()
|
|
460
|
+
|
|
461
|
+
# All requests reuse the same connection pool
|
|
462
|
+
for text in texts:
|
|
463
|
+
provider.embed(text, model="mxbai-embed-large")
|
|
464
|
+
|
|
465
|
+
# Explicitly close when done (optional, auto-closes on garbage collection)
|
|
466
|
+
provider.close()
|
|
467
|
+
```
|
|
468
|
+
|
|
469
|
+
### Async Parallel Embedding
|
|
470
|
+
|
|
471
|
+
For large batches, use `embed_batch_async()` with trio for 5-10x faster embedding:
|
|
472
|
+
|
|
473
|
+
```python
|
|
474
|
+
import trio
|
|
475
|
+
from ragit.providers import OllamaProvider
|
|
476
|
+
|
|
477
|
+
provider = OllamaProvider()
|
|
478
|
+
|
|
479
|
+
async def embed_documents():
|
|
480
|
+
texts = ["doc1...", "doc2...", "doc3...", ...] # hundreds of texts
|
|
481
|
+
embeddings = await provider.embed_batch_async(
|
|
482
|
+
texts,
|
|
483
|
+
model="mxbai-embed-large",
|
|
484
|
+
max_concurrent=10 # Adjust based on server capacity
|
|
485
|
+
)
|
|
486
|
+
return embeddings
|
|
487
|
+
|
|
488
|
+
# Run with trio
|
|
489
|
+
results = trio.run(embed_documents)
|
|
490
|
+
```
|
|
491
|
+
|
|
492
|
+
### Embedding Cache
|
|
493
|
+
|
|
494
|
+
Repeated embedding calls are cached automatically (2048 entries LRU):
|
|
495
|
+
|
|
496
|
+
```python
|
|
497
|
+
from ragit.providers import OllamaProvider
|
|
498
|
+
|
|
499
|
+
provider = OllamaProvider(use_cache=True) # Default
|
|
500
|
+
|
|
501
|
+
# First call hits the API
|
|
502
|
+
provider.embed("Hello world", model="mxbai-embed-large")
|
|
503
|
+
|
|
504
|
+
# Second call returns cached result instantly
|
|
505
|
+
provider.embed("Hello world", model="mxbai-embed-large")
|
|
506
|
+
|
|
507
|
+
# View cache statistics
|
|
508
|
+
print(OllamaProvider.embedding_cache_info())
|
|
509
|
+
# {'hits': 1, 'misses': 1, 'maxsize': 2048, 'currsize': 1}
|
|
510
|
+
|
|
511
|
+
# Clear cache if needed
|
|
512
|
+
OllamaProvider.clear_embedding_cache()
|
|
513
|
+
```
|
|
514
|
+
|
|
515
|
+
### Pre-normalized Embeddings
|
|
516
|
+
|
|
517
|
+
Vector similarity uses pre-normalized embeddings, making cosine similarity a simple dot product (O(1) per comparison).
|
|
518
|
+
|
|
446
519
|
## API Reference
|
|
447
520
|
|
|
448
521
|
### Document Loading
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|