ragbits-core 0.16.0__py3-none-any.whl → 1.4.0.dev202512021005__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits/core/__init__.py +21 -2
- ragbits/core/audit/__init__.py +15 -157
- ragbits/core/audit/metrics/__init__.py +83 -0
- ragbits/core/audit/metrics/base.py +198 -0
- ragbits/core/audit/metrics/logfire.py +19 -0
- ragbits/core/audit/metrics/otel.py +65 -0
- ragbits/core/audit/traces/__init__.py +171 -0
- ragbits/core/audit/{base.py → traces/base.py} +9 -5
- ragbits/core/audit/{cli.py → traces/cli.py} +8 -4
- ragbits/core/audit/traces/logfire.py +18 -0
- ragbits/core/audit/{otel.py → traces/otel.py} +5 -8
- ragbits/core/config.py +15 -0
- ragbits/core/embeddings/__init__.py +2 -1
- ragbits/core/embeddings/base.py +19 -0
- ragbits/core/embeddings/dense/base.py +10 -1
- ragbits/core/embeddings/dense/fastembed.py +22 -1
- ragbits/core/embeddings/dense/litellm.py +37 -10
- ragbits/core/embeddings/dense/local.py +15 -1
- ragbits/core/embeddings/dense/noop.py +11 -1
- ragbits/core/embeddings/dense/vertex_multimodal.py +14 -1
- ragbits/core/embeddings/sparse/bag_of_tokens.py +47 -17
- ragbits/core/embeddings/sparse/base.py +10 -1
- ragbits/core/embeddings/sparse/fastembed.py +25 -2
- ragbits/core/llms/__init__.py +3 -3
- ragbits/core/llms/base.py +612 -88
- ragbits/core/llms/exceptions.py +27 -0
- ragbits/core/llms/litellm.py +408 -83
- ragbits/core/llms/local.py +180 -41
- ragbits/core/llms/mock.py +88 -23
- ragbits/core/prompt/__init__.py +2 -2
- ragbits/core/prompt/_cli.py +32 -19
- ragbits/core/prompt/base.py +105 -19
- ragbits/core/prompt/{discovery/prompt_discovery.py → discovery.py} +1 -1
- ragbits/core/prompt/exceptions.py +22 -6
- ragbits/core/prompt/prompt.py +180 -98
- ragbits/core/sources/__init__.py +2 -0
- ragbits/core/sources/azure.py +1 -1
- ragbits/core/sources/base.py +8 -1
- ragbits/core/sources/gcs.py +1 -1
- ragbits/core/sources/git.py +1 -1
- ragbits/core/sources/google_drive.py +595 -0
- ragbits/core/sources/hf.py +71 -31
- ragbits/core/sources/local.py +1 -1
- ragbits/core/sources/s3.py +1 -1
- ragbits/core/utils/config_handling.py +13 -2
- ragbits/core/utils/function_schema.py +220 -0
- ragbits/core/utils/helpers.py +22 -0
- ragbits/core/utils/lazy_litellm.py +44 -0
- ragbits/core/vector_stores/base.py +18 -1
- ragbits/core/vector_stores/chroma.py +28 -11
- ragbits/core/vector_stores/hybrid.py +1 -1
- ragbits/core/vector_stores/hybrid_strategies.py +21 -8
- ragbits/core/vector_stores/in_memory.py +13 -4
- ragbits/core/vector_stores/pgvector.py +123 -47
- ragbits/core/vector_stores/qdrant.py +15 -7
- ragbits/core/vector_stores/weaviate.py +440 -0
- {ragbits_core-0.16.0.dist-info → ragbits_core-1.4.0.dev202512021005.dist-info}/METADATA +22 -6
- ragbits_core-1.4.0.dev202512021005.dist-info/RECORD +79 -0
- {ragbits_core-0.16.0.dist-info → ragbits_core-1.4.0.dev202512021005.dist-info}/WHEEL +1 -1
- ragbits/core/prompt/discovery/__init__.py +0 -3
- ragbits/core/prompt/lab/__init__.py +0 -0
- ragbits/core/prompt/lab/app.py +0 -262
- ragbits_core-0.16.0.dist-info/RECORD +0 -72
|
@@ -2,8 +2,8 @@ from collections import Counter
|
|
|
2
2
|
|
|
3
3
|
import tiktoken
|
|
4
4
|
|
|
5
|
-
from ragbits.core.audit import trace
|
|
6
|
-
from ragbits.core.embeddings.base import SparseVector
|
|
5
|
+
from ragbits.core.audit.traces import trace
|
|
6
|
+
from ragbits.core.embeddings.base import SparseVector, VectorSize
|
|
7
7
|
from ragbits.core.embeddings.sparse.base import SparseEmbedder
|
|
8
8
|
from ragbits.core.options import Options
|
|
9
9
|
from ragbits.core.types import NOT_GIVEN, NotGiven
|
|
@@ -12,8 +12,6 @@ from ragbits.core.types import NOT_GIVEN, NotGiven
|
|
|
12
12
|
class BagOfTokensOptions(Options):
|
|
13
13
|
"""A dataclass with definition of BOT options"""
|
|
14
14
|
|
|
15
|
-
model_name: str | None | NotGiven = "gpt-4o"
|
|
16
|
-
encoding_name: str | None | NotGiven = NOT_GIVEN
|
|
17
15
|
min_token_count: int | None | NotGiven = NOT_GIVEN
|
|
18
16
|
|
|
19
17
|
|
|
@@ -22,6 +20,50 @@ class BagOfTokens(SparseEmbedder[BagOfTokensOptions]):
|
|
|
22
20
|
|
|
23
21
|
options_cls = BagOfTokensOptions
|
|
24
22
|
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
model_name: str | None = None,
|
|
26
|
+
encoding_name: str | None = None,
|
|
27
|
+
default_options: BagOfTokensOptions | None = None,
|
|
28
|
+
) -> None:
|
|
29
|
+
"""
|
|
30
|
+
Initialize the BagOfTokens embedder.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
model_name: Name of the model to use for tokenization (e.g., "gpt-4o").
|
|
34
|
+
encoding_name: Name of the encoding to use for tokenization.
|
|
35
|
+
default_options: Default options for the embedder.
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
ValueError: If both model_name and encoding_name are provided, or if neither is provided.
|
|
39
|
+
"""
|
|
40
|
+
super().__init__(default_options=default_options)
|
|
41
|
+
|
|
42
|
+
if encoding_name and model_name:
|
|
43
|
+
raise ValueError("Please specify only one of encoding_name or model_name")
|
|
44
|
+
if not (encoding_name or model_name):
|
|
45
|
+
# Default to gpt-4o if neither is specified
|
|
46
|
+
model_name = "gpt-4o"
|
|
47
|
+
|
|
48
|
+
if encoding_name:
|
|
49
|
+
self._encoder = tiktoken.get_encoding(encoding_name=encoding_name)
|
|
50
|
+
elif model_name:
|
|
51
|
+
self._encoder = tiktoken.encoding_for_model(model_name=model_name)
|
|
52
|
+
else:
|
|
53
|
+
raise ValueError("Either encoding_name or model_name needs to be specified")
|
|
54
|
+
|
|
55
|
+
async def get_vector_size(self) -> VectorSize:
|
|
56
|
+
"""
|
|
57
|
+
Get the vector size for this BagOfTokens model.
|
|
58
|
+
|
|
59
|
+
For BagOfTokens, this returns the tokenizer vocabulary size.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
VectorSize object with is_sparse=True and the vocabulary size.
|
|
63
|
+
"""
|
|
64
|
+
vocab_size = self._encoder.n_vocab
|
|
65
|
+
return VectorSize(size=vocab_size, is_sparse=True)
|
|
66
|
+
|
|
25
67
|
async def embed_text(self, texts: list[str], options: BagOfTokensOptions | None = None) -> list[SparseVector]:
|
|
26
68
|
"""
|
|
27
69
|
Transforms a list of texts into sparse vectors using bag-of-tokens representation.
|
|
@@ -36,21 +78,9 @@ class BagOfTokens(SparseEmbedder[BagOfTokensOptions]):
|
|
|
36
78
|
vectors = []
|
|
37
79
|
merged_options = self.default_options | options if options else self.default_options
|
|
38
80
|
with trace(data=texts, options=merged_options.dict()) as outputs:
|
|
39
|
-
if merged_options.encoding_name and merged_options.model_name:
|
|
40
|
-
raise ValueError("Please specify only one of encoding_name or model_name")
|
|
41
|
-
if not (merged_options.encoding_name or merged_options.model_name):
|
|
42
|
-
raise ValueError("Either encoding_name or model_name needs to be specified")
|
|
43
|
-
|
|
44
|
-
if merged_options.encoding_name:
|
|
45
|
-
encoder = tiktoken.get_encoding(encoding_name=merged_options.encoding_name)
|
|
46
|
-
elif merged_options.model_name:
|
|
47
|
-
encoder = tiktoken.encoding_for_model(model_name=merged_options.model_name)
|
|
48
|
-
else:
|
|
49
|
-
raise ValueError("Either encoding_name or model_name needs to be specified")
|
|
50
|
-
|
|
51
81
|
min_token_count = merged_options.min_token_count or float("-inf")
|
|
52
82
|
for text in texts:
|
|
53
|
-
tokens =
|
|
83
|
+
tokens = self._encoder.encode(text)
|
|
54
84
|
token_counts = Counter(tokens)
|
|
55
85
|
non_zero_dims = []
|
|
56
86
|
non_zero_vals = []
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from typing import TypeVar
|
|
3
3
|
|
|
4
|
-
from ragbits.core.embeddings.base import Embedder, SparseVector
|
|
4
|
+
from ragbits.core.embeddings.base import Embedder, SparseVector, VectorSize
|
|
5
5
|
from ragbits.core.options import Options
|
|
6
6
|
|
|
7
7
|
SparseEmbedderOptionsT = TypeVar("SparseEmbedderOptionsT", bound=Options)
|
|
@@ -23,6 +23,15 @@ class SparseEmbedder(Embedder[SparseEmbedderOptionsT], ABC):
|
|
|
23
23
|
list of sparse embeddings.
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
|
+
@abstractmethod
|
|
27
|
+
async def get_vector_size(self) -> VectorSize:
|
|
28
|
+
"""
|
|
29
|
+
Get information about the sparse vector size/dimensions returned by this embedder.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
VectorSize object with is_sparse=True and the vocabulary size.
|
|
33
|
+
"""
|
|
34
|
+
|
|
26
35
|
async def embed_image(
|
|
27
36
|
self, images: list[bytes], options: SparseEmbedderOptionsT | None = None
|
|
28
37
|
) -> list[SparseVector]:
|
|
@@ -2,8 +2,8 @@ from collections.abc import Callable
|
|
|
2
2
|
|
|
3
3
|
from fastembed import SparseTextEmbedding
|
|
4
4
|
|
|
5
|
-
from ragbits.core.audit import trace
|
|
6
|
-
from ragbits.core.embeddings.base import EmbedderOptionsT, SparseVector
|
|
5
|
+
from ragbits.core.audit.traces import trace
|
|
6
|
+
from ragbits.core.embeddings.base import EmbedderOptionsT, SparseVector, VectorSize
|
|
7
7
|
from ragbits.core.embeddings.dense.fastembed import FastEmbedOptions
|
|
8
8
|
from ragbits.core.embeddings.sparse.base import SparseEmbedder
|
|
9
9
|
|
|
@@ -35,6 +35,29 @@ class FastEmbedSparseEmbedder(SparseEmbedder[FastEmbedOptions]):
|
|
|
35
35
|
"""
|
|
36
36
|
return (self.__class__, (self.model_name, self.use_gpu, self.default_options))
|
|
37
37
|
|
|
38
|
+
async def get_vector_size(self) -> VectorSize:
|
|
39
|
+
"""
|
|
40
|
+
Get the vector size for this FastEmbed sparse model.
|
|
41
|
+
|
|
42
|
+
For sparse models, this returns the vocabulary size.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
VectorSize object with is_sparse=True and the vocabulary size.
|
|
46
|
+
"""
|
|
47
|
+
# Get model info from FastEmbed's supported models list
|
|
48
|
+
supported_models = self._model.list_supported_models()
|
|
49
|
+
model_info = next((model for model in supported_models if model["model"] == self.model_name), None)
|
|
50
|
+
|
|
51
|
+
if model_info and "vocab_size" in model_info:
|
|
52
|
+
vocab_size = model_info["vocab_size"]
|
|
53
|
+
else:
|
|
54
|
+
sample_embedding = await self.embed_text(["sample text with various tokens"])
|
|
55
|
+
vocab_size = (
|
|
56
|
+
max(sample_embedding[0].indices) + 1 if sample_embedding and sample_embedding[0].indices else 30000
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
return VectorSize(size=vocab_size, is_sparse=True)
|
|
60
|
+
|
|
38
61
|
async def embed_text(self, data: list[str], options: EmbedderOptionsT | None = None) -> list[SparseVector]:
|
|
39
62
|
"""
|
|
40
63
|
Embeds a list of strings into a list of sparse embeddings.
|
ragbits/core/llms/__init__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from .base import LLM
|
|
1
|
+
from .base import LLM, ToolCall, Usage
|
|
2
2
|
from .litellm import LiteLLM, LiteLLMOptions
|
|
3
|
-
from .local import LocalLLMOptions
|
|
3
|
+
from .local import LocalLLM, LocalLLMOptions
|
|
4
4
|
|
|
5
|
-
__all__ = ["LLM", "LiteLLM", "LiteLLMOptions", "LocalLLMOptions"]
|
|
5
|
+
__all__ = ["LLM", "LiteLLM", "LiteLLMOptions", "LocalLLM", "LocalLLMOptions", "ToolCall", "Usage"]
|