amsdal_ml 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amsdal_ml/Third-Party Materials - AMSDAL Dependencies - License Notices.md +617 -0
- amsdal_ml/__about__.py +1 -1
- amsdal_ml/agents/__init__.py +13 -0
- amsdal_ml/agents/agent.py +5 -7
- amsdal_ml/agents/default_qa_agent.py +108 -143
- amsdal_ml/agents/functional_calling_agent.py +233 -0
- amsdal_ml/agents/mcp_client_tool.py +46 -0
- amsdal_ml/agents/python_tool.py +86 -0
- amsdal_ml/agents/retriever_tool.py +17 -8
- amsdal_ml/agents/tool_adapters.py +98 -0
- amsdal_ml/fileio/base_loader.py +7 -5
- amsdal_ml/fileio/openai_loader.py +16 -17
- amsdal_ml/mcp_client/base.py +2 -0
- amsdal_ml/mcp_client/http_client.py +7 -1
- amsdal_ml/mcp_client/stdio_client.py +21 -18
- amsdal_ml/mcp_server/server_retriever_stdio.py +8 -11
- amsdal_ml/ml_ingesting/__init__.py +29 -0
- amsdal_ml/ml_ingesting/default_ingesting.py +49 -51
- amsdal_ml/ml_ingesting/embedders/__init__.py +4 -0
- amsdal_ml/ml_ingesting/embedders/embedder.py +12 -0
- amsdal_ml/ml_ingesting/embedders/openai_embedder.py +30 -0
- amsdal_ml/ml_ingesting/embedding_data.py +3 -0
- amsdal_ml/ml_ingesting/loaders/__init__.py +6 -0
- amsdal_ml/ml_ingesting/loaders/folder_loader.py +52 -0
- amsdal_ml/ml_ingesting/loaders/loader.py +28 -0
- amsdal_ml/ml_ingesting/loaders/pdf_loader.py +136 -0
- amsdal_ml/ml_ingesting/loaders/text_loader.py +44 -0
- amsdal_ml/ml_ingesting/model_ingester.py +278 -0
- amsdal_ml/ml_ingesting/pipeline.py +131 -0
- amsdal_ml/ml_ingesting/pipeline_interface.py +31 -0
- amsdal_ml/ml_ingesting/processors/__init__.py +4 -0
- amsdal_ml/ml_ingesting/processors/cleaner.py +14 -0
- amsdal_ml/ml_ingesting/processors/text_cleaner.py +42 -0
- amsdal_ml/ml_ingesting/splitters/__init__.py +4 -0
- amsdal_ml/ml_ingesting/splitters/splitter.py +15 -0
- amsdal_ml/ml_ingesting/splitters/token_splitter.py +85 -0
- amsdal_ml/ml_ingesting/stores/__init__.py +4 -0
- amsdal_ml/ml_ingesting/stores/embedding_data.py +63 -0
- amsdal_ml/ml_ingesting/stores/store.py +22 -0
- amsdal_ml/ml_ingesting/types.py +40 -0
- amsdal_ml/ml_models/models.py +96 -4
- amsdal_ml/ml_models/openai_model.py +430 -122
- amsdal_ml/ml_models/utils.py +7 -0
- amsdal_ml/ml_retrievers/__init__.py +17 -0
- amsdal_ml/ml_retrievers/adapters.py +93 -0
- amsdal_ml/ml_retrievers/default_retriever.py +11 -1
- amsdal_ml/ml_retrievers/openai_retriever.py +27 -7
- amsdal_ml/ml_retrievers/query_retriever.py +487 -0
- amsdal_ml/ml_retrievers/retriever.py +12 -0
- amsdal_ml/models/embedding_model.py +7 -7
- amsdal_ml/prompts/__init__.py +77 -0
- amsdal_ml/prompts/database_query_agent.prompt +14 -0
- amsdal_ml/prompts/functional_calling_agent_base.prompt +9 -0
- amsdal_ml/prompts/nl_query_filter.prompt +318 -0
- amsdal_ml/{agents/promts → prompts}/react_chat.prompt +17 -8
- amsdal_ml/utils/__init__.py +5 -0
- amsdal_ml/utils/query_utils.py +189 -0
- amsdal_ml-0.2.0.dist-info/METADATA +293 -0
- amsdal_ml-0.2.0.dist-info/RECORD +72 -0
- {amsdal_ml-0.1.3.dist-info → amsdal_ml-0.2.0.dist-info}/WHEEL +1 -1
- amsdal_ml/agents/promts/__init__.py +0 -58
- amsdal_ml-0.1.3.dist-info/METADATA +0 -69
- amsdal_ml-0.1.3.dist-info/RECORD +0 -39
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from amsdal_ml.ml_retrievers.default_retriever import DefaultRetriever
|
|
2
|
+
from amsdal_ml.ml_retrievers.openai_retriever import OpenAIRetriever
|
|
3
|
+
from amsdal_ml.ml_retrievers.query_retriever import NLQueryExecutor
|
|
4
|
+
from amsdal_ml.ml_retrievers.query_retriever import NLQueryRetriever
|
|
5
|
+
from amsdal_ml.ml_retrievers.retriever import MLRetriever
|
|
6
|
+
from amsdal_ml.ml_retrievers.retriever import RetrievalChunk
|
|
7
|
+
from amsdal_ml.ml_retrievers.retriever import Retriever
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"DefaultRetriever",
|
|
11
|
+
"MLRetriever",
|
|
12
|
+
"NLQueryExecutor",
|
|
13
|
+
"NLQueryRetriever",
|
|
14
|
+
"OpenAIRetriever",
|
|
15
|
+
"RetrievalChunk",
|
|
16
|
+
"Retriever",
|
|
17
|
+
]
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from abc import ABC
|
|
5
|
+
from abc import abstractmethod
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from amsdal_ml.ml_models.models import MLModel
|
|
11
|
+
from amsdal_ml.ml_retrievers.query_retriever import FilterCondition
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RetrieverAdapter(ABC):
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def get_response_schema(self, base_schema: dict[str, Any]) -> dict[str, Any]:
|
|
18
|
+
"""Adapts the base JSON schema for the specific model."""
|
|
19
|
+
raise NotImplementedError
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def parse_response(
|
|
23
|
+
self,
|
|
24
|
+
raw_json: str,
|
|
25
|
+
*,
|
|
26
|
+
is_schema_based: bool,
|
|
27
|
+
) -> list[FilterCondition]:
|
|
28
|
+
"""Parses the raw JSON string from the model into a list of FilterCondition."""
|
|
29
|
+
raise NotImplementedError
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DefaultRetrieverAdapter(RetrieverAdapter):
|
|
33
|
+
def get_response_schema(self, base_schema: dict[str, Any]) -> dict[str, Any]:
|
|
34
|
+
return base_schema
|
|
35
|
+
|
|
36
|
+
def parse_response(
|
|
37
|
+
self,
|
|
38
|
+
raw_json: str,
|
|
39
|
+
*,
|
|
40
|
+
is_schema_based: bool,
|
|
41
|
+
) -> list[FilterCondition]:
|
|
42
|
+
from amsdal_ml.ml_retrievers.query_retriever import FilterCondition
|
|
43
|
+
from amsdal_ml.ml_retrievers.query_retriever import FilterResponse
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
filter_data = json.loads(raw_json)
|
|
47
|
+
|
|
48
|
+
if is_schema_based:
|
|
49
|
+
return FilterResponse.model_validate(filter_data).filters
|
|
50
|
+
|
|
51
|
+
if isinstance(filter_data, dict) and "filters" in filter_data:
|
|
52
|
+
return [FilterCondition(**cond) for cond in filter_data["filters"]]
|
|
53
|
+
if isinstance(filter_data, list):
|
|
54
|
+
return [FilterCondition(**cond) for cond in filter_data]
|
|
55
|
+
|
|
56
|
+
return []
|
|
57
|
+
except (json.JSONDecodeError, Exception):
|
|
58
|
+
return []
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class OpenAIRetrieverAdapter(DefaultRetrieverAdapter):
|
|
62
|
+
def get_response_schema(self, base_schema: dict[str, Any]) -> dict[str, Any]:
|
|
63
|
+
def add_additional_properties_recursively(schema_node: dict[str, Any] | list[Any]) -> None:
|
|
64
|
+
if isinstance(schema_node, dict):
|
|
65
|
+
if (
|
|
66
|
+
schema_node.get("type") == "object"
|
|
67
|
+
and "additionalProperties" not in schema_node
|
|
68
|
+
):
|
|
69
|
+
schema_node["additionalProperties"] = False
|
|
70
|
+
|
|
71
|
+
for value in schema_node.values():
|
|
72
|
+
add_additional_properties_recursively(value)
|
|
73
|
+
|
|
74
|
+
elif isinstance(schema_node, list):
|
|
75
|
+
for item in schema_node:
|
|
76
|
+
add_additional_properties_recursively(item)
|
|
77
|
+
|
|
78
|
+
add_additional_properties_recursively(base_schema)
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
"name": "data",
|
|
82
|
+
"strict": True,
|
|
83
|
+
"schema": base_schema
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_retriever_adapter(model: MLModel) -> RetrieverAdapter:
|
|
88
|
+
model_name = model.__class__.__name__.lower()
|
|
89
|
+
|
|
90
|
+
if "openai" in model_name:
|
|
91
|
+
return OpenAIRetrieverAdapter()
|
|
92
|
+
|
|
93
|
+
return DefaultRetrieverAdapter()
|
|
@@ -6,6 +6,7 @@ from abc import abstractmethod
|
|
|
6
6
|
|
|
7
7
|
from amsdal_models.classes.annotations import CosineDistance
|
|
8
8
|
|
|
9
|
+
from amsdal_ml.ml_ingesting.embedders.embedder import Embedder
|
|
9
10
|
from amsdal_ml.models.embedding_model import EmbeddingModel
|
|
10
11
|
|
|
11
12
|
from .retriever import MLRetriever
|
|
@@ -18,14 +19,21 @@ def _default_num_tokens(text: str) -> int:
|
|
|
18
19
|
|
|
19
20
|
class DefaultRetriever(MLRetriever, ABC):
|
|
20
21
|
def __init__(
|
|
21
|
-
self,
|
|
22
|
+
self,
|
|
23
|
+
*,
|
|
24
|
+
embedding_model_cls=EmbeddingModel,
|
|
25
|
+
max_context_tokens: int = 1800,
|
|
26
|
+
num_tokens_fn=_default_num_tokens,
|
|
27
|
+
embedder: Embedder | None = None,
|
|
22
28
|
):
|
|
23
29
|
self.embedding_model_cls = embedding_model_cls
|
|
24
30
|
self.max_context_tokens = max_context_tokens
|
|
25
31
|
self.num_tokens_fn = num_tokens_fn
|
|
32
|
+
self.embedder = embedder
|
|
26
33
|
|
|
27
34
|
@abstractmethod
|
|
28
35
|
def _embed_query(self, text: str) -> list[float]: ...
|
|
36
|
+
|
|
29
37
|
@abstractmethod
|
|
30
38
|
async def _aembed_query(self, text: str) -> list[float]: ...
|
|
31
39
|
|
|
@@ -66,6 +74,7 @@ class DefaultRetriever(MLRetriever, ABC):
|
|
|
66
74
|
raw_text=(r.raw_text or '').strip(),
|
|
67
75
|
distance=float(getattr(r, 'distance', math.inf)),
|
|
68
76
|
tags=list(r.tags or []),
|
|
77
|
+
metadata=dict(getattr(r, 'ml_metadata', {}) or {}),
|
|
69
78
|
)
|
|
70
79
|
for r in rows
|
|
71
80
|
]
|
|
@@ -100,6 +109,7 @@ class DefaultRetriever(MLRetriever, ABC):
|
|
|
100
109
|
raw_text=(r.raw_text or '').strip(),
|
|
101
110
|
distance=float(getattr(r, 'distance', math.inf)),
|
|
102
111
|
tags=list(r.tags or []),
|
|
112
|
+
metadata=dict(getattr(r, 'ml_metadata', {}) or {}),
|
|
103
113
|
)
|
|
104
114
|
for r in rows
|
|
105
115
|
]
|
|
@@ -7,6 +7,7 @@ from openai import AsyncOpenAI
|
|
|
7
7
|
from openai import OpenAI
|
|
8
8
|
|
|
9
9
|
from amsdal_ml.ml_config import ml_config
|
|
10
|
+
from amsdal_ml.ml_ingesting.embedders.embedder import Embedder
|
|
10
11
|
|
|
11
12
|
from .default_retriever import DefaultRetriever
|
|
12
13
|
|
|
@@ -14,26 +15,45 @@ DEFAULT_EMBED_MODEL = ml_config.embed_model_name
|
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
class OpenAIRetriever(DefaultRetriever):
|
|
18
|
+
client: OpenAI | None = None
|
|
19
|
+
aclient: AsyncOpenAI | None = None
|
|
20
|
+
|
|
17
21
|
def __init__(
|
|
18
22
|
self,
|
|
19
23
|
*,
|
|
20
24
|
api_key: Optional[str] = None,
|
|
21
25
|
embed_model: Optional[str] = None,
|
|
26
|
+
embedder: Embedder | None = None,
|
|
22
27
|
):
|
|
23
|
-
super().__init__()
|
|
24
|
-
self.api_key = api_key or ml_config.resolved_openai_key or os.getenv('OPENAI_API_KEY')
|
|
25
|
-
if not self.api_key:
|
|
26
|
-
msg = 'OPENAI_API_KEY is required for OpenAIRetriever'
|
|
27
|
-
raise RuntimeError(msg)
|
|
28
|
+
super().__init__(embedder=embedder)
|
|
28
29
|
|
|
30
|
+
self.api_key = api_key or ml_config.resolved_openai_key or os.getenv('OPENAI_API_KEY')
|
|
29
31
|
self.embed_model = embed_model or DEFAULT_EMBED_MODEL
|
|
30
|
-
|
|
31
|
-
|
|
32
|
+
|
|
33
|
+
if embedder is None:
|
|
34
|
+
if not self.api_key:
|
|
35
|
+
msg = 'OPENAI_API_KEY is required for OpenAIRetriever'
|
|
36
|
+
raise RuntimeError(msg)
|
|
37
|
+
self.client = OpenAI(api_key=self.api_key)
|
|
38
|
+
self.aclient = AsyncOpenAI(api_key=self.api_key)
|
|
39
|
+
else:
|
|
40
|
+
self.client = None
|
|
41
|
+
self.aclient = None
|
|
32
42
|
|
|
33
43
|
def _embed_query(self, text: str) -> list[float]:
|
|
44
|
+
if self.embedder is not None:
|
|
45
|
+
return self.embedder.embed(text)
|
|
46
|
+
if not self.client:
|
|
47
|
+
msg = 'OpenAI client is not configured'
|
|
48
|
+
raise RuntimeError(msg)
|
|
34
49
|
resp = self.client.embeddings.create(model=self.embed_model, input=text)
|
|
35
50
|
return resp.data[0].embedding
|
|
36
51
|
|
|
37
52
|
async def _aembed_query(self, text: str) -> list[float]:
|
|
53
|
+
if self.embedder is not None:
|
|
54
|
+
return await self.embedder.aembed(text)
|
|
55
|
+
if not self.aclient:
|
|
56
|
+
msg = 'Async OpenAI client is not configured'
|
|
57
|
+
raise RuntimeError(msg)
|
|
38
58
|
resp = await self.aclient.embeddings.create(model=self.embed_model, input=text)
|
|
39
59
|
return resp.data[0].embedding
|