amsdal_ml 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. amsdal_ml/Third-Party Materials - AMSDAL Dependencies - License Notices.md +617 -0
  2. amsdal_ml/__about__.py +1 -1
  3. amsdal_ml/agents/__init__.py +13 -0
  4. amsdal_ml/agents/agent.py +5 -7
  5. amsdal_ml/agents/default_qa_agent.py +108 -143
  6. amsdal_ml/agents/functional_calling_agent.py +233 -0
  7. amsdal_ml/agents/mcp_client_tool.py +46 -0
  8. amsdal_ml/agents/python_tool.py +86 -0
  9. amsdal_ml/agents/retriever_tool.py +5 -6
  10. amsdal_ml/agents/tool_adapters.py +98 -0
  11. amsdal_ml/fileio/base_loader.py +7 -5
  12. amsdal_ml/fileio/openai_loader.py +16 -17
  13. amsdal_ml/mcp_client/base.py +2 -0
  14. amsdal_ml/mcp_client/http_client.py +7 -1
  15. amsdal_ml/mcp_client/stdio_client.py +19 -16
  16. amsdal_ml/mcp_server/server_retriever_stdio.py +8 -11
  17. amsdal_ml/ml_ingesting/__init__.py +29 -0
  18. amsdal_ml/ml_ingesting/default_ingesting.py +49 -51
  19. amsdal_ml/ml_ingesting/embedders/__init__.py +4 -0
  20. amsdal_ml/ml_ingesting/embedders/embedder.py +12 -0
  21. amsdal_ml/ml_ingesting/embedders/openai_embedder.py +30 -0
  22. amsdal_ml/ml_ingesting/embedding_data.py +3 -0
  23. amsdal_ml/ml_ingesting/loaders/__init__.py +6 -0
  24. amsdal_ml/ml_ingesting/loaders/folder_loader.py +52 -0
  25. amsdal_ml/ml_ingesting/loaders/loader.py +28 -0
  26. amsdal_ml/ml_ingesting/loaders/pdf_loader.py +136 -0
  27. amsdal_ml/ml_ingesting/loaders/text_loader.py +44 -0
  28. amsdal_ml/ml_ingesting/model_ingester.py +278 -0
  29. amsdal_ml/ml_ingesting/pipeline.py +131 -0
  30. amsdal_ml/ml_ingesting/pipeline_interface.py +31 -0
  31. amsdal_ml/ml_ingesting/processors/__init__.py +4 -0
  32. amsdal_ml/ml_ingesting/processors/cleaner.py +14 -0
  33. amsdal_ml/ml_ingesting/processors/text_cleaner.py +42 -0
  34. amsdal_ml/ml_ingesting/splitters/__init__.py +4 -0
  35. amsdal_ml/ml_ingesting/splitters/splitter.py +15 -0
  36. amsdal_ml/ml_ingesting/splitters/token_splitter.py +85 -0
  37. amsdal_ml/ml_ingesting/stores/__init__.py +4 -0
  38. amsdal_ml/ml_ingesting/stores/embedding_data.py +63 -0
  39. amsdal_ml/ml_ingesting/stores/store.py +22 -0
  40. amsdal_ml/ml_ingesting/types.py +40 -0
  41. amsdal_ml/ml_models/models.py +96 -4
  42. amsdal_ml/ml_models/openai_model.py +430 -122
  43. amsdal_ml/ml_models/utils.py +7 -0
  44. amsdal_ml/ml_retrievers/__init__.py +17 -0
  45. amsdal_ml/ml_retrievers/adapters.py +93 -0
  46. amsdal_ml/ml_retrievers/default_retriever.py +11 -1
  47. amsdal_ml/ml_retrievers/openai_retriever.py +27 -7
  48. amsdal_ml/ml_retrievers/query_retriever.py +487 -0
  49. amsdal_ml/ml_retrievers/retriever.py +12 -0
  50. amsdal_ml/models/embedding_model.py +7 -7
  51. amsdal_ml/prompts/__init__.py +77 -0
  52. amsdal_ml/prompts/database_query_agent.prompt +14 -0
  53. amsdal_ml/prompts/functional_calling_agent_base.prompt +9 -0
  54. amsdal_ml/prompts/nl_query_filter.prompt +318 -0
  55. amsdal_ml/{agents/promts → prompts}/react_chat.prompt +17 -8
  56. amsdal_ml/utils/__init__.py +5 -0
  57. amsdal_ml/utils/query_utils.py +189 -0
  58. {amsdal_ml-0.1.4.dist-info → amsdal_ml-0.2.0.dist-info}/METADATA +59 -1
  59. amsdal_ml-0.2.0.dist-info/RECORD +72 -0
  60. {amsdal_ml-0.1.4.dist-info → amsdal_ml-0.2.0.dist-info}/WHEEL +1 -1
  61. amsdal_ml/agents/promts/__init__.py +0 -58
  62. amsdal_ml-0.1.4.dist-info/RECORD +0 -39
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC
4
+ from abc import abstractmethod
5
+
6
+ from amsdal_ml.ml_ingesting.types import LoadedDocument
7
+ from amsdal_ml.ml_ingesting.types import TextChunk
8
+
9
+
10
+ class Splitter(ABC):
11
+ @abstractmethod
12
+ def split(self, doc: LoadedDocument) -> list[TextChunk]: ...
13
+
14
+ @abstractmethod
15
+ async def asplit(self, doc: LoadedDocument) -> list[TextChunk]: ...
@@ -0,0 +1,85 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from collections.abc import Callable
5
+
6
+ from amsdal_ml.ml_ingesting.splitters.splitter import Splitter
7
+ from amsdal_ml.ml_ingesting.types import LoadedDocument
8
+ from amsdal_ml.ml_ingesting.types import TextChunk
9
+
10
+
11
+ def _default_token_len(text: str) -> int:
12
+ return max(1, len(text) // 4)
13
+
14
+
15
+ def _words_with_token_counts(text: str, token_fn: Callable[[str], int]) -> list[tuple[str, int]]:
16
+ out: list[tuple[str, int]] = []
17
+ for word in text.split():
18
+ out.append((word, token_fn(word)))
19
+ return out
20
+
21
+
22
+ def _compute_overlap_step(chunk: list[tuple[str, int]], overlap_tokens: int) -> int:
23
+ tokens = 0
24
+ step = 0
25
+ for _word, tk in reversed(chunk):
26
+ tokens += tk
27
+ step += 1
28
+ if tokens >= overlap_tokens:
29
+ break
30
+ return step
31
+
32
+
33
+ class TokenSplitter(Splitter):
34
+ def __init__(
35
+ self,
36
+ *,
37
+ max_tokens: int = 600,
38
+ overlap_tokens: int = 50,
39
+ token_len_fn: Callable[[str], int] = _default_token_len,
40
+ ) -> None:
41
+ self.max_tokens = max_tokens
42
+ self.overlap_tokens = overlap_tokens
43
+ self.token_len_fn = token_len_fn
44
+
45
+ def _split_page(self, text: str) -> list[str]:
46
+ words = _words_with_token_counts(text, self.token_len_fn)
47
+ chunks: list[str] = []
48
+ start = 0
49
+ n = len(words)
50
+ while start < n:
51
+ tokens = 0
52
+ end = start
53
+ while end < n and (tokens + words[end][1] <= self.max_tokens or tokens == 0):
54
+ tokens += words[end][1]
55
+ end += 1
56
+ chunk_words = [w for w, _ in words[start:end]]
57
+ chunk_text = ' '.join(chunk_words).strip()
58
+ if chunk_text:
59
+ chunks.append(chunk_text)
60
+ if end >= n:
61
+ break
62
+ back = _compute_overlap_step(words[start:end], self.overlap_tokens)
63
+ start = max(start + 1, end - back)
64
+ return chunks
65
+
66
+ def split(self, doc: LoadedDocument) -> list[TextChunk]:
67
+ chunks: list[TextChunk] = []
68
+ for page in doc.pages:
69
+ page_chunks = self._split_page(page.text)
70
+ for chunk_text in page_chunks:
71
+ metadata = dict(doc.metadata)
72
+ metadata.update(page.metadata)
73
+ metadata.setdefault('page_number', page.page_number)
74
+ chunks.append(
75
+ TextChunk(
76
+ index=len(chunks),
77
+ text=chunk_text,
78
+ metadata=metadata,
79
+ tags=[],
80
+ )
81
+ )
82
+ return chunks
83
+
84
+ async def asplit(self, doc: LoadedDocument) -> list[TextChunk]:
85
+ return await asyncio.to_thread(self.split, doc)
@@ -0,0 +1,4 @@
1
+ from amsdal_ml.ml_ingesting.stores.embedding_data import EmbeddingDataStore
2
+ from amsdal_ml.ml_ingesting.stores.store import EmbeddingStore
3
+
4
+ __all__ = ['EmbeddingDataStore', 'EmbeddingStore']
@@ -0,0 +1,63 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Sequence
4
+ from typing import Any
5
+ from typing import cast
6
+
7
+ from amsdal_ml.ml_ingesting.embedding_data import EmbeddingData
8
+ from amsdal_ml.ml_ingesting.stores.store import EmbeddingStore
9
+ from amsdal_ml.ml_ingesting.types import IngestionSource
10
+ from amsdal_ml.models.embedding_model import EmbeddingModel
11
+
12
+
13
+ class EmbeddingDataStore(EmbeddingStore):
14
+ def __init__(self, *, model_cls=EmbeddingModel) -> None:
15
+ self.model_cls = model_cls
16
+
17
+ def _merge_tags(self, base: list[str], extra: list[str]) -> list[str]:
18
+ merged = list(base)
19
+ for tag in extra:
20
+ if tag not in merged:
21
+ merged.append(tag)
22
+ return merged
23
+
24
+ def _build_objects(self, embeddings: Sequence[EmbeddingData], source: IngestionSource) -> list[Any]:
25
+ base_tags = list(source.tags)
26
+ base_meta = dict(source.metadata)
27
+ objs = []
28
+ for record in embeddings:
29
+ tags = self._merge_tags(base_tags, list(record.tags))
30
+ meta = dict(base_meta)
31
+ meta.update(record.metadata or {})
32
+ objs.append(
33
+ self.model_cls(
34
+ data_object_class=source.object_class,
35
+ data_object_id=source.object_id,
36
+ chunk_index=record.chunk_index,
37
+ raw_text=record.raw_text,
38
+ embedding=record.embedding,
39
+ tags=tags,
40
+ ml_metadata=meta,
41
+ )
42
+ )
43
+ return objs
44
+
45
+ def save(self, embeddings: Sequence[EmbeddingData], *, source: IngestionSource | None = None) -> list[Any]:
46
+ if source is None:
47
+ msg = 'source is required to save embeddings'
48
+ raise RuntimeError(msg)
49
+ objs = self._build_objects(embeddings, source)
50
+ manager = cast(Any, self.model_cls.objects)
51
+ manager.bulk_create(objs)
52
+ return objs
53
+
54
+ async def asave(
55
+ self, embeddings: Sequence[EmbeddingData], *, source: IngestionSource | None = None
56
+ ) -> list[Any]:
57
+ if source is None:
58
+ msg = 'source is required to save embeddings'
59
+ raise RuntimeError(msg)
60
+ objs = self._build_objects(embeddings, source)
61
+ manager = cast(Any, self.model_cls.objects)
62
+ await manager.bulk_acreate(objs)
63
+ return objs
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC
4
+ from abc import abstractmethod
5
+ from collections.abc import Sequence
6
+ from typing import Any
7
+
8
+ from amsdal_ml.ml_ingesting.embedding_data import EmbeddingData
9
+ from amsdal_ml.ml_ingesting.types import IngestionSource
10
+
11
+
12
+ class EmbeddingStore(ABC):
13
+ @abstractmethod
14
+ def save(self, embeddings: Sequence[EmbeddingData], *, source: IngestionSource | None = None) -> list[Any]: ...
15
+
16
+ @abstractmethod
17
+ async def asave(
18
+ self,
19
+ embeddings: Sequence[EmbeddingData],
20
+ *,
21
+ source: IngestionSource | None = None,
22
+ ) -> list[Any]: ...
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel
6
+ from pydantic import Field
7
+
8
+
9
+ class LoadedPage(BaseModel):
10
+ page_number: int | None = Field(default=None, title='1-based page number if available')
11
+ text: str = Field(..., min_length=1, title='Extracted page text')
12
+ metadata: dict[str, Any] = Field(default_factory=dict, title='Page-level metadata')
13
+
14
+ def as_text(self) -> str:
15
+ return self.text
16
+
17
+
18
+ class LoadedDocument(BaseModel):
19
+ pages: list[LoadedPage] = Field(default_factory=list, title='Pages in original order')
20
+ metadata: dict[str, Any] = Field(default_factory=dict, title='Document-level metadata')
21
+
22
+ def join(self, *, separator: str = '\n\n') -> str:
23
+ return separator.join(page.text for page in self.pages)
24
+
25
+
26
+ class TextChunk(BaseModel):
27
+ index: int = Field(..., title='Chunk order in document')
28
+ text: str = Field(..., min_length=1, title='Chunk text destined for embedding')
29
+ tags: list[str] = Field(default_factory=list, title='Tags to persist with embeddings')
30
+ metadata: dict[str, Any] = Field(default_factory=dict, title='Arbitrary chunk metadata')
31
+
32
+
33
+ class IngestionSource(BaseModel):
34
+ object_class: str = Field(..., title='Linked object class for embeddings')
35
+ object_id: str = Field(..., title='Linked object ID for embeddings')
36
+ tags: list[str] = Field(default_factory=list, title='Base tags applied to all chunks')
37
+ metadata: dict[str, Any] = Field(default_factory=dict, title='Arbitrary source metadata')
38
+
39
+
40
+ __all__ = ['IngestionSource', 'LoadedDocument', 'LoadedPage', 'TextChunk']
@@ -3,9 +3,31 @@ from __future__ import annotations
3
3
  from abc import ABC
4
4
  from abc import abstractmethod
5
5
  from collections.abc import AsyncIterator
6
+ from typing import Any
7
+ from typing import Required
8
+ from typing import TypedDict
6
9
 
7
10
  from amsdal_ml.fileio.base_loader import PLAIN_TEXT
8
11
  from amsdal_ml.fileio.base_loader import FileAttachment
12
+ from amsdal_ml.ml_models.utils import ResponseFormat
13
+
14
+
15
+ class StructuredMessage(TypedDict, total=False):
16
+ """Base structure for a message in LLM conversations.
17
+
18
+ Attributes:
19
+ role: The role of the message sender (e.g., 'user', 'assistant', 'system').
20
+ content: The content of the message, can be str or list of multimodal parts.
21
+ tool_call_id: ID of the tool call (for tool messages).
22
+ name: Name of the tool (for tool messages).
23
+ """
24
+ role: Required[str]
25
+ content: Required[Any]
26
+ tool_call_id: str
27
+ name: str
28
+
29
+
30
+ LLModelInput = str | list[StructuredMessage]
9
31
 
10
32
 
11
33
  class ModelError(Exception):
@@ -25,6 +47,12 @@ class ModelAPIError(ModelError):
25
47
 
26
48
 
27
49
  class MLModel(ABC):
50
+ @property
51
+ @abstractmethod
52
+ def supported_formats(self) -> set[ResponseFormat]:
53
+ """Return a set of supported response formats for this model."""
54
+ raise NotImplementedError
55
+
28
56
  @abstractmethod
29
57
  def setup(self) -> None:
30
58
  """Initialize any clients or resources needed before inference."""
@@ -39,12 +67,64 @@ class MLModel(ABC):
39
67
  """Return a set of universal attachment kinds, e.g. {PLAIN_TEXT, FILE_ID}."""
40
68
  return {PLAIN_TEXT}
41
69
 
70
+ @property
71
+ @abstractmethod
72
+ def input_role(self) -> str:
73
+ """Return the role for user input messages."""
74
+ raise NotImplementedError
75
+
76
+ @property
77
+ @abstractmethod
78
+ def output_role(self) -> str:
79
+ """Return the role for model output messages."""
80
+ raise NotImplementedError
81
+
82
+ @property
83
+ @abstractmethod
84
+ def tool_role(self) -> str:
85
+ """Return the role for tool result messages."""
86
+ raise NotImplementedError
87
+
88
+ @property
89
+ @abstractmethod
90
+ def system_role(self) -> str:
91
+ """Return the role for system messages."""
92
+ raise NotImplementedError
93
+
94
+ @property
95
+ @abstractmethod
96
+ def content_field(self) -> str:
97
+ """Return the field name for message content (e.g., 'content' for OpenAI, 'parts' for Gemini)."""
98
+ raise NotImplementedError
99
+
100
+ @property
101
+ @abstractmethod
102
+ def role_field(self) -> str:
103
+ """Return the field name for message role (e.g., 'role' for most models)."""
104
+ raise NotImplementedError
105
+
106
+ @property
107
+ @abstractmethod
108
+ def tool_call_id_field(self) -> str:
109
+ """Return the field name for tool call ID (e.g., 'tool_call_id' for OpenAI)."""
110
+ raise NotImplementedError
111
+
112
+ @property
113
+ @abstractmethod
114
+ def tool_name_field(self) -> str:
115
+ """Return the field name for tool name (e.g., 'name' for OpenAI)."""
116
+ raise NotImplementedError
117
+
42
118
  @abstractmethod
43
119
  def invoke(
44
120
  self,
45
- prompt: str,
121
+ input: LLModelInput, # noqa: A002
46
122
  *,
47
123
  attachments: list[FileAttachment] | None = None,
124
+ response_format: ResponseFormat | None = None,
125
+ schema: dict[str, Any] | None = None,
126
+ tools: list[dict[str, Any]] | None = None,
127
+ tool_choice: str | dict[str, Any] | None = None,
48
128
  ) -> str:
49
129
  """Run synchronous inference with the model."""
50
130
  raise NotImplementedError
@@ -52,9 +132,13 @@ class MLModel(ABC):
52
132
  @abstractmethod
53
133
  async def ainvoke(
54
134
  self,
55
- prompt: str,
135
+ input: LLModelInput, # noqa: A002
56
136
  *,
57
137
  attachments: list[FileAttachment] | None = None,
138
+ response_format: ResponseFormat | None = None,
139
+ schema: dict[str, Any] | None = None,
140
+ tools: list[dict[str, Any]] | None = None,
141
+ tool_choice: str | dict[str, Any] | None = None,
58
142
  ) -> str:
59
143
  """Run asynchronous inference with the model."""
60
144
  raise NotImplementedError
@@ -62,9 +146,13 @@ class MLModel(ABC):
62
146
  @abstractmethod
63
147
  def stream(
64
148
  self,
65
- prompt: str,
149
+ input: LLModelInput, # noqa: A002
66
150
  *,
67
151
  attachments: list[FileAttachment] | None = None,
152
+ response_format: ResponseFormat | None = None,
153
+ schema: dict[str, Any] | None = None,
154
+ tools: list[dict[str, Any]] | None = None,
155
+ tool_choice: str | dict[str, Any] | None = None,
68
156
  ):
69
157
  """Stream synchronous inference results from the model."""
70
158
  raise NotImplementedError
@@ -72,9 +160,13 @@ class MLModel(ABC):
72
160
  @abstractmethod
73
161
  def astream(
74
162
  self,
75
- prompt: str,
163
+ input: LLModelInput, # noqa: A002
76
164
  *,
77
165
  attachments: list[FileAttachment] | None = None,
166
+ response_format: ResponseFormat | None = None,
167
+ schema: dict[str, Any] | None = None,
168
+ tools: list[dict[str, Any]] | None = None,
169
+ tool_choice: str | dict[str, Any] | None = None,
78
170
  ) -> AsyncIterator[str]:
79
171
  """
80
172
  Stream asynchronous inference results as an async generator.