amsdal_ml 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. amsdal_ml/Third-Party Materials - AMSDAL Dependencies - License Notices.md +617 -0
  2. amsdal_ml/__about__.py +1 -1
  3. amsdal_ml/agents/__init__.py +13 -0
  4. amsdal_ml/agents/agent.py +5 -7
  5. amsdal_ml/agents/default_qa_agent.py +108 -143
  6. amsdal_ml/agents/functional_calling_agent.py +233 -0
  7. amsdal_ml/agents/mcp_client_tool.py +46 -0
  8. amsdal_ml/agents/python_tool.py +86 -0
  9. amsdal_ml/agents/retriever_tool.py +5 -6
  10. amsdal_ml/agents/tool_adapters.py +98 -0
  11. amsdal_ml/fileio/base_loader.py +7 -5
  12. amsdal_ml/fileio/openai_loader.py +16 -17
  13. amsdal_ml/mcp_client/base.py +2 -0
  14. amsdal_ml/mcp_client/http_client.py +7 -1
  15. amsdal_ml/mcp_client/stdio_client.py +19 -16
  16. amsdal_ml/mcp_server/server_retriever_stdio.py +8 -11
  17. amsdal_ml/ml_ingesting/__init__.py +29 -0
  18. amsdal_ml/ml_ingesting/default_ingesting.py +49 -51
  19. amsdal_ml/ml_ingesting/embedders/__init__.py +4 -0
  20. amsdal_ml/ml_ingesting/embedders/embedder.py +12 -0
  21. amsdal_ml/ml_ingesting/embedders/openai_embedder.py +30 -0
  22. amsdal_ml/ml_ingesting/embedding_data.py +3 -0
  23. amsdal_ml/ml_ingesting/loaders/__init__.py +6 -0
  24. amsdal_ml/ml_ingesting/loaders/folder_loader.py +52 -0
  25. amsdal_ml/ml_ingesting/loaders/loader.py +28 -0
  26. amsdal_ml/ml_ingesting/loaders/pdf_loader.py +136 -0
  27. amsdal_ml/ml_ingesting/loaders/text_loader.py +44 -0
  28. amsdal_ml/ml_ingesting/model_ingester.py +278 -0
  29. amsdal_ml/ml_ingesting/pipeline.py +131 -0
  30. amsdal_ml/ml_ingesting/pipeline_interface.py +31 -0
  31. amsdal_ml/ml_ingesting/processors/__init__.py +4 -0
  32. amsdal_ml/ml_ingesting/processors/cleaner.py +14 -0
  33. amsdal_ml/ml_ingesting/processors/text_cleaner.py +42 -0
  34. amsdal_ml/ml_ingesting/splitters/__init__.py +4 -0
  35. amsdal_ml/ml_ingesting/splitters/splitter.py +15 -0
  36. amsdal_ml/ml_ingesting/splitters/token_splitter.py +85 -0
  37. amsdal_ml/ml_ingesting/stores/__init__.py +4 -0
  38. amsdal_ml/ml_ingesting/stores/embedding_data.py +63 -0
  39. amsdal_ml/ml_ingesting/stores/store.py +22 -0
  40. amsdal_ml/ml_ingesting/types.py +40 -0
  41. amsdal_ml/ml_models/models.py +96 -4
  42. amsdal_ml/ml_models/openai_model.py +430 -122
  43. amsdal_ml/ml_models/utils.py +7 -0
  44. amsdal_ml/ml_retrievers/__init__.py +17 -0
  45. amsdal_ml/ml_retrievers/adapters.py +93 -0
  46. amsdal_ml/ml_retrievers/default_retriever.py +11 -1
  47. amsdal_ml/ml_retrievers/openai_retriever.py +27 -7
  48. amsdal_ml/ml_retrievers/query_retriever.py +487 -0
  49. amsdal_ml/ml_retrievers/retriever.py +12 -0
  50. amsdal_ml/models/embedding_model.py +7 -7
  51. amsdal_ml/prompts/__init__.py +77 -0
  52. amsdal_ml/prompts/database_query_agent.prompt +14 -0
  53. amsdal_ml/prompts/functional_calling_agent_base.prompt +9 -0
  54. amsdal_ml/prompts/nl_query_filter.prompt +318 -0
  55. amsdal_ml/{agents/promts → prompts}/react_chat.prompt +17 -8
  56. amsdal_ml/utils/__init__.py +5 -0
  57. amsdal_ml/utils/query_utils.py +189 -0
  58. {amsdal_ml-0.1.4.dist-info → amsdal_ml-0.2.0.dist-info}/METADATA +59 -1
  59. amsdal_ml-0.2.0.dist-info/RECORD +72 -0
  60. {amsdal_ml-0.1.4.dist-info → amsdal_ml-0.2.0.dist-info}/WHEEL +1 -1
  61. amsdal_ml/agents/promts/__init__.py +0 -58
  62. amsdal_ml-0.1.4.dist-info/RECORD +0 -39
@@ -0,0 +1,136 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ from typing import IO
6
+ from typing import Any
7
+
8
+ import pymupdf # type: ignore
9
+
10
+ from amsdal_ml.ml_ingesting.loaders.loader import Loader
11
+ from amsdal_ml.ml_ingesting.types import LoadedDocument
12
+ from amsdal_ml.ml_ingesting.types import LoadedPage
13
+
14
+
15
+ def _merge_spaced_characters(text: str) -> str:
16
+ """Collapse sequences like "s h o r t - t e r m" into "short-term".
17
+
18
+ Some PDFs return every character as a separate token. We merge contiguous
19
+ single-character tokens (letters/digits and simple inline punctuation) so
20
+ downstream cleaning does not preserve the artificial spaces.
21
+ """
22
+
23
+ tokens = text.replace("\n", " ").split()
24
+ merged: list[str] = []
25
+ buffer = ""
26
+
27
+ def flush_buffer() -> None:
28
+ nonlocal buffer
29
+ if buffer:
30
+ merged.append(buffer)
31
+ buffer = ""
32
+
33
+ for tok in tokens:
34
+ if len(tok) == 1 and (tok.isalnum() or tok in {"'", "-", "&"}):
35
+ buffer += tok
36
+ continue
37
+ flush_buffer()
38
+ merged.append(tok)
39
+
40
+ flush_buffer()
41
+ return " ".join(merged)
42
+
43
+
44
+ def _is_noise(
45
+ text: str,
46
+ *,
47
+ min_tokens: int = 20,
48
+ single_char_ratio: float = 0.6,
49
+ single_char_min_count: int = 5,
50
+ ) -> bool:
51
+ """Heuristic: drop text dominated by single-character tokens.
52
+
53
+ We treat as noise when:
54
+ - Most tokens are single characters (ratio >= single_char_ratio), AND
55
+ either the sample is long enough (>= min_tokens) or we have at least a
56
+ few single-character tokens (>= single_char_min_count) even in short text.
57
+ """
58
+
59
+ tokens = text.split()
60
+ if not tokens:
61
+ return True
62
+
63
+ single_chars = sum(1 for t in tokens if len(t) == 1)
64
+ ratio = single_chars / len(tokens)
65
+
66
+ if ratio < single_char_ratio:
67
+ return False
68
+
69
+ return len(tokens) >= min_tokens or single_chars >= single_char_min_count
70
+
71
+
72
+ class PdfLoader(Loader):
73
+ def __init__(self, *, extract_metadata: bool = True) -> None:
74
+ self.extract_metadata = extract_metadata
75
+ self.logger = logging.getLogger(__name__)
76
+
77
+ def _read_sync(
78
+ self,
79
+ file: IO[Any],
80
+ filename: str | None = None,
81
+ metadata: dict[str, Any] | None = None
82
+ ) -> LoadedDocument:
83
+ peek = file.read(4)
84
+ file.seek(0)
85
+ if not (isinstance(peek, bytes) and peek.startswith(b"%PDF")):
86
+ msg = "Not a PDF file"
87
+ raise ValueError(msg)
88
+
89
+ data = file.read()
90
+
91
+ with pymupdf.open(stream=data, filetype="pdf") as doc:
92
+ pages: list[LoadedPage] = []
93
+ for idx, page in enumerate(doc):
94
+ text_raw = page.get_text("text") or ''
95
+ text = _merge_spaced_characters(text_raw)
96
+ if not text:
97
+ continue
98
+ if _is_noise(text):
99
+ self.logger.debug("Skipping page %s as noise (single-char heavy)", idx + 1)
100
+ continue
101
+ pages.append(
102
+ LoadedPage(
103
+ page_number=idx + 1,
104
+ text=text,
105
+ metadata={'page_number': idx + 1},
106
+ )
107
+ )
108
+
109
+ doc_meta: dict[str, Any] = {}
110
+ if self.extract_metadata:
111
+ raw_meta = doc.metadata or {}
112
+ for key, value in raw_meta.items():
113
+ if value is None:
114
+ continue
115
+ doc_meta[str(key)] = str(value)
116
+
117
+ if metadata:
118
+ doc_meta.update(metadata)
119
+ if filename:
120
+ doc_meta.setdefault('filename', filename)
121
+
122
+ return LoadedDocument(pages=pages, metadata=doc_meta)
123
+
124
+ def load(
125
+ self,
126
+ file: IO[Any],
127
+ *,
128
+ filename: str | None = None,
129
+ metadata: dict[str, Any] | None = None
130
+ ) -> LoadedDocument:
131
+ return self._read_sync(file, filename=filename, metadata=metadata)
132
+
133
+ async def aload(
134
+ self, file: IO[Any], *, filename: str | None = None, metadata: dict[str, Any] | None = None
135
+ ) -> LoadedDocument:
136
+ return await asyncio.to_thread(self._read_sync, file, filename, metadata)
@@ -0,0 +1,44 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from typing import IO
5
+ from typing import Any
6
+
7
+ from amsdal_ml.ml_ingesting.loaders.loader import Loader
8
+ from amsdal_ml.ml_ingesting.types import LoadedDocument
9
+ from amsdal_ml.ml_ingesting.types import LoadedPage
10
+
11
+
12
+ class TextLoader(Loader):
13
+ def __init__(self, *, encoding: str = 'utf-8', errors: str = 'ignore') -> None:
14
+ self.encoding = encoding
15
+ self.errors = errors
16
+
17
+ def _read_text(self, file: IO[Any]) -> str:
18
+ data = file.read()
19
+ if isinstance(data, bytes):
20
+ return data.decode(self.encoding, errors=self.errors)
21
+ return str(data)
22
+
23
+ def load(
24
+ self,
25
+ file: IO[Any],
26
+ *,
27
+ filename: str | None = None,
28
+ metadata: dict[str, Any] | None = None,
29
+ ) -> LoadedDocument:
30
+ text = self._read_text(file)
31
+ doc_meta = dict(metadata or {})
32
+ if filename:
33
+ doc_meta.setdefault('filename', filename)
34
+ page = LoadedPage(page_number=None, text=text, metadata={})
35
+ return LoadedDocument(pages=[page], metadata=doc_meta)
36
+
37
+ async def aload(
38
+ self,
39
+ file: IO[Any],
40
+ *,
41
+ filename: str | None = None,
42
+ metadata: dict[str, Any] | None = None,
43
+ ) -> LoadedDocument:
44
+ return await asyncio.to_thread(self.load, file, filename=filename, metadata=metadata)
@@ -0,0 +1,278 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ from collections.abc import AsyncIterator
6
+ from collections.abc import Iterable
7
+ from typing import IO
8
+ from typing import Any
9
+ from typing import ClassVar
10
+ from typing import Generic
11
+ from typing import TypeVar
12
+
13
+ from amsdal.models.core.file import File
14
+ from amsdal_models.querysets.base_queryset import QuerySetBase
15
+
16
+ from amsdal_ml.ml_ingesting.loaders.loader import Loader
17
+ from amsdal_ml.ml_ingesting.pipeline import DefaultIngestionPipeline
18
+ from amsdal_ml.ml_ingesting.types import IngestionSource
19
+
20
+ LoaderT = TypeVar('LoaderT', bound=Loader)
21
+
22
+
23
+ class ModelIngester(Generic[LoaderT]):
24
+ MIN_OBJECTS_FOR_WARNING: ClassVar[int] = 3
25
+
26
+ def __init__(
27
+ self,
28
+ *,
29
+ pipeline: DefaultIngestionPipeline[LoaderT],
30
+ base_tags: Iterable[str] | None = None,
31
+ base_metadata: dict[str, Any] | None = None,
32
+ logger: logging.Logger | None = None,
33
+ ) -> None:
34
+ self.pipeline = pipeline
35
+ self.base_tags = list(base_tags or [])
36
+ self.base_metadata = dict(base_metadata or {})
37
+ self.logger = logger or logging.getLogger(__name__)
38
+
39
+ async def aingest(
40
+ self,
41
+ objects: Iterable[Any] | AsyncIterator[Any],
42
+ *,
43
+ fields: Iterable[str] | None = None,
44
+ tags: Iterable[str] | None = None,
45
+ ) -> list[Any]:
46
+ results: list[Any] = []
47
+ count_objects = 0
48
+ fields_set = set(fields) if fields else None
49
+
50
+ self.logger.debug("Starting async ingest; fields=%s", fields_set)
51
+
52
+ async for instance in self._amaterialize(objects):
53
+ count_objects += 1
54
+ object_class, object_id = self._resolve_link(instance)
55
+ self.logger.debug(
56
+ "Async ingest instance %s:%s; model_fields=%s",
57
+ object_class,
58
+ object_id,
59
+ getattr(instance.__class__, "model_fields", None),
60
+ )
61
+
62
+ found_any = False
63
+ async for payload in self._aiter_values(instance, fields_set):
64
+ found_any = True
65
+ field_name, field_title, stream, filename, kind = payload
66
+ source = self._build_source(object_class, object_id, field_name, field_title, filename)
67
+ run_tags = self._build_tags(tags, field_name, kind)
68
+ try:
69
+ chunk_res = await self.pipeline.arun(
70
+ stream,
71
+ filename=filename,
72
+ tags=run_tags,
73
+ source=source,
74
+ )
75
+ results.extend(chunk_res)
76
+ except Exception as exc: # noqa: BLE001
77
+ self._warn_skip(object_class, field_name, exc)
78
+
79
+ if not found_any and count_objects <= self.MIN_OBJECTS_FOR_WARNING:
80
+ self.logger.info(" -> No ingestible fields found in %s:%s", object_class, object_id)
81
+
82
+ if count_objects == 0:
83
+ self.logger.warning("ModelIngester received 0 objects to process!")
84
+
85
+ self.logger.debug("Async ingest finished; processed=%s results=%s", count_objects, len(results))
86
+ return results
87
+
88
+ def ingest(
89
+ self,
90
+ objects: Iterable[Any],
91
+ *,
92
+ fields: Iterable[str] | None = None,
93
+ tags: Iterable[str] | None = None,
94
+ ) -> list[Any]:
95
+ rows = list(self._materialize(objects))
96
+ fields_set = set(fields) if fields else None
97
+ results: list[Any] = []
98
+ self.logger.debug("Starting sync ingest; rows=%s fields=%s", len(rows), fields_set)
99
+ for instance in rows:
100
+ object_class, object_id = self._resolve_link(instance)
101
+ self.logger.debug(
102
+ "Sync ingest instance %s:%s; model_fields=%s",
103
+ object_class,
104
+ object_id,
105
+ getattr(instance.__class__, "model_fields", None),
106
+ )
107
+ found_any = False
108
+ for payload in self._iter_values(instance, fields_set):
109
+ found_any = True
110
+ field_name, field_title, stream, filename, kind = payload
111
+ source = self._build_source(object_class, object_id, field_name, field_title, filename)
112
+ run_tags = self._build_tags(tags, field_name, kind)
113
+ try:
114
+ results.extend(
115
+ self.pipeline.run(
116
+ stream,
117
+ filename=filename,
118
+ tags=run_tags,
119
+ source=source,
120
+ )
121
+ )
122
+ except Exception as exc: # noqa: BLE001
123
+ self._warn_skip(object_class, field_name, exc)
124
+ if not found_any:
125
+ self.logger.debug("No ingestible fields in %s:%s", object_class, object_id)
126
+
127
+ self.logger.debug("Sync ingest finished; processed=%s results=%s", len(rows), len(results))
128
+ return results
129
+
130
+ async def _aiter_values(
131
+ self,
132
+ instance: Any,
133
+ fields_set: set[str] | None,
134
+ ) -> AsyncIterator[tuple[str, str | None, IO[Any], str | None, str]]:
135
+ if isinstance(instance, File):
136
+ try:
137
+ stream, filename = await self._afile_to_stream(instance)
138
+ yield "file", "file", stream, filename, "file"
139
+ except Exception as exc:
140
+ self._warn_skip("File", str(getattr(instance, "filename", instance)), exc)
141
+ return
142
+
143
+ model_fields = getattr(instance.__class__, "model_fields", None) or {}
144
+ if not model_fields:
145
+ return
146
+
147
+ for name, info in model_fields.items():
148
+ if fields_set is not None and name not in fields_set:
149
+ continue
150
+ val = getattr(instance, name, None)
151
+ try:
152
+ payload = await self._avalue_to_stream(val)
153
+ if payload is None:
154
+ continue
155
+ stream, filename, kind = payload
156
+ title = getattr(info, "title", None) or name
157
+ yield name, title, stream, filename, kind
158
+ except Exception as exc:
159
+ self._warn_skip(instance.__class__.__name__, name, exc)
160
+
161
+ def _iter_values(
162
+ self,
163
+ instance: Any,
164
+ fields_set: set[str] | None,
165
+ ) -> Iterable[tuple[str, str | None, IO[Any], str | None, str]]:
166
+ if isinstance(instance, File):
167
+ try:
168
+ stream, filename = self._file_to_stream(instance)
169
+ return [("file", "file", stream, filename, "file")]
170
+ except Exception as exc:
171
+ self._warn_skip("File", str(getattr(instance, "filename", instance)), exc)
172
+ return []
173
+
174
+ model_fields = getattr(instance.__class__, "model_fields", None) or {}
175
+ if not model_fields:
176
+ return []
177
+
178
+ items: list[tuple[str, str | None, IO[Any], str | None, str]] = []
179
+ for name, info in model_fields.items():
180
+ if fields_set is not None and name not in fields_set:
181
+ continue
182
+ val = getattr(instance, name, None)
183
+ try:
184
+ payload = self._value_to_stream(val)
185
+ if payload is None:
186
+ continue
187
+ stream, filename, kind = payload
188
+ title = getattr(info, "title", None) or name
189
+ items.append((name, title, stream, filename, kind))
190
+ except Exception as exc:
191
+ self._warn_skip(instance.__class__.__name__, name, exc)
192
+ return items
193
+
194
+ def _build_source(
195
+ self,
196
+ object_class: str,
197
+ object_id: str,
198
+ field_name: str,
199
+ field_title: str | None,
200
+ filename: str | None,
201
+ ) -> IngestionSource:
202
+ meta = dict(self.base_metadata)
203
+ meta.setdefault("field", field_name)
204
+ if field_title:
205
+ meta.setdefault("field_title", field_title)
206
+ if filename:
207
+ meta.setdefault("filename", filename)
208
+ return IngestionSource(
209
+ object_class=object_class,
210
+ object_id=object_id,
211
+ tags=list(self.base_tags),
212
+ metadata=meta,
213
+ )
214
+
215
+ def _build_tags(self, extra: Iterable[str] | None, field_name: str, kind: str) -> list[str]:
216
+ tags = list(extra or [])
217
+ tags.append(kind)
218
+ tags.append(f"field:{field_name}")
219
+ return tags
220
+
221
+ def _resolve_link(self, instance: Any) -> tuple[str, str]:
222
+ cls = instance.__class__.__name__
223
+ oid = getattr(instance, "object_id", None)
224
+ if oid is None:
225
+ oid = getattr(instance, "id", None)
226
+ if oid is None:
227
+ oid = id(instance)
228
+ return cls, str(oid)
229
+
230
+ def _warn_skip(self, object_class: str, field_name: str, exc: Exception) -> None:
231
+ self.logger.warning("Skipping %s.%s: %s", object_class, field_name, exc)
232
+
233
+ def _materialize(self, objects: Any) -> Iterable[Any]:
234
+ if isinstance(objects, QuerySetBase):
235
+ return objects.execute() # type: ignore[attr-defined]
236
+ return list(objects)
237
+
238
+ async def _amaterialize(self, objects: Any) -> AsyncIterator[Any]:
239
+ if isinstance(objects, QuerySetBase):
240
+ result = await objects.aexecute() # type: ignore[attr-defined]
241
+ for item in result:
242
+ yield item
243
+ return
244
+
245
+ for item in objects:
246
+ yield item
247
+
248
+ def _file_to_stream(self, file_obj: File) -> tuple[IO[Any], str | None]:
249
+ content = file_obj.read_bytes()
250
+ return io.BytesIO(content), getattr(file_obj, "filename", None)
251
+
252
+ async def _afile_to_stream(self, file_obj: File) -> tuple[IO[Any], str | None]:
253
+ content = await file_obj.aread_bytes()
254
+ return io.BytesIO(content), getattr(file_obj, "filename", None)
255
+
256
+ def _value_to_stream(self, val: Any) -> tuple[IO[Any], str | None, str] | None:
257
+ if val is None:
258
+ return None
259
+ if isinstance(val, File):
260
+ stream, filename = self._file_to_stream(val)
261
+ return stream, filename, "file"
262
+ if isinstance(val, (bytes, bytearray)):
263
+ return io.BytesIO(val), None, "file"
264
+ if isinstance(val, str):
265
+ return io.StringIO(val), None, "text"
266
+ return None
267
+
268
+ async def _avalue_to_stream(self, val: Any) -> tuple[IO[Any], str | None, str] | None:
269
+ if val is None:
270
+ return None
271
+ if isinstance(val, File):
272
+ stream, filename = await self._afile_to_stream(val)
273
+ return stream, filename, "file"
274
+ if isinstance(val, (bytes, bytearray)):
275
+ return io.BytesIO(val), None, "file"
276
+ if isinstance(val, str):
277
+ return io.StringIO(val), None, "text"
278
+ return None
@@ -0,0 +1,131 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterable
4
+ from typing import IO
5
+ from typing import Any
6
+ from typing import Generic
7
+ from typing import TypeVar
8
+
9
+ from amsdal_ml.ml_ingesting.embedders.embedder import Embedder
10
+ from amsdal_ml.ml_ingesting.embedding_data import EmbeddingData
11
+ from amsdal_ml.ml_ingesting.loaders.loader import Loader
12
+ from amsdal_ml.ml_ingesting.pipeline_interface import IngestionPipeline
13
+ from amsdal_ml.ml_ingesting.processors.cleaner import Cleaner
14
+ from amsdal_ml.ml_ingesting.splitters.splitter import Splitter
15
+ from amsdal_ml.ml_ingesting.stores.store import EmbeddingStore
16
+ from amsdal_ml.ml_ingesting.types import IngestionSource
17
+ from amsdal_ml.ml_ingesting.types import LoadedDocument
18
+
19
+ LoaderT = TypeVar('LoaderT', bound=Loader)
20
+
21
+
22
+ class DefaultIngestionPipeline(IngestionPipeline, Generic[LoaderT]):
23
+ loader: LoaderT
24
+
25
+ def __init__(
26
+ self,
27
+ *,
28
+ loader: LoaderT,
29
+ cleaner: Cleaner,
30
+ splitter: Splitter,
31
+ embedder: Embedder,
32
+ store: EmbeddingStore,
33
+ ) -> None:
34
+ self.loader = loader
35
+ self.cleaner = cleaner
36
+ self.splitter = splitter
37
+ self.embedder = embedder
38
+ self.store = store
39
+
40
+ def _combine_tags(self, base: Iterable[str] | None, extra: Iterable[str] | None) -> list[str]:
41
+ out: list[str] = []
42
+ for tag in list(base or []) + list(extra or []):
43
+ if tag not in out:
44
+ out.append(tag)
45
+ return out
46
+
47
+ def _ensure_source(self, source: IngestionSource | None) -> IngestionSource:
48
+ if source is None:
49
+ msg = 'source is required for ingestion pipeline'
50
+ raise RuntimeError(msg)
51
+ return source
52
+
53
+ def _merge_metadata(
54
+ self,
55
+ source_meta: dict[str, Any],
56
+ doc_meta: dict[str, Any],
57
+ filename: str | None = None,
58
+ ) -> dict[str, Any]:
59
+ merged = {**source_meta, **doc_meta}
60
+ if filename and 'filename' not in merged:
61
+ merged['filename'] = filename
62
+ return merged
63
+
64
+ def _embed_chunks(self, chunks, tags: list[str], base_metadata: dict[str, Any]) -> list[EmbeddingData]:
65
+ embeddings: list[EmbeddingData] = []
66
+ for idx, chunk in enumerate(chunks):
67
+ vector = self.embedder.embed(chunk.text)
68
+ merged_tags = self._combine_tags(tags, chunk.tags)
69
+ metadata = {**base_metadata, **dict(chunk.metadata)}
70
+ embeddings.append(
71
+ EmbeddingData(
72
+ chunk_index=idx,
73
+ raw_text=chunk.text,
74
+ embedding=vector,
75
+ tags=merged_tags,
76
+ metadata=metadata,
77
+ )
78
+ )
79
+ return embeddings
80
+
81
+ async def _aembed_chunks(self, chunks, tags: list[str], base_metadata: dict[str, Any]) -> list[EmbeddingData]:
82
+ embeddings: list[EmbeddingData] = []
83
+ for idx, chunk in enumerate(chunks):
84
+ vector = await self.embedder.aembed(chunk.text)
85
+ merged_tags = self._combine_tags(tags, chunk.tags)
86
+ metadata = {**base_metadata, **dict(chunk.metadata)}
87
+ embeddings.append(
88
+ EmbeddingData(
89
+ chunk_index=idx,
90
+ raw_text=chunk.text,
91
+ embedding=vector,
92
+ tags=merged_tags,
93
+ metadata=metadata,
94
+ )
95
+ )
96
+ return embeddings
97
+
98
+ def run(
99
+ self,
100
+ file: IO[Any],
101
+ *,
102
+ filename: str | None = None,
103
+ tags: Iterable[str] | None = None,
104
+ source: IngestionSource | None = None,
105
+ ) -> list[Any]:
106
+ src = self._ensure_source(source)
107
+ doc = self.loader.load(file, filename=filename, metadata=src.metadata)
108
+ base_metadata = self._merge_metadata(src.metadata, doc.metadata, filename)
109
+ cleaned = self.cleaner.clean(LoadedDocument(pages=doc.pages, metadata=base_metadata))
110
+ chunks = self.splitter.split(cleaned)
111
+ merged_tags = self._combine_tags(src.tags, tags)
112
+ embeddings = self._embed_chunks(chunks, merged_tags, base_metadata)
113
+ return self.store.save(embeddings, source=src)
114
+
115
+ async def arun(
116
+ self,
117
+ file: IO[Any],
118
+ *,
119
+ filename: str | None = None,
120
+ tags: Iterable[str] | None = None,
121
+ source: IngestionSource | None = None,
122
+ ) -> list[Any]:
123
+ src = self._ensure_source(source)
124
+ doc = await self.loader.aload(file, filename=filename, metadata=src.metadata)
125
+ base_metadata = self._merge_metadata(src.metadata, doc.metadata, filename)
126
+ cleaned = await self.cleaner.aclean(LoadedDocument(pages=doc.pages, metadata=base_metadata))
127
+ chunks = await self.splitter.asplit(cleaned)
128
+ merged_tags = self._combine_tags(src.tags, tags)
129
+ embeddings = await self._aembed_chunks(chunks, merged_tags, base_metadata)
130
+ return await self.store.asave(embeddings, source=src)
131
+
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC
4
+ from abc import abstractmethod
5
+ from collections.abc import Iterable
6
+ from typing import IO
7
+ from typing import Any
8
+
9
+ from amsdal_ml.ml_ingesting.types import IngestionSource
10
+
11
+
12
+ class IngestionPipeline(ABC):
13
+ @abstractmethod
14
+ def run(
15
+ self,
16
+ file: IO[Any],
17
+ *,
18
+ filename: str | None = None,
19
+ tags: Iterable[str] | None = None,
20
+ source: IngestionSource | None = None,
21
+ ) -> list[Any]: ...
22
+
23
+ @abstractmethod
24
+ async def arun(
25
+ self,
26
+ file: IO[Any],
27
+ *,
28
+ filename: str | None = None,
29
+ tags: Iterable[str] | None = None,
30
+ source: IngestionSource | None = None,
31
+ ) -> list[Any]: ...
@@ -0,0 +1,4 @@
1
+ from amsdal_ml.ml_ingesting.processors.cleaner import Cleaner
2
+ from amsdal_ml.ml_ingesting.processors.text_cleaner import TextCleaner
3
+
4
+ __all__ = ['Cleaner', 'TextCleaner']
@@ -0,0 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC
4
+ from abc import abstractmethod
5
+
6
+ from amsdal_ml.ml_ingesting.types import LoadedDocument
7
+
8
+
9
+ class Cleaner(ABC):
10
+ @abstractmethod
11
+ def clean(self, doc: LoadedDocument) -> LoadedDocument: ...
12
+
13
+ @abstractmethod
14
+ async def aclean(self, doc: LoadedDocument) -> LoadedDocument: ...
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import re
5
+
6
+ from amsdal_ml.ml_ingesting.processors.cleaner import Cleaner
7
+ from amsdal_ml.ml_ingesting.types import LoadedDocument
8
+ from amsdal_ml.ml_ingesting.types import LoadedPage
9
+
10
+ _CONTROL_CHARS = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]')
11
+ _MULTI_SPACE = re.compile(r'\s{2,}')
12
+
13
+
14
+ def _normalize_text(text: str) -> str:
15
+ text = _CONTROL_CHARS.sub(' ', text)
16
+ text = text.replace('\r', ' ').replace('\t', ' ')
17
+ text = _MULTI_SPACE.sub(' ', text)
18
+ return text.strip()
19
+
20
+
21
+ class TextCleaner(Cleaner):
22
+ def __init__(self, *, drop_empty_pages: bool = True) -> None:
23
+ self.drop_empty_pages = drop_empty_pages
24
+
25
+ def clean(self, doc: LoadedDocument) -> LoadedDocument:
26
+ pages: list[LoadedPage] = []
27
+ for page in doc.pages:
28
+ cleaned = _normalize_text(page.text)
29
+ if not cleaned and self.drop_empty_pages:
30
+ continue
31
+ pages.append(
32
+ LoadedPage(
33
+ page_number=page.page_number,
34
+ text=cleaned,
35
+ metadata=dict(page.metadata),
36
+ )
37
+ )
38
+ meta = dict(doc.metadata)
39
+ return LoadedDocument(pages=pages, metadata=meta)
40
+
41
+ async def aclean(self, doc: LoadedDocument) -> LoadedDocument:
42
+ return await asyncio.to_thread(self.clean, doc)
@@ -0,0 +1,4 @@
1
+ from amsdal_ml.ml_ingesting.splitters.splitter import Splitter
2
+ from amsdal_ml.ml_ingesting.splitters.token_splitter import TokenSplitter
3
+
4
+ __all__ = ['Splitter', 'TokenSplitter']