amsdal_ml 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amsdal_ml/Third-Party Materials - AMSDAL Dependencies - License Notices.md +617 -0
- amsdal_ml/__about__.py +1 -1
- amsdal_ml/agents/__init__.py +13 -0
- amsdal_ml/agents/agent.py +5 -7
- amsdal_ml/agents/default_qa_agent.py +108 -143
- amsdal_ml/agents/functional_calling_agent.py +233 -0
- amsdal_ml/agents/mcp_client_tool.py +46 -0
- amsdal_ml/agents/python_tool.py +86 -0
- amsdal_ml/agents/retriever_tool.py +5 -6
- amsdal_ml/agents/tool_adapters.py +98 -0
- amsdal_ml/fileio/base_loader.py +7 -5
- amsdal_ml/fileio/openai_loader.py +16 -17
- amsdal_ml/mcp_client/base.py +2 -0
- amsdal_ml/mcp_client/http_client.py +7 -1
- amsdal_ml/mcp_client/stdio_client.py +19 -16
- amsdal_ml/mcp_server/server_retriever_stdio.py +8 -11
- amsdal_ml/ml_ingesting/__init__.py +29 -0
- amsdal_ml/ml_ingesting/default_ingesting.py +49 -51
- amsdal_ml/ml_ingesting/embedders/__init__.py +4 -0
- amsdal_ml/ml_ingesting/embedders/embedder.py +12 -0
- amsdal_ml/ml_ingesting/embedders/openai_embedder.py +30 -0
- amsdal_ml/ml_ingesting/embedding_data.py +3 -0
- amsdal_ml/ml_ingesting/loaders/__init__.py +6 -0
- amsdal_ml/ml_ingesting/loaders/folder_loader.py +52 -0
- amsdal_ml/ml_ingesting/loaders/loader.py +28 -0
- amsdal_ml/ml_ingesting/loaders/pdf_loader.py +136 -0
- amsdal_ml/ml_ingesting/loaders/text_loader.py +44 -0
- amsdal_ml/ml_ingesting/model_ingester.py +278 -0
- amsdal_ml/ml_ingesting/pipeline.py +131 -0
- amsdal_ml/ml_ingesting/pipeline_interface.py +31 -0
- amsdal_ml/ml_ingesting/processors/__init__.py +4 -0
- amsdal_ml/ml_ingesting/processors/cleaner.py +14 -0
- amsdal_ml/ml_ingesting/processors/text_cleaner.py +42 -0
- amsdal_ml/ml_ingesting/splitters/__init__.py +4 -0
- amsdal_ml/ml_ingesting/splitters/splitter.py +15 -0
- amsdal_ml/ml_ingesting/splitters/token_splitter.py +85 -0
- amsdal_ml/ml_ingesting/stores/__init__.py +4 -0
- amsdal_ml/ml_ingesting/stores/embedding_data.py +63 -0
- amsdal_ml/ml_ingesting/stores/store.py +22 -0
- amsdal_ml/ml_ingesting/types.py +40 -0
- amsdal_ml/ml_models/models.py +96 -4
- amsdal_ml/ml_models/openai_model.py +430 -122
- amsdal_ml/ml_models/utils.py +7 -0
- amsdal_ml/ml_retrievers/__init__.py +17 -0
- amsdal_ml/ml_retrievers/adapters.py +93 -0
- amsdal_ml/ml_retrievers/default_retriever.py +11 -1
- amsdal_ml/ml_retrievers/openai_retriever.py +27 -7
- amsdal_ml/ml_retrievers/query_retriever.py +487 -0
- amsdal_ml/ml_retrievers/retriever.py +12 -0
- amsdal_ml/models/embedding_model.py +7 -7
- amsdal_ml/prompts/__init__.py +77 -0
- amsdal_ml/prompts/database_query_agent.prompt +14 -0
- amsdal_ml/prompts/functional_calling_agent_base.prompt +9 -0
- amsdal_ml/prompts/nl_query_filter.prompt +318 -0
- amsdal_ml/{agents/promts → prompts}/react_chat.prompt +17 -8
- amsdal_ml/utils/__init__.py +5 -0
- amsdal_ml/utils/query_utils.py +189 -0
- {amsdal_ml-0.1.4.dist-info → amsdal_ml-0.2.1.dist-info}/METADATA +61 -3
- amsdal_ml-0.2.1.dist-info/RECORD +72 -0
- {amsdal_ml-0.1.4.dist-info → amsdal_ml-0.2.1.dist-info}/WHEEL +1 -1
- amsdal_ml/agents/promts/__init__.py +0 -58
- amsdal_ml-0.1.4.dist-info/RECORD +0 -39
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from typing import IO
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pymupdf # type: ignore
|
|
9
|
+
|
|
10
|
+
from amsdal_ml.ml_ingesting.loaders.loader import Loader
|
|
11
|
+
from amsdal_ml.ml_ingesting.types import LoadedDocument
|
|
12
|
+
from amsdal_ml.ml_ingesting.types import LoadedPage
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _merge_spaced_characters(text: str) -> str:
|
|
16
|
+
"""Collapse sequences like "s h o r t - t e r m" into "short-term".
|
|
17
|
+
|
|
18
|
+
Some PDFs return every character as a separate token. We merge contiguous
|
|
19
|
+
single-character tokens (letters/digits and simple inline punctuation) so
|
|
20
|
+
downstream cleaning does not preserve the artificial spaces.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
tokens = text.replace("\n", " ").split()
|
|
24
|
+
merged: list[str] = []
|
|
25
|
+
buffer = ""
|
|
26
|
+
|
|
27
|
+
def flush_buffer() -> None:
|
|
28
|
+
nonlocal buffer
|
|
29
|
+
if buffer:
|
|
30
|
+
merged.append(buffer)
|
|
31
|
+
buffer = ""
|
|
32
|
+
|
|
33
|
+
for tok in tokens:
|
|
34
|
+
if len(tok) == 1 and (tok.isalnum() or tok in {"'", "-", "&"}):
|
|
35
|
+
buffer += tok
|
|
36
|
+
continue
|
|
37
|
+
flush_buffer()
|
|
38
|
+
merged.append(tok)
|
|
39
|
+
|
|
40
|
+
flush_buffer()
|
|
41
|
+
return " ".join(merged)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _is_noise(
|
|
45
|
+
text: str,
|
|
46
|
+
*,
|
|
47
|
+
min_tokens: int = 20,
|
|
48
|
+
single_char_ratio: float = 0.6,
|
|
49
|
+
single_char_min_count: int = 5,
|
|
50
|
+
) -> bool:
|
|
51
|
+
"""Heuristic: drop text dominated by single-character tokens.
|
|
52
|
+
|
|
53
|
+
We treat as noise when:
|
|
54
|
+
- Most tokens are single characters (ratio >= single_char_ratio), AND
|
|
55
|
+
either the sample is long enough (>= min_tokens) or we have at least a
|
|
56
|
+
few single-character tokens (>= single_char_min_count) even in short text.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
tokens = text.split()
|
|
60
|
+
if not tokens:
|
|
61
|
+
return True
|
|
62
|
+
|
|
63
|
+
single_chars = sum(1 for t in tokens if len(t) == 1)
|
|
64
|
+
ratio = single_chars / len(tokens)
|
|
65
|
+
|
|
66
|
+
if ratio < single_char_ratio:
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
return len(tokens) >= min_tokens or single_chars >= single_char_min_count
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class PdfLoader(Loader):
|
|
73
|
+
def __init__(self, *, extract_metadata: bool = True) -> None:
|
|
74
|
+
self.extract_metadata = extract_metadata
|
|
75
|
+
self.logger = logging.getLogger(__name__)
|
|
76
|
+
|
|
77
|
+
def _read_sync(
|
|
78
|
+
self,
|
|
79
|
+
file: IO[Any],
|
|
80
|
+
filename: str | None = None,
|
|
81
|
+
metadata: dict[str, Any] | None = None
|
|
82
|
+
) -> LoadedDocument:
|
|
83
|
+
peek = file.read(4)
|
|
84
|
+
file.seek(0)
|
|
85
|
+
if not (isinstance(peek, bytes) and peek.startswith(b"%PDF")):
|
|
86
|
+
msg = "Not a PDF file"
|
|
87
|
+
raise ValueError(msg)
|
|
88
|
+
|
|
89
|
+
data = file.read()
|
|
90
|
+
|
|
91
|
+
with pymupdf.open(stream=data, filetype="pdf") as doc:
|
|
92
|
+
pages: list[LoadedPage] = []
|
|
93
|
+
for idx, page in enumerate(doc):
|
|
94
|
+
text_raw = page.get_text("text") or ''
|
|
95
|
+
text = _merge_spaced_characters(text_raw)
|
|
96
|
+
if not text:
|
|
97
|
+
continue
|
|
98
|
+
if _is_noise(text):
|
|
99
|
+
self.logger.debug("Skipping page %s as noise (single-char heavy)", idx + 1)
|
|
100
|
+
continue
|
|
101
|
+
pages.append(
|
|
102
|
+
LoadedPage(
|
|
103
|
+
page_number=idx + 1,
|
|
104
|
+
text=text,
|
|
105
|
+
metadata={'page_number': idx + 1},
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
doc_meta: dict[str, Any] = {}
|
|
110
|
+
if self.extract_metadata:
|
|
111
|
+
raw_meta = doc.metadata or {}
|
|
112
|
+
for key, value in raw_meta.items():
|
|
113
|
+
if value is None:
|
|
114
|
+
continue
|
|
115
|
+
doc_meta[str(key)] = str(value)
|
|
116
|
+
|
|
117
|
+
if metadata:
|
|
118
|
+
doc_meta.update(metadata)
|
|
119
|
+
if filename:
|
|
120
|
+
doc_meta.setdefault('filename', filename)
|
|
121
|
+
|
|
122
|
+
return LoadedDocument(pages=pages, metadata=doc_meta)
|
|
123
|
+
|
|
124
|
+
def load(
|
|
125
|
+
self,
|
|
126
|
+
file: IO[Any],
|
|
127
|
+
*,
|
|
128
|
+
filename: str | None = None,
|
|
129
|
+
metadata: dict[str, Any] | None = None
|
|
130
|
+
) -> LoadedDocument:
|
|
131
|
+
return self._read_sync(file, filename=filename, metadata=metadata)
|
|
132
|
+
|
|
133
|
+
async def aload(
|
|
134
|
+
self, file: IO[Any], *, filename: str | None = None, metadata: dict[str, Any] | None = None
|
|
135
|
+
) -> LoadedDocument:
|
|
136
|
+
return await asyncio.to_thread(self._read_sync, file, filename, metadata)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from typing import IO
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from amsdal_ml.ml_ingesting.loaders.loader import Loader
|
|
8
|
+
from amsdal_ml.ml_ingesting.types import LoadedDocument
|
|
9
|
+
from amsdal_ml.ml_ingesting.types import LoadedPage
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TextLoader(Loader):
|
|
13
|
+
def __init__(self, *, encoding: str = 'utf-8', errors: str = 'ignore') -> None:
|
|
14
|
+
self.encoding = encoding
|
|
15
|
+
self.errors = errors
|
|
16
|
+
|
|
17
|
+
def _read_text(self, file: IO[Any]) -> str:
|
|
18
|
+
data = file.read()
|
|
19
|
+
if isinstance(data, bytes):
|
|
20
|
+
return data.decode(self.encoding, errors=self.errors)
|
|
21
|
+
return str(data)
|
|
22
|
+
|
|
23
|
+
def load(
|
|
24
|
+
self,
|
|
25
|
+
file: IO[Any],
|
|
26
|
+
*,
|
|
27
|
+
filename: str | None = None,
|
|
28
|
+
metadata: dict[str, Any] | None = None,
|
|
29
|
+
) -> LoadedDocument:
|
|
30
|
+
text = self._read_text(file)
|
|
31
|
+
doc_meta = dict(metadata or {})
|
|
32
|
+
if filename:
|
|
33
|
+
doc_meta.setdefault('filename', filename)
|
|
34
|
+
page = LoadedPage(page_number=None, text=text, metadata={})
|
|
35
|
+
return LoadedDocument(pages=[page], metadata=doc_meta)
|
|
36
|
+
|
|
37
|
+
async def aload(
|
|
38
|
+
self,
|
|
39
|
+
file: IO[Any],
|
|
40
|
+
*,
|
|
41
|
+
filename: str | None = None,
|
|
42
|
+
metadata: dict[str, Any] | None = None,
|
|
43
|
+
) -> LoadedDocument:
|
|
44
|
+
return await asyncio.to_thread(self.load, file, filename=filename, metadata=metadata)
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import AsyncIterator
|
|
6
|
+
from collections.abc import Iterable
|
|
7
|
+
from typing import IO
|
|
8
|
+
from typing import Any
|
|
9
|
+
from typing import ClassVar
|
|
10
|
+
from typing import Generic
|
|
11
|
+
from typing import TypeVar
|
|
12
|
+
|
|
13
|
+
from amsdal.models.core.file import File
|
|
14
|
+
from amsdal_models.querysets.base_queryset import QuerySetBase
|
|
15
|
+
|
|
16
|
+
from amsdal_ml.ml_ingesting.loaders.loader import Loader
|
|
17
|
+
from amsdal_ml.ml_ingesting.pipeline import DefaultIngestionPipeline
|
|
18
|
+
from amsdal_ml.ml_ingesting.types import IngestionSource
|
|
19
|
+
|
|
20
|
+
LoaderT = TypeVar('LoaderT', bound=Loader)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ModelIngester(Generic[LoaderT]):
|
|
24
|
+
MIN_OBJECTS_FOR_WARNING: ClassVar[int] = 3
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
*,
|
|
29
|
+
pipeline: DefaultIngestionPipeline[LoaderT],
|
|
30
|
+
base_tags: Iterable[str] | None = None,
|
|
31
|
+
base_metadata: dict[str, Any] | None = None,
|
|
32
|
+
logger: logging.Logger | None = None,
|
|
33
|
+
) -> None:
|
|
34
|
+
self.pipeline = pipeline
|
|
35
|
+
self.base_tags = list(base_tags or [])
|
|
36
|
+
self.base_metadata = dict(base_metadata or {})
|
|
37
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
async def aingest(
|
|
40
|
+
self,
|
|
41
|
+
objects: Iterable[Any] | AsyncIterator[Any],
|
|
42
|
+
*,
|
|
43
|
+
fields: Iterable[str] | None = None,
|
|
44
|
+
tags: Iterable[str] | None = None,
|
|
45
|
+
) -> list[Any]:
|
|
46
|
+
results: list[Any] = []
|
|
47
|
+
count_objects = 0
|
|
48
|
+
fields_set = set(fields) if fields else None
|
|
49
|
+
|
|
50
|
+
self.logger.debug("Starting async ingest; fields=%s", fields_set)
|
|
51
|
+
|
|
52
|
+
async for instance in self._amaterialize(objects):
|
|
53
|
+
count_objects += 1
|
|
54
|
+
object_class, object_id = self._resolve_link(instance)
|
|
55
|
+
self.logger.debug(
|
|
56
|
+
"Async ingest instance %s:%s; model_fields=%s",
|
|
57
|
+
object_class,
|
|
58
|
+
object_id,
|
|
59
|
+
getattr(instance.__class__, "model_fields", None),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
found_any = False
|
|
63
|
+
async for payload in self._aiter_values(instance, fields_set):
|
|
64
|
+
found_any = True
|
|
65
|
+
field_name, field_title, stream, filename, kind = payload
|
|
66
|
+
source = self._build_source(object_class, object_id, field_name, field_title, filename)
|
|
67
|
+
run_tags = self._build_tags(tags, field_name, kind)
|
|
68
|
+
try:
|
|
69
|
+
chunk_res = await self.pipeline.arun(
|
|
70
|
+
stream,
|
|
71
|
+
filename=filename,
|
|
72
|
+
tags=run_tags,
|
|
73
|
+
source=source,
|
|
74
|
+
)
|
|
75
|
+
results.extend(chunk_res)
|
|
76
|
+
except Exception as exc: # noqa: BLE001
|
|
77
|
+
self._warn_skip(object_class, field_name, exc)
|
|
78
|
+
|
|
79
|
+
if not found_any and count_objects <= self.MIN_OBJECTS_FOR_WARNING:
|
|
80
|
+
self.logger.info(" -> No ingestible fields found in %s:%s", object_class, object_id)
|
|
81
|
+
|
|
82
|
+
if count_objects == 0:
|
|
83
|
+
self.logger.warning("ModelIngester received 0 objects to process!")
|
|
84
|
+
|
|
85
|
+
self.logger.debug("Async ingest finished; processed=%s results=%s", count_objects, len(results))
|
|
86
|
+
return results
|
|
87
|
+
|
|
88
|
+
def ingest(
|
|
89
|
+
self,
|
|
90
|
+
objects: Iterable[Any],
|
|
91
|
+
*,
|
|
92
|
+
fields: Iterable[str] | None = None,
|
|
93
|
+
tags: Iterable[str] | None = None,
|
|
94
|
+
) -> list[Any]:
|
|
95
|
+
rows = list(self._materialize(objects))
|
|
96
|
+
fields_set = set(fields) if fields else None
|
|
97
|
+
results: list[Any] = []
|
|
98
|
+
self.logger.debug("Starting sync ingest; rows=%s fields=%s", len(rows), fields_set)
|
|
99
|
+
for instance in rows:
|
|
100
|
+
object_class, object_id = self._resolve_link(instance)
|
|
101
|
+
self.logger.debug(
|
|
102
|
+
"Sync ingest instance %s:%s; model_fields=%s",
|
|
103
|
+
object_class,
|
|
104
|
+
object_id,
|
|
105
|
+
getattr(instance.__class__, "model_fields", None),
|
|
106
|
+
)
|
|
107
|
+
found_any = False
|
|
108
|
+
for payload in self._iter_values(instance, fields_set):
|
|
109
|
+
found_any = True
|
|
110
|
+
field_name, field_title, stream, filename, kind = payload
|
|
111
|
+
source = self._build_source(object_class, object_id, field_name, field_title, filename)
|
|
112
|
+
run_tags = self._build_tags(tags, field_name, kind)
|
|
113
|
+
try:
|
|
114
|
+
results.extend(
|
|
115
|
+
self.pipeline.run(
|
|
116
|
+
stream,
|
|
117
|
+
filename=filename,
|
|
118
|
+
tags=run_tags,
|
|
119
|
+
source=source,
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
except Exception as exc: # noqa: BLE001
|
|
123
|
+
self._warn_skip(object_class, field_name, exc)
|
|
124
|
+
if not found_any:
|
|
125
|
+
self.logger.debug("No ingestible fields in %s:%s", object_class, object_id)
|
|
126
|
+
|
|
127
|
+
self.logger.debug("Sync ingest finished; processed=%s results=%s", len(rows), len(results))
|
|
128
|
+
return results
|
|
129
|
+
|
|
130
|
+
async def _aiter_values(
|
|
131
|
+
self,
|
|
132
|
+
instance: Any,
|
|
133
|
+
fields_set: set[str] | None,
|
|
134
|
+
) -> AsyncIterator[tuple[str, str | None, IO[Any], str | None, str]]:
|
|
135
|
+
if isinstance(instance, File):
|
|
136
|
+
try:
|
|
137
|
+
stream, filename = await self._afile_to_stream(instance)
|
|
138
|
+
yield "file", "file", stream, filename, "file"
|
|
139
|
+
except Exception as exc:
|
|
140
|
+
self._warn_skip("File", str(getattr(instance, "filename", instance)), exc)
|
|
141
|
+
return
|
|
142
|
+
|
|
143
|
+
model_fields = getattr(instance.__class__, "model_fields", None) or {}
|
|
144
|
+
if not model_fields:
|
|
145
|
+
return
|
|
146
|
+
|
|
147
|
+
for name, info in model_fields.items():
|
|
148
|
+
if fields_set is not None and name not in fields_set:
|
|
149
|
+
continue
|
|
150
|
+
val = getattr(instance, name, None)
|
|
151
|
+
try:
|
|
152
|
+
payload = await self._avalue_to_stream(val)
|
|
153
|
+
if payload is None:
|
|
154
|
+
continue
|
|
155
|
+
stream, filename, kind = payload
|
|
156
|
+
title = getattr(info, "title", None) or name
|
|
157
|
+
yield name, title, stream, filename, kind
|
|
158
|
+
except Exception as exc:
|
|
159
|
+
self._warn_skip(instance.__class__.__name__, name, exc)
|
|
160
|
+
|
|
161
|
+
def _iter_values(
|
|
162
|
+
self,
|
|
163
|
+
instance: Any,
|
|
164
|
+
fields_set: set[str] | None,
|
|
165
|
+
) -> Iterable[tuple[str, str | None, IO[Any], str | None, str]]:
|
|
166
|
+
if isinstance(instance, File):
|
|
167
|
+
try:
|
|
168
|
+
stream, filename = self._file_to_stream(instance)
|
|
169
|
+
return [("file", "file", stream, filename, "file")]
|
|
170
|
+
except Exception as exc:
|
|
171
|
+
self._warn_skip("File", str(getattr(instance, "filename", instance)), exc)
|
|
172
|
+
return []
|
|
173
|
+
|
|
174
|
+
model_fields = getattr(instance.__class__, "model_fields", None) or {}
|
|
175
|
+
if not model_fields:
|
|
176
|
+
return []
|
|
177
|
+
|
|
178
|
+
items: list[tuple[str, str | None, IO[Any], str | None, str]] = []
|
|
179
|
+
for name, info in model_fields.items():
|
|
180
|
+
if fields_set is not None and name not in fields_set:
|
|
181
|
+
continue
|
|
182
|
+
val = getattr(instance, name, None)
|
|
183
|
+
try:
|
|
184
|
+
payload = self._value_to_stream(val)
|
|
185
|
+
if payload is None:
|
|
186
|
+
continue
|
|
187
|
+
stream, filename, kind = payload
|
|
188
|
+
title = getattr(info, "title", None) or name
|
|
189
|
+
items.append((name, title, stream, filename, kind))
|
|
190
|
+
except Exception as exc:
|
|
191
|
+
self._warn_skip(instance.__class__.__name__, name, exc)
|
|
192
|
+
return items
|
|
193
|
+
|
|
194
|
+
def _build_source(
|
|
195
|
+
self,
|
|
196
|
+
object_class: str,
|
|
197
|
+
object_id: str,
|
|
198
|
+
field_name: str,
|
|
199
|
+
field_title: str | None,
|
|
200
|
+
filename: str | None,
|
|
201
|
+
) -> IngestionSource:
|
|
202
|
+
meta = dict(self.base_metadata)
|
|
203
|
+
meta.setdefault("field", field_name)
|
|
204
|
+
if field_title:
|
|
205
|
+
meta.setdefault("field_title", field_title)
|
|
206
|
+
if filename:
|
|
207
|
+
meta.setdefault("filename", filename)
|
|
208
|
+
return IngestionSource(
|
|
209
|
+
object_class=object_class,
|
|
210
|
+
object_id=object_id,
|
|
211
|
+
tags=list(self.base_tags),
|
|
212
|
+
metadata=meta,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
def _build_tags(self, extra: Iterable[str] | None, field_name: str, kind: str) -> list[str]:
|
|
216
|
+
tags = list(extra or [])
|
|
217
|
+
tags.append(kind)
|
|
218
|
+
tags.append(f"field:{field_name}")
|
|
219
|
+
return tags
|
|
220
|
+
|
|
221
|
+
def _resolve_link(self, instance: Any) -> tuple[str, str]:
|
|
222
|
+
cls = instance.__class__.__name__
|
|
223
|
+
oid = getattr(instance, "object_id", None)
|
|
224
|
+
if oid is None:
|
|
225
|
+
oid = getattr(instance, "id", None)
|
|
226
|
+
if oid is None:
|
|
227
|
+
oid = id(instance)
|
|
228
|
+
return cls, str(oid)
|
|
229
|
+
|
|
230
|
+
def _warn_skip(self, object_class: str, field_name: str, exc: Exception) -> None:
|
|
231
|
+
self.logger.warning("Skipping %s.%s: %s", object_class, field_name, exc)
|
|
232
|
+
|
|
233
|
+
def _materialize(self, objects: Any) -> Iterable[Any]:
|
|
234
|
+
if isinstance(objects, QuerySetBase):
|
|
235
|
+
return objects.execute() # type: ignore[attr-defined]
|
|
236
|
+
return list(objects)
|
|
237
|
+
|
|
238
|
+
async def _amaterialize(self, objects: Any) -> AsyncIterator[Any]:
|
|
239
|
+
if isinstance(objects, QuerySetBase):
|
|
240
|
+
result = await objects.aexecute() # type: ignore[attr-defined]
|
|
241
|
+
for item in result:
|
|
242
|
+
yield item
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
for item in objects:
|
|
246
|
+
yield item
|
|
247
|
+
|
|
248
|
+
def _file_to_stream(self, file_obj: File) -> tuple[IO[Any], str | None]:
|
|
249
|
+
content = file_obj.read_bytes()
|
|
250
|
+
return io.BytesIO(content), getattr(file_obj, "filename", None)
|
|
251
|
+
|
|
252
|
+
async def _afile_to_stream(self, file_obj: File) -> tuple[IO[Any], str | None]:
|
|
253
|
+
content = await file_obj.aread_bytes()
|
|
254
|
+
return io.BytesIO(content), getattr(file_obj, "filename", None)
|
|
255
|
+
|
|
256
|
+
def _value_to_stream(self, val: Any) -> tuple[IO[Any], str | None, str] | None:
|
|
257
|
+
if val is None:
|
|
258
|
+
return None
|
|
259
|
+
if isinstance(val, File):
|
|
260
|
+
stream, filename = self._file_to_stream(val)
|
|
261
|
+
return stream, filename, "file"
|
|
262
|
+
if isinstance(val, (bytes, bytearray)):
|
|
263
|
+
return io.BytesIO(val), None, "file"
|
|
264
|
+
if isinstance(val, str):
|
|
265
|
+
return io.StringIO(val), None, "text"
|
|
266
|
+
return None
|
|
267
|
+
|
|
268
|
+
async def _avalue_to_stream(self, val: Any) -> tuple[IO[Any], str | None, str] | None:
|
|
269
|
+
if val is None:
|
|
270
|
+
return None
|
|
271
|
+
if isinstance(val, File):
|
|
272
|
+
stream, filename = await self._afile_to_stream(val)
|
|
273
|
+
return stream, filename, "file"
|
|
274
|
+
if isinstance(val, (bytes, bytearray)):
|
|
275
|
+
return io.BytesIO(val), None, "file"
|
|
276
|
+
if isinstance(val, str):
|
|
277
|
+
return io.StringIO(val), None, "text"
|
|
278
|
+
return None
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from typing import IO
|
|
5
|
+
from typing import Any
|
|
6
|
+
from typing import Generic
|
|
7
|
+
from typing import TypeVar
|
|
8
|
+
|
|
9
|
+
from amsdal_ml.ml_ingesting.embedders.embedder import Embedder
|
|
10
|
+
from amsdal_ml.ml_ingesting.embedding_data import EmbeddingData
|
|
11
|
+
from amsdal_ml.ml_ingesting.loaders.loader import Loader
|
|
12
|
+
from amsdal_ml.ml_ingesting.pipeline_interface import IngestionPipeline
|
|
13
|
+
from amsdal_ml.ml_ingesting.processors.cleaner import Cleaner
|
|
14
|
+
from amsdal_ml.ml_ingesting.splitters.splitter import Splitter
|
|
15
|
+
from amsdal_ml.ml_ingesting.stores.store import EmbeddingStore
|
|
16
|
+
from amsdal_ml.ml_ingesting.types import IngestionSource
|
|
17
|
+
from amsdal_ml.ml_ingesting.types import LoadedDocument
|
|
18
|
+
|
|
19
|
+
LoaderT = TypeVar('LoaderT', bound=Loader)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DefaultIngestionPipeline(IngestionPipeline, Generic[LoaderT]):
|
|
23
|
+
loader: LoaderT
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
*,
|
|
28
|
+
loader: LoaderT,
|
|
29
|
+
cleaner: Cleaner,
|
|
30
|
+
splitter: Splitter,
|
|
31
|
+
embedder: Embedder,
|
|
32
|
+
store: EmbeddingStore,
|
|
33
|
+
) -> None:
|
|
34
|
+
self.loader = loader
|
|
35
|
+
self.cleaner = cleaner
|
|
36
|
+
self.splitter = splitter
|
|
37
|
+
self.embedder = embedder
|
|
38
|
+
self.store = store
|
|
39
|
+
|
|
40
|
+
def _combine_tags(self, base: Iterable[str] | None, extra: Iterable[str] | None) -> list[str]:
|
|
41
|
+
out: list[str] = []
|
|
42
|
+
for tag in list(base or []) + list(extra or []):
|
|
43
|
+
if tag not in out:
|
|
44
|
+
out.append(tag)
|
|
45
|
+
return out
|
|
46
|
+
|
|
47
|
+
def _ensure_source(self, source: IngestionSource | None) -> IngestionSource:
|
|
48
|
+
if source is None:
|
|
49
|
+
msg = 'source is required for ingestion pipeline'
|
|
50
|
+
raise RuntimeError(msg)
|
|
51
|
+
return source
|
|
52
|
+
|
|
53
|
+
def _merge_metadata(
|
|
54
|
+
self,
|
|
55
|
+
source_meta: dict[str, Any],
|
|
56
|
+
doc_meta: dict[str, Any],
|
|
57
|
+
filename: str | None = None,
|
|
58
|
+
) -> dict[str, Any]:
|
|
59
|
+
merged = {**source_meta, **doc_meta}
|
|
60
|
+
if filename and 'filename' not in merged:
|
|
61
|
+
merged['filename'] = filename
|
|
62
|
+
return merged
|
|
63
|
+
|
|
64
|
+
def _embed_chunks(self, chunks, tags: list[str], base_metadata: dict[str, Any]) -> list[EmbeddingData]:
|
|
65
|
+
embeddings: list[EmbeddingData] = []
|
|
66
|
+
for idx, chunk in enumerate(chunks):
|
|
67
|
+
vector = self.embedder.embed(chunk.text)
|
|
68
|
+
merged_tags = self._combine_tags(tags, chunk.tags)
|
|
69
|
+
metadata = {**base_metadata, **dict(chunk.metadata)}
|
|
70
|
+
embeddings.append(
|
|
71
|
+
EmbeddingData(
|
|
72
|
+
chunk_index=idx,
|
|
73
|
+
raw_text=chunk.text,
|
|
74
|
+
embedding=vector,
|
|
75
|
+
tags=merged_tags,
|
|
76
|
+
metadata=metadata,
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
return embeddings
|
|
80
|
+
|
|
81
|
+
async def _aembed_chunks(self, chunks, tags: list[str], base_metadata: dict[str, Any]) -> list[EmbeddingData]:
|
|
82
|
+
embeddings: list[EmbeddingData] = []
|
|
83
|
+
for idx, chunk in enumerate(chunks):
|
|
84
|
+
vector = await self.embedder.aembed(chunk.text)
|
|
85
|
+
merged_tags = self._combine_tags(tags, chunk.tags)
|
|
86
|
+
metadata = {**base_metadata, **dict(chunk.metadata)}
|
|
87
|
+
embeddings.append(
|
|
88
|
+
EmbeddingData(
|
|
89
|
+
chunk_index=idx,
|
|
90
|
+
raw_text=chunk.text,
|
|
91
|
+
embedding=vector,
|
|
92
|
+
tags=merged_tags,
|
|
93
|
+
metadata=metadata,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
return embeddings
|
|
97
|
+
|
|
98
|
+
def run(
|
|
99
|
+
self,
|
|
100
|
+
file: IO[Any],
|
|
101
|
+
*,
|
|
102
|
+
filename: str | None = None,
|
|
103
|
+
tags: Iterable[str] | None = None,
|
|
104
|
+
source: IngestionSource | None = None,
|
|
105
|
+
) -> list[Any]:
|
|
106
|
+
src = self._ensure_source(source)
|
|
107
|
+
doc = self.loader.load(file, filename=filename, metadata=src.metadata)
|
|
108
|
+
base_metadata = self._merge_metadata(src.metadata, doc.metadata, filename)
|
|
109
|
+
cleaned = self.cleaner.clean(LoadedDocument(pages=doc.pages, metadata=base_metadata))
|
|
110
|
+
chunks = self.splitter.split(cleaned)
|
|
111
|
+
merged_tags = self._combine_tags(src.tags, tags)
|
|
112
|
+
embeddings = self._embed_chunks(chunks, merged_tags, base_metadata)
|
|
113
|
+
return self.store.save(embeddings, source=src)
|
|
114
|
+
|
|
115
|
+
async def arun(
|
|
116
|
+
self,
|
|
117
|
+
file: IO[Any],
|
|
118
|
+
*,
|
|
119
|
+
filename: str | None = None,
|
|
120
|
+
tags: Iterable[str] | None = None,
|
|
121
|
+
source: IngestionSource | None = None,
|
|
122
|
+
) -> list[Any]:
|
|
123
|
+
src = self._ensure_source(source)
|
|
124
|
+
doc = await self.loader.aload(file, filename=filename, metadata=src.metadata)
|
|
125
|
+
base_metadata = self._merge_metadata(src.metadata, doc.metadata, filename)
|
|
126
|
+
cleaned = await self.cleaner.aclean(LoadedDocument(pages=doc.pages, metadata=base_metadata))
|
|
127
|
+
chunks = await self.splitter.asplit(cleaned)
|
|
128
|
+
merged_tags = self._combine_tags(src.tags, tags)
|
|
129
|
+
embeddings = await self._aembed_chunks(chunks, merged_tags, base_metadata)
|
|
130
|
+
return await self.store.asave(embeddings, source=src)
|
|
131
|
+
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC
|
|
4
|
+
from abc import abstractmethod
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from typing import IO
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from amsdal_ml.ml_ingesting.types import IngestionSource
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class IngestionPipeline(ABC):
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def run(
|
|
15
|
+
self,
|
|
16
|
+
file: IO[Any],
|
|
17
|
+
*,
|
|
18
|
+
filename: str | None = None,
|
|
19
|
+
tags: Iterable[str] | None = None,
|
|
20
|
+
source: IngestionSource | None = None,
|
|
21
|
+
) -> list[Any]: ...
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
async def arun(
|
|
25
|
+
self,
|
|
26
|
+
file: IO[Any],
|
|
27
|
+
*,
|
|
28
|
+
filename: str | None = None,
|
|
29
|
+
tags: Iterable[str] | None = None,
|
|
30
|
+
source: IngestionSource | None = None,
|
|
31
|
+
) -> list[Any]: ...
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC
|
|
4
|
+
from abc import abstractmethod
|
|
5
|
+
|
|
6
|
+
from amsdal_ml.ml_ingesting.types import LoadedDocument
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Cleaner(ABC):
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def clean(self, doc: LoadedDocument) -> LoadedDocument: ...
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
async def aclean(self, doc: LoadedDocument) -> LoadedDocument: ...
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
from amsdal_ml.ml_ingesting.processors.cleaner import Cleaner
|
|
7
|
+
from amsdal_ml.ml_ingesting.types import LoadedDocument
|
|
8
|
+
from amsdal_ml.ml_ingesting.types import LoadedPage
|
|
9
|
+
|
|
10
|
+
_CONTROL_CHARS = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]')
|
|
11
|
+
_MULTI_SPACE = re.compile(r'\s{2,}')
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _normalize_text(text: str) -> str:
|
|
15
|
+
text = _CONTROL_CHARS.sub(' ', text)
|
|
16
|
+
text = text.replace('\r', ' ').replace('\t', ' ')
|
|
17
|
+
text = _MULTI_SPACE.sub(' ', text)
|
|
18
|
+
return text.strip()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TextCleaner(Cleaner):
|
|
22
|
+
def __init__(self, *, drop_empty_pages: bool = True) -> None:
|
|
23
|
+
self.drop_empty_pages = drop_empty_pages
|
|
24
|
+
|
|
25
|
+
def clean(self, doc: LoadedDocument) -> LoadedDocument:
|
|
26
|
+
pages: list[LoadedPage] = []
|
|
27
|
+
for page in doc.pages:
|
|
28
|
+
cleaned = _normalize_text(page.text)
|
|
29
|
+
if not cleaned and self.drop_empty_pages:
|
|
30
|
+
continue
|
|
31
|
+
pages.append(
|
|
32
|
+
LoadedPage(
|
|
33
|
+
page_number=page.page_number,
|
|
34
|
+
text=cleaned,
|
|
35
|
+
metadata=dict(page.metadata),
|
|
36
|
+
)
|
|
37
|
+
)
|
|
38
|
+
meta = dict(doc.metadata)
|
|
39
|
+
return LoadedDocument(pages=pages, metadata=meta)
|
|
40
|
+
|
|
41
|
+
async def aclean(self, doc: LoadedDocument) -> LoadedDocument:
|
|
42
|
+
return await asyncio.to_thread(self.clean, doc)
|