MindsDB 25.9.2.0a1__py3-none-any.whl → 25.9.3rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +39 -20
- mindsdb/api/a2a/agent.py +7 -9
- mindsdb/api/a2a/common/server/server.py +3 -3
- mindsdb/api/a2a/common/server/task_manager.py +4 -4
- mindsdb/api/a2a/task_manager.py +15 -17
- mindsdb/api/common/middleware.py +9 -11
- mindsdb/api/executor/command_executor.py +2 -4
- mindsdb/api/executor/datahub/datanodes/datanode.py +2 -2
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +100 -48
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +8 -4
- mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
- mindsdb/api/executor/exceptions.py +29 -10
- mindsdb/api/executor/planner/plan_join.py +17 -3
- mindsdb/api/executor/sql_query/sql_query.py +74 -74
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +1 -2
- mindsdb/api/executor/sql_query/steps/subselect_step.py +0 -1
- mindsdb/api/executor/utilities/functions.py +6 -6
- mindsdb/api/executor/utilities/sql.py +32 -16
- mindsdb/api/http/gui.py +5 -11
- mindsdb/api/http/initialize.py +8 -10
- mindsdb/api/http/namespaces/agents.py +10 -12
- mindsdb/api/http/namespaces/analysis.py +13 -20
- mindsdb/api/http/namespaces/auth.py +1 -1
- mindsdb/api/http/namespaces/config.py +15 -11
- mindsdb/api/http/namespaces/databases.py +140 -201
- mindsdb/api/http/namespaces/file.py +15 -4
- mindsdb/api/http/namespaces/handlers.py +7 -2
- mindsdb/api/http/namespaces/knowledge_bases.py +8 -7
- mindsdb/api/http/namespaces/models.py +94 -126
- mindsdb/api/http/namespaces/projects.py +13 -22
- mindsdb/api/http/namespaces/sql.py +33 -25
- mindsdb/api/http/namespaces/tab.py +27 -37
- mindsdb/api/http/namespaces/views.py +1 -1
- mindsdb/api/http/start.py +14 -8
- mindsdb/api/mcp/__init__.py +2 -1
- mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +15 -20
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +26 -50
- mindsdb/api/mysql/mysql_proxy/utilities/__init__.py +0 -1
- mindsdb/api/postgres/postgres_proxy/executor/executor.py +6 -13
- mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_packets.py +40 -28
- mindsdb/integrations/handlers/byom_handler/byom_handler.py +168 -185
- mindsdb/integrations/handlers/file_handler/file_handler.py +7 -0
- mindsdb/integrations/handlers/lightwood_handler/functions.py +45 -79
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +13 -1
- mindsdb/integrations/handlers/shopify_handler/shopify_handler.py +25 -12
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +2 -1
- mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
- mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
- mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +4 -4
- mindsdb/integrations/libs/api_handler.py +10 -10
- mindsdb/integrations/libs/base.py +4 -4
- mindsdb/integrations/libs/llm/utils.py +2 -2
- mindsdb/integrations/libs/ml_handler_process/create_engine_process.py +4 -7
- mindsdb/integrations/libs/ml_handler_process/func_call_process.py +2 -7
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +37 -47
- mindsdb/integrations/libs/ml_handler_process/update_engine_process.py +4 -7
- mindsdb/integrations/libs/ml_handler_process/update_process.py +2 -7
- mindsdb/integrations/libs/process_cache.py +132 -140
- mindsdb/integrations/libs/response.py +18 -12
- mindsdb/integrations/libs/vectordatabase_handler.py +26 -0
- mindsdb/integrations/utilities/files/file_reader.py +6 -7
- mindsdb/integrations/utilities/rag/config_loader.py +37 -26
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +59 -9
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +4 -4
- mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +55 -133
- mindsdb/integrations/utilities/rag/settings.py +58 -133
- mindsdb/integrations/utilities/rag/splitters/file_splitter.py +5 -15
- mindsdb/interfaces/agents/agents_controller.py +2 -1
- mindsdb/interfaces/agents/constants.py +0 -2
- mindsdb/interfaces/agents/litellm_server.py +34 -58
- mindsdb/interfaces/agents/mcp_client_agent.py +10 -10
- mindsdb/interfaces/agents/mindsdb_database_agent.py +5 -5
- mindsdb/interfaces/agents/run_mcp_agent.py +12 -21
- mindsdb/interfaces/chatbot/chatbot_task.py +20 -23
- mindsdb/interfaces/chatbot/polling.py +30 -18
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +10 -10
- mindsdb/interfaces/database/integrations.py +19 -2
- mindsdb/interfaces/file/file_controller.py +6 -6
- mindsdb/interfaces/functions/controller.py +1 -1
- mindsdb/interfaces/functions/to_markdown.py +2 -2
- mindsdb/interfaces/jobs/jobs_controller.py +5 -5
- mindsdb/interfaces/jobs/scheduler.py +3 -8
- mindsdb/interfaces/knowledge_base/controller.py +50 -23
- mindsdb/interfaces/knowledge_base/preprocessing/json_chunker.py +40 -61
- mindsdb/interfaces/model/model_controller.py +170 -166
- mindsdb/interfaces/query_context/context_controller.py +14 -2
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +6 -4
- mindsdb/interfaces/skills/retrieval_tool.py +43 -50
- mindsdb/interfaces/skills/skill_tool.py +2 -2
- mindsdb/interfaces/skills/sql_agent.py +25 -19
- mindsdb/interfaces/storage/fs.py +114 -169
- mindsdb/interfaces/storage/json.py +19 -18
- mindsdb/interfaces/tabs/tabs_controller.py +49 -72
- mindsdb/interfaces/tasks/task_monitor.py +3 -9
- mindsdb/interfaces/tasks/task_thread.py +7 -9
- mindsdb/interfaces/triggers/trigger_task.py +7 -13
- mindsdb/interfaces/triggers/triggers_controller.py +47 -50
- mindsdb/migrations/migrate.py +16 -16
- mindsdb/utilities/api_status.py +58 -0
- mindsdb/utilities/config.py +49 -0
- mindsdb/utilities/exception.py +40 -1
- mindsdb/utilities/fs.py +0 -1
- mindsdb/utilities/hooks/profiling.py +17 -14
- mindsdb/utilities/langfuse.py +40 -45
- mindsdb/utilities/log.py +272 -0
- mindsdb/utilities/ml_task_queue/consumer.py +52 -58
- mindsdb/utilities/ml_task_queue/producer.py +26 -30
- mindsdb/utilities/render/sqlalchemy_render.py +7 -6
- mindsdb/utilities/utils.py +2 -2
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/METADATA +269 -264
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/RECORD +115 -115
- mindsdb/api/mysql/mysql_proxy/utilities/exceptions.py +0 -14
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/WHEEL +0 -0
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.9.2.0a1.dist-info → mindsdb-25.9.3rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""BYOM: Bring Your Own Model
|
|
2
2
|
|
|
3
3
|
env vars to contloll BYOM:
|
|
4
4
|
- MINDSDB_BYOM_ENABLED - can BYOM be uysed or not. Locally enabled by default.
|
|
@@ -7,7 +7,6 @@ env vars to contloll BYOM:
|
|
|
7
7
|
- MINDSDB_BYOM_TYPE - [safe|unsafe] - obsolete, same as above.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
|
|
11
10
|
import os
|
|
12
11
|
import re
|
|
13
12
|
import sys
|
|
@@ -15,7 +14,6 @@ import shutil
|
|
|
15
14
|
import pickle
|
|
16
15
|
import tarfile
|
|
17
16
|
import tempfile
|
|
18
|
-
import traceback
|
|
19
17
|
import subprocess
|
|
20
18
|
from enum import Enum
|
|
21
19
|
from pathlib import Path
|
|
@@ -36,28 +34,33 @@ import mindsdb.utilities.profiler as profiler
|
|
|
36
34
|
|
|
37
35
|
|
|
38
36
|
from .proc_wrapper import (
|
|
39
|
-
pd_decode,
|
|
40
|
-
|
|
37
|
+
pd_decode,
|
|
38
|
+
pd_encode,
|
|
39
|
+
encode,
|
|
40
|
+
decode,
|
|
41
|
+
BYOM_METHOD,
|
|
42
|
+
import_string,
|
|
43
|
+
find_model_class,
|
|
44
|
+
check_module,
|
|
41
45
|
)
|
|
42
46
|
from .__about__ import __version__
|
|
43
47
|
|
|
44
48
|
|
|
45
|
-
BYOM_TYPE = Enum(
|
|
49
|
+
BYOM_TYPE = Enum("BYOM_TYPE", ["INHOUSE", "VENV"])
|
|
46
50
|
|
|
47
51
|
logger = log.getLogger(__name__)
|
|
48
52
|
|
|
49
53
|
|
|
50
54
|
class BYOMHandler(BaseMLEngine):
|
|
51
|
-
|
|
52
|
-
name = 'byom'
|
|
55
|
+
name = "byom"
|
|
53
56
|
|
|
54
57
|
def __init__(self, model_storage, engine_storage, **kwargs) -> None:
|
|
55
58
|
# region check availability
|
|
56
|
-
is_cloud = Config().get(
|
|
59
|
+
is_cloud = Config().get("cloud", False)
|
|
57
60
|
if is_cloud is True:
|
|
58
|
-
byom_enabled = os.environ.get(
|
|
59
|
-
if byom_enabled not in (
|
|
60
|
-
raise RuntimeError(
|
|
61
|
+
byom_enabled = os.environ.get("MINDSDB_BYOM_ENABLED", "false").lower()
|
|
62
|
+
if byom_enabled not in ("true", "1"):
|
|
63
|
+
raise RuntimeError("BYOM is disabled on cloud")
|
|
61
64
|
# endregion
|
|
62
65
|
|
|
63
66
|
self.model_wrapper = None
|
|
@@ -68,16 +71,14 @@ class BYOMHandler(BaseMLEngine):
|
|
|
68
71
|
# region read and save set default byom type
|
|
69
72
|
try:
|
|
70
73
|
self._default_byom_type = BYOM_TYPE.VENV
|
|
71
|
-
if os.environ.get(
|
|
72
|
-
self._default_byom_type = BYOM_TYPE[
|
|
73
|
-
os.environ.get('MINDSDB_BYOM_DEFAULT_TYPE').upper()
|
|
74
|
-
]
|
|
74
|
+
if os.environ.get("MINDSDB_BYOM_DEFAULT_TYPE") is not None:
|
|
75
|
+
self._default_byom_type = BYOM_TYPE[os.environ.get("MINDSDB_BYOM_DEFAULT_TYPE").upper()]
|
|
75
76
|
else:
|
|
76
|
-
env_var = os.environ.get(
|
|
77
|
-
if env_var ==
|
|
78
|
-
self._default_byom_type = BYOM_TYPE[
|
|
79
|
-
elif env_var ==
|
|
80
|
-
self._default_byom_type = BYOM_TYPE[
|
|
77
|
+
env_var = os.environ.get("MINDSDB_BYOM_DEFAULT_TYPE")
|
|
78
|
+
if env_var == "SAVE":
|
|
79
|
+
self._default_byom_type = BYOM_TYPE["VENV"]
|
|
80
|
+
elif env_var == "UNSAVE":
|
|
81
|
+
self._default_byom_type = BYOM_TYPE["INHOUSE"]
|
|
81
82
|
else:
|
|
82
83
|
raise KeyError
|
|
83
84
|
except KeyError:
|
|
@@ -85,11 +86,11 @@ class BYOMHandler(BaseMLEngine):
|
|
|
85
86
|
# endregion
|
|
86
87
|
|
|
87
88
|
# region check if 'inhouse' BYOM is enabled
|
|
88
|
-
env_var = os.environ.get(
|
|
89
|
+
env_var = os.environ.get("MINDSDB_BYOM_INHOUSE_ENABLED")
|
|
89
90
|
if env_var is None:
|
|
90
91
|
self._inhouse_enabled = False if is_cloud else True
|
|
91
92
|
else:
|
|
92
|
-
self._inhouse_enabled = env_var.lower() in (
|
|
93
|
+
self._inhouse_enabled = env_var.lower() in ("true", "1")
|
|
93
94
|
# endregion
|
|
94
95
|
|
|
95
96
|
super().__init__(model_storage, engine_storage, **kwargs)
|
|
@@ -117,18 +118,18 @@ class BYOMHandler(BaseMLEngine):
|
|
|
117
118
|
def create_validation(target: str, args: dict = None, **kwargs) -> None:
|
|
118
119
|
if isinstance(args, dict) is False:
|
|
119
120
|
return
|
|
120
|
-
using_args = args.get(
|
|
121
|
-
engine_version = using_args.get(
|
|
121
|
+
using_args = args.get("using", {})
|
|
122
|
+
engine_version = using_args.get("engine_version")
|
|
122
123
|
if engine_version is not None:
|
|
123
124
|
engine_version = BYOMHandler.normalize_engine_version(engine_version)
|
|
124
125
|
else:
|
|
125
|
-
connection_args = kwargs[
|
|
126
|
-
versions = connection_args.get(
|
|
126
|
+
connection_args = kwargs["handler_storage"].get_connection_args()
|
|
127
|
+
versions = connection_args.get("versions")
|
|
127
128
|
if isinstance(versions, dict):
|
|
128
129
|
engine_version = max([int(x) for x in versions.keys()])
|
|
129
130
|
else:
|
|
130
131
|
engine_version = 1
|
|
131
|
-
using_args[
|
|
132
|
+
using_args["engine_version"] = engine_version
|
|
132
133
|
|
|
133
134
|
def get_model_engine_version(self) -> int:
|
|
134
135
|
"""Return current model engine version
|
|
@@ -136,7 +137,7 @@ class BYOMHandler(BaseMLEngine):
|
|
|
136
137
|
Returns:
|
|
137
138
|
int: engine version
|
|
138
139
|
"""
|
|
139
|
-
engine_version = self.model_storage.get_info()[
|
|
140
|
+
engine_version = self.model_storage.get_info()["learn_args"].get("using", {}).get("engine_version")
|
|
140
141
|
engine_version = BYOMHandler.normalize_engine_version(engine_version)
|
|
141
142
|
return engine_version
|
|
142
143
|
|
|
@@ -154,28 +155,26 @@ class BYOMHandler(BaseMLEngine):
|
|
|
154
155
|
version = 1
|
|
155
156
|
if isinstance(version, str):
|
|
156
157
|
version = int(version)
|
|
157
|
-
version_mark =
|
|
158
|
+
version_mark = ""
|
|
158
159
|
if version > 1:
|
|
159
|
-
version_mark = f
|
|
160
|
+
version_mark = f"_{version}"
|
|
160
161
|
version_str = str(version)
|
|
161
162
|
|
|
162
163
|
self.engine_storage.fileStorage.pull()
|
|
163
164
|
try:
|
|
164
|
-
code = self.engine_storage.fileStorage.file_get(f
|
|
165
|
-
modules_str = self.engine_storage.fileStorage.file_get(f
|
|
165
|
+
code = self.engine_storage.fileStorage.file_get(f"code{version_mark}")
|
|
166
|
+
modules_str = self.engine_storage.fileStorage.file_get(f"modules{version_mark}")
|
|
166
167
|
except FileNotFoundError:
|
|
167
168
|
raise Exception(f"Engine version '{version}' does not exists")
|
|
168
169
|
|
|
169
170
|
if version_str not in self.model_wrappers:
|
|
170
171
|
connection_args = self.engine_storage.get_connection_args()
|
|
171
|
-
version_meta = connection_args[
|
|
172
|
+
version_meta = connection_args["versions"][version_str]
|
|
172
173
|
|
|
173
174
|
try:
|
|
174
|
-
engine_version_type = BYOM_TYPE[
|
|
175
|
-
version_meta.get('type', self._default_byom_type.name).upper()
|
|
176
|
-
]
|
|
175
|
+
engine_version_type = BYOM_TYPE[version_meta.get("type", self._default_byom_type.name).upper()]
|
|
177
176
|
except KeyError:
|
|
178
|
-
raise Exception(
|
|
177
|
+
raise Exception("Unknown BYOM engine type")
|
|
179
178
|
|
|
180
179
|
if engine_version_type == BYOM_TYPE.INHOUSE:
|
|
181
180
|
if self._inhouse_enabled is False:
|
|
@@ -185,20 +184,20 @@ class BYOMHandler(BaseMLEngine):
|
|
|
185
184
|
code=code,
|
|
186
185
|
modules_str=modules_str,
|
|
187
186
|
engine_id=self.engine_storage.integration_id,
|
|
188
|
-
engine_version=version
|
|
187
|
+
engine_version=version,
|
|
189
188
|
)
|
|
190
189
|
self.model_wrappers[version_str] = self.inhouse_model_wrapper
|
|
191
190
|
elif engine_version_type == BYOM_TYPE.VENV:
|
|
192
|
-
if version_meta.get(
|
|
193
|
-
version_meta[
|
|
191
|
+
if version_meta.get("venv_status") != "ready":
|
|
192
|
+
version_meta["venv_status"] = "creating"
|
|
194
193
|
self.engine_storage.update_connection_args(connection_args)
|
|
195
194
|
self.model_wrappers[version_str] = ModelWrapperSafe(
|
|
196
195
|
code=code,
|
|
197
196
|
modules_str=modules_str,
|
|
198
197
|
engine_id=self.engine_storage.integration_id,
|
|
199
|
-
engine_version=version
|
|
198
|
+
engine_version=version,
|
|
200
199
|
)
|
|
201
|
-
version_meta[
|
|
200
|
+
version_meta["venv_status"] = "ready"
|
|
202
201
|
self.engine_storage.update_connection_args(connection_args)
|
|
203
202
|
|
|
204
203
|
return self.model_wrappers[version_str]
|
|
@@ -206,130 +205,114 @@ class BYOMHandler(BaseMLEngine):
|
|
|
206
205
|
def describe(self, attribute: Optional[str] = None) -> pd.DataFrame:
|
|
207
206
|
engine_version = self.get_model_engine_version()
|
|
208
207
|
mp = self._get_model_proxy(engine_version)
|
|
209
|
-
model_state = self.model_storage.file_get(
|
|
208
|
+
model_state = self.model_storage.file_get("model")
|
|
210
209
|
return mp.describe(model_state, attribute)
|
|
211
210
|
|
|
212
211
|
def create(self, target, df=None, args=None, **kwargs):
|
|
213
|
-
using_args = args.get(
|
|
214
|
-
engine_version = using_args.get(
|
|
212
|
+
using_args = args.get("using", {})
|
|
213
|
+
engine_version = using_args.get("engine_version")
|
|
215
214
|
|
|
216
215
|
model_proxy = self._get_model_proxy(engine_version)
|
|
217
216
|
model_state = model_proxy.train(df, target, args)
|
|
218
217
|
|
|
219
|
-
self.model_storage.file_set(
|
|
218
|
+
self.model_storage.file_set("model", model_state)
|
|
220
219
|
|
|
221
220
|
# TODO return columns?
|
|
222
221
|
|
|
223
222
|
def convert_type(field_type):
|
|
224
223
|
if pd_types.is_integer_dtype(field_type):
|
|
225
|
-
return
|
|
224
|
+
return "integer"
|
|
226
225
|
elif pd_types.is_numeric_dtype(field_type):
|
|
227
|
-
return
|
|
226
|
+
return "float"
|
|
228
227
|
elif pd_types.is_datetime64_any_dtype(field_type):
|
|
229
|
-
return
|
|
228
|
+
return "datetime"
|
|
230
229
|
else:
|
|
231
|
-
return
|
|
230
|
+
return "categorical"
|
|
232
231
|
|
|
233
|
-
columns = {
|
|
234
|
-
target: convert_type(object)
|
|
235
|
-
}
|
|
232
|
+
columns = {target: convert_type(object)}
|
|
236
233
|
|
|
237
234
|
self.model_storage.columns_set(columns)
|
|
238
235
|
|
|
239
236
|
def predict(self, df, args=None):
|
|
240
|
-
pred_args = args.get(
|
|
237
|
+
pred_args = args.get("predict_params", {})
|
|
241
238
|
|
|
242
|
-
engine_version = pred_args.get(
|
|
239
|
+
engine_version = pred_args.get("engine_version")
|
|
243
240
|
if engine_version is not None:
|
|
244
241
|
engine_version = int(engine_version)
|
|
245
242
|
else:
|
|
246
243
|
engine_version = self.get_model_engine_version()
|
|
247
244
|
|
|
248
245
|
model_proxy = self._get_model_proxy(engine_version)
|
|
249
|
-
model_state = self.model_storage.file_get(
|
|
246
|
+
model_state = self.model_storage.file_get("model")
|
|
250
247
|
pred_df = model_proxy.predict(df, model_state, pred_args)
|
|
251
248
|
|
|
252
249
|
return pred_df
|
|
253
250
|
|
|
254
251
|
def create_engine(self, connection_args):
|
|
255
|
-
code_path = Path(connection_args[
|
|
256
|
-
self.engine_storage.fileStorage.file_set(
|
|
257
|
-
'code',
|
|
258
|
-
code_path.read_bytes()
|
|
259
|
-
)
|
|
252
|
+
code_path = Path(connection_args["code"])
|
|
253
|
+
self.engine_storage.fileStorage.file_set("code", code_path.read_bytes())
|
|
260
254
|
|
|
261
|
-
requirements_path = Path(connection_args[
|
|
262
|
-
self.engine_storage.fileStorage.file_set(
|
|
263
|
-
'modules',
|
|
264
|
-
requirements_path.read_bytes()
|
|
265
|
-
)
|
|
255
|
+
requirements_path = Path(connection_args["modules"])
|
|
256
|
+
self.engine_storage.fileStorage.file_set("modules", requirements_path.read_bytes())
|
|
266
257
|
|
|
267
258
|
self.engine_storage.fileStorage.push()
|
|
268
259
|
|
|
269
|
-
self.engine_storage.update_connection_args(
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
connection_args.get(
|
|
278
|
-
|
|
279
|
-
}
|
|
260
|
+
self.engine_storage.update_connection_args(
|
|
261
|
+
{
|
|
262
|
+
"handler_version": __version__,
|
|
263
|
+
"mode": connection_args.get("mode"),
|
|
264
|
+
"versions": {
|
|
265
|
+
"1": {
|
|
266
|
+
"code": code_path.name,
|
|
267
|
+
"requirements": requirements_path.name,
|
|
268
|
+
"type": self.normalize_byom_type(connection_args.get("type")).name.lower(),
|
|
269
|
+
}
|
|
270
|
+
},
|
|
280
271
|
}
|
|
281
|
-
|
|
272
|
+
)
|
|
282
273
|
|
|
283
274
|
model_proxy = self._get_model_proxy()
|
|
284
275
|
try:
|
|
285
|
-
info = model_proxy.check(connection_args.get(
|
|
286
|
-
self.engine_storage.json_set(
|
|
276
|
+
info = model_proxy.check(connection_args.get("mode"))
|
|
277
|
+
self.engine_storage.json_set("methods", info["methods"])
|
|
287
278
|
|
|
288
279
|
except Exception as e:
|
|
289
|
-
if hasattr(model_proxy,
|
|
280
|
+
if hasattr(model_proxy, "remove_venv"):
|
|
290
281
|
model_proxy.remove_venv()
|
|
291
282
|
raise e
|
|
292
283
|
|
|
293
284
|
def update_engine(self, connection_args: dict) -> None:
|
|
294
285
|
"""Add new version of engine
|
|
295
286
|
|
|
296
|
-
|
|
297
|
-
|
|
287
|
+
Args:
|
|
288
|
+
connection_args (dict): paths to code and requirements
|
|
298
289
|
"""
|
|
299
|
-
code_path = Path(connection_args[
|
|
300
|
-
requirements_path = Path(connection_args[
|
|
290
|
+
code_path = Path(connection_args["code"])
|
|
291
|
+
requirements_path = Path(connection_args["modules"])
|
|
301
292
|
|
|
302
293
|
engine_connection_args = self.engine_storage.get_connection_args()
|
|
303
|
-
if isinstance(engine_connection_args, dict) is False or
|
|
294
|
+
if isinstance(engine_connection_args, dict) is False or "handler_version" not in engine_connection_args:
|
|
304
295
|
engine_connection_args = {
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
296
|
+
"handler_version": __version__,
|
|
297
|
+
"versions": {
|
|
298
|
+
"1": {
|
|
299
|
+
"code": "code.py",
|
|
300
|
+
"requirements": "requirements.txt",
|
|
301
|
+
"type": self._default_byom_type.name.lower(),
|
|
311
302
|
}
|
|
312
|
-
}
|
|
303
|
+
},
|
|
313
304
|
}
|
|
314
|
-
new_version = str(max([int(x) for x in engine_connection_args[
|
|
315
|
-
|
|
316
|
-
engine_connection_args[
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
connection_args.get('type')
|
|
321
|
-
).name.lower()
|
|
305
|
+
new_version = str(max([int(x) for x in engine_connection_args["versions"].keys()]) + 1)
|
|
306
|
+
|
|
307
|
+
engine_connection_args["versions"][new_version] = {
|
|
308
|
+
"code": code_path.name,
|
|
309
|
+
"requirements": requirements_path.name,
|
|
310
|
+
"type": self.normalize_byom_type(connection_args.get("type")).name.lower(),
|
|
322
311
|
}
|
|
323
312
|
|
|
324
|
-
self.engine_storage.fileStorage.file_set(
|
|
325
|
-
f'code_{new_version}',
|
|
326
|
-
code_path.read_bytes()
|
|
327
|
-
)
|
|
313
|
+
self.engine_storage.fileStorage.file_set(f"code_{new_version}", code_path.read_bytes())
|
|
328
314
|
|
|
329
|
-
self.engine_storage.fileStorage.file_set(
|
|
330
|
-
f'modules_{new_version}',
|
|
331
|
-
requirements_path.read_bytes()
|
|
332
|
-
)
|
|
315
|
+
self.engine_storage.fileStorage.file_set(f"modules_{new_version}", requirements_path.read_bytes())
|
|
333
316
|
self.engine_storage.fileStorage.push()
|
|
334
317
|
|
|
335
318
|
self.engine_storage.update_connection_args(engine_connection_args)
|
|
@@ -337,28 +320,28 @@ class BYOMHandler(BaseMLEngine):
|
|
|
337
320
|
model_proxy = self._get_model_proxy(new_version)
|
|
338
321
|
try:
|
|
339
322
|
methods = model_proxy.check()
|
|
340
|
-
self.engine_storage.json_set(
|
|
323
|
+
self.engine_storage.json_set("methods", methods)
|
|
341
324
|
|
|
342
325
|
except Exception as e:
|
|
343
|
-
if hasattr(model_proxy,
|
|
326
|
+
if hasattr(model_proxy, "remove_venv"):
|
|
344
327
|
model_proxy.remove_venv()
|
|
345
328
|
raise e
|
|
346
329
|
|
|
347
330
|
def function_list(self):
|
|
348
|
-
return self.engine_storage.json_get(
|
|
331
|
+
return self.engine_storage.json_get("methods")
|
|
349
332
|
|
|
350
333
|
def function_call(self, name, args):
|
|
351
334
|
mp = self._get_model_proxy()
|
|
352
335
|
return mp.func_call(name, args)
|
|
353
336
|
|
|
354
337
|
def finetune(self, df: Optional[pd.DataFrame] = None, args: Optional[Dict] = None) -> None:
|
|
355
|
-
using_args = args.get(
|
|
356
|
-
engine_version = using_args.get(
|
|
338
|
+
using_args = args.get("using", {})
|
|
339
|
+
engine_version = using_args.get("engine_version")
|
|
357
340
|
|
|
358
341
|
model_storage = self.model_storage
|
|
359
342
|
# TODO: should probably refactor at some point, as a bit of the logic is shared with lightwood's finetune logic
|
|
360
343
|
try:
|
|
361
|
-
base_predictor_id = args[
|
|
344
|
+
base_predictor_id = args["base_model_id"]
|
|
362
345
|
base_predictor_record = db.Predictor.query.get(base_predictor_id)
|
|
363
346
|
if base_predictor_record.status != PREDICTOR_STATUS.COMPLETE:
|
|
364
347
|
raise Exception("Base model must be in status 'complete'")
|
|
@@ -366,33 +349,34 @@ class BYOMHandler(BaseMLEngine):
|
|
|
366
349
|
predictor_id = model_storage.predictor_id
|
|
367
350
|
predictor_record = db.Predictor.query.get(predictor_id)
|
|
368
351
|
|
|
369
|
-
predictor_record.data = {
|
|
352
|
+
predictor_record.data = {
|
|
353
|
+
"training_log": "training"
|
|
354
|
+
} # TODO move to ModelStorage (don't work w/ db directly)
|
|
370
355
|
predictor_record.training_start_at = datetime.now()
|
|
371
356
|
predictor_record.status = PREDICTOR_STATUS.FINETUNING # TODO: parallel execution block
|
|
372
357
|
db.session.commit()
|
|
373
358
|
|
|
374
359
|
model_proxy = self._get_model_proxy(engine_version)
|
|
375
|
-
model_state = self.base_model_storage.file_get(
|
|
376
|
-
model_state = model_proxy.finetune(df, model_state, args=args.get(
|
|
360
|
+
model_state = self.base_model_storage.file_get("model")
|
|
361
|
+
model_state = model_proxy.finetune(df, model_state, args=args.get("using", {}))
|
|
377
362
|
|
|
378
363
|
# region hack to speedup file saving
|
|
379
|
-
with profiler.Context(
|
|
380
|
-
dest_abs_path = model_storage.fileStorage.folder_path /
|
|
381
|
-
with open(dest_abs_path,
|
|
364
|
+
with profiler.Context("finetune-byom-write-file"):
|
|
365
|
+
dest_abs_path = model_storage.fileStorage.folder_path / "model"
|
|
366
|
+
with open(dest_abs_path, "wb") as fd:
|
|
382
367
|
fd.write(model_state)
|
|
383
368
|
model_storage.fileStorage.push(compression_level=0)
|
|
384
369
|
# endregion
|
|
385
370
|
|
|
386
|
-
predictor_record.update_status =
|
|
371
|
+
predictor_record.update_status = "up_to_date"
|
|
387
372
|
predictor_record.status = PREDICTOR_STATUS.COMPLETE
|
|
388
373
|
predictor_record.training_stop_at = datetime.now()
|
|
389
374
|
db.session.commit()
|
|
390
375
|
|
|
391
376
|
except Exception as e:
|
|
392
|
-
logger.error(
|
|
377
|
+
logger.error("Unexpected error during BYOM finetune:", exc_info=True)
|
|
393
378
|
predictor_id = model_storage.predictor_id
|
|
394
379
|
predictor_record = db.Predictor.query.with_for_update().get(predictor_id)
|
|
395
|
-
logger.error(traceback.format_exc())
|
|
396
380
|
error_message = format_exception_error(e)
|
|
397
381
|
predictor_record.data = {"error": error_message}
|
|
398
382
|
predictor_record.status = PREDICTOR_STATUS.ERROR
|
|
@@ -406,8 +390,7 @@ class BYOMHandler(BaseMLEngine):
|
|
|
406
390
|
|
|
407
391
|
|
|
408
392
|
class ModelWrapperUnsafe:
|
|
409
|
-
"""
|
|
410
|
-
"""
|
|
393
|
+
"""Model wrapper that executes learn/predict in current process"""
|
|
411
394
|
|
|
412
395
|
def __init__(self, code, modules_str, engine_id, engine_version: int):
|
|
413
396
|
self.module = import_string(code)
|
|
@@ -444,7 +427,7 @@ class ModelWrapperUnsafe:
|
|
|
444
427
|
return pickle.dumps(self.model_instance.__dict__, protocol=5)
|
|
445
428
|
|
|
446
429
|
def describe(self, model_state, attribute: Optional[str] = None) -> pd.DataFrame:
|
|
447
|
-
if hasattr(self.model_instance,
|
|
430
|
+
if hasattr(self.model_instance, "describe"):
|
|
448
431
|
model_state = pickle.loads(model_state)
|
|
449
432
|
self.model_instance.__dict__ = model_state
|
|
450
433
|
return self.model_instance.describe(attribute)
|
|
@@ -460,15 +443,14 @@ class ModelWrapperUnsafe:
|
|
|
460
443
|
|
|
461
444
|
|
|
462
445
|
class ModelWrapperSafe:
|
|
463
|
-
"""
|
|
464
|
-
"""
|
|
446
|
+
"""Model wrapper that executes learn/predict in venv"""
|
|
465
447
|
|
|
466
448
|
def __init__(self, code, modules_str, engine_id, engine_version: int):
|
|
467
449
|
self.code = code
|
|
468
450
|
modules = self.parse_requirements(modules_str)
|
|
469
451
|
|
|
470
452
|
self.config = Config()
|
|
471
|
-
self.is_cloud = Config().get(
|
|
453
|
+
self.is_cloud = Config().get("cloud", False)
|
|
472
454
|
|
|
473
455
|
self.env_path = None
|
|
474
456
|
self.env_storage_path = None
|
|
@@ -478,37 +460,37 @@ class ModelWrapperSafe:
|
|
|
478
460
|
try:
|
|
479
461
|
import virtualenv
|
|
480
462
|
|
|
481
|
-
base_path = self.config.get(
|
|
463
|
+
base_path = self.config.get("byom", {}).get("venv_path")
|
|
482
464
|
if base_path is None:
|
|
483
465
|
# create in root path
|
|
484
|
-
base_path = Path(self.config.paths[
|
|
466
|
+
base_path = Path(self.config.paths["root"]) / "venvs"
|
|
485
467
|
else:
|
|
486
468
|
base_path = Path(base_path)
|
|
487
469
|
base_path.mkdir(parents=True, exist_ok=True)
|
|
488
470
|
|
|
489
|
-
env_folder_name = f
|
|
471
|
+
env_folder_name = f"env_{engine_id}"
|
|
490
472
|
if isinstance(engine_version, int) and engine_version > 1:
|
|
491
|
-
env_folder_name = f
|
|
473
|
+
env_folder_name = f"{env_folder_name}_{engine_version}"
|
|
492
474
|
|
|
493
475
|
self.env_storage_path = base_path / env_folder_name
|
|
494
476
|
if self.is_cloud:
|
|
495
|
-
bese_env_path = Path(tempfile.gettempdir()) /
|
|
477
|
+
bese_env_path = Path(tempfile.gettempdir()) / "mindsdb" / "venv"
|
|
496
478
|
bese_env_path.mkdir(parents=True, exist_ok=True)
|
|
497
479
|
self.env_path = bese_env_path / env_folder_name
|
|
498
|
-
tar_path = self.env_storage_path.with_suffix(
|
|
480
|
+
tar_path = self.env_storage_path.with_suffix(".tar")
|
|
499
481
|
if self.env_path.exists() is False and tar_path.exists() is True:
|
|
500
482
|
with tarfile.open(tar_path) as tar:
|
|
501
483
|
safe_extract(tar, path=bese_env_path)
|
|
502
484
|
else:
|
|
503
485
|
self.env_path = self.env_storage_path
|
|
504
486
|
|
|
505
|
-
if sys.platform in (
|
|
506
|
-
exectable_folder_name =
|
|
487
|
+
if sys.platform in ("win32", "cygwin"):
|
|
488
|
+
exectable_folder_name = "Scripts"
|
|
507
489
|
else:
|
|
508
|
-
exectable_folder_name =
|
|
490
|
+
exectable_folder_name = "bin"
|
|
509
491
|
|
|
510
|
-
pip_cmd = self.env_path / exectable_folder_name /
|
|
511
|
-
self.python_path = self.env_path / exectable_folder_name /
|
|
492
|
+
pip_cmd = self.env_path / exectable_folder_name / "pip"
|
|
493
|
+
self.python_path = self.env_path / exectable_folder_name / "python"
|
|
512
494
|
|
|
513
495
|
if self.env_path.exists():
|
|
514
496
|
# already exists. it means requirements are already installed
|
|
@@ -516,7 +498,7 @@ class ModelWrapperSafe:
|
|
|
516
498
|
|
|
517
499
|
# create
|
|
518
500
|
logger.info(f"Creating new environment: {self.env_path}")
|
|
519
|
-
virtualenv.cli_run([
|
|
501
|
+
virtualenv.cli_run(["-p", sys.executable, str(self.env_path)])
|
|
520
502
|
logger.info(f"Created new environment: {self.env_path}")
|
|
521
503
|
|
|
522
504
|
if len(modules) > 0:
|
|
@@ -537,13 +519,14 @@ class ModelWrapperSafe:
|
|
|
537
519
|
if self.is_cloud and self.env_storage_path != self.env_path:
|
|
538
520
|
old_cwd = os.getcwd()
|
|
539
521
|
os.chdir(str(bese_env_path))
|
|
540
|
-
tar_path = self.env_path.with_suffix(
|
|
541
|
-
with tarfile.open(name=str(tar_path), mode=
|
|
522
|
+
tar_path = self.env_path.with_suffix(".tar")
|
|
523
|
+
with tarfile.open(name=str(tar_path), mode="w") as tar:
|
|
542
524
|
tar.add(str(self.env_path.name))
|
|
543
525
|
os.chdir(old_cwd)
|
|
544
526
|
subprocess.run(
|
|
545
|
-
[
|
|
546
|
-
check=True,
|
|
527
|
+
["cp", "-R", "--no-preserve=mode,ownership", str(tar_path), str(base_path / tar_path.name)],
|
|
528
|
+
check=True,
|
|
529
|
+
shell=False,
|
|
547
530
|
)
|
|
548
531
|
tar_path.unlink()
|
|
549
532
|
|
|
@@ -552,46 +535,46 @@ class ModelWrapperSafe:
|
|
|
552
535
|
shutil.rmtree(str(self.env_path))
|
|
553
536
|
|
|
554
537
|
if self.is_cloud:
|
|
555
|
-
tar_path = self.env_storage_path.with_suffix(
|
|
538
|
+
tar_path = self.env_storage_path.with_suffix(".tar")
|
|
556
539
|
tar_path.unlink()
|
|
557
540
|
|
|
558
541
|
def parse_requirements(self, requirements):
|
|
559
542
|
# get requirements from string
|
|
560
543
|
# they should be located at the top of the file, before code
|
|
561
544
|
|
|
562
|
-
pattern =
|
|
545
|
+
pattern = "^[\w\\[\\]-]+[=!<>\s]*[\d\.]*[,=!<>\s]*[\d\.]*$" # noqa
|
|
563
546
|
modules = []
|
|
564
|
-
for line in requirements.split(b
|
|
547
|
+
for line in requirements.split(b"\n"):
|
|
565
548
|
line = line.decode().strip()
|
|
566
549
|
if line:
|
|
567
550
|
if re.match(pattern, line):
|
|
568
551
|
modules.append(line)
|
|
569
552
|
else:
|
|
570
|
-
raise Exception(f
|
|
553
|
+
raise Exception(f"Wrong requirement: {line}")
|
|
571
554
|
|
|
572
|
-
is_pandas = any([m.lower().startswith(
|
|
555
|
+
is_pandas = any([m.lower().startswith("pandas") for m in modules])
|
|
573
556
|
if not is_pandas:
|
|
574
|
-
modules.append(
|
|
575
|
-
modules.append(
|
|
557
|
+
modules.append("pandas>=2.0.0,<2.1.0")
|
|
558
|
+
modules.append("numpy<2.0.0")
|
|
576
559
|
|
|
577
560
|
# for dataframe serialization
|
|
578
|
-
modules.append(
|
|
561
|
+
modules.append("pyarrow==19.0.0")
|
|
579
562
|
return modules
|
|
580
563
|
|
|
581
564
|
def install_modules(self, modules, pip_cmd):
|
|
582
565
|
# install in current environment using pip
|
|
583
566
|
for module in modules:
|
|
584
567
|
logger.debug(f"BYOM install module: {module}")
|
|
585
|
-
p = subprocess.Popen([pip_cmd,
|
|
568
|
+
p = subprocess.Popen([pip_cmd, "install", module], stderr=subprocess.PIPE)
|
|
586
569
|
p.wait()
|
|
587
570
|
if p.returncode != 0:
|
|
588
|
-
raise Exception(f
|
|
571
|
+
raise Exception(f"Problem with installing module {module}: {p.stderr.read()}")
|
|
589
572
|
|
|
590
573
|
def _run_command(self, params):
|
|
591
574
|
logger.debug(f"BYOM run command: {params.get('method')}")
|
|
592
575
|
params_enc = encode(params)
|
|
593
576
|
|
|
594
|
-
wrapper_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
|
577
|
+
wrapper_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "proc_wrapper.py")
|
|
595
578
|
p = subprocess.Popen(
|
|
596
579
|
[str(self.python_path), wrapper_path],
|
|
597
580
|
stdin=subprocess.PIPE,
|
|
@@ -613,44 +596,44 @@ class ModelWrapperSafe:
|
|
|
613
596
|
|
|
614
597
|
def check(self, mode: str = None):
|
|
615
598
|
params = {
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
599
|
+
"method": BYOM_METHOD.CHECK.value,
|
|
600
|
+
"code": self.code,
|
|
601
|
+
"mode": mode,
|
|
619
602
|
}
|
|
620
603
|
return self._run_command(params)
|
|
621
604
|
|
|
622
605
|
def train(self, df, target, args):
|
|
623
606
|
params = {
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
607
|
+
"method": BYOM_METHOD.TRAIN.value,
|
|
608
|
+
"code": self.code,
|
|
609
|
+
"df": None,
|
|
610
|
+
"to_predict": target,
|
|
611
|
+
"args": args,
|
|
629
612
|
}
|
|
630
613
|
if df is not None:
|
|
631
|
-
params[
|
|
614
|
+
params["df"] = pd_encode(df)
|
|
632
615
|
|
|
633
616
|
model_state = self._run_command(params)
|
|
634
617
|
return model_state
|
|
635
618
|
|
|
636
619
|
def predict(self, df, model_state, args):
|
|
637
620
|
params = {
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
621
|
+
"method": BYOM_METHOD.PREDICT.value,
|
|
622
|
+
"code": self.code,
|
|
623
|
+
"model_state": model_state,
|
|
624
|
+
"df": pd_encode(df),
|
|
625
|
+
"args": args,
|
|
643
626
|
}
|
|
644
627
|
pred_df = self._run_command(params)
|
|
645
628
|
return pd_decode(pred_df)
|
|
646
629
|
|
|
647
630
|
def finetune(self, df, model_state, args):
|
|
648
631
|
params = {
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
632
|
+
"method": BYOM_METHOD.FINETUNE.value,
|
|
633
|
+
"code": self.code,
|
|
634
|
+
"model_state": model_state,
|
|
635
|
+
"df": pd_encode(df),
|
|
636
|
+
"args": args,
|
|
654
637
|
}
|
|
655
638
|
|
|
656
639
|
model_state = self._run_command(params)
|
|
@@ -658,10 +641,10 @@ class ModelWrapperSafe:
|
|
|
658
641
|
|
|
659
642
|
def describe(self, model_state, attribute: Optional[str] = None) -> pd.DataFrame:
|
|
660
643
|
params = {
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
644
|
+
"method": BYOM_METHOD.DESCRIBE.value,
|
|
645
|
+
"code": self.code,
|
|
646
|
+
"model_state": model_state,
|
|
647
|
+
"attribute": attribute,
|
|
665
648
|
}
|
|
666
649
|
enc_df = self._run_command(params)
|
|
667
650
|
df = pd_decode(enc_df)
|
|
@@ -669,10 +652,10 @@ class ModelWrapperSafe:
|
|
|
669
652
|
|
|
670
653
|
def func_call(self, func_name, args):
|
|
671
654
|
params = {
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
655
|
+
"method": BYOM_METHOD.FUNC_CALL.value,
|
|
656
|
+
"code": self.code,
|
|
657
|
+
"func_name": func_name,
|
|
658
|
+
"args": args,
|
|
676
659
|
}
|
|
677
660
|
result = self._run_command(params)
|
|
678
661
|
return result
|