MindsDB 25.5.4.2__py3-none-any.whl → 25.6.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/a2a/agent.py +28 -25
- mindsdb/api/a2a/common/server/server.py +32 -26
- mindsdb/api/executor/command_executor.py +69 -14
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +49 -65
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +29 -48
- mindsdb/api/executor/datahub/datanodes/system_tables.py +35 -61
- mindsdb/api/executor/planner/plan_join.py +67 -77
- mindsdb/api/executor/planner/query_planner.py +176 -155
- mindsdb/api/executor/planner/steps.py +37 -12
- mindsdb/api/executor/sql_query/result_set.py +45 -64
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +14 -18
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +17 -18
- mindsdb/api/executor/sql_query/steps/insert_step.py +13 -33
- mindsdb/api/executor/sql_query/steps/subselect_step.py +43 -35
- mindsdb/api/executor/utilities/sql.py +42 -48
- mindsdb/api/http/namespaces/config.py +1 -1
- mindsdb/api/http/namespaces/file.py +14 -23
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_datum.py +12 -28
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/binary_resultset_row_package.py +59 -50
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/resultset_row_package.py +9 -8
- mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +449 -461
- mindsdb/api/mysql/mysql_proxy/utilities/dump.py +87 -36
- mindsdb/integrations/handlers/file_handler/file_handler.py +15 -9
- mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +43 -24
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +10 -3
- mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +26 -33
- mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +74 -51
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +305 -98
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +53 -34
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +136 -6
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +334 -83
- mindsdb/integrations/libs/api_handler.py +261 -57
- mindsdb/integrations/libs/base.py +100 -29
- mindsdb/integrations/utilities/files/file_reader.py +99 -73
- mindsdb/integrations/utilities/handler_utils.py +23 -8
- mindsdb/integrations/utilities/sql_utils.py +35 -40
- mindsdb/interfaces/agents/agents_controller.py +196 -192
- mindsdb/interfaces/agents/constants.py +7 -1
- mindsdb/interfaces/agents/langchain_agent.py +42 -11
- mindsdb/interfaces/agents/mcp_client_agent.py +29 -21
- mindsdb/interfaces/data_catalog/__init__.py +0 -0
- mindsdb/interfaces/data_catalog/base_data_catalog.py +54 -0
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +359 -0
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +34 -0
- mindsdb/interfaces/database/database.py +81 -57
- mindsdb/interfaces/database/integrations.py +220 -234
- mindsdb/interfaces/database/log.py +72 -104
- mindsdb/interfaces/database/projects.py +156 -193
- mindsdb/interfaces/file/file_controller.py +21 -65
- mindsdb/interfaces/knowledge_base/controller.py +63 -10
- mindsdb/interfaces/knowledge_base/evaluate.py +519 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +75 -0
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +83 -43
- mindsdb/interfaces/skills/skills_controller.py +54 -36
- mindsdb/interfaces/skills/sql_agent.py +109 -86
- mindsdb/interfaces/storage/db.py +223 -79
- mindsdb/migrations/versions/2025-05-28_a44643042fe8_added_data_catalog_tables.py +118 -0
- mindsdb/migrations/versions/2025-06-09_608e376c19a7_updated_data_catalog_data_types.py +58 -0
- mindsdb/utilities/config.py +9 -2
- mindsdb/utilities/log.py +35 -26
- mindsdb/utilities/ml_task_queue/task.py +19 -22
- mindsdb/utilities/render/sqlalchemy_render.py +129 -181
- mindsdb/utilities/starters.py +40 -0
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/METADATA +253 -253
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/RECORD +69 -61
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/top_level.txt +0 -0
|
@@ -15,7 +15,10 @@ OPEN_AI_CHAT_MODELS = (
|
|
|
15
15
|
"gpt-4-32k",
|
|
16
16
|
"gpt-4-1106-preview",
|
|
17
17
|
"gpt-4-0125-preview",
|
|
18
|
+
"gpt-4.1",
|
|
19
|
+
"gpt-4.1-mini",
|
|
18
20
|
"gpt-4o",
|
|
21
|
+
"o4-mini",
|
|
19
22
|
"o3-mini",
|
|
20
23
|
"o1-mini",
|
|
21
24
|
)
|
|
@@ -216,8 +219,11 @@ You are an AI assistant powered by MindsDB. When answering questions, follow the
|
|
|
216
219
|
- Finally use kb_query_tool to query the knowledge base for specific information
|
|
217
220
|
|
|
218
221
|
2. For questions about database tables and their contents:
|
|
219
|
-
- Use the
|
|
222
|
+
- Use the sql_db_query to query the tables directly
|
|
220
223
|
- You can join tables if needed to get comprehensive information
|
|
224
|
+
- **Important Rule for SQL Queries:** If you formulate an SQL query as part of answering a user's question, you *must* then use the `sql_db_query` tool to execute that query and get its results. The SQL query string itself is NOT the final answer to the user unless the user has specifically asked for the query. Your final AI response should be based on the *results* obtained from executing the query.
|
|
225
|
+
|
|
221
226
|
|
|
222
227
|
For factual questions, ALWAYS use the available tools to look up information rather than relying on your internal knowledge.
|
|
228
|
+
|
|
223
229
|
"""
|
|
@@ -226,7 +226,7 @@ def process_chunk(chunk):
|
|
|
226
226
|
|
|
227
227
|
|
|
228
228
|
class LangchainAgent:
|
|
229
|
-
def __init__(self, agent: db.Agents, model: dict = None):
|
|
229
|
+
def __init__(self, agent: db.Agents, model: dict = None, params: dict = None):
|
|
230
230
|
self.agent = agent
|
|
231
231
|
self.model = model
|
|
232
232
|
|
|
@@ -239,16 +239,35 @@ class LangchainAgent:
|
|
|
239
239
|
self.mdb_langfuse_callback_handler: Optional[object] = None # custom (see langfuse_callback_handler.py)
|
|
240
240
|
|
|
241
241
|
self.langfuse_client_wrapper = LangfuseClientWrapper()
|
|
242
|
-
self.args = self._initialize_args()
|
|
242
|
+
self.args = self._initialize_args(params)
|
|
243
243
|
|
|
244
244
|
# Back compatibility for old models
|
|
245
245
|
self.provider = self.args.get("provider", get_llm_provider(self.args))
|
|
246
246
|
|
|
247
|
-
def _initialize_args(self) -> dict:
|
|
248
|
-
"""
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
247
|
+
def _initialize_args(self, params: dict = None) -> dict:
|
|
248
|
+
"""
|
|
249
|
+
Initialize the arguments for agent execution.
|
|
250
|
+
|
|
251
|
+
Takes the parameters passed during execution and sets necessary defaults.
|
|
252
|
+
The params are already merged with defaults by AgentsController.get_agent_llm_params.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
params: Parameters for agent execution (already merged with defaults)
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
dict: Final parameters for agent execution
|
|
259
|
+
"""
|
|
260
|
+
# Use the parameters passed to the method (already merged with defaults by AgentsController)
|
|
261
|
+
# No fallback needed as AgentsController.get_agent_llm_params already handles this
|
|
262
|
+
args = params.copy() if params else {}
|
|
263
|
+
|
|
264
|
+
# Set model name and provider if given in create agent otherwise use global llm defaults
|
|
265
|
+
# AgentsController.get_agent_llm_params
|
|
266
|
+
if self.agent.model_name is not None:
|
|
267
|
+
args["model_name"] = self.agent.model_name
|
|
268
|
+
if self.agent.provider is not None:
|
|
269
|
+
args["provider"] = self.agent.provider
|
|
270
|
+
|
|
252
271
|
args["embedding_model_provider"] = args.get("embedding_model", get_embedding_model_provider(args))
|
|
253
272
|
|
|
254
273
|
# agent is using current langchain model
|
|
@@ -261,11 +280,20 @@ class LangchainAgent:
|
|
|
261
280
|
# only update prompt_template if it is set on the model
|
|
262
281
|
args["prompt_template"] = prompt_template
|
|
263
282
|
|
|
283
|
+
# Set default prompt template if not provided
|
|
264
284
|
if args.get("prompt_template") is None:
|
|
285
|
+
# Default prompt template depends on agent mode
|
|
265
286
|
if args.get("mode") == "retrieval":
|
|
266
287
|
args["prompt_template"] = DEFAULT_RAG_PROMPT_TEMPLATE
|
|
288
|
+
logger.info(f"Using default retrieval prompt template: {DEFAULT_RAG_PROMPT_TEMPLATE[:50]}...")
|
|
267
289
|
else:
|
|
268
|
-
|
|
290
|
+
# Set a default prompt template for non-retrieval mode
|
|
291
|
+
default_prompt = "you are an assistant, answer using the tables connected"
|
|
292
|
+
args["prompt_template"] = default_prompt
|
|
293
|
+
logger.info(f"Using default prompt template: {default_prompt}")
|
|
294
|
+
|
|
295
|
+
if "prompt_template" in args:
|
|
296
|
+
logger.info(f"Using prompt template: {args['prompt_template'][:50]}...")
|
|
269
297
|
|
|
270
298
|
return args
|
|
271
299
|
|
|
@@ -318,7 +346,7 @@ class LangchainAgent:
|
|
|
318
346
|
self.provider = args.get("provider", get_llm_provider(args))
|
|
319
347
|
|
|
320
348
|
df = df.reset_index(drop=True)
|
|
321
|
-
agent = self.create_agent(df
|
|
349
|
+
agent = self.create_agent(df)
|
|
322
350
|
# Use last message as prompt, remove other questions.
|
|
323
351
|
user_column = args.get("user_column", USER_COLUMN)
|
|
324
352
|
df.iloc[:-1, df.columns.get_loc(user_column)] = None
|
|
@@ -348,14 +376,17 @@ class LangchainAgent:
|
|
|
348
376
|
self.provider = args.get("provider", get_llm_provider(args))
|
|
349
377
|
|
|
350
378
|
df = df.reset_index(drop=True)
|
|
351
|
-
agent = self.create_agent(df
|
|
379
|
+
agent = self.create_agent(df)
|
|
352
380
|
# Use last message as prompt, remove other questions.
|
|
353
381
|
user_column = args.get("user_column", USER_COLUMN)
|
|
354
382
|
df.iloc[:-1, df.columns.get_loc(user_column)] = None
|
|
355
383
|
return self.stream_agent(df, agent, args)
|
|
356
384
|
|
|
357
|
-
def create_agent(self, df: pd.DataFrame
|
|
385
|
+
def create_agent(self, df: pd.DataFrame) -> AgentExecutor:
|
|
358
386
|
# Set up tools.
|
|
387
|
+
|
|
388
|
+
args = self.args
|
|
389
|
+
|
|
359
390
|
llm = create_chat_model(args)
|
|
360
391
|
self.llm = llm
|
|
361
392
|
|
|
@@ -63,11 +63,19 @@ class MCPQueryTool(BaseTool):
|
|
|
63
63
|
return loop.run_until_complete(self._arun(query))
|
|
64
64
|
|
|
65
65
|
|
|
66
|
+
# todo move instantiation to agent controller
|
|
66
67
|
class MCPLangchainAgent(LangchainAgent):
|
|
67
68
|
"""Extension of LangchainAgent that delegates to MCP server"""
|
|
68
69
|
|
|
69
|
-
def __init__(
|
|
70
|
-
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
agent: db.Agents,
|
|
73
|
+
model: dict = None,
|
|
74
|
+
params: dict = None,
|
|
75
|
+
mcp_host: str = "127.0.0.1",
|
|
76
|
+
mcp_port: int = 47337,
|
|
77
|
+
):
|
|
78
|
+
super().__init__(agent, model, params)
|
|
71
79
|
self.mcp_host = mcp_host
|
|
72
80
|
self.mcp_port = mcp_port
|
|
73
81
|
self.exit_stack = AsyncExitStack()
|
|
@@ -85,7 +93,7 @@ class MCPLangchainAgent(LangchainAgent):
|
|
|
85
93
|
server_params = StdioServerParameters(
|
|
86
94
|
command="python",
|
|
87
95
|
args=["-m", "mindsdb", "--api=mcp"],
|
|
88
|
-
env={"MCP_HOST": self.mcp_host, "MCP_PORT": str(self.mcp_port)}
|
|
96
|
+
env={"MCP_HOST": self.mcp_host, "MCP_PORT": str(self.mcp_port)},
|
|
89
97
|
)
|
|
90
98
|
|
|
91
99
|
logger.info(f"Connecting to MCP server at {self.mcp_host}:{self.mcp_port}")
|
|
@@ -99,7 +107,9 @@ class MCPLangchainAgent(LangchainAgent):
|
|
|
99
107
|
|
|
100
108
|
# Test the connection by listing tools
|
|
101
109
|
tools_response = await self.session.list_tools()
|
|
102
|
-
logger.info(
|
|
110
|
+
logger.info(
|
|
111
|
+
f"Successfully connected to MCP server. Available tools: {[tool.name for tool in tools_response.tools]}"
|
|
112
|
+
)
|
|
103
113
|
|
|
104
114
|
except Exception as e:
|
|
105
115
|
logger.error(f"Failed to connect to MCP server: {str(e)}")
|
|
@@ -141,7 +151,7 @@ class MCPLangchainAgent(LangchainAgent):
|
|
|
141
151
|
response = super().get_completion(messages, stream)
|
|
142
152
|
|
|
143
153
|
# Ensure response is a string (not a DataFrame)
|
|
144
|
-
if hasattr(response,
|
|
154
|
+
if hasattr(response, "to_string"): # It's a DataFrame
|
|
145
155
|
return response.to_string()
|
|
146
156
|
|
|
147
157
|
return response
|
|
@@ -167,7 +177,7 @@ class LiteLLMAgentWrapper:
|
|
|
167
177
|
formatted_messages = [
|
|
168
178
|
{
|
|
169
179
|
"question": msg["content"] if msg["role"] == "user" else "",
|
|
170
|
-
"answer": msg["content"] if msg["role"] == "assistant" else ""
|
|
180
|
+
"answer": msg["content"] if msg["role"] == "assistant" else "",
|
|
171
181
|
}
|
|
172
182
|
for msg in messages
|
|
173
183
|
]
|
|
@@ -177,23 +187,16 @@ class LiteLLMAgentWrapper:
|
|
|
177
187
|
|
|
178
188
|
# Ensure response is a string
|
|
179
189
|
if not isinstance(response, str):
|
|
180
|
-
if hasattr(response,
|
|
190
|
+
if hasattr(response, "to_string"): # It's a DataFrame
|
|
181
191
|
response = response.to_string()
|
|
182
192
|
else:
|
|
183
193
|
response = str(response)
|
|
184
194
|
|
|
185
195
|
# Format response in LiteLLM expected format
|
|
186
196
|
return {
|
|
187
|
-
"choices": [
|
|
188
|
-
{
|
|
189
|
-
"message": {
|
|
190
|
-
"role": "assistant",
|
|
191
|
-
"content": response
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
],
|
|
197
|
+
"choices": [{"message": {"role": "assistant", "content": response}}],
|
|
195
198
|
"model": self.agent.args["model_name"],
|
|
196
|
-
"object": "chat.completion"
|
|
199
|
+
"object": "chat.completion",
|
|
197
200
|
}
|
|
198
201
|
|
|
199
202
|
async def acompletion_stream(self, messages: List[Dict[str, str]], **kwargs) -> Iterator[Dict[str, Any]]:
|
|
@@ -202,7 +205,7 @@ class LiteLLMAgentWrapper:
|
|
|
202
205
|
formatted_messages = [
|
|
203
206
|
{
|
|
204
207
|
"question": msg["content"] if msg["role"] == "user" else "",
|
|
205
|
-
"answer": msg["content"] if msg["role"] == "assistant" else ""
|
|
208
|
+
"answer": msg["content"] if msg["role"] == "assistant" else "",
|
|
206
209
|
}
|
|
207
210
|
for msg in messages
|
|
208
211
|
]
|
|
@@ -217,7 +220,7 @@ class LiteLLMAgentWrapper:
|
|
|
217
220
|
yield {
|
|
218
221
|
"choices": [{"delta": {"role": "assistant", "content": content}}],
|
|
219
222
|
"model": model_name,
|
|
220
|
-
"object": "chat.completion.chunk"
|
|
223
|
+
"object": "chat.completion.chunk",
|
|
221
224
|
}
|
|
222
225
|
# Allow async context switch
|
|
223
226
|
await asyncio.sleep(0)
|
|
@@ -230,7 +233,9 @@ class LiteLLMAgentWrapper:
|
|
|
230
233
|
await self.agent.cleanup()
|
|
231
234
|
|
|
232
235
|
|
|
233
|
-
def create_mcp_agent(
|
|
236
|
+
def create_mcp_agent(
|
|
237
|
+
agent_name: str, project_name: str, mcp_host: str = "127.0.0.1", mcp_port: int = 47337
|
|
238
|
+
) -> LiteLLMAgentWrapper:
|
|
234
239
|
"""Create an MCP agent and wrap it for LiteLLM compatibility"""
|
|
235
240
|
from mindsdb.interfaces.agents.agents_controller import AgentsController
|
|
236
241
|
from mindsdb.interfaces.storage import db
|
|
@@ -245,8 +250,11 @@ def create_mcp_agent(agent_name: str, project_name: str, mcp_host: str = "127.0.
|
|
|
245
250
|
if agent_db is None:
|
|
246
251
|
raise ValueError(f"Agent {agent_name} not found in project {project_name}")
|
|
247
252
|
|
|
248
|
-
#
|
|
249
|
-
|
|
253
|
+
# Get merged parameters (defaults + agent params)
|
|
254
|
+
merged_params = agent_controller.get_agent_llm_params(agent_db.params)
|
|
255
|
+
|
|
256
|
+
# Create MCP agent with merged parameters
|
|
257
|
+
mcp_agent = MCPLangchainAgent(agent_db, params=merged_params, mcp_host=mcp_host, mcp_port=mcp_port)
|
|
250
258
|
|
|
251
259
|
# Wrap for LiteLLM compatibility
|
|
252
260
|
return LiteLLMAgentWrapper(mcp_agent)
|
|
File without changes
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from typing import List, Optional, Union
|
|
2
|
+
|
|
3
|
+
from mindsdb.integrations.libs.api_handler import MetaAPIHandler
|
|
4
|
+
from mindsdb.integrations.libs.base import MetaDatabaseHandler
|
|
5
|
+
from mindsdb.utilities import log
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
logger = log.getLogger("mindsdb")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseDataCatalog:
|
|
12
|
+
"""
|
|
13
|
+
This is the base class for the Data Catalog interface.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, database_name: str, table_names: Optional[List[str]] = None) -> None:
|
|
17
|
+
"""
|
|
18
|
+
Initialize the DataCatalogReader.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
database_name (str): The data source to read/write metadata from.
|
|
22
|
+
table_names (Optional[List[str]]): The list of table names to read or write metadata for. If None, all tables will be read or written.
|
|
23
|
+
"""
|
|
24
|
+
from mindsdb.api.executor.controllers.session_controller import (
|
|
25
|
+
SessionController,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
session = SessionController()
|
|
29
|
+
|
|
30
|
+
self.database_name = database_name
|
|
31
|
+
self.data_handler: Union[MetaDatabaseHandler, MetaAPIHandler] = session.integration_controller.get_data_handler(
|
|
32
|
+
database_name
|
|
33
|
+
)
|
|
34
|
+
integration = session.integration_controller.get(database_name)
|
|
35
|
+
self.integration_id = integration["id"]
|
|
36
|
+
self.integration_engine = integration["engine"]
|
|
37
|
+
# TODO: Handle situations where a schema is provided along with the database name, e.g., 'schema.table'.
|
|
38
|
+
# TODO: Handle situations where a file path is provided with integrations like S3, e.g., 'dir/file.csv'.
|
|
39
|
+
self.table_names = table_names
|
|
40
|
+
|
|
41
|
+
self.logger = logger
|
|
42
|
+
|
|
43
|
+
def is_data_catalog_supported(self) -> bool:
|
|
44
|
+
"""
|
|
45
|
+
Check if the data catalog is supported for the given database.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
bool: True if the data catalog is supported, False otherwise.
|
|
49
|
+
"""
|
|
50
|
+
if not isinstance(self.data_handler, (MetaDatabaseHandler, MetaAPIHandler)):
|
|
51
|
+
self.logger.warning(f"Data catalog is not supported for the '{self.integration_engine}' integration'. ")
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
return True
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
from typing import List, Union
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from mindsdb.integrations.libs.response import RESPONSE_TYPE
|
|
6
|
+
from mindsdb.interfaces.data_catalog.base_data_catalog import BaseDataCatalog
|
|
7
|
+
from mindsdb.interfaces.storage import db
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DataCatalogLoader(BaseDataCatalog):
|
|
11
|
+
"""
|
|
12
|
+
This class is responsible for loading the metadata from a data source (via the handler) and storing it in the data catalog.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def load_metadata(self) -> None:
|
|
16
|
+
"""
|
|
17
|
+
Load the metadata from the handler and store it in the database.
|
|
18
|
+
"""
|
|
19
|
+
if not self.is_data_catalog_supported():
|
|
20
|
+
return
|
|
21
|
+
|
|
22
|
+
loaded_table_names = self._get_loaded_table_names()
|
|
23
|
+
|
|
24
|
+
tables = self._load_table_metadata(loaded_table_names)
|
|
25
|
+
|
|
26
|
+
if tables:
|
|
27
|
+
columns = self._load_column_metadata(tables)
|
|
28
|
+
|
|
29
|
+
self._load_column_statistics(tables, columns)
|
|
30
|
+
|
|
31
|
+
self._load_primary_keys(tables, columns)
|
|
32
|
+
|
|
33
|
+
self._load_foreign_keys(tables, columns)
|
|
34
|
+
|
|
35
|
+
self.logger.info(f"Metadata loading completed for {self.database_name}.")
|
|
36
|
+
|
|
37
|
+
def _get_loaded_table_names(self) -> List[str]:
|
|
38
|
+
"""
|
|
39
|
+
Retrieve the names of tables that are already present in the data catalog for the current integration.
|
|
40
|
+
If table_names are provided, only those tables will be checked.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
List[str]: Names of tables already loaded in the data catalog.
|
|
44
|
+
"""
|
|
45
|
+
query = db.session.query(db.MetaTables).filter_by(integration_id=self.integration_id)
|
|
46
|
+
if self.table_names:
|
|
47
|
+
query = query.filter(db.MetaTables.name.in_(self.table_names))
|
|
48
|
+
|
|
49
|
+
tables = query.all()
|
|
50
|
+
table_names = [table.name for table in tables]
|
|
51
|
+
|
|
52
|
+
if table_names:
|
|
53
|
+
self.logger.info(f"Tables already loaded in the data catalog: {', '.join(table_names)}.")
|
|
54
|
+
|
|
55
|
+
return table_names
|
|
56
|
+
|
|
57
|
+
def _load_table_metadata(self, loaded_table_names: List[str] = None) -> List[Union[db.MetaTables, None]]:
|
|
58
|
+
"""
|
|
59
|
+
Load the table metadata from the handler.
|
|
60
|
+
"""
|
|
61
|
+
self.logger.info(f"Loading tables for {self.database_name}")
|
|
62
|
+
response = self.data_handler.meta_get_tables(self.table_names)
|
|
63
|
+
if response.resp_type != RESPONSE_TYPE.TABLE:
|
|
64
|
+
self.logger.error(f"Failed to load tables for {self.database_name}: {response.error_message}")
|
|
65
|
+
return []
|
|
66
|
+
|
|
67
|
+
df = response.data_frame
|
|
68
|
+
if df.empty:
|
|
69
|
+
self.logger.info(f"No tables to add for {self.database_name}.")
|
|
70
|
+
return []
|
|
71
|
+
|
|
72
|
+
# Filter out tables that are already loaded in the data catalog
|
|
73
|
+
if loaded_table_names:
|
|
74
|
+
df = df[~df["table_name"].isin(loaded_table_names)]
|
|
75
|
+
|
|
76
|
+
if df.empty:
|
|
77
|
+
self.logger.info(f"No new tables to load for {self.database_name}.")
|
|
78
|
+
return []
|
|
79
|
+
|
|
80
|
+
df.columns = df.columns.str.lower()
|
|
81
|
+
tables = self._add_table_metadata(df)
|
|
82
|
+
self.logger.info(f"Tables loaded for {self.database_name}.")
|
|
83
|
+
return tables
|
|
84
|
+
|
|
85
|
+
def _add_table_metadata(self, df: pd.DataFrame) -> List[db.MetaTables]:
|
|
86
|
+
"""
|
|
87
|
+
Add the table metadata to the database.
|
|
88
|
+
"""
|
|
89
|
+
tables = []
|
|
90
|
+
try:
|
|
91
|
+
for row in df.to_dict(orient="records"):
|
|
92
|
+
# Convert the distinct_values_count to an integer if it is not NaN, otherwise set it to None.
|
|
93
|
+
val = row.get("row_count")
|
|
94
|
+
row_count = int(val) if pd.notna(val) else None
|
|
95
|
+
|
|
96
|
+
record = db.MetaTables(
|
|
97
|
+
integration_id=self.integration_id,
|
|
98
|
+
name=row.get("table_name") or row.get("name"),
|
|
99
|
+
schema=row.get("table_schema"),
|
|
100
|
+
description=row.get("table_description"),
|
|
101
|
+
type=row.get("table_type"),
|
|
102
|
+
row_count=row_count,
|
|
103
|
+
)
|
|
104
|
+
tables.append(record)
|
|
105
|
+
|
|
106
|
+
db.session.add_all(tables)
|
|
107
|
+
db.session.commit()
|
|
108
|
+
except Exception as e:
|
|
109
|
+
self.logger.error(f"Failed to add tables: {e}")
|
|
110
|
+
db.session.rollback()
|
|
111
|
+
raise
|
|
112
|
+
return tables
|
|
113
|
+
|
|
114
|
+
def _load_column_metadata(self, tables: db.MetaTables) -> List[db.MetaColumns]:
|
|
115
|
+
"""
|
|
116
|
+
Load the column metadata from the handler.
|
|
117
|
+
"""
|
|
118
|
+
self.logger.info(f"Loading columns for {self.database_name}")
|
|
119
|
+
response = self.data_handler.meta_get_columns(self.table_names)
|
|
120
|
+
if response.resp_type != RESPONSE_TYPE.TABLE:
|
|
121
|
+
self.logger.error(f"Failed to load columns for {self.database_name}: {response.error_message}")
|
|
122
|
+
return []
|
|
123
|
+
|
|
124
|
+
df = response.data_frame
|
|
125
|
+
if df.empty:
|
|
126
|
+
self.logger.info(f"No columns to load for {self.database_name}.")
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
df.columns = df.columns.str.lower()
|
|
130
|
+
columns = self._add_column_metadata(df, tables)
|
|
131
|
+
self.logger.info(f"Columns loaded for {self.database_name}.")
|
|
132
|
+
return columns
|
|
133
|
+
|
|
134
|
+
def _add_column_metadata(self, df: pd.DataFrame, tables: db.MetaTables) -> List[db.MetaColumns]:
|
|
135
|
+
"""
|
|
136
|
+
Add the column metadata to the database.
|
|
137
|
+
"""
|
|
138
|
+
columns = []
|
|
139
|
+
try:
|
|
140
|
+
for row in df.to_dict(orient="records"):
|
|
141
|
+
record = db.MetaColumns(
|
|
142
|
+
table_id=next((table.id for table in tables if table.name == row.get("table_name"))),
|
|
143
|
+
name=row.get("column_name"),
|
|
144
|
+
data_type=row.get("data_type"),
|
|
145
|
+
default_value=row.get("column_default"),
|
|
146
|
+
description=row.get("description"),
|
|
147
|
+
is_nullable=row.get("is_nullable"),
|
|
148
|
+
)
|
|
149
|
+
columns.append(record)
|
|
150
|
+
|
|
151
|
+
db.session.add_all(columns)
|
|
152
|
+
db.session.commit()
|
|
153
|
+
except Exception as e:
|
|
154
|
+
self.logger.error(f"Failed to add columns: {e}")
|
|
155
|
+
db.session.rollback()
|
|
156
|
+
raise
|
|
157
|
+
return columns
|
|
158
|
+
|
|
159
|
+
def _load_column_statistics(self, tables: db.MetaTables, columns: db.MetaColumns) -> None:
|
|
160
|
+
"""
|
|
161
|
+
Load the column statistics metadata from the handler.
|
|
162
|
+
"""
|
|
163
|
+
self.logger.info(f"Loading column statistics for {self.database_name}")
|
|
164
|
+
response = self.data_handler.meta_get_column_statistics(self.table_names)
|
|
165
|
+
if response.resp_type != RESPONSE_TYPE.TABLE:
|
|
166
|
+
self.logger.error(f"Failed to load column statistics for {self.database_name}: {response.error_message}")
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
df = response.data_frame
|
|
170
|
+
if df.empty:
|
|
171
|
+
self.logger.info(f"No column statistics to load for {self.database_name}.")
|
|
172
|
+
return
|
|
173
|
+
|
|
174
|
+
df.columns = df.columns.str.lower()
|
|
175
|
+
self._add_column_statistics(df, tables, columns)
|
|
176
|
+
self.logger.info(f"Column statistics loaded for {self.database_name}.")
|
|
177
|
+
|
|
178
|
+
def _add_column_statistics(self, df: pd.DataFrame, tables: db.MetaTables, columns: db.MetaColumns) -> None:
|
|
179
|
+
"""
|
|
180
|
+
Add the column statistics metadata to the database.
|
|
181
|
+
"""
|
|
182
|
+
column_statistics = []
|
|
183
|
+
try:
|
|
184
|
+
for row in df.to_dict(orient="records"):
|
|
185
|
+
table_id = next((table.id for table in tables if table.name == row.get("table_name")))
|
|
186
|
+
column_id = next(
|
|
187
|
+
(
|
|
188
|
+
column.id
|
|
189
|
+
for column in columns
|
|
190
|
+
if column.name == row.get("column_name") and column.table_id == table_id
|
|
191
|
+
)
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Convert the distinct_values_count to an integer if it is not NaN, otherwise set it to None.
|
|
195
|
+
val = row.get("distinct_values_count")
|
|
196
|
+
distinct_values_count = int(val) if pd.notna(val) else None
|
|
197
|
+
|
|
198
|
+
# Convert the most_common_frequencies to a list of strings.
|
|
199
|
+
most_common_frequencies = [str(val) for val in row.get("most_common_frequencies") or []]
|
|
200
|
+
|
|
201
|
+
record = db.MetaColumnStatistics(
|
|
202
|
+
column_id=column_id,
|
|
203
|
+
most_common_values=row.get("most_common_values"),
|
|
204
|
+
most_common_frequencies=most_common_frequencies,
|
|
205
|
+
null_percentage=row.get("null_percentage"),
|
|
206
|
+
distinct_values_count=distinct_values_count,
|
|
207
|
+
minimum_value=row.get("minimum_value"),
|
|
208
|
+
maximum_value=row.get("maximum_value"),
|
|
209
|
+
)
|
|
210
|
+
column_statistics.append(record)
|
|
211
|
+
|
|
212
|
+
db.session.add_all(column_statistics)
|
|
213
|
+
db.session.commit()
|
|
214
|
+
except Exception as e:
|
|
215
|
+
self.logger.error(f"Failed to add column statistics: {e}")
|
|
216
|
+
db.session.rollback()
|
|
217
|
+
raise
|
|
218
|
+
|
|
219
|
+
def _load_primary_keys(self, tables: db.MetaTables, columns: db.MetaColumns) -> None:
|
|
220
|
+
"""
|
|
221
|
+
Load the primary keys metadata from the handler.
|
|
222
|
+
"""
|
|
223
|
+
self.logger.info(f"Loading primary keys for {self.database_name}")
|
|
224
|
+
response = self.data_handler.meta_get_primary_keys(self.table_names)
|
|
225
|
+
if response.resp_type != RESPONSE_TYPE.TABLE:
|
|
226
|
+
self.logger.error(f"Failed to load primary keys for {self.database_name}: {response.error_message}")
|
|
227
|
+
return
|
|
228
|
+
|
|
229
|
+
df = response.data_frame
|
|
230
|
+
if df.empty:
|
|
231
|
+
self.logger.info(f"No primary keys to load for {self.database_name}.")
|
|
232
|
+
return
|
|
233
|
+
|
|
234
|
+
df.columns = df.columns.str.lower()
|
|
235
|
+
self._add_primary_keys(df, tables, columns)
|
|
236
|
+
self.logger.info(f"Primary keys loaded for {self.database_name}.")
|
|
237
|
+
|
|
238
|
+
def _add_primary_keys(self, df: pd.DataFrame, tables: db.MetaTables, columns: db.MetaColumns) -> None:
|
|
239
|
+
"""
|
|
240
|
+
Add the primary keys metadata to the database.
|
|
241
|
+
"""
|
|
242
|
+
primary_keys = []
|
|
243
|
+
try:
|
|
244
|
+
for row in df.to_dict(orient="records"):
|
|
245
|
+
table_id = next((table.id for table in tables if table.name == row.get("table_name")))
|
|
246
|
+
column_id = next(
|
|
247
|
+
(
|
|
248
|
+
column.id
|
|
249
|
+
for column in columns
|
|
250
|
+
if column.name == row.get("column_name") and column.table_id == table_id
|
|
251
|
+
)
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
record = db.MetaPrimaryKeys(
|
|
255
|
+
table_id=table_id,
|
|
256
|
+
column_id=column_id,
|
|
257
|
+
constraint_name=row.get("constraint_name"),
|
|
258
|
+
)
|
|
259
|
+
primary_keys.append(record)
|
|
260
|
+
|
|
261
|
+
db.session.add_all(primary_keys)
|
|
262
|
+
db.session.commit()
|
|
263
|
+
except Exception as e:
|
|
264
|
+
self.logger.error(f"Failed to add primary keys: {e}")
|
|
265
|
+
db.session.rollback()
|
|
266
|
+
raise
|
|
267
|
+
|
|
268
|
+
def _load_foreign_keys(self, tables: db.MetaTables, columns: db.MetaColumns) -> None:
|
|
269
|
+
"""
|
|
270
|
+
Load the foreign keys metadata from the handler.
|
|
271
|
+
"""
|
|
272
|
+
self.logger.info(f"Loading foreign keys for {self.database_name}")
|
|
273
|
+
response = self.data_handler.meta_get_foreign_keys(self.table_names)
|
|
274
|
+
if response.resp_type != RESPONSE_TYPE.TABLE:
|
|
275
|
+
self.logger.error(f"Failed to foreign keys for {self.database_name}: {response.error_message}")
|
|
276
|
+
return
|
|
277
|
+
|
|
278
|
+
df = response.data_frame
|
|
279
|
+
if df.empty:
|
|
280
|
+
self.logger.info(f"No foreign keys to load for {self.database_name}.")
|
|
281
|
+
return
|
|
282
|
+
|
|
283
|
+
df.columns = df.columns.str.lower()
|
|
284
|
+
self._add_foreign_keys(df, tables, columns)
|
|
285
|
+
self.logger.info(f"Foreign keys loaded for {self.database_name}.")
|
|
286
|
+
|
|
287
|
+
def _add_foreign_keys(self, df: pd.DataFrame, tables: db.MetaTables, columns: db.MetaColumns) -> None:
|
|
288
|
+
"""
|
|
289
|
+
Add the foreign keys metadata to the database.
|
|
290
|
+
"""
|
|
291
|
+
foreign_keys = []
|
|
292
|
+
try:
|
|
293
|
+
for row in df.to_dict(orient="records"):
|
|
294
|
+
try:
|
|
295
|
+
parent_table_id = next((table.id for table in tables if table.name == row.get("parent_table_name")))
|
|
296
|
+
parent_column_id = next(
|
|
297
|
+
(
|
|
298
|
+
column.id
|
|
299
|
+
for column in columns
|
|
300
|
+
if column.name == row.get("parent_column_name") and column.table_id == parent_table_id
|
|
301
|
+
)
|
|
302
|
+
)
|
|
303
|
+
child_table_id = next((table.id for table in tables if table.name == row.get("child_table_name")))
|
|
304
|
+
child_column_id = next(
|
|
305
|
+
(
|
|
306
|
+
column.id
|
|
307
|
+
for column in columns
|
|
308
|
+
if column.name == row.get("child_column_name") and column.table_id == child_table_id
|
|
309
|
+
)
|
|
310
|
+
)
|
|
311
|
+
except StopIteration:
|
|
312
|
+
self.logger.warning(
|
|
313
|
+
f"The foreign key relationship for {row.get('parent_table_name')} -> {row.get('child_table_name')} "
|
|
314
|
+
f"could not be established. One or more tables or columns may not exist in the metadata."
|
|
315
|
+
)
|
|
316
|
+
continue
|
|
317
|
+
|
|
318
|
+
record = db.MetaForeignKeys(
|
|
319
|
+
parent_table_id=parent_table_id,
|
|
320
|
+
parent_column_id=parent_column_id,
|
|
321
|
+
child_table_id=child_table_id,
|
|
322
|
+
child_column_id=child_column_id,
|
|
323
|
+
constraint_name=row.get("constraint_name"),
|
|
324
|
+
)
|
|
325
|
+
foreign_keys.append(record)
|
|
326
|
+
|
|
327
|
+
db.session.add_all(foreign_keys)
|
|
328
|
+
db.session.commit()
|
|
329
|
+
except Exception as e:
|
|
330
|
+
self.logger.error(f"Failed to add foreign keys: {e}")
|
|
331
|
+
db.session.rollback()
|
|
332
|
+
raise
|
|
333
|
+
|
|
334
|
+
def unload_metadata(self) -> None:
|
|
335
|
+
"""
|
|
336
|
+
Remove the metadata for the specified database from the data catalog.
|
|
337
|
+
"""
|
|
338
|
+
if not self.is_data_catalog_supported():
|
|
339
|
+
return
|
|
340
|
+
|
|
341
|
+
meta_tables = db.session.query(db.MetaTables).filter_by(integration_id=self.integration_id).all()
|
|
342
|
+
|
|
343
|
+
if not meta_tables:
|
|
344
|
+
self.logger.info(f"No metadata found for {self.database_name}. Nothing to remove.")
|
|
345
|
+
return
|
|
346
|
+
|
|
347
|
+
for table in meta_tables:
|
|
348
|
+
db.session.query(db.MetaPrimaryKeys).filter_by(table_id=table.id).delete()
|
|
349
|
+
db.session.query(db.MetaForeignKeys).filter(
|
|
350
|
+
(db.MetaForeignKeys.parent_table_id == table.id) | (db.MetaForeignKeys.child_table_id == table.id)
|
|
351
|
+
).delete()
|
|
352
|
+
meta_columns = db.session.query(db.MetaColumns).filter_by(table_id=table.id).all()
|
|
353
|
+
for col in meta_columns:
|
|
354
|
+
db.session.query(db.MetaColumnStatistics).filter_by(column_id=col.id).delete()
|
|
355
|
+
db.session.delete(col)
|
|
356
|
+
|
|
357
|
+
db.session.delete(table)
|
|
358
|
+
db.session.commit()
|
|
359
|
+
self.logger.info(f"Metadata for {self.database_name} removed successfully.")
|