MindsDB 25.5.4.2__py3-none-any.whl → 25.6.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (69) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/a2a/agent.py +28 -25
  3. mindsdb/api/a2a/common/server/server.py +32 -26
  4. mindsdb/api/executor/command_executor.py +69 -14
  5. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +49 -65
  6. mindsdb/api/executor/datahub/datanodes/project_datanode.py +29 -48
  7. mindsdb/api/executor/datahub/datanodes/system_tables.py +35 -61
  8. mindsdb/api/executor/planner/plan_join.py +67 -77
  9. mindsdb/api/executor/planner/query_planner.py +176 -155
  10. mindsdb/api/executor/planner/steps.py +37 -12
  11. mindsdb/api/executor/sql_query/result_set.py +45 -64
  12. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +14 -18
  13. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +17 -18
  14. mindsdb/api/executor/sql_query/steps/insert_step.py +13 -33
  15. mindsdb/api/executor/sql_query/steps/subselect_step.py +43 -35
  16. mindsdb/api/executor/utilities/sql.py +42 -48
  17. mindsdb/api/http/namespaces/config.py +1 -1
  18. mindsdb/api/http/namespaces/file.py +14 -23
  19. mindsdb/api/mysql/mysql_proxy/data_types/mysql_datum.py +12 -28
  20. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/binary_resultset_row_package.py +59 -50
  21. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/resultset_row_package.py +9 -8
  22. mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +449 -461
  23. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +87 -36
  24. mindsdb/integrations/handlers/file_handler/file_handler.py +15 -9
  25. mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +43 -24
  26. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +10 -3
  27. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +26 -33
  28. mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +74 -51
  29. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +305 -98
  30. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +53 -34
  31. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +136 -6
  32. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +334 -83
  33. mindsdb/integrations/libs/api_handler.py +261 -57
  34. mindsdb/integrations/libs/base.py +100 -29
  35. mindsdb/integrations/utilities/files/file_reader.py +99 -73
  36. mindsdb/integrations/utilities/handler_utils.py +23 -8
  37. mindsdb/integrations/utilities/sql_utils.py +35 -40
  38. mindsdb/interfaces/agents/agents_controller.py +196 -192
  39. mindsdb/interfaces/agents/constants.py +7 -1
  40. mindsdb/interfaces/agents/langchain_agent.py +42 -11
  41. mindsdb/interfaces/agents/mcp_client_agent.py +29 -21
  42. mindsdb/interfaces/data_catalog/__init__.py +0 -0
  43. mindsdb/interfaces/data_catalog/base_data_catalog.py +54 -0
  44. mindsdb/interfaces/data_catalog/data_catalog_loader.py +359 -0
  45. mindsdb/interfaces/data_catalog/data_catalog_reader.py +34 -0
  46. mindsdb/interfaces/database/database.py +81 -57
  47. mindsdb/interfaces/database/integrations.py +220 -234
  48. mindsdb/interfaces/database/log.py +72 -104
  49. mindsdb/interfaces/database/projects.py +156 -193
  50. mindsdb/interfaces/file/file_controller.py +21 -65
  51. mindsdb/interfaces/knowledge_base/controller.py +63 -10
  52. mindsdb/interfaces/knowledge_base/evaluate.py +519 -0
  53. mindsdb/interfaces/knowledge_base/llm_client.py +75 -0
  54. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +83 -43
  55. mindsdb/interfaces/skills/skills_controller.py +54 -36
  56. mindsdb/interfaces/skills/sql_agent.py +109 -86
  57. mindsdb/interfaces/storage/db.py +223 -79
  58. mindsdb/migrations/versions/2025-05-28_a44643042fe8_added_data_catalog_tables.py +118 -0
  59. mindsdb/migrations/versions/2025-06-09_608e376c19a7_updated_data_catalog_data_types.py +58 -0
  60. mindsdb/utilities/config.py +9 -2
  61. mindsdb/utilities/log.py +35 -26
  62. mindsdb/utilities/ml_task_queue/task.py +19 -22
  63. mindsdb/utilities/render/sqlalchemy_render.py +129 -181
  64. mindsdb/utilities/starters.py +40 -0
  65. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/METADATA +253 -253
  66. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/RECORD +69 -61
  67. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/WHEEL +0 -0
  68. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/licenses/LICENSE +0 -0
  69. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,10 @@ OPEN_AI_CHAT_MODELS = (
15
15
  "gpt-4-32k",
16
16
  "gpt-4-1106-preview",
17
17
  "gpt-4-0125-preview",
18
+ "gpt-4.1",
19
+ "gpt-4.1-mini",
18
20
  "gpt-4o",
21
+ "o4-mini",
19
22
  "o3-mini",
20
23
  "o1-mini",
21
24
  )
@@ -216,8 +219,11 @@ You are an AI assistant powered by MindsDB. When answering questions, follow the
216
219
  - Finally use kb_query_tool to query the knowledge base for specific information
217
220
 
218
221
  2. For questions about database tables and their contents:
219
- - Use the sql_tool to query the tables directly
222
+ - Use the sql_db_query to query the tables directly
220
223
  - You can join tables if needed to get comprehensive information
224
+ - **Important Rule for SQL Queries:** If you formulate an SQL query as part of answering a user's question, you *must* then use the `sql_db_query` tool to execute that query and get its results. The SQL query string itself is NOT the final answer to the user unless the user has specifically asked for the query. Your final AI response should be based on the *results* obtained from executing the query.
225
+
221
226
 
222
227
  For factual questions, ALWAYS use the available tools to look up information rather than relying on your internal knowledge.
228
+
223
229
  """
@@ -226,7 +226,7 @@ def process_chunk(chunk):
226
226
 
227
227
 
228
228
  class LangchainAgent:
229
- def __init__(self, agent: db.Agents, model: dict = None):
229
+ def __init__(self, agent: db.Agents, model: dict = None, params: dict = None):
230
230
  self.agent = agent
231
231
  self.model = model
232
232
 
@@ -239,16 +239,35 @@ class LangchainAgent:
239
239
  self.mdb_langfuse_callback_handler: Optional[object] = None # custom (see langfuse_callback_handler.py)
240
240
 
241
241
  self.langfuse_client_wrapper = LangfuseClientWrapper()
242
- self.args = self._initialize_args()
242
+ self.args = self._initialize_args(params)
243
243
 
244
244
  # Back compatibility for old models
245
245
  self.provider = self.args.get("provider", get_llm_provider(self.args))
246
246
 
247
- def _initialize_args(self) -> dict:
248
- """Initialize the arguments based on the agent's parameters."""
249
- args = self.agent.params.copy()
250
- args["model_name"] = self.agent.model_name
251
- args["provider"] = self.agent.provider
247
+ def _initialize_args(self, params: dict = None) -> dict:
248
+ """
249
+ Initialize the arguments for agent execution.
250
+
251
+ Takes the parameters passed during execution and sets necessary defaults.
252
+ The params are already merged with defaults by AgentsController.get_agent_llm_params.
253
+
254
+ Args:
255
+ params: Parameters for agent execution (already merged with defaults)
256
+
257
+ Returns:
258
+ dict: Final parameters for agent execution
259
+ """
260
+ # Use the parameters passed to the method (already merged with defaults by AgentsController)
261
+ # No fallback needed as AgentsController.get_agent_llm_params already handles this
262
+ args = params.copy() if params else {}
263
+
264
+ # Set model name and provider if given in create agent otherwise use global llm defaults
265
+ # AgentsController.get_agent_llm_params
266
+ if self.agent.model_name is not None:
267
+ args["model_name"] = self.agent.model_name
268
+ if self.agent.provider is not None:
269
+ args["provider"] = self.agent.provider
270
+
252
271
  args["embedding_model_provider"] = args.get("embedding_model", get_embedding_model_provider(args))
253
272
 
254
273
  # agent is using current langchain model
@@ -261,11 +280,20 @@ class LangchainAgent:
261
280
  # only update prompt_template if it is set on the model
262
281
  args["prompt_template"] = prompt_template
263
282
 
283
+ # Set default prompt template if not provided
264
284
  if args.get("prompt_template") is None:
285
+ # Default prompt template depends on agent mode
265
286
  if args.get("mode") == "retrieval":
266
287
  args["prompt_template"] = DEFAULT_RAG_PROMPT_TEMPLATE
288
+ logger.info(f"Using default retrieval prompt template: {DEFAULT_RAG_PROMPT_TEMPLATE[:50]}...")
267
289
  else:
268
- raise ValueError("Please provide a `prompt_template` or set `mode=retrieval`")
290
+ # Set a default prompt template for non-retrieval mode
291
+ default_prompt = "you are an assistant, answer using the tables connected"
292
+ args["prompt_template"] = default_prompt
293
+ logger.info(f"Using default prompt template: {default_prompt}")
294
+
295
+ if "prompt_template" in args:
296
+ logger.info(f"Using prompt template: {args['prompt_template'][:50]}...")
269
297
 
270
298
  return args
271
299
 
@@ -318,7 +346,7 @@ class LangchainAgent:
318
346
  self.provider = args.get("provider", get_llm_provider(args))
319
347
 
320
348
  df = df.reset_index(drop=True)
321
- agent = self.create_agent(df, args)
349
+ agent = self.create_agent(df)
322
350
  # Use last message as prompt, remove other questions.
323
351
  user_column = args.get("user_column", USER_COLUMN)
324
352
  df.iloc[:-1, df.columns.get_loc(user_column)] = None
@@ -348,14 +376,17 @@ class LangchainAgent:
348
376
  self.provider = args.get("provider", get_llm_provider(args))
349
377
 
350
378
  df = df.reset_index(drop=True)
351
- agent = self.create_agent(df, args)
379
+ agent = self.create_agent(df)
352
380
  # Use last message as prompt, remove other questions.
353
381
  user_column = args.get("user_column", USER_COLUMN)
354
382
  df.iloc[:-1, df.columns.get_loc(user_column)] = None
355
383
  return self.stream_agent(df, agent, args)
356
384
 
357
- def create_agent(self, df: pd.DataFrame, args: Dict = None) -> AgentExecutor:
385
+ def create_agent(self, df: pd.DataFrame) -> AgentExecutor:
358
386
  # Set up tools.
387
+
388
+ args = self.args
389
+
359
390
  llm = create_chat_model(args)
360
391
  self.llm = llm
361
392
 
@@ -63,11 +63,19 @@ class MCPQueryTool(BaseTool):
63
63
  return loop.run_until_complete(self._arun(query))
64
64
 
65
65
 
66
+ # todo move instantiation to agent controller
66
67
  class MCPLangchainAgent(LangchainAgent):
67
68
  """Extension of LangchainAgent that delegates to MCP server"""
68
69
 
69
- def __init__(self, agent: db.Agents, model: dict = None, mcp_host: str = "127.0.0.1", mcp_port: int = 47337):
70
- super().__init__(agent, model)
70
+ def __init__(
71
+ self,
72
+ agent: db.Agents,
73
+ model: dict = None,
74
+ params: dict = None,
75
+ mcp_host: str = "127.0.0.1",
76
+ mcp_port: int = 47337,
77
+ ):
78
+ super().__init__(agent, model, params)
71
79
  self.mcp_host = mcp_host
72
80
  self.mcp_port = mcp_port
73
81
  self.exit_stack = AsyncExitStack()
@@ -85,7 +93,7 @@ class MCPLangchainAgent(LangchainAgent):
85
93
  server_params = StdioServerParameters(
86
94
  command="python",
87
95
  args=["-m", "mindsdb", "--api=mcp"],
88
- env={"MCP_HOST": self.mcp_host, "MCP_PORT": str(self.mcp_port)}
96
+ env={"MCP_HOST": self.mcp_host, "MCP_PORT": str(self.mcp_port)},
89
97
  )
90
98
 
91
99
  logger.info(f"Connecting to MCP server at {self.mcp_host}:{self.mcp_port}")
@@ -99,7 +107,9 @@ class MCPLangchainAgent(LangchainAgent):
99
107
 
100
108
  # Test the connection by listing tools
101
109
  tools_response = await self.session.list_tools()
102
- logger.info(f"Successfully connected to MCP server. Available tools: {[tool.name for tool in tools_response.tools]}")
110
+ logger.info(
111
+ f"Successfully connected to MCP server. Available tools: {[tool.name for tool in tools_response.tools]}"
112
+ )
103
113
 
104
114
  except Exception as e:
105
115
  logger.error(f"Failed to connect to MCP server: {str(e)}")
@@ -141,7 +151,7 @@ class MCPLangchainAgent(LangchainAgent):
141
151
  response = super().get_completion(messages, stream)
142
152
 
143
153
  # Ensure response is a string (not a DataFrame)
144
- if hasattr(response, 'to_string'): # It's a DataFrame
154
+ if hasattr(response, "to_string"): # It's a DataFrame
145
155
  return response.to_string()
146
156
 
147
157
  return response
@@ -167,7 +177,7 @@ class LiteLLMAgentWrapper:
167
177
  formatted_messages = [
168
178
  {
169
179
  "question": msg["content"] if msg["role"] == "user" else "",
170
- "answer": msg["content"] if msg["role"] == "assistant" else ""
180
+ "answer": msg["content"] if msg["role"] == "assistant" else "",
171
181
  }
172
182
  for msg in messages
173
183
  ]
@@ -177,23 +187,16 @@ class LiteLLMAgentWrapper:
177
187
 
178
188
  # Ensure response is a string
179
189
  if not isinstance(response, str):
180
- if hasattr(response, 'to_string'): # It's a DataFrame
190
+ if hasattr(response, "to_string"): # It's a DataFrame
181
191
  response = response.to_string()
182
192
  else:
183
193
  response = str(response)
184
194
 
185
195
  # Format response in LiteLLM expected format
186
196
  return {
187
- "choices": [
188
- {
189
- "message": {
190
- "role": "assistant",
191
- "content": response
192
- }
193
- }
194
- ],
197
+ "choices": [{"message": {"role": "assistant", "content": response}}],
195
198
  "model": self.agent.args["model_name"],
196
- "object": "chat.completion"
199
+ "object": "chat.completion",
197
200
  }
198
201
 
199
202
  async def acompletion_stream(self, messages: List[Dict[str, str]], **kwargs) -> Iterator[Dict[str, Any]]:
@@ -202,7 +205,7 @@ class LiteLLMAgentWrapper:
202
205
  formatted_messages = [
203
206
  {
204
207
  "question": msg["content"] if msg["role"] == "user" else "",
205
- "answer": msg["content"] if msg["role"] == "assistant" else ""
208
+ "answer": msg["content"] if msg["role"] == "assistant" else "",
206
209
  }
207
210
  for msg in messages
208
211
  ]
@@ -217,7 +220,7 @@ class LiteLLMAgentWrapper:
217
220
  yield {
218
221
  "choices": [{"delta": {"role": "assistant", "content": content}}],
219
222
  "model": model_name,
220
- "object": "chat.completion.chunk"
223
+ "object": "chat.completion.chunk",
221
224
  }
222
225
  # Allow async context switch
223
226
  await asyncio.sleep(0)
@@ -230,7 +233,9 @@ class LiteLLMAgentWrapper:
230
233
  await self.agent.cleanup()
231
234
 
232
235
 
233
- def create_mcp_agent(agent_name: str, project_name: str, mcp_host: str = "127.0.0.1", mcp_port: int = 47337) -> LiteLLMAgentWrapper:
236
+ def create_mcp_agent(
237
+ agent_name: str, project_name: str, mcp_host: str = "127.0.0.1", mcp_port: int = 47337
238
+ ) -> LiteLLMAgentWrapper:
234
239
  """Create an MCP agent and wrap it for LiteLLM compatibility"""
235
240
  from mindsdb.interfaces.agents.agents_controller import AgentsController
236
241
  from mindsdb.interfaces.storage import db
@@ -245,8 +250,11 @@ def create_mcp_agent(agent_name: str, project_name: str, mcp_host: str = "127.0.
245
250
  if agent_db is None:
246
251
  raise ValueError(f"Agent {agent_name} not found in project {project_name}")
247
252
 
248
- # Create MCP agent
249
- mcp_agent = MCPLangchainAgent(agent_db, mcp_host=mcp_host, mcp_port=mcp_port)
253
+ # Get merged parameters (defaults + agent params)
254
+ merged_params = agent_controller.get_agent_llm_params(agent_db.params)
255
+
256
+ # Create MCP agent with merged parameters
257
+ mcp_agent = MCPLangchainAgent(agent_db, params=merged_params, mcp_host=mcp_host, mcp_port=mcp_port)
250
258
 
251
259
  # Wrap for LiteLLM compatibility
252
260
  return LiteLLMAgentWrapper(mcp_agent)
File without changes
@@ -0,0 +1,54 @@
1
+ from typing import List, Optional, Union
2
+
3
+ from mindsdb.integrations.libs.api_handler import MetaAPIHandler
4
+ from mindsdb.integrations.libs.base import MetaDatabaseHandler
5
+ from mindsdb.utilities import log
6
+
7
+
8
+ logger = log.getLogger("mindsdb")
9
+
10
+
11
+ class BaseDataCatalog:
12
+ """
13
+ This is the base class for the Data Catalog interface.
14
+ """
15
+
16
+ def __init__(self, database_name: str, table_names: Optional[List[str]] = None) -> None:
17
+ """
18
+ Initialize the DataCatalogReader.
19
+
20
+ Args:
21
+ database_name (str): The data source to read/write metadata from.
22
+ table_names (Optional[List[str]]): The list of table names to read or write metadata for. If None, all tables will be read or written.
23
+ """
24
+ from mindsdb.api.executor.controllers.session_controller import (
25
+ SessionController,
26
+ )
27
+
28
+ session = SessionController()
29
+
30
+ self.database_name = database_name
31
+ self.data_handler: Union[MetaDatabaseHandler, MetaAPIHandler] = session.integration_controller.get_data_handler(
32
+ database_name
33
+ )
34
+ integration = session.integration_controller.get(database_name)
35
+ self.integration_id = integration["id"]
36
+ self.integration_engine = integration["engine"]
37
+ # TODO: Handle situations where a schema is provided along with the database name, e.g., 'schema.table'.
38
+ # TODO: Handle situations where a file path is provided with integrations like S3, e.g., 'dir/file.csv'.
39
+ self.table_names = table_names
40
+
41
+ self.logger = logger
42
+
43
+ def is_data_catalog_supported(self) -> bool:
44
+ """
45
+ Check if the data catalog is supported for the given database.
46
+
47
+ Returns:
48
+ bool: True if the data catalog is supported, False otherwise.
49
+ """
50
+ if not isinstance(self.data_handler, (MetaDatabaseHandler, MetaAPIHandler)):
51
+ self.logger.warning(f"Data catalog is not supported for the '{self.integration_engine}' integration'. ")
52
+ return False
53
+
54
+ return True
@@ -0,0 +1,359 @@
1
+ from typing import List, Union
2
+
3
+ import pandas as pd
4
+
5
+ from mindsdb.integrations.libs.response import RESPONSE_TYPE
6
+ from mindsdb.interfaces.data_catalog.base_data_catalog import BaseDataCatalog
7
+ from mindsdb.interfaces.storage import db
8
+
9
+
10
+ class DataCatalogLoader(BaseDataCatalog):
11
+ """
12
+ This class is responsible for loading the metadata from a data source (via the handler) and storing it in the data catalog.
13
+ """
14
+
15
+ def load_metadata(self) -> None:
16
+ """
17
+ Load the metadata from the handler and store it in the database.
18
+ """
19
+ if not self.is_data_catalog_supported():
20
+ return
21
+
22
+ loaded_table_names = self._get_loaded_table_names()
23
+
24
+ tables = self._load_table_metadata(loaded_table_names)
25
+
26
+ if tables:
27
+ columns = self._load_column_metadata(tables)
28
+
29
+ self._load_column_statistics(tables, columns)
30
+
31
+ self._load_primary_keys(tables, columns)
32
+
33
+ self._load_foreign_keys(tables, columns)
34
+
35
+ self.logger.info(f"Metadata loading completed for {self.database_name}.")
36
+
37
+ def _get_loaded_table_names(self) -> List[str]:
38
+ """
39
+ Retrieve the names of tables that are already present in the data catalog for the current integration.
40
+ If table_names are provided, only those tables will be checked.
41
+
42
+ Returns:
43
+ List[str]: Names of tables already loaded in the data catalog.
44
+ """
45
+ query = db.session.query(db.MetaTables).filter_by(integration_id=self.integration_id)
46
+ if self.table_names:
47
+ query = query.filter(db.MetaTables.name.in_(self.table_names))
48
+
49
+ tables = query.all()
50
+ table_names = [table.name for table in tables]
51
+
52
+ if table_names:
53
+ self.logger.info(f"Tables already loaded in the data catalog: {', '.join(table_names)}.")
54
+
55
+ return table_names
56
+
57
+ def _load_table_metadata(self, loaded_table_names: List[str] = None) -> List[Union[db.MetaTables, None]]:
58
+ """
59
+ Load the table metadata from the handler.
60
+ """
61
+ self.logger.info(f"Loading tables for {self.database_name}")
62
+ response = self.data_handler.meta_get_tables(self.table_names)
63
+ if response.resp_type != RESPONSE_TYPE.TABLE:
64
+ self.logger.error(f"Failed to load tables for {self.database_name}: {response.error_message}")
65
+ return []
66
+
67
+ df = response.data_frame
68
+ if df.empty:
69
+ self.logger.info(f"No tables to add for {self.database_name}.")
70
+ return []
71
+
72
+ # Filter out tables that are already loaded in the data catalog
73
+ if loaded_table_names:
74
+ df = df[~df["table_name"].isin(loaded_table_names)]
75
+
76
+ if df.empty:
77
+ self.logger.info(f"No new tables to load for {self.database_name}.")
78
+ return []
79
+
80
+ df.columns = df.columns.str.lower()
81
+ tables = self._add_table_metadata(df)
82
+ self.logger.info(f"Tables loaded for {self.database_name}.")
83
+ return tables
84
+
85
+ def _add_table_metadata(self, df: pd.DataFrame) -> List[db.MetaTables]:
86
+ """
87
+ Add the table metadata to the database.
88
+ """
89
+ tables = []
90
+ try:
91
+ for row in df.to_dict(orient="records"):
92
+ # Convert the distinct_values_count to an integer if it is not NaN, otherwise set it to None.
93
+ val = row.get("row_count")
94
+ row_count = int(val) if pd.notna(val) else None
95
+
96
+ record = db.MetaTables(
97
+ integration_id=self.integration_id,
98
+ name=row.get("table_name") or row.get("name"),
99
+ schema=row.get("table_schema"),
100
+ description=row.get("table_description"),
101
+ type=row.get("table_type"),
102
+ row_count=row_count,
103
+ )
104
+ tables.append(record)
105
+
106
+ db.session.add_all(tables)
107
+ db.session.commit()
108
+ except Exception as e:
109
+ self.logger.error(f"Failed to add tables: {e}")
110
+ db.session.rollback()
111
+ raise
112
+ return tables
113
+
114
+ def _load_column_metadata(self, tables: db.MetaTables) -> List[db.MetaColumns]:
115
+ """
116
+ Load the column metadata from the handler.
117
+ """
118
+ self.logger.info(f"Loading columns for {self.database_name}")
119
+ response = self.data_handler.meta_get_columns(self.table_names)
120
+ if response.resp_type != RESPONSE_TYPE.TABLE:
121
+ self.logger.error(f"Failed to load columns for {self.database_name}: {response.error_message}")
122
+ return []
123
+
124
+ df = response.data_frame
125
+ if df.empty:
126
+ self.logger.info(f"No columns to load for {self.database_name}.")
127
+ return []
128
+
129
+ df.columns = df.columns.str.lower()
130
+ columns = self._add_column_metadata(df, tables)
131
+ self.logger.info(f"Columns loaded for {self.database_name}.")
132
+ return columns
133
+
134
+ def _add_column_metadata(self, df: pd.DataFrame, tables: db.MetaTables) -> List[db.MetaColumns]:
135
+ """
136
+ Add the column metadata to the database.
137
+ """
138
+ columns = []
139
+ try:
140
+ for row in df.to_dict(orient="records"):
141
+ record = db.MetaColumns(
142
+ table_id=next((table.id for table in tables if table.name == row.get("table_name"))),
143
+ name=row.get("column_name"),
144
+ data_type=row.get("data_type"),
145
+ default_value=row.get("column_default"),
146
+ description=row.get("description"),
147
+ is_nullable=row.get("is_nullable"),
148
+ )
149
+ columns.append(record)
150
+
151
+ db.session.add_all(columns)
152
+ db.session.commit()
153
+ except Exception as e:
154
+ self.logger.error(f"Failed to add columns: {e}")
155
+ db.session.rollback()
156
+ raise
157
+ return columns
158
+
159
+ def _load_column_statistics(self, tables: db.MetaTables, columns: db.MetaColumns) -> None:
160
+ """
161
+ Load the column statistics metadata from the handler.
162
+ """
163
+ self.logger.info(f"Loading column statistics for {self.database_name}")
164
+ response = self.data_handler.meta_get_column_statistics(self.table_names)
165
+ if response.resp_type != RESPONSE_TYPE.TABLE:
166
+ self.logger.error(f"Failed to load column statistics for {self.database_name}: {response.error_message}")
167
+ return
168
+
169
+ df = response.data_frame
170
+ if df.empty:
171
+ self.logger.info(f"No column statistics to load for {self.database_name}.")
172
+ return
173
+
174
+ df.columns = df.columns.str.lower()
175
+ self._add_column_statistics(df, tables, columns)
176
+ self.logger.info(f"Column statistics loaded for {self.database_name}.")
177
+
178
+ def _add_column_statistics(self, df: pd.DataFrame, tables: db.MetaTables, columns: db.MetaColumns) -> None:
179
+ """
180
+ Add the column statistics metadata to the database.
181
+ """
182
+ column_statistics = []
183
+ try:
184
+ for row in df.to_dict(orient="records"):
185
+ table_id = next((table.id for table in tables if table.name == row.get("table_name")))
186
+ column_id = next(
187
+ (
188
+ column.id
189
+ for column in columns
190
+ if column.name == row.get("column_name") and column.table_id == table_id
191
+ )
192
+ )
193
+
194
+ # Convert the distinct_values_count to an integer if it is not NaN, otherwise set it to None.
195
+ val = row.get("distinct_values_count")
196
+ distinct_values_count = int(val) if pd.notna(val) else None
197
+
198
+ # Convert the most_common_frequencies to a list of strings.
199
+ most_common_frequencies = [str(val) for val in row.get("most_common_frequencies") or []]
200
+
201
+ record = db.MetaColumnStatistics(
202
+ column_id=column_id,
203
+ most_common_values=row.get("most_common_values"),
204
+ most_common_frequencies=most_common_frequencies,
205
+ null_percentage=row.get("null_percentage"),
206
+ distinct_values_count=distinct_values_count,
207
+ minimum_value=row.get("minimum_value"),
208
+ maximum_value=row.get("maximum_value"),
209
+ )
210
+ column_statistics.append(record)
211
+
212
+ db.session.add_all(column_statistics)
213
+ db.session.commit()
214
+ except Exception as e:
215
+ self.logger.error(f"Failed to add column statistics: {e}")
216
+ db.session.rollback()
217
+ raise
218
+
219
+ def _load_primary_keys(self, tables: db.MetaTables, columns: db.MetaColumns) -> None:
220
+ """
221
+ Load the primary keys metadata from the handler.
222
+ """
223
+ self.logger.info(f"Loading primary keys for {self.database_name}")
224
+ response = self.data_handler.meta_get_primary_keys(self.table_names)
225
+ if response.resp_type != RESPONSE_TYPE.TABLE:
226
+ self.logger.error(f"Failed to load primary keys for {self.database_name}: {response.error_message}")
227
+ return
228
+
229
+ df = response.data_frame
230
+ if df.empty:
231
+ self.logger.info(f"No primary keys to load for {self.database_name}.")
232
+ return
233
+
234
+ df.columns = df.columns.str.lower()
235
+ self._add_primary_keys(df, tables, columns)
236
+ self.logger.info(f"Primary keys loaded for {self.database_name}.")
237
+
238
+ def _add_primary_keys(self, df: pd.DataFrame, tables: db.MetaTables, columns: db.MetaColumns) -> None:
239
+ """
240
+ Add the primary keys metadata to the database.
241
+ """
242
+ primary_keys = []
243
+ try:
244
+ for row in df.to_dict(orient="records"):
245
+ table_id = next((table.id for table in tables if table.name == row.get("table_name")))
246
+ column_id = next(
247
+ (
248
+ column.id
249
+ for column in columns
250
+ if column.name == row.get("column_name") and column.table_id == table_id
251
+ )
252
+ )
253
+
254
+ record = db.MetaPrimaryKeys(
255
+ table_id=table_id,
256
+ column_id=column_id,
257
+ constraint_name=row.get("constraint_name"),
258
+ )
259
+ primary_keys.append(record)
260
+
261
+ db.session.add_all(primary_keys)
262
+ db.session.commit()
263
+ except Exception as e:
264
+ self.logger.error(f"Failed to add primary keys: {e}")
265
+ db.session.rollback()
266
+ raise
267
+
268
+ def _load_foreign_keys(self, tables: db.MetaTables, columns: db.MetaColumns) -> None:
269
+ """
270
+ Load the foreign keys metadata from the handler.
271
+ """
272
+ self.logger.info(f"Loading foreign keys for {self.database_name}")
273
+ response = self.data_handler.meta_get_foreign_keys(self.table_names)
274
+ if response.resp_type != RESPONSE_TYPE.TABLE:
275
+ self.logger.error(f"Failed to foreign keys for {self.database_name}: {response.error_message}")
276
+ return
277
+
278
+ df = response.data_frame
279
+ if df.empty:
280
+ self.logger.info(f"No foreign keys to load for {self.database_name}.")
281
+ return
282
+
283
+ df.columns = df.columns.str.lower()
284
+ self._add_foreign_keys(df, tables, columns)
285
+ self.logger.info(f"Foreign keys loaded for {self.database_name}.")
286
+
287
+ def _add_foreign_keys(self, df: pd.DataFrame, tables: db.MetaTables, columns: db.MetaColumns) -> None:
288
+ """
289
+ Add the foreign keys metadata to the database.
290
+ """
291
+ foreign_keys = []
292
+ try:
293
+ for row in df.to_dict(orient="records"):
294
+ try:
295
+ parent_table_id = next((table.id for table in tables if table.name == row.get("parent_table_name")))
296
+ parent_column_id = next(
297
+ (
298
+ column.id
299
+ for column in columns
300
+ if column.name == row.get("parent_column_name") and column.table_id == parent_table_id
301
+ )
302
+ )
303
+ child_table_id = next((table.id for table in tables if table.name == row.get("child_table_name")))
304
+ child_column_id = next(
305
+ (
306
+ column.id
307
+ for column in columns
308
+ if column.name == row.get("child_column_name") and column.table_id == child_table_id
309
+ )
310
+ )
311
+ except StopIteration:
312
+ self.logger.warning(
313
+ f"The foreign key relationship for {row.get('parent_table_name')} -> {row.get('child_table_name')} "
314
+ f"could not be established. One or more tables or columns may not exist in the metadata."
315
+ )
316
+ continue
317
+
318
+ record = db.MetaForeignKeys(
319
+ parent_table_id=parent_table_id,
320
+ parent_column_id=parent_column_id,
321
+ child_table_id=child_table_id,
322
+ child_column_id=child_column_id,
323
+ constraint_name=row.get("constraint_name"),
324
+ )
325
+ foreign_keys.append(record)
326
+
327
+ db.session.add_all(foreign_keys)
328
+ db.session.commit()
329
+ except Exception as e:
330
+ self.logger.error(f"Failed to add foreign keys: {e}")
331
+ db.session.rollback()
332
+ raise
333
+
334
+ def unload_metadata(self) -> None:
335
+ """
336
+ Remove the metadata for the specified database from the data catalog.
337
+ """
338
+ if not self.is_data_catalog_supported():
339
+ return
340
+
341
+ meta_tables = db.session.query(db.MetaTables).filter_by(integration_id=self.integration_id).all()
342
+
343
+ if not meta_tables:
344
+ self.logger.info(f"No metadata found for {self.database_name}. Nothing to remove.")
345
+ return
346
+
347
+ for table in meta_tables:
348
+ db.session.query(db.MetaPrimaryKeys).filter_by(table_id=table.id).delete()
349
+ db.session.query(db.MetaForeignKeys).filter(
350
+ (db.MetaForeignKeys.parent_table_id == table.id) | (db.MetaForeignKeys.child_table_id == table.id)
351
+ ).delete()
352
+ meta_columns = db.session.query(db.MetaColumns).filter_by(table_id=table.id).all()
353
+ for col in meta_columns:
354
+ db.session.query(db.MetaColumnStatistics).filter_by(column_id=col.id).delete()
355
+ db.session.delete(col)
356
+
357
+ db.session.delete(table)
358
+ db.session.commit()
359
+ self.logger.info(f"Metadata for {self.database_name} removed successfully.")