MindsDB 25.5.4.2__py3-none-any.whl → 25.6.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (69) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/a2a/agent.py +28 -25
  3. mindsdb/api/a2a/common/server/server.py +32 -26
  4. mindsdb/api/executor/command_executor.py +69 -14
  5. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +49 -65
  6. mindsdb/api/executor/datahub/datanodes/project_datanode.py +29 -48
  7. mindsdb/api/executor/datahub/datanodes/system_tables.py +35 -61
  8. mindsdb/api/executor/planner/plan_join.py +67 -77
  9. mindsdb/api/executor/planner/query_planner.py +176 -155
  10. mindsdb/api/executor/planner/steps.py +37 -12
  11. mindsdb/api/executor/sql_query/result_set.py +45 -64
  12. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +14 -18
  13. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +17 -18
  14. mindsdb/api/executor/sql_query/steps/insert_step.py +13 -33
  15. mindsdb/api/executor/sql_query/steps/subselect_step.py +43 -35
  16. mindsdb/api/executor/utilities/sql.py +42 -48
  17. mindsdb/api/http/namespaces/config.py +1 -1
  18. mindsdb/api/http/namespaces/file.py +14 -23
  19. mindsdb/api/mysql/mysql_proxy/data_types/mysql_datum.py +12 -28
  20. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/binary_resultset_row_package.py +59 -50
  21. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/resultset_row_package.py +9 -8
  22. mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +449 -461
  23. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +87 -36
  24. mindsdb/integrations/handlers/file_handler/file_handler.py +15 -9
  25. mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +43 -24
  26. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +10 -3
  27. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +26 -33
  28. mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +74 -51
  29. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +305 -98
  30. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +53 -34
  31. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +136 -6
  32. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +334 -83
  33. mindsdb/integrations/libs/api_handler.py +261 -57
  34. mindsdb/integrations/libs/base.py +100 -29
  35. mindsdb/integrations/utilities/files/file_reader.py +99 -73
  36. mindsdb/integrations/utilities/handler_utils.py +23 -8
  37. mindsdb/integrations/utilities/sql_utils.py +35 -40
  38. mindsdb/interfaces/agents/agents_controller.py +196 -192
  39. mindsdb/interfaces/agents/constants.py +7 -1
  40. mindsdb/interfaces/agents/langchain_agent.py +42 -11
  41. mindsdb/interfaces/agents/mcp_client_agent.py +29 -21
  42. mindsdb/interfaces/data_catalog/__init__.py +0 -0
  43. mindsdb/interfaces/data_catalog/base_data_catalog.py +54 -0
  44. mindsdb/interfaces/data_catalog/data_catalog_loader.py +359 -0
  45. mindsdb/interfaces/data_catalog/data_catalog_reader.py +34 -0
  46. mindsdb/interfaces/database/database.py +81 -57
  47. mindsdb/interfaces/database/integrations.py +220 -234
  48. mindsdb/interfaces/database/log.py +72 -104
  49. mindsdb/interfaces/database/projects.py +156 -193
  50. mindsdb/interfaces/file/file_controller.py +21 -65
  51. mindsdb/interfaces/knowledge_base/controller.py +63 -10
  52. mindsdb/interfaces/knowledge_base/evaluate.py +519 -0
  53. mindsdb/interfaces/knowledge_base/llm_client.py +75 -0
  54. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +83 -43
  55. mindsdb/interfaces/skills/skills_controller.py +54 -36
  56. mindsdb/interfaces/skills/sql_agent.py +109 -86
  57. mindsdb/interfaces/storage/db.py +223 -79
  58. mindsdb/migrations/versions/2025-05-28_a44643042fe8_added_data_catalog_tables.py +118 -0
  59. mindsdb/migrations/versions/2025-06-09_608e376c19a7_updated_data_catalog_data_types.py +58 -0
  60. mindsdb/utilities/config.py +9 -2
  61. mindsdb/utilities/log.py +35 -26
  62. mindsdb/utilities/ml_task_queue/task.py +19 -22
  63. mindsdb/utilities/render/sqlalchemy_render.py +129 -181
  64. mindsdb/utilities/starters.py +40 -0
  65. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/METADATA +253 -253
  66. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/RECORD +69 -61
  67. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/WHEEL +0 -0
  68. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/licenses/LICENSE +0 -0
  69. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ from typing import Dict, List
5
5
  import numpy as np
6
6
  from sqlalchemy import (
7
7
  JSON,
8
+ BigInteger,
8
9
  Boolean,
9
10
  Column,
10
11
  DateTime,
@@ -16,7 +17,7 @@ from sqlalchemy import (
16
17
  UniqueConstraint,
17
18
  create_engine,
18
19
  text,
19
- types
20
+ types,
20
21
  )
21
22
  from sqlalchemy.exc import OperationalError
22
23
  from sqlalchemy.orm import (
@@ -45,7 +46,7 @@ session, engine = None, None
45
46
  def init(connection_str: str = None):
46
47
  global Base, session, engine
47
48
  if connection_str is None:
48
- connection_str = config['storage_db']
49
+ connection_str = config["storage_db"]
49
50
  base_args = {
50
51
  "pool_size": 30,
51
52
  "max_overflow": 200,
@@ -144,15 +145,11 @@ class Predictor(Base):
144
145
  __tablename__ = "predictor"
145
146
 
146
147
  id = Column(Integer, primary_key=True)
147
- updated_at = Column(
148
- DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
149
- )
148
+ updated_at = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
150
149
  created_at = Column(DateTime, default=datetime.datetime.now)
151
150
  deleted_at = Column(DateTime)
152
151
  name = Column(String)
153
- data = Column(
154
- Json
155
- ) # A JSON -- should be everything returned by `get_model_data`, I think
152
+ data = Column(Json) # A JSON -- should be everything returned by `get_model_data`, I think
156
153
  to_predict = Column(Array)
157
154
  company_id = Column(Integer)
158
155
  mindsdb_version = Column(String)
@@ -173,9 +170,7 @@ class Predictor(Base):
173
170
  code = Column(String, nullable=True)
174
171
  lightwood_version = Column(String, nullable=True)
175
172
  dtype_dict = Column(Json, nullable=True)
176
- project_id = Column(
177
- Integer, ForeignKey("project.id", name="fk_project_id"), nullable=False
178
- )
173
+ project_id = Column(Integer, ForeignKey("project.id", name="fk_project_id"), nullable=False)
179
174
  training_phase_current = Column(Integer)
180
175
  training_phase_total = Column(Integer)
181
176
  training_phase_name = Column(String)
@@ -199,7 +194,7 @@ Index(
199
194
  Predictor.version,
200
195
  Predictor.active,
201
196
  Predictor.deleted_at, # would be good to have here nullsfirst(Predictor.deleted_at)
202
- unique=True
197
+ unique=True,
203
198
  )
204
199
 
205
200
 
@@ -208,34 +203,27 @@ class Project(Base):
208
203
 
209
204
  id = Column(Integer, primary_key=True)
210
205
  created_at = Column(DateTime, default=datetime.datetime.now)
211
- updated_at = Column(
212
- DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
213
- )
206
+ updated_at = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
214
207
  deleted_at = Column(DateTime)
215
208
  name = Column(String, nullable=False)
216
209
  company_id = Column(Integer, default=0)
217
210
  metadata_: dict = Column("metadata", JSON, nullable=True)
218
- __table_args__ = (
219
- UniqueConstraint("name", "company_id", name="unique_project_name_company_id"),
220
- )
211
+ __table_args__ = (UniqueConstraint("name", "company_id", name="unique_project_name_company_id"),)
221
212
 
222
213
 
223
214
  class Integration(Base):
224
215
  __tablename__ = "integration"
225
216
  id = Column(Integer, primary_key=True)
226
- updated_at = Column(
227
- DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
228
- )
217
+ updated_at = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
229
218
  created_at = Column(DateTime, default=datetime.datetime.now)
230
219
  name = Column(String, nullable=False)
231
220
  engine = Column(String, nullable=False)
232
221
  data = Column(Json)
233
222
  company_id = Column(Integer)
234
- __table_args__ = (
235
- UniqueConstraint(
236
- "name", "company_id", name="unique_integration_name_company_id"
237
- ),
238
- )
223
+
224
+ meta_tables = relationship("MetaTables", back_populates="integration")
225
+
226
+ __table_args__ = (UniqueConstraint("name", "company_id", name="unique_integration_name_company_id"),)
239
227
 
240
228
 
241
229
  class File(Base):
@@ -249,12 +237,8 @@ class File(Base):
249
237
  columns = Column(Json, nullable=False)
250
238
  created_at = Column(DateTime, default=datetime.datetime.now)
251
239
  metadata_: dict = Column("metadata", JSON, nullable=True)
252
- updated_at = Column(
253
- DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
254
- )
255
- __table_args__ = (
256
- UniqueConstraint("name", "company_id", name="unique_file_name_company_id"),
257
- )
240
+ updated_at = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
241
+ __table_args__ = (UniqueConstraint("name", "company_id", name="unique_file_name_company_id"),)
258
242
 
259
243
 
260
244
  class View(Base):
@@ -263,12 +247,8 @@ class View(Base):
263
247
  name = Column(String, nullable=False)
264
248
  company_id = Column(Integer)
265
249
  query = Column(String, nullable=False)
266
- project_id = Column(
267
- Integer, ForeignKey("project.id", name="fk_project_id"), nullable=False
268
- )
269
- __table_args__ = (
270
- UniqueConstraint("name", "company_id", name="unique_view_name_company_id"),
271
- )
250
+ project_id = Column(Integer, ForeignKey("project.id", name="fk_project_id"), nullable=False)
251
+ __table_args__ = (UniqueConstraint("name", "company_id", name="unique_view_name_company_id"),)
272
252
 
273
253
 
274
254
  class JsonStorage(Base):
@@ -310,9 +290,7 @@ class Jobs(Base):
310
290
  schedule_str = Column(String)
311
291
 
312
292
  deleted_at = Column(DateTime)
313
- updated_at = Column(
314
- DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
315
- )
293
+ updated_at = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
316
294
  created_at = Column(DateTime, default=datetime.datetime.now)
317
295
 
318
296
 
@@ -331,9 +309,7 @@ class JobsHistory(Base):
331
309
  created_at = Column(DateTime, default=datetime.datetime.now)
332
310
  updated_at = Column(DateTime, default=datetime.datetime.now)
333
311
 
334
- __table_args__ = (
335
- UniqueConstraint("job_id", "start_at", name="uniq_job_history_job_id_start"),
336
- )
312
+ __table_args__ = (UniqueConstraint("job_id", "start_at", name="uniq_job_history_job_id_start"),)
337
313
 
338
314
 
339
315
  class ChatBots(Base):
@@ -349,9 +325,7 @@ class ChatBots(Base):
349
325
  database_id = Column(Integer)
350
326
  params = Column(JSON)
351
327
 
352
- updated_at = Column(
353
- DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
354
- )
328
+ updated_at = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
355
329
  created_at = Column(DateTime, default=datetime.datetime.now)
356
330
  webhook_token = Column(String)
357
331
 
@@ -393,9 +367,7 @@ class Triggers(Base):
393
367
  query_str = Column(String, nullable=False)
394
368
  columns = Column(String) # list of columns separated by delimiter
395
369
 
396
- updated_at = Column(
397
- DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
398
- )
370
+ updated_at = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
399
371
  created_at = Column(DateTime, default=datetime.datetime.now)
400
372
 
401
373
 
@@ -417,9 +389,7 @@ class Tasks(Base):
417
389
  run_by = Column(String)
418
390
  alive_time = Column(DateTime(timezone=True))
419
391
 
420
- updated_at = Column(
421
- DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
422
- )
392
+ updated_at = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
423
393
  created_at = Column(DateTime, default=datetime.datetime.now)
424
394
 
425
395
 
@@ -444,9 +414,7 @@ class Skills(Base):
444
414
  params = Column(JSON)
445
415
 
446
416
  created_at = Column(DateTime, default=datetime.datetime.now)
447
- updated_at = Column(
448
- DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
449
- )
417
+ updated_at = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
450
418
  deleted_at = Column(DateTime)
451
419
 
452
420
  def as_dict(self) -> Dict:
@@ -475,9 +443,7 @@ class Agents(Base):
475
443
  provider = Column(String, nullable=True)
476
444
  params = Column(JSON)
477
445
 
478
- updated_at = Column(
479
- DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
480
- )
446
+ updated_at = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
481
447
  created_at = Column(DateTime, default=datetime.datetime.now)
482
448
  deleted_at = Column(DateTime)
483
449
 
@@ -520,21 +486,13 @@ class KnowledgeBase(Base):
520
486
  doc="fk to the embedding model",
521
487
  )
522
488
 
523
- embedding_model = relationship(
524
- "Predictor", foreign_keys=[embedding_model_id], doc="embedding model"
525
- )
489
+ embedding_model = relationship("Predictor", foreign_keys=[embedding_model_id], doc="embedding model")
526
490
  query_id = Column(Integer, nullable=True)
527
491
 
528
492
  created_at = Column(DateTime, default=datetime.datetime.now)
529
- updated_at = Column(
530
- DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
531
- )
493
+ updated_at = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
532
494
 
533
- __table_args__ = (
534
- UniqueConstraint(
535
- "name", "project_id", name="unique_knowledge_base_name_project_id"
536
- ),
537
- )
495
+ __table_args__ = (UniqueConstraint("name", "project_id", name="unique_knowledge_base_name_project_id"),)
538
496
 
539
497
  def as_dict(self) -> Dict:
540
498
  return {
@@ -546,7 +504,7 @@ class KnowledgeBase(Base):
546
504
  "vector_database_table": self.vector_database_table,
547
505
  "updated_at": self.updated_at,
548
506
  "created_at": self.created_at,
549
- "params": self.params
507
+ "params": self.params,
550
508
  }
551
509
 
552
510
 
@@ -559,9 +517,7 @@ class QueryContext(Base):
559
517
  context_name: str = Column(String, nullable=False)
560
518
  values: dict = Column(JSON)
561
519
 
562
- updated_at: datetime.datetime = Column(
563
- DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
564
- )
520
+ updated_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
565
521
  created_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
566
522
 
567
523
 
@@ -581,9 +537,7 @@ class Queries(Base):
581
537
  processed_rows = Column(Integer, default=0)
582
538
  error: str = Column(String, nullable=True)
583
539
 
584
- updated_at: datetime.datetime = Column(
585
- DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
586
- )
540
+ updated_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now)
587
541
  created_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
588
542
 
589
543
 
@@ -610,10 +564,11 @@ class LLMLog(Base):
610
564
 
611
565
 
612
566
  class LLMData(Base):
613
- '''
567
+ """
614
568
  Stores the question/answer pairs of an LLM call so examples can be used
615
569
  for self improvement with DSPy
616
- '''
570
+ """
571
+
617
572
  __tablename__ = "llm_data"
618
573
  id: int = Column(Integer, primary_key=True)
619
574
  input: str = Column(String, nullable=False)
@@ -621,3 +576,192 @@ class LLMData(Base):
621
576
  model_id: int = Column(Integer, nullable=False)
622
577
  created_at: datetime = Column(DateTime, default=datetime.datetime.now)
623
578
  updated_at: datetime = Column(DateTime, onupdate=datetime.datetime.now)
579
+
580
+
581
+ # Data Catalog
582
+ class MetaTables(Base):
583
+ __tablename__ = "meta_tables"
584
+ id: int = Column(Integer, primary_key=True)
585
+
586
+ integration_id: int = Column(Integer, ForeignKey("integration.id"))
587
+ integration = relationship("Integration", back_populates="meta_tables")
588
+
589
+ name: str = Column(String, nullable=False)
590
+ schema: str = Column(String, nullable=True)
591
+ description: str = Column(String, nullable=True)
592
+ type: str = Column(String, nullable=True)
593
+ row_count: int = Column(BigInteger, nullable=True)
594
+
595
+ meta_columns: Mapped[List["MetaColumns"]] = relationship("MetaColumns", back_populates="meta_tables")
596
+ meta_primary_keys: Mapped[List["MetaPrimaryKeys"]] = relationship("MetaPrimaryKeys", back_populates="meta_tables")
597
+ meta_foreign_keys_parents: Mapped[List["MetaForeignKeys"]] = relationship(
598
+ "MetaForeignKeys", foreign_keys="MetaForeignKeys.parent_table_id", back_populates="parent_table"
599
+ )
600
+ meta_foreign_keys_children: Mapped[List["MetaForeignKeys"]] = relationship(
601
+ "MetaForeignKeys", foreign_keys="MetaForeignKeys.child_table_id", back_populates="child_table"
602
+ )
603
+
604
+ def as_string(self, indent: int = 0) -> str:
605
+ pad = " " * indent
606
+
607
+ table_info = f"`{self.integration.name}`.`{self.name}` ({self.type})"
608
+
609
+ if self.description:
610
+ table_info += f" : {self.description}"
611
+
612
+ if self.schema:
613
+ table_info += f"\n{pad}Schema: {self.schema}"
614
+
615
+ if self.row_count and self.row_count > 0:
616
+ table_info += f"\n{pad}Estimated Row Count: {self.row_count}"
617
+
618
+ if self.meta_primary_keys:
619
+ table_info += f"\n{pad}Primary Keys (in defined order): {', '.join([pk.as_string() for pk in self.meta_primary_keys])}"
620
+
621
+ if self.meta_columns:
622
+ table_info += f"\n\n{pad}Columns:"
623
+ for index, column in enumerate(self.meta_columns, start=1):
624
+ table_info += f"\n{index}. {column.as_string(indent + 4)}\n"
625
+
626
+ if self.meta_foreign_keys_children:
627
+ table_info += f"\n\n{pad}Key Relationships:"
628
+ for fk in self.meta_foreign_keys_children:
629
+ table_info += f"\n{pad} {fk.as_string()}"
630
+
631
+ return table_info
632
+
633
+
634
+ class MetaColumns(Base):
635
+ __tablename__ = "meta_columns"
636
+ id: int = Column(Integer, primary_key=True)
637
+
638
+ table_id: int = Column(Integer, ForeignKey("meta_tables.id"))
639
+ meta_tables = relationship("MetaTables", back_populates="meta_columns")
640
+
641
+ name: str = Column(String, nullable=False)
642
+ data_type: str = Column(String, nullable=False)
643
+ description: str = Column(String, nullable=True)
644
+ default_value: str = Column(String, nullable=True)
645
+ is_nullable: bool = Column(Boolean, nullable=True)
646
+
647
+ meta_column_statistics: Mapped[List["MetaColumnStatistics"]] = relationship(
648
+ "MetaColumnStatistics", back_populates="meta_columns"
649
+ )
650
+ meta_primary_keys: Mapped[List["MetaPrimaryKeys"]] = relationship("MetaPrimaryKeys", back_populates="meta_columns")
651
+ meta_foreign_keys_parents: Mapped[List["MetaForeignKeys"]] = relationship(
652
+ "MetaForeignKeys", foreign_keys="MetaForeignKeys.parent_column_id", back_populates="parent_column"
653
+ )
654
+ meta_foreign_keys_children: Mapped[List["MetaForeignKeys"]] = relationship(
655
+ "MetaForeignKeys", foreign_keys="MetaForeignKeys.child_column_id", back_populates="child_column"
656
+ )
657
+
658
+ def as_string(self, indent: int = 0) -> str:
659
+ pad = " " * indent
660
+
661
+ column_info = f"{self.name} ({self.data_type}):"
662
+ if self.description:
663
+ column_info += f"\n{pad}Description: {self.description}"
664
+
665
+ if self.is_nullable:
666
+ column_info += f"\n{pad}- Nullable: Yes"
667
+
668
+ if self.default_value:
669
+ column_info += f"\n{pad}- Default Value: {self.default_value}"
670
+
671
+ if self.meta_column_statistics:
672
+ column_info += f"\n\n{pad}- Column Statistics:"
673
+ column_info += f"\n{self.meta_column_statistics[0].as_string(indent + 4)}"
674
+
675
+ return column_info
676
+
677
+
678
+ class MetaColumnStatistics(Base):
679
+ __tablename__ = "meta_column_statistics"
680
+ column_id: int = Column(Integer, ForeignKey("meta_columns.id"), primary_key=True)
681
+ meta_columns = relationship("MetaColumns", back_populates="meta_column_statistics")
682
+
683
+ most_common_values: str = Column(Array, nullable=True)
684
+ most_common_frequencies: str = Column(Array, nullable=True)
685
+ null_percentage: float = Column(Numeric(5, 2), nullable=True)
686
+ distinct_values_count: int = Column(BigInteger, nullable=True)
687
+ minimum_value: str = Column(String, nullable=True)
688
+ maximum_value: str = Column(String, nullable=True)
689
+
690
+ def as_string(self, indent: int = 0) -> str:
691
+ pad = " " * indent
692
+ inner_pad = " " * (indent + 4)
693
+
694
+ column_statistics = ""
695
+
696
+ if any(self.most_common_values) and any(self.most_common_frequencies):
697
+ column_statistics += f"{pad}- Top 10 Most Common Values and Frequencies:"
698
+ for i in range(min(10, len(self.most_common_values))):
699
+ freq = self.most_common_frequencies[i]
700
+ try:
701
+ percent = float(freq) * 100
702
+ freq_str = f"{percent:.2f}%"
703
+ except (ValueError, TypeError):
704
+ freq_str = str(freq)
705
+
706
+ column_statistics += f"\n{inner_pad}- {self.most_common_values[i]}: {freq_str}"
707
+ column_statistics += "\n"
708
+
709
+ if self.null_percentage:
710
+ column_statistics += f"{pad}- Null Percentage: {self.null_percentage}\n"
711
+
712
+ if self.distinct_values_count:
713
+ column_statistics += f"{pad}- No. of Distinct Values: {self.distinct_values_count}\n"
714
+
715
+ if self.minimum_value:
716
+ column_statistics += f"{pad}- Minimum Value: {self.minimum_value}\n"
717
+
718
+ if self.maximum_value:
719
+ column_statistics += f"{pad}- Maximum Value: {self.maximum_value}"
720
+
721
+ return column_statistics
722
+
723
+
724
+ class MetaPrimaryKeys(Base):
725
+ __tablename__ = "meta_primary_keys"
726
+ table_id: int = Column(Integer, ForeignKey("meta_tables.id"), primary_key=True)
727
+ meta_tables = relationship("MetaTables", back_populates="meta_primary_keys")
728
+
729
+ column_id: int = Column(Integer, ForeignKey("meta_columns.id"), primary_key=True)
730
+ meta_columns = relationship("MetaColumns", back_populates="meta_primary_keys")
731
+
732
+ ordinal_position: int = Column(Integer, nullable=True)
733
+ constraint_name: str = Column(String, nullable=True)
734
+
735
+ def as_string(self) -> str:
736
+ pk_list = sorted(
737
+ self.meta_tables.meta_primary_keys,
738
+ key=lambda pk: pk.ordinal_position if pk.ordinal_position is not None else 0,
739
+ )
740
+
741
+ return ", ".join(f"{pk.meta_columns.name} ({pk.meta_columns.data_type})" for pk in pk_list)
742
+
743
+
744
+ class MetaForeignKeys(Base):
745
+ __tablename__ = "meta_foreign_keys"
746
+ parent_table_id: int = Column(Integer, ForeignKey("meta_tables.id"), primary_key=True)
747
+ parent_table = relationship(
748
+ "MetaTables", back_populates="meta_foreign_keys_parents", foreign_keys=[parent_table_id]
749
+ )
750
+
751
+ parent_column_id: int = Column(Integer, ForeignKey("meta_columns.id"), primary_key=True)
752
+ parent_column = relationship(
753
+ "MetaColumns", back_populates="meta_foreign_keys_parents", foreign_keys=[parent_column_id]
754
+ )
755
+
756
+ child_table_id: int = Column(Integer, ForeignKey("meta_tables.id"), primary_key=True)
757
+ child_table = relationship("MetaTables", back_populates="meta_foreign_keys_children", foreign_keys=[child_table_id])
758
+
759
+ child_column_id: int = Column(Integer, ForeignKey("meta_columns.id"), primary_key=True)
760
+ child_column = relationship(
761
+ "MetaColumns", back_populates="meta_foreign_keys_children", foreign_keys=[child_column_id]
762
+ )
763
+
764
+ constraint_name: str = Column(String, nullable=True)
765
+
766
+ def as_string(self) -> str:
767
+ return f"{self.child_column.name} in {self.child_table.name} references {self.parent_column.name} in {self.parent_table.name}"
@@ -0,0 +1,118 @@
1
+ """added data catalog tables
2
+
3
+ Revision ID: a44643042fe8
4
+ Revises: 9f150e4f9a05
5
+ Create Date: 2025-05-28 17:20:57.300313
6
+
7
+ """
8
+
9
+ from alembic import op
10
+ import sqlalchemy as sa
11
+ import mindsdb.interfaces.storage.db # noqa
12
+ from mindsdb.interfaces.storage.db import Array
13
+
14
+
15
+ # revision identifiers, used by Alembic.
16
+ revision = "a44643042fe8"
17
+ down_revision = "9f150e4f9a05"
18
+ branch_labels = None
19
+ depends_on = None
20
+
21
+
22
+ def upgrade():
23
+ op.create_table(
24
+ "meta_tables",
25
+ sa.Column("id", sa.Integer(), primary_key=True),
26
+ sa.Column(
27
+ "integration_id",
28
+ sa.Integer(),
29
+ sa.ForeignKey("integration.id"),
30
+ nullable=False,
31
+ ),
32
+ sa.Column("name", sa.String(), nullable=False),
33
+ sa.Column("schema", sa.String(), nullable=True),
34
+ sa.Column("description", sa.String(), nullable=True),
35
+ sa.Column("type", sa.String(), nullable=True),
36
+ sa.Column("row_count", sa.Integer(), nullable=True),
37
+ )
38
+
39
+ op.create_table(
40
+ "meta_columns",
41
+ sa.Column("id", sa.Integer(), primary_key=True),
42
+ sa.Column("table_id", sa.Integer(), sa.ForeignKey("meta_tables.id"), nullable=False),
43
+ sa.Column("name", sa.String(), nullable=False),
44
+ sa.Column("data_type", sa.String(), nullable=False),
45
+ sa.Column("default_value", sa.String(), nullable=True),
46
+ sa.Column("description", sa.String(), nullable=True),
47
+ sa.Column("is_nullable", sa.Boolean(), nullable=True),
48
+ )
49
+
50
+ op.create_table(
51
+ "meta_column_statistics",
52
+ sa.Column(
53
+ "column_id",
54
+ sa.Integer(),
55
+ sa.ForeignKey("meta_columns.id"),
56
+ primary_key=True,
57
+ ),
58
+ sa.Column("most_common_values", Array(), nullable=True),
59
+ sa.Column("most_common_frequencies", Array(), nullable=True),
60
+ sa.Column("null_percentage", sa.Numeric(5, 2), nullable=True),
61
+ sa.Column("distinct_values_count", sa.Integer(), nullable=True),
62
+ sa.Column("minimum_value", sa.String(), nullable=True),
63
+ sa.Column("maximum_value", sa.String(), nullable=True),
64
+ )
65
+
66
+ op.create_table(
67
+ "meta_primary_keys",
68
+ sa.Column("table_id", sa.Integer(), sa.ForeignKey("meta_tables.id"), primary_key=True),
69
+ sa.Column(
70
+ "column_id",
71
+ sa.Integer(),
72
+ sa.ForeignKey("meta_columns.id"),
73
+ primary_key=True,
74
+ ),
75
+ sa.Column("ordinal_position", sa.Integer(), nullable=True),
76
+ sa.Column("constraint_name", sa.String(), nullable=True),
77
+ )
78
+
79
+ op.create_table(
80
+ "meta_foreign_keys",
81
+ sa.Column(
82
+ "parent_table_id",
83
+ sa.Integer(),
84
+ sa.ForeignKey("meta_tables.id"),
85
+ primary_key=True,
86
+ ),
87
+ sa.Column(
88
+ "parent_column_id",
89
+ sa.Integer(),
90
+ sa.ForeignKey("meta_columns.id"),
91
+ primary_key=True,
92
+ ),
93
+ sa.Column(
94
+ "child_table_id",
95
+ sa.Integer(),
96
+ sa.ForeignKey("meta_tables.id"),
97
+ primary_key=True,
98
+ ),
99
+ sa.Column(
100
+ "child_column_id",
101
+ sa.Integer(),
102
+ sa.ForeignKey("meta_columns.id"),
103
+ primary_key=True,
104
+ ),
105
+ sa.Column("constraint_name", sa.String(), nullable=True),
106
+ )
107
+
108
+
109
+ def downgrade():
110
+ op.drop_table("meta_tables")
111
+
112
+ op.drop_table("meta_columns")
113
+
114
+ op.drop_table("meta_column_statistics")
115
+
116
+ op.drop_table("meta_primary_keys")
117
+
118
+ op.drop_table("meta_foreign_keys")
@@ -0,0 +1,58 @@
1
+ """updated data catalog data types
2
+
3
+ Revision ID: 608e376c19a7
4
+ Revises: a44643042fe8
5
+ Create Date: 2025-06-09 23:20:34.739735
6
+
7
+ """
8
+
9
+ from alembic import op
10
+ import sqlalchemy as sa
11
+ import mindsdb.interfaces.storage.db # noqa
12
+
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision = "608e376c19a7"
16
+ down_revision = "a44643042fe8"
17
+ branch_labels = None
18
+ depends_on = None
19
+
20
+
21
+ def upgrade():
22
+ with op.batch_alter_table("meta_tables", schema=None) as batch_op:
23
+ batch_op.alter_column(
24
+ "row_count",
25
+ type_=sa.BigInteger(),
26
+ existing_type=sa.Integer(),
27
+ existing_nullable=True,
28
+ existing_server_default=None,
29
+ )
30
+
31
+ with op.batch_alter_table("meta_column_statistics", schema=None) as batch_op:
32
+ batch_op.alter_column(
33
+ "distinct_values_count",
34
+ type_=sa.BigInteger(),
35
+ existing_type=sa.Integer(),
36
+ existing_nullable=True,
37
+ existing_server_default=None,
38
+ )
39
+
40
+
41
+ def downgrade():
42
+ with op.batch_alter_table("meta_tables", schema=None) as batch_op:
43
+ batch_op.alter_column(
44
+ "row_count",
45
+ type_=sa.Integer(),
46
+ existing_type=sa.BigInteger(),
47
+ existing_nullable=True,
48
+ existing_server_default=None,
49
+ )
50
+
51
+ with op.batch_alter_table("meta_column_statistics", schema=None) as batch_op:
52
+ batch_op.alter_column(
53
+ "distinct_values_count",
54
+ type_=sa.Integer(),
55
+ existing_type=sa.BigInteger(),
56
+ existing_nullable=True,
57
+ existing_server_default=None,
58
+ )
@@ -217,6 +217,9 @@ class Config:
217
217
  "project_name": "mindsdb",
218
218
  "enabled": False,
219
219
  },
220
+ "data_catalog": {
221
+ "enabled": False,
222
+ },
220
223
  }
221
224
  # endregion
222
225
 
@@ -360,6 +363,8 @@ class Config:
360
363
  self._env_config["default_reranking_model"] = {
361
364
  "api_key": os.environ["MINDSDB_DEFAULT_RERANKING_MODEL_API_KEY"]
362
365
  }
366
+ if os.environ.get("MINDSDB_DATA_CATALOG_ENABLED", "").lower() in ("1", "true"):
367
+ self._env_config["data_catalog"] = {"enabled": True}
363
368
 
364
369
  # region vars: a2a configuration
365
370
  a2a_config = {}
@@ -395,11 +400,13 @@ class Config:
395
400
  bool: True if config was loaded or updated
396
401
  """
397
402
 
398
- if self.auto_config_mtime != self.auto_config_path.stat().st_mtime:
403
+ if self.auto_config_path.is_file() and self.auto_config_mtime != self.auto_config_path.stat().st_mtime:
399
404
  try:
400
405
  self._auto_config = json.loads(self.auto_config_path.read_text())
401
406
  except json.JSONDecodeError as e:
402
- raise ValueError(f"The 'auto' configuration file ({self.auto_config_path}) contains invalid JSON: {e}")
407
+ raise ValueError(
408
+ f"The 'auto' configuration file ({self.auto_config_path}) contains invalid JSON: {e}\nFile content: {self.auto_config_path.read_text()}"
409
+ )
403
410
  self.auto_config_mtime = self.auto_config_path.stat().st_mtime
404
411
  return True
405
412
  return False