MindsDB 25.5.4.0__py3-none-any.whl → 25.5.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

@@ -5,15 +5,7 @@ from typing import Dict, List, Optional
5
5
  import pandas as pd
6
6
  import numpy as np
7
7
 
8
- from mindsdb_sql_parser.ast import (
9
- BinaryOperation,
10
- Constant,
11
- Identifier,
12
- Select,
13
- Update,
14
- Delete,
15
- Star
16
- )
8
+ from mindsdb_sql_parser.ast import BinaryOperation, Constant, Identifier, Select, Update, Delete, Star
17
9
  from mindsdb_sql_parser.ast.mindsdb import CreatePredictor
18
10
 
19
11
  from mindsdb.integrations.utilities.query_traversal import query_traversal
@@ -27,7 +19,9 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
27
19
  from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
28
20
  from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
29
21
  from mindsdb.integrations.utilities.handler_utils import get_api_key
30
- from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import construct_model_from_args
22
+ from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import (
23
+ construct_model_from_args,
24
+ )
31
25
 
32
26
  from mindsdb.interfaces.agents.constants import DEFAULT_EMBEDDINGS_MODEL_CLASS
33
27
  from mindsdb.interfaces.agents.langchain_agent import create_chat_model, get_llm_provider
@@ -48,11 +42,7 @@ from mindsdb.integrations.utilities.rag.rerankers.base_reranker import BaseLLMRe
48
42
 
49
43
  logger = log.getLogger(__name__)
50
44
 
51
- KB_TO_VECTORDB_COLUMNS = {
52
- 'id': 'original_doc_id',
53
- 'chunk_id': 'id',
54
- 'chunk_content': 'content'
55
- }
45
+ KB_TO_VECTORDB_COLUMNS = {"id": "original_doc_id", "chunk_id": "id", "chunk_content": "content"}
56
46
 
57
47
 
58
48
  def get_model_params(model_params: dict, default_config_key: str):
@@ -72,23 +62,23 @@ def get_embedding_model_from_params(embedding_model_params: dict):
72
62
  Create embedding model from parameters.
73
63
  """
74
64
  params_copy = copy.deepcopy(embedding_model_params)
75
- provider = params_copy.pop('provider', None).lower()
76
- api_key = get_api_key(provider, params_copy, strict=False) or params_copy.get('api_key')
65
+ provider = params_copy.pop("provider", None).lower()
66
+ api_key = get_api_key(provider, params_copy, strict=False) or params_copy.get("api_key")
77
67
  # Underscores are replaced because the provider name ultimately gets mapped to a class name.
78
68
  # This is mostly to support Azure OpenAI (azure_openai); the mapped class name is 'AzureOpenAIEmbeddings'.
79
- params_copy['class'] = provider.replace('_', '')
80
- if provider == 'azure_openai':
69
+ params_copy["class"] = provider.replace("_", "")
70
+ if provider == "azure_openai":
81
71
  # Azure OpenAI expects the api_key to be passed as 'openai_api_key'.
82
- params_copy['openai_api_key'] = api_key
83
- params_copy['azure_endpoint'] = params_copy.pop('base_url')
84
- if 'chunk_size' not in params_copy:
85
- params_copy['chunk_size'] = 2048
86
- if 'api_version' in params_copy:
87
- params_copy['openai_api_version'] = params_copy['api_version']
72
+ params_copy["openai_api_key"] = api_key
73
+ params_copy["azure_endpoint"] = params_copy.pop("base_url")
74
+ if "chunk_size" not in params_copy:
75
+ params_copy["chunk_size"] = 2048
76
+ if "api_version" in params_copy:
77
+ params_copy["openai_api_version"] = params_copy["api_version"]
88
78
  else:
89
79
  params_copy[f"{provider}_api_key"] = api_key
90
- params_copy.pop('api_key', None)
91
- params_copy['model'] = params_copy.pop('model_name', None)
80
+ params_copy.pop("api_key", None)
81
+ params_copy["model"] = params_copy.pop("model_name", None)
92
82
 
93
83
  return construct_model_from_args(params_copy)
94
84
 
@@ -98,15 +88,26 @@ def get_reranking_model_from_params(reranking_model_params: dict):
98
88
  Create reranking model from parameters.
99
89
  """
100
90
  params_copy = copy.deepcopy(reranking_model_params)
101
- provider = params_copy.get('provider', "openai").lower()
91
+ provider = params_copy.get("provider", "openai").lower()
102
92
 
103
93
  if "api_key" not in params_copy:
104
94
  params_copy["api_key"] = get_api_key(provider, params_copy, strict=False)
105
- params_copy['model'] = params_copy.pop('model_name', None)
95
+ params_copy["model"] = params_copy.pop("model_name", None)
106
96
 
107
97
  return BaseLLMReranker(**params_copy)
108
98
 
109
99
 
100
+ def safe_pandas_is_datetime(value: str) -> bool:
101
+ """
102
+ Check if the value can be parsed as a datetime.
103
+ """
104
+ try:
105
+ result = pd.api.types.is_datetime64_any_dtype(value)
106
+ return result
107
+ except ValueError:
108
+ return False
109
+
110
+
110
111
  class KnowledgeBaseTable:
111
112
  """
112
113
  Knowledge base table interface
@@ -127,9 +128,9 @@ class KnowledgeBaseTable:
127
128
  self.document_preprocessor = None # Reset existing preprocessor
128
129
  if config is not None:
129
130
  # Ensure content_column is set for JSON chunking if not already specified
130
- if config.get('type') == 'json_chunking' and config.get('json_chunking_config'):
131
- if 'content_column' not in config['json_chunking_config']:
132
- config['json_chunking_config']['content_column'] = 'content'
131
+ if config.get("type") == "json_chunking" and config.get("json_chunking_config"):
132
+ if "content_column" not in config["json_chunking_config"]:
133
+ config["json_chunking_config"]["content_column"] = "content"
133
134
 
134
135
  preprocessing_config = PreprocessingConfig(**config)
135
136
  self.document_preprocessor = PreprocessorFactory.create_preprocessor(preprocessing_config)
@@ -192,11 +193,13 @@ class KnowledgeBaseTable:
192
193
  query_text = item.value
193
194
 
194
195
  # replace content with embeddings
195
- conditions.append(FilterCondition(
196
- column=TableField.EMBEDDINGS.value,
197
- value=self._content_to_embeddings(item.value),
198
- op=FilterOperator.EQUAL,
199
- ))
196
+ conditions.append(
197
+ FilterCondition(
198
+ column=TableField.EMBEDDINGS.value,
199
+ value=self._content_to_embeddings(item.value),
200
+ op=FilterOperator.EQUAL,
201
+ )
202
+ )
200
203
  else:
201
204
  conditions.append(item)
202
205
 
@@ -238,7 +241,7 @@ class KnowledgeBaseTable:
238
241
  def add_relevance(self, df, query_text, relevance_threshold=None):
239
242
  relevance_column = TableField.RELEVANCE.value
240
243
 
241
- reranking_model_params = get_model_params(self._kb.params.get("reranking_model"), "default_llm")
244
+ reranking_model_params = get_model_params(self._kb.params.get("reranking_model"), "default_reranking_model")
242
245
  if reranking_model_params and query_text and len(df) > 0:
243
246
  # Use reranker for relevance score
244
247
  try:
@@ -250,7 +253,7 @@ class KnowledgeBaseTable:
250
253
 
251
254
  reranker = get_reranking_model_from_params(reranking_model_params)
252
255
  # Get documents to rerank
253
- documents = df['chunk_content'].tolist()
256
+ documents = df["chunk_content"].tolist()
254
257
  # Use the get_scores method with disable_events=True
255
258
  scores = reranker.get_scores(query_text, documents)
256
259
  # Add scores as the relevance column
@@ -263,21 +266,21 @@ class KnowledgeBaseTable:
263
266
  except Exception as e:
264
267
  logger.error(f"Error during reranking: {str(e)}")
265
268
  # Fallback to distance-based relevance
266
- if 'distance' in df.columns:
267
- df[relevance_column] = 1 / (1 + df['distance'])
269
+ if "distance" in df.columns:
270
+ df[relevance_column] = 1 / (1 + df["distance"])
268
271
  else:
269
272
  logger.info("No distance or reranker available")
270
273
 
271
- elif 'distance' in df.columns:
274
+ elif "distance" in df.columns:
272
275
  # Calculate relevance from distance
273
276
  logger.info("Calculating relevance from vector distance")
274
- df[relevance_column] = 1 / (1 + df['distance'])
277
+ df[relevance_column] = 1 / (1 + df["distance"])
275
278
  if relevance_threshold is not None:
276
279
  df = df[df[relevance_column] > relevance_threshold]
277
280
 
278
281
  else:
279
282
  df[relevance_column] = None
280
- df['distance'] = None
283
+ df["distance"] = None
281
284
  # Sort by relevance
282
285
  df = df.sort_values(by=relevance_column, ascending=False)
283
286
  return df
@@ -300,7 +303,7 @@ class KnowledgeBaseTable:
300
303
  columns = list(df.columns)
301
304
  # update id, get from metadata
302
305
  df[TableField.ID.value] = df[TableField.METADATA.value].apply(
303
- lambda m: None if m is None else m.get('original_doc_id')
306
+ lambda m: None if m is None else m.get("original_doc_id")
304
307
  )
305
308
 
306
309
  # id on first place
@@ -315,23 +318,14 @@ class KnowledgeBaseTable:
315
318
  if documents:
316
319
  self.insert_documents(documents)
317
320
 
318
- def insert_web_pages(
319
- self,
320
- urls: List[str],
321
- crawl_depth: int,
322
- limit: int,
323
- filters: List[str] = None
324
- ):
321
+ def insert_web_pages(self, urls: List[str], crawl_depth: int, limit: int, filters: List[str] = None):
325
322
  """Process and insert web pages"""
326
323
  if not self.document_loader:
327
324
  raise ValueError("Document loader not configured")
328
325
 
329
- documents = list(self.document_loader.load_web_pages(
330
- urls,
331
- limit=limit,
332
- crawl_depth=crawl_depth,
333
- filters=filters
334
- ))
326
+ documents = list(
327
+ self.document_loader.load_web_pages(urls, limit=limit, crawl_depth=crawl_depth, filters=filters)
328
+ )
335
329
  if documents:
336
330
  self.insert_documents(documents)
337
331
 
@@ -349,11 +343,9 @@ class KnowledgeBaseTable:
349
343
  if not rows:
350
344
  return
351
345
 
352
- documents = [Document(
353
- content=row.get('content', ''),
354
- id=row.get('id'),
355
- metadata=row.get('metadata', {})
356
- ) for row in rows]
346
+ documents = [
347
+ Document(content=row.get("content", ""), id=row.get("id"), metadata=row.get("metadata", {})) for row in rows
348
+ ]
357
349
 
358
350
  self.insert_documents(documents)
359
351
 
@@ -374,7 +366,7 @@ class KnowledgeBaseTable:
374
366
  conditions = db_handler.extract_conditions(query.where)
375
367
  doc_id = None
376
368
  for condition in conditions:
377
- if condition.column == 'chunk_id' and condition.op == FilterOperator.EQUAL:
369
+ if condition.column == "chunk_id" and condition.op == FilterOperator.EQUAL:
378
370
  doc_id = condition.value
379
371
 
380
372
  if cont_col in query.update_columns:
@@ -385,7 +377,7 @@ class KnowledgeBaseTable:
385
377
  doc = Document(
386
378
  id=doc_id,
387
379
  content=content.value,
388
- metadata={} # Empty metadata for content-only updates
380
+ metadata={}, # Empty metadata for content-only updates
389
381
  )
390
382
  processed_chunks = self.document_preprocessor.process_documents([doc])
391
383
  if processed_chunks:
@@ -424,7 +416,7 @@ class KnowledgeBaseTable:
424
416
  query: str,
425
417
  keywords: List[str] = None,
426
418
  metadata: Dict[str, str] = None,
427
- distance_function=DistanceFunction.COSINE_DISTANCE
419
+ distance_function=DistanceFunction.COSINE_DISTANCE,
428
420
  ) -> pd.DataFrame:
429
421
  query_df = pd.DataFrame.from_records([{TableField.CONTENT.value: query}])
430
422
  embeddings_df = self._df_to_embeddings(query_df)
@@ -433,14 +425,14 @@ class KnowledgeBaseTable:
433
425
  embeddings = embeddings_df.iloc[0][TableField.EMBEDDINGS.value]
434
426
  keywords_query = None
435
427
  if keywords is not None:
436
- keywords_query = ' '.join(keywords)
428
+ keywords_query = " ".join(keywords)
437
429
  db_handler = self.get_vector_db()
438
430
  return db_handler.hybrid_search(
439
431
  self._kb.vector_database_table,
440
432
  embeddings,
441
433
  query=keywords_query,
442
434
  metadata=metadata,
443
- distance_function=distance_function
435
+ distance_function=distance_function,
444
436
  )
445
437
 
446
438
  def clear(self):
@@ -473,7 +465,7 @@ class KnowledgeBaseTable:
473
465
 
474
466
  # First adapt column names to identify content and metadata columns
475
467
  adapted_df = self._adapt_column_names(df)
476
- content_columns = self._kb.params.get('content_columns', [TableField.CONTENT.value])
468
+ content_columns = self._kb.params.get("content_columns", [TableField.CONTENT.value])
477
469
 
478
470
  # Convert DataFrame rows to documents, creating separate documents for each content column
479
471
  raw_documents = []
@@ -491,15 +483,11 @@ class KnowledgeBaseTable:
491
483
 
492
484
  metadata = {
493
485
  **base_metadata,
494
- 'original_row_index': str(idx), # provide link to original row index
495
- 'content_column': col,
486
+ "original_row_index": str(idx), # provide link to original row index
487
+ "content_column": col,
496
488
  }
497
489
 
498
- raw_documents.append(Document(
499
- content=content_str,
500
- id=doc_id,
501
- metadata=metadata
502
- ))
490
+ raw_documents.append(Document(content=content_str, id=doc_id, metadata=metadata))
503
491
 
504
492
  # Apply preprocessing to all documents if preprocessor exists
505
493
  if self.document_preprocessor:
@@ -508,11 +496,16 @@ class KnowledgeBaseTable:
508
496
  processed_chunks = raw_documents # Use raw documents if no preprocessing
509
497
 
510
498
  # Convert processed chunks back to DataFrame with standard structure
511
- df = pd.DataFrame([{
512
- TableField.CONTENT.value: chunk.content,
513
- TableField.ID.value: chunk.id,
514
- TableField.METADATA.value: chunk.metadata
515
- } for chunk in processed_chunks])
499
+ df = pd.DataFrame(
500
+ [
501
+ {
502
+ TableField.CONTENT.value: chunk.content,
503
+ TableField.ID.value: chunk.id,
504
+ TableField.METADATA.value: chunk.metadata,
505
+ }
506
+ for chunk in processed_chunks
507
+ ]
508
+ )
516
509
 
517
510
  if df.empty:
518
511
  logger.warning("No valid content found in any content columns")
@@ -523,17 +516,17 @@ class KnowledgeBaseTable:
523
516
  df = pd.concat([df, df_emb], axis=1)
524
517
  db_handler = self.get_vector_db()
525
518
 
526
- if params is not None and params.get('kb_no_upsert', False):
519
+ if params is not None and params.get("kb_no_upsert", False):
527
520
  # speed up inserting by disable checking existing records
528
521
  db_handler.insert(self._kb.vector_database_table, df)
529
522
  else:
530
523
  db_handler.do_upsert(self._kb.vector_database_table, df)
531
524
 
532
525
  def _adapt_column_names(self, df: pd.DataFrame) -> pd.DataFrame:
533
- '''
526
+ """
534
527
  Convert input columns for vector db input
535
528
  - id, content and metadata
536
- '''
529
+ """
537
530
  # Debug incoming data
538
531
  logger.debug(f"Input DataFrame columns: {df.columns}")
539
532
  logger.debug(f"Input DataFrame first row: {df.iloc[0].to_dict()}")
@@ -542,7 +535,7 @@ class KnowledgeBaseTable:
542
535
  columns = list(df.columns)
543
536
 
544
537
  # -- prepare id --
545
- id_column = params.get('id_column')
538
+ id_column = params.get("id_column")
546
539
  if id_column is not None and id_column not in columns:
547
540
  id_column = None
548
541
 
@@ -552,8 +545,8 @@ class KnowledgeBaseTable:
552
545
  # Also check for case-insensitive 'id' column
553
546
  if id_column is None:
554
547
  column_map = {col.lower(): col for col in columns}
555
- if 'id' in column_map:
556
- id_column = column_map['id']
548
+ if "id" in column_map:
549
+ id_column = column_map["id"]
557
550
 
558
551
  if id_column is not None:
559
552
  columns.remove(id_column)
@@ -568,8 +561,8 @@ class KnowledgeBaseTable:
568
561
  logger.debug(f"Added IDs: {df_out[TableField.ID.value].tolist()}")
569
562
 
570
563
  # -- prepare content and metadata --
571
- content_columns = params.get('content_columns', [TableField.CONTENT.value])
572
- metadata_columns = params.get('metadata_columns')
564
+ content_columns = params.get("content_columns", [TableField.CONTENT.value])
565
+ metadata_columns = params.get("metadata_columns")
573
566
 
574
567
  logger.debug(f"Processing with: content_columns={content_columns}, metadata_columns={metadata_columns}")
575
568
 
@@ -577,25 +570,19 @@ class KnowledgeBaseTable:
577
570
  if content_columns:
578
571
  # Ensure content columns are case-insensitive
579
572
  column_map = {col.lower(): col for col in columns}
580
- content_columns = [
581
- column_map.get(col.lower(), col)
582
- for col in content_columns
583
- ]
573
+ content_columns = [column_map.get(col.lower(), col) for col in content_columns]
584
574
  logger.debug(f"Mapped content columns: {content_columns}")
585
575
 
586
576
  if metadata_columns:
587
577
  # Ensure metadata columns are case-insensitive
588
578
  column_map = {col.lower(): col for col in columns}
589
- metadata_columns = [
590
- column_map.get(col.lower(), col)
591
- for col in metadata_columns
592
- ]
579
+ metadata_columns = [column_map.get(col.lower(), col) for col in metadata_columns]
593
580
  logger.debug(f"Mapped metadata columns: {metadata_columns}")
594
581
 
595
582
  if content_columns is not None:
596
583
  content_columns = list(set(content_columns).intersection(columns))
597
584
  if len(content_columns) == 0:
598
- raise ValueError(f'Content columns {params.get("content_columns")} not found in dataset: {columns}')
585
+ raise ValueError(f"Content columns {params.get('content_columns')} not found in dataset: {columns}")
599
586
 
600
587
  if metadata_columns is not None:
601
588
  metadata_columns = list(set(metadata_columns).intersection(columns))
@@ -609,12 +596,13 @@ class KnowledgeBaseTable:
609
596
 
610
597
  # Add metadata
611
598
  if metadata_columns and len(metadata_columns) > 0:
599
+
612
600
  def convert_row_to_metadata(row):
613
601
  metadata = {}
614
602
  for col in metadata_columns:
615
603
  value = row[col]
616
604
  # Convert numpy/pandas types to Python native types
617
- if pd.api.types.is_datetime64_any_dtype(value) or isinstance(value, pd.Timestamp):
605
+ if safe_pandas_is_datetime(value) or isinstance(value, pd.Timestamp):
618
606
  value = str(value)
619
607
  elif pd.api.types.is_integer_dtype(value):
620
608
  value = int(value)
@@ -654,7 +642,7 @@ class KnowledgeBaseTable:
654
642
  if self._vector_db is None:
655
643
  database = db.Integration.query.get(self._kb.vector_database_id)
656
644
  if database is None:
657
- raise ValueError('Vector database not found. Is it deleted?')
645
+ raise ValueError("Vector database not found. Is it deleted?")
658
646
  database_name = database.name
659
647
  self._vector_db = self.session.integration_controller.get_data_handler(database_name)
660
648
  return self._vector_db
@@ -679,6 +667,15 @@ class KnowledgeBaseTable:
679
667
 
680
668
  model_id = self._kb.embedding_model_id
681
669
 
670
+ if model_id is None:
671
+ # call litellm handler
672
+ messages = list(df[TableField.CONTENT.value])
673
+ embedding_params = copy.deepcopy(config.get("default_embedding_model", {}))
674
+ embedding_params.update(self._kb.params["embedding_model"])
675
+ results = self.call_litellm_embedding(self.session, embedding_params, messages)
676
+ results = [[val] for val in results]
677
+ return pd.DataFrame(results, columns=[TableField.EMBEDDINGS.value])
678
+
682
679
  # get the input columns
683
680
  model_rec = db.session.query(db.Predictor).filter_by(id=model_id).first()
684
681
 
@@ -687,19 +684,15 @@ class KnowledgeBaseTable:
687
684
 
688
685
  project_datanode = self.session.datahub.get(model_project.name)
689
686
 
690
- model_using = model_rec.learn_args.get('using', {})
691
- input_col = model_using.get('question_column')
687
+ model_using = model_rec.learn_args.get("using", {})
688
+ input_col = model_using.get("question_column")
692
689
  if input_col is None:
693
- input_col = model_using.get('input_column')
690
+ input_col = model_using.get("input_column")
694
691
 
695
692
  if input_col is not None and input_col != TableField.CONTENT.value:
696
693
  df = df.rename(columns={TableField.CONTENT.value: input_col})
697
694
 
698
- df_out = project_datanode.predict(
699
- model_name=model_rec.name,
700
- df=df,
701
- params=self.model_params
702
- )
695
+ df_out = project_datanode.predict(model_name=model_rec.name, df=df, params=self.model_params)
703
696
 
704
697
  target = model_rec.to_predict[0]
705
698
  if target != TableField.EMBEDDINGS.value:
@@ -720,6 +713,23 @@ class KnowledgeBaseTable:
720
713
  res = self._df_to_embeddings(df)
721
714
  return res[TableField.EMBEDDINGS.value][0]
722
715
 
716
+ @staticmethod
717
+ def call_litellm_embedding(session, model_params, messages):
718
+ args = copy.deepcopy(model_params)
719
+
720
+ llm_model = args.pop("model_name")
721
+ engine = args.pop("provider")
722
+
723
+ llm_model = f"{engine}/{llm_model}"
724
+
725
+ if "base_url" in args:
726
+ args["api_base"] = args.pop("base_url")
727
+
728
+ module = session.integration_controller.get_handler_module("litellm")
729
+ if module is None or module.Handler is None:
730
+ raise ValueError(f'Unable to use "{engine}" provider. Litellm handler is not installed')
731
+ return module.Handler.embeddings(llm_model, messages, args)
732
+
723
733
  def build_rag_pipeline(self, retrieval_config: dict):
724
734
  """
725
735
  Builds a RAG pipeline with returned sources
@@ -735,10 +745,10 @@ class KnowledgeBaseTable:
735
745
  """
736
746
  # Get embedding model from knowledge base
737
747
  embeddings_model = None
738
- embedding_model_params = get_model_params(self._kb.params.get('embedding_model', {}), 'default_embedding_model')
748
+ embedding_model_params = get_model_params(self._kb.params.get("embedding_model", {}), "default_embedding_model")
739
749
  if self._kb.embedding_model:
740
750
  # Extract embedding model args from knowledge base table
741
- embedding_args = self._kb.embedding_model.learn_args.get('using', {})
751
+ embedding_args = self._kb.embedding_model.learn_args.get("using", {})
742
752
  # Construct the embedding model directly
743
753
  embeddings_model = construct_model_from_args(embedding_args)
744
754
  logger.debug(f"Using knowledge base embedding model with args: {embedding_args}")
@@ -750,21 +760,17 @@ class KnowledgeBaseTable:
750
760
  logger.debug("Using default embedding model as knowledge base has no embedding model")
751
761
 
752
762
  # Update retrieval config with knowledge base parameters
753
- kb_params = {
754
- 'vector_store_config': {
755
- 'kb_table': self
756
- }
757
- }
763
+ kb_params = {"vector_store_config": {"kb_table": self}}
758
764
 
759
765
  # Load and validate config
760
766
  try:
761
767
  rag_config = load_rag_config(retrieval_config, kb_params, embeddings_model)
762
768
 
763
769
  # Build LLM if specified
764
- if 'llm_model_name' in rag_config:
770
+ if "llm_model_name" in rag_config:
765
771
  llm_args = {"model_name": rag_config.llm_model_name}
766
772
  if not rag_config.llm_provider:
767
- llm_args['provider'] = get_llm_provider(llm_args)
773
+ llm_args["provider"] = get_llm_provider(llm_args)
768
774
  else:
769
775
  llm_args["provider"] = rag_config.llm_provider
770
776
  rag_config.llm = create_chat_model(llm_args)
@@ -785,6 +791,7 @@ class KnowledgeBaseTable:
785
791
  if isinstance(base_metadata, str):
786
792
  try:
787
793
  import ast
794
+
788
795
  return ast.literal_eval(base_metadata)
789
796
  except (SyntaxError, ValueError):
790
797
  logger.warning(f"Could not parse metadata: {base_metadata}. Using empty dict.")
@@ -794,6 +801,7 @@ class KnowledgeBaseTable:
794
801
  def _generate_document_id(self, content: str, content_column: str, provided_id: str = None) -> str:
795
802
  """Generate a deterministic document ID using the utility function."""
796
803
  from mindsdb.interfaces.knowledge_base.utils import generate_document_id
804
+
797
805
  return generate_document_id(content=content, provided_id=provided_id)
798
806
 
799
807
  def _convert_metadata_value(self, value):
@@ -846,14 +854,14 @@ class KnowledgeBaseController:
846
854
  self.session = session
847
855
 
848
856
  def add(
849
- self,
850
- name: str,
851
- project_name: str,
852
- embedding_model: Identifier,
853
- storage: Identifier,
854
- params: dict,
855
- preprocessing_config: Optional[dict] = None,
856
- if_not_exists: bool = False
857
+ self,
858
+ name: str,
859
+ project_name: str,
860
+ storage: Identifier,
861
+ params: dict,
862
+ preprocessing_config: Optional[dict] = None,
863
+ if_not_exists: bool = False,
864
+ # embedding_model: Identifier = None, # Legacy: Allow MindsDB models to be passed as embedding_model.
857
865
  ) -> db.KnowledgeBase:
858
866
  """
859
867
  Add a new knowledge base to the database
@@ -868,11 +876,11 @@ class KnowledgeBaseController:
868
876
  if preprocessing_config is not None:
869
877
  PreprocessingConfig(**preprocessing_config) # Validate before storing
870
878
  params = params or {}
871
- params['preprocessing'] = preprocessing_config
879
+ params["preprocessing"] = preprocessing_config
872
880
 
873
881
  # Check if vector_size is provided when using sparse vectors
874
- is_sparse = params.get('is_sparse')
875
- vector_size = params.get('vector_size')
882
+ is_sparse = params.get("is_sparse")
883
+ vector_size = params.get("vector_size")
876
884
  if is_sparse and vector_size is None:
877
885
  raise ValueError("vector_size is required when is_sparse=True")
878
886
 
@@ -889,41 +897,45 @@ class KnowledgeBaseController:
889
897
  return kb
890
898
  raise EntityExistsError("Knowledge base already exists", name)
891
899
 
892
- embedding_params = copy.deepcopy(config.get('default_embedding_model', {}))
893
-
894
- model_name = None
895
- model_project = project
896
- if embedding_model:
897
- model_name = embedding_model.parts[-1]
898
- if len(embedding_model.parts) > 1:
899
- model_project = self.session.database_controller.get_project(embedding_model.parts[-2])
900
-
901
- elif 'embedding_model' in params:
902
- if isinstance(params['embedding_model'], str):
903
- # it is model name
904
- model_name = params['embedding_model']
905
- else:
906
- # it is params for model
907
- embedding_params.update(params['embedding_model'])
908
-
909
- if model_name is None:
910
- model_name = self._create_embedding_model(
911
- project.name,
912
- params=embedding_params,
913
- kb_name=name,
914
- )
915
- params['created_embedding_model'] = model_name
900
+ embedding_params = copy.deepcopy(config.get("default_embedding_model", {}))
901
+
902
+ # Legacy
903
+ # model_name = None
904
+ # model_project = project
905
+ # if embedding_model:
906
+ # model_name = embedding_model.parts[-1]
907
+ # if len(embedding_model.parts) > 1:
908
+ # model_project = self.session.database_controller.get_project(embedding_model.parts[-2])
909
+
910
+ # elif "embedding_model" in params:
911
+ # if isinstance(params["embedding_model"], str):
912
+ # # it is model name
913
+ # model_name = params["embedding_model"]
914
+ # else:
915
+ # # it is params for model
916
+ # embedding_params.update(params["embedding_model"])
917
+
918
+ if "embedding_model" in params:
919
+ if not isinstance(params["embedding_model"], dict):
920
+ raise ValueError("embedding_model should be JSON object with model parameters.")
921
+ embedding_params.update(params["embedding_model"])
922
+
923
+ # if model_name is None: # Legacy
924
+ model_name = self._create_embedding_model(
925
+ project.name,
926
+ params=embedding_params,
927
+ kb_name=name,
928
+ )
929
+ if model_name is not None:
930
+ params["created_embedding_model"] = model_name
916
931
 
917
932
  embedding_model_id = None
918
933
  if model_name is not None:
919
- model = self.session.model_controller.get_model(
920
- name=model_name,
921
- project_name=model_project.name
922
- )
923
- model_record = db.Predictor.query.get(model['id'])
934
+ model = self.session.model_controller.get_model(name=model_name, project_name=project.name)
935
+ model_record = db.Predictor.query.get(model["id"])
924
936
  embedding_model_id = model_record.id
925
937
 
926
- reranking_model_params = get_model_params(params.get('reranking_model', {}), 'default_llm')
938
+ reranking_model_params = get_model_params(params.get("reranking_model", {}), "default_reranking_model")
927
939
  if reranking_model_params:
928
940
  # Get reranking model from params.
929
941
  # This is called here to check validaity of the parameters.
@@ -931,17 +943,17 @@ class KnowledgeBaseController:
931
943
 
932
944
  # search for the vector database table
933
945
  if storage is None:
934
- cloud_pg_vector = os.environ.get('KB_PGVECTOR_URL')
946
+ cloud_pg_vector = os.environ.get("KB_PGVECTOR_URL")
935
947
  if cloud_pg_vector:
936
948
  vector_table_name = name
937
949
  # Add sparse vector support for pgvector
938
950
  vector_db_params = {}
939
951
  # Check both explicit parameter and model configuration
940
- is_sparse = is_sparse or model_record.learn_args.get('using', {}).get('sparse')
952
+ is_sparse = is_sparse or model_record.learn_args.get("using", {}).get("sparse")
941
953
  if is_sparse:
942
- vector_db_params['is_sparse'] = True
954
+ vector_db_params["is_sparse"] = True
943
955
  if vector_size is not None:
944
- vector_db_params['vector_size'] = vector_size
956
+ vector_db_params["vector_size"] = vector_size
945
957
  vector_db_name = self._create_persistent_pgvector(vector_db_params)
946
958
 
947
959
  else:
@@ -949,26 +961,22 @@ class KnowledgeBaseController:
949
961
  vector_table_name = "default_collection"
950
962
  vector_db_name = self._create_persistent_chroma(name)
951
963
  # memorize to remove it later
952
- params['default_vector_storage'] = vector_db_name
964
+ params["default_vector_storage"] = vector_db_name
953
965
  elif len(storage.parts) != 2:
954
- raise ValueError('Storage param has to be vector db with table')
966
+ raise ValueError("Storage param has to be vector db with table")
955
967
  else:
956
968
  vector_db_name, vector_table_name = storage.parts
957
969
 
958
970
  # create table in vectordb before creating KB
959
- self.session.datahub.get(vector_db_name).integration_handler.create_table(
960
- vector_table_name
961
- )
962
- vector_database_id = self.session.integration_controller.get(vector_db_name)['id']
971
+ self.session.datahub.get(vector_db_name).integration_handler.create_table(vector_table_name)
972
+ vector_database_id = self.session.integration_controller.get(vector_db_name)["id"]
963
973
 
964
974
  # Store sparse vector settings in params if specified
965
975
  if is_sparse:
966
976
  params = params or {}
967
- params['vector_config'] = {
968
- 'is_sparse': is_sparse
969
- }
977
+ params["vector_config"] = {"is_sparse": is_sparse}
970
978
  if vector_size is not None:
971
- params['vector_config']['vector_size'] = vector_size
979
+ params["vector_config"]["vector_size"] = vector_size
972
980
 
973
981
  kb = db.KnowledgeBase(
974
982
  name=name,
@@ -990,7 +998,7 @@ class KnowledgeBaseController:
990
998
  if self.session.integration_controller.get(vector_store_name):
991
999
  return vector_store_name
992
1000
 
993
- self.session.integration_controller.add(vector_store_name, 'pgvector', params or {})
1001
+ self.session.integration_controller.add(vector_store_name, "pgvector", params or {})
994
1002
  return vector_store_name
995
1003
 
996
1004
  def _create_persistent_chroma(self, kb_name, engine="chromadb"):
@@ -1008,7 +1016,7 @@ class KnowledgeBaseController:
1008
1016
  self.session.integration_controller.add(vector_store_name, engine, connection_args)
1009
1017
  return vector_store_name
1010
1018
 
1011
- def _create_embedding_model(self, project_name, engine="openai", params: dict = None, kb_name=''):
1019
+ def _create_embedding_model(self, project_name, engine="openai", params: dict = None, kb_name=""):
1012
1020
  """create a default embedding model for knowledge base, if not specified"""
1013
1021
  model_name = f"kb_embedding_{kb_name}"
1014
1022
 
@@ -1020,44 +1028,47 @@ class KnowledgeBaseController:
1020
1028
  except PredictorRecordNotFound:
1021
1029
  pass
1022
1030
 
1023
- if 'provider' in params:
1024
- engine = params.pop('provider').lower()
1031
+ if params.get("provider", None) not in ("openai", "azure"):
1032
+ # try use litellm
1033
+ KnowledgeBaseTable.call_litellm_embedding(self.session, params, ["test"])
1034
+ return
1025
1035
 
1026
- api_key = get_api_key(engine, params, strict=False) or params.pop('api_key')
1036
+ if "provider" in params:
1037
+ engine = params.pop("provider").lower()
1027
1038
 
1028
- if engine == 'azure_openai':
1029
- engine = 'openai'
1030
- params['provider'] = 'azure'
1039
+ api_key = get_api_key(engine, params, strict=False) or params.pop("api_key")
1031
1040
 
1032
- if engine == 'openai':
1033
- if 'question_column' not in params:
1034
- params['question_column'] = 'content'
1041
+ if engine == "azure_openai":
1042
+ engine = "openai"
1043
+ params["provider"] = "azure"
1044
+
1045
+ if engine == "openai":
1046
+ if "question_column" not in params:
1047
+ params["question_column"] = "content"
1035
1048
  if api_key:
1036
1049
  params[f"{engine}_api_key"] = api_key
1037
- if 'api_key' in params:
1038
- params.pop('api_key')
1039
- if 'base_url' in params:
1040
- params['api_base'] = params.pop('base_url')
1050
+ if "api_key" in params:
1051
+ params.pop("api_key")
1052
+ if "base_url" in params:
1053
+ params["api_base"] = params.pop("base_url")
1041
1054
 
1042
- params['engine'] = engine
1043
- params['join_learn_process'] = True
1044
- params['mode'] = 'embedding'
1055
+ params["engine"] = engine
1056
+ params["join_learn_process"] = True
1057
+ params["mode"] = "embedding"
1045
1058
 
1046
1059
  # Include API key if provided.
1047
1060
  statement = CreatePredictor(
1048
1061
  name=Identifier(parts=[project_name, model_name]),
1049
1062
  using=params,
1050
- targets=[
1051
- Identifier(parts=[TableField.EMBEDDINGS.value])
1052
- ]
1063
+ targets=[Identifier(parts=[TableField.EMBEDDINGS.value])],
1053
1064
  )
1054
1065
 
1055
1066
  command_executor = ExecuteCommands(self.session)
1056
1067
  resp = command_executor.answer_create_predictor(statement, project_name)
1057
1068
  # check model status
1058
1069
  record = resp.data.records[0]
1059
- if record['STATUS'] == 'error':
1060
- raise ValueError('Embedding model error:' + record['ERROR'])
1070
+ if record["STATUS"] == "error":
1071
+ raise ValueError("Embedding model error:" + record["ERROR"])
1061
1072
  return model_name
1062
1073
 
1063
1074
  def delete(self, name: str, project_name: int, if_exists: bool = False) -> None:
@@ -1084,16 +1095,16 @@ class KnowledgeBaseController:
1084
1095
  db.session.commit()
1085
1096
 
1086
1097
  # drop objects if they were created automatically
1087
- if 'default_vector_storage' in kb.params:
1098
+ if "default_vector_storage" in kb.params:
1088
1099
  try:
1089
- handler = self.session.datahub.get(kb.params['default_vector_storage']).integration_handler
1100
+ handler = self.session.datahub.get(kb.params["default_vector_storage"]).integration_handler
1090
1101
  handler.drop_table(kb.vector_database_table)
1091
- self.session.integration_controller.delete(kb.params['default_vector_storage'])
1102
+ self.session.integration_controller.delete(kb.params["default_vector_storage"])
1092
1103
  except EntityNotExistsError:
1093
1104
  pass
1094
- if 'created_embedding_model' in kb.params:
1105
+ if "created_embedding_model" in kb.params:
1095
1106
  try:
1096
- self.session.model_controller.delete_model(kb.params['created_embedding_model'], project_name)
1107
+ self.session.model_controller.delete_model(kb.params["created_embedding_model"], project_name)
1097
1108
  except EntityNotExistsError:
1098
1109
  pass
1099
1110
 
@@ -1124,11 +1135,11 @@ class KnowledgeBaseController:
1124
1135
  if kb is not None:
1125
1136
  table = KnowledgeBaseTable(kb, self.session)
1126
1137
  if params:
1127
- table.model_params = params.get('model')
1138
+ table.model_params = params.get("model")
1128
1139
 
1129
1140
  # Always configure preprocessing - either from params or default
1130
- if kb.params and 'preprocessing' in kb.params:
1131
- table.configure_preprocessing(kb.params['preprocessing'])
1141
+ if kb.params and "preprocessing" in kb.params:
1142
+ table.configure_preprocessing(kb.params["preprocessing"])
1132
1143
  else:
1133
1144
  table.configure_preprocessing(None) # This ensures default preprocessor is created
1134
1145
 
@@ -1144,32 +1155,30 @@ class KnowledgeBaseController:
1144
1155
  if project_name is not None:
1145
1156
  projects = [p for p in projects if p.name == project_name]
1146
1157
 
1147
- query = (
1148
- db.session.query(db.KnowledgeBase)
1149
- .filter(db.KnowledgeBase.project_id.in_(list([p.id for p in projects])))
1158
+ query = db.session.query(db.KnowledgeBase).filter(
1159
+ db.KnowledgeBase.project_id.in_(list([p.id for p in projects]))
1150
1160
  )
1151
1161
 
1152
1162
  data = []
1153
- project_names = {
1154
- i.id: i.name
1155
- for i in project_controller.get_list()
1156
- }
1163
+ project_names = {i.id: i.name for i in project_controller.get_list()}
1157
1164
 
1158
1165
  for record in query:
1159
1166
  vector_database = record.vector_database
1160
1167
  embedding_model = record.embedding_model
1161
1168
 
1162
- data.append({
1163
- 'id': record.id,
1164
- 'name': record.name,
1165
- 'project_id': record.project_id,
1166
- 'project_name': project_names[record.project_id],
1167
- 'embedding_model': embedding_model.name if embedding_model is not None else None,
1168
- 'vector_database': None if vector_database is None else vector_database.name,
1169
- 'vector_database_table': record.vector_database_table,
1170
- 'query_id': record.query_id,
1171
- 'params': record.params
1172
- })
1169
+ data.append(
1170
+ {
1171
+ "id": record.id,
1172
+ "name": record.name,
1173
+ "project_id": record.project_id,
1174
+ "project_name": project_names[record.project_id],
1175
+ "embedding_model": embedding_model.name if embedding_model is not None else None,
1176
+ "vector_database": None if vector_database is None else vector_database.name,
1177
+ "vector_database_table": record.vector_database_table,
1178
+ "query_id": record.query_id,
1179
+ "params": record.params,
1180
+ }
1181
+ )
1173
1182
 
1174
1183
  return data
1175
1184