PyPI - MindsDB - Versions diffs - 25.9.1.2__py3-none-any.whl → 25.9.3rc1__py3-none-any.whl - Mend

MindsDB 25.9.1.2py3-none-any.whl → 25.9.3rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (120) hide show

mindsdb/__about__.py +1 -1
mindsdb/__main__.py +39 -20
mindsdb/api/a2a/agent.py +7 -9
mindsdb/api/a2a/common/server/server.py +3 -3
mindsdb/api/a2a/common/server/task_manager.py +4 -4
mindsdb/api/a2a/task_manager.py +15 -17
mindsdb/api/common/middleware.py +9 -11
mindsdb/api/executor/command_executor.py +2 -4
mindsdb/api/executor/datahub/datanodes/datanode.py +2 -2
mindsdb/api/executor/datahub/datanodes/integration_datanode.py +100 -48
mindsdb/api/executor/datahub/datanodes/project_datanode.py +8 -4
mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
mindsdb/api/executor/exceptions.py +29 -10
mindsdb/api/executor/planner/plan_join.py +17 -3
mindsdb/api/executor/sql_query/sql_query.py +74 -74
mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +1 -2
mindsdb/api/executor/sql_query/steps/subselect_step.py +0 -1
mindsdb/api/executor/utilities/functions.py +6 -6
mindsdb/api/executor/utilities/sql.py +32 -16
mindsdb/api/http/gui.py +5 -11
mindsdb/api/http/initialize.py +8 -10
mindsdb/api/http/namespaces/agents.py +10 -12
mindsdb/api/http/namespaces/analysis.py +13 -20
mindsdb/api/http/namespaces/auth.py +1 -1
mindsdb/api/http/namespaces/config.py +15 -11
mindsdb/api/http/namespaces/databases.py +140 -201
mindsdb/api/http/namespaces/file.py +15 -4
mindsdb/api/http/namespaces/handlers.py +7 -2
mindsdb/api/http/namespaces/knowledge_bases.py +8 -7
mindsdb/api/http/namespaces/models.py +94 -126
mindsdb/api/http/namespaces/projects.py +13 -22
mindsdb/api/http/namespaces/sql.py +33 -25
mindsdb/api/http/namespaces/tab.py +27 -37
mindsdb/api/http/namespaces/views.py +1 -1
mindsdb/api/http/start.py +14 -8
mindsdb/api/mcp/__init__.py +2 -1
mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +15 -20
mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +26 -50
mindsdb/api/mysql/mysql_proxy/utilities/__init__.py +0 -1
mindsdb/api/postgres/postgres_proxy/executor/executor.py +6 -13
mindsdb/api/postgres/postgres_proxy/postgres_packets/postgres_packets.py +40 -28
mindsdb/integrations/handlers/byom_handler/byom_handler.py +168 -185
mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +11 -5
mindsdb/integrations/handlers/file_handler/file_handler.py +7 -0
mindsdb/integrations/handlers/lightwood_handler/functions.py +45 -79
mindsdb/integrations/handlers/openai_handler/openai_handler.py +1 -1
mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +20 -2
mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +18 -3
mindsdb/integrations/handlers/shopify_handler/shopify_handler.py +25 -12
mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +2 -1
mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +4 -4
mindsdb/integrations/libs/api_handler.py +10 -10
mindsdb/integrations/libs/base.py +4 -4
mindsdb/integrations/libs/llm/utils.py +2 -2
mindsdb/integrations/libs/ml_handler_process/create_engine_process.py +4 -7
mindsdb/integrations/libs/ml_handler_process/func_call_process.py +2 -7
mindsdb/integrations/libs/ml_handler_process/learn_process.py +37 -47
mindsdb/integrations/libs/ml_handler_process/update_engine_process.py +4 -7
mindsdb/integrations/libs/ml_handler_process/update_process.py +2 -7
mindsdb/integrations/libs/process_cache.py +132 -140
mindsdb/integrations/libs/response.py +18 -12
mindsdb/integrations/libs/vectordatabase_handler.py +26 -0
mindsdb/integrations/utilities/files/file_reader.py +6 -7
mindsdb/integrations/utilities/rag/config_loader.py +37 -26
mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +59 -9
mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +4 -4
mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +55 -133
mindsdb/integrations/utilities/rag/settings.py +58 -133
mindsdb/integrations/utilities/rag/splitters/file_splitter.py +5 -15
mindsdb/interfaces/agents/agents_controller.py +2 -1
mindsdb/interfaces/agents/constants.py +0 -2
mindsdb/interfaces/agents/litellm_server.py +34 -58
mindsdb/interfaces/agents/mcp_client_agent.py +10 -10
mindsdb/interfaces/agents/mindsdb_database_agent.py +5 -5
mindsdb/interfaces/agents/run_mcp_agent.py +12 -21
mindsdb/interfaces/chatbot/chatbot_task.py +20 -23
mindsdb/interfaces/chatbot/polling.py +30 -18
mindsdb/interfaces/data_catalog/data_catalog_loader.py +10 -10
mindsdb/interfaces/database/integrations.py +19 -2
mindsdb/interfaces/file/file_controller.py +6 -6
mindsdb/interfaces/functions/controller.py +1 -1
mindsdb/interfaces/functions/to_markdown.py +2 -2
mindsdb/interfaces/jobs/jobs_controller.py +5 -5
mindsdb/interfaces/jobs/scheduler.py +3 -8
mindsdb/interfaces/knowledge_base/controller.py +54 -25
mindsdb/interfaces/knowledge_base/preprocessing/json_chunker.py +40 -61
mindsdb/interfaces/model/model_controller.py +170 -166
mindsdb/interfaces/query_context/context_controller.py +14 -2
mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +6 -4
mindsdb/interfaces/skills/retrieval_tool.py +43 -50
mindsdb/interfaces/skills/skill_tool.py +2 -2
mindsdb/interfaces/skills/sql_agent.py +25 -19
mindsdb/interfaces/storage/fs.py +114 -169
mindsdb/interfaces/storage/json.py +19 -18
mindsdb/interfaces/storage/model_fs.py +54 -92
mindsdb/interfaces/tabs/tabs_controller.py +49 -72
mindsdb/interfaces/tasks/task_monitor.py +3 -9
mindsdb/interfaces/tasks/task_thread.py +7 -9
mindsdb/interfaces/triggers/trigger_task.py +7 -13
mindsdb/interfaces/triggers/triggers_controller.py +47 -50
mindsdb/migrations/migrate.py +16 -16
mindsdb/utilities/api_status.py +58 -0
mindsdb/utilities/config.py +49 -0
mindsdb/utilities/exception.py +40 -1
mindsdb/utilities/fs.py +0 -1
mindsdb/utilities/hooks/profiling.py +17 -14
mindsdb/utilities/langfuse.py +40 -45
mindsdb/utilities/log.py +272 -0
mindsdb/utilities/ml_task_queue/consumer.py +52 -58
mindsdb/utilities/ml_task_queue/producer.py +26 -30
mindsdb/utilities/render/sqlalchemy_render.py +8 -7
mindsdb/utilities/utils.py +2 -2
{mindsdb-25.9.1.2.dist-info → mindsdb-25.9.3rc1.dist-info}/METADATA +266 -261
{mindsdb-25.9.1.2.dist-info → mindsdb-25.9.3rc1.dist-info}/RECORD +119 -119
mindsdb/api/mysql/mysql_proxy/utilities/exceptions.py +0 -14
{mindsdb-25.9.1.2.dist-info → mindsdb-25.9.3rc1.dist-info}/WHEEL +0 -0
{mindsdb-25.9.1.2.dist-info → mindsdb-25.9.3rc1.dist-info}/licenses/LICENSE +0 -0
{mindsdb-25.9.1.2.dist-info → mindsdb-25.9.3rc1.dist-info}/top_level.txt +0 -0

mindsdb/interfaces/jobs/scheduler.py CHANGED Viewed

@@ -14,7 +14,6 @@ logger = log.getLogger(__name__)
 def execute_async(q_in, q_out):
     while True:
         task = q_in.get()
@@ -44,7 +43,7 @@ class Scheduler:
         self.q_in = queue.Queue()
         self.q_out = queue.Queue()
         self.work_thread = threading.Thread(
-            target=execute_async, args=(self.q_in, self.q_out), name='Scheduler.execute_async'
+            target=execute_async, args=(self.q_in, self.q_out), name="Scheduler.execute_async"
         )
         self.work_thread.start()
@@ -58,14 +57,13 @@ class Scheduler:
         check_interval = self.config.get("jobs", {}).get("check_interval", 30)
         while True:
             logger.debug("Scheduler check timetable")
             try:
                 self.check_timetable()
             except (SystemExit, KeyboardInterrupt):
                 raise
-            except Exception as e:
-                logger.error(e)
+            except Exception:
+                logger.exception("Error in 'scheduler_monitor'")
             # different instances should start in not the same time
@@ -83,7 +81,6 @@ class Scheduler:
         db.session.remove()
     def execute_task(self, record_id, exec_method):
         executor = JobsExecutor()
         if exec_method == "local":
             history_id = executor.lock_record(record_id)
@@ -117,7 +114,6 @@ class Scheduler:
             raise NotImplementedError()
     def start(self):
         config = Config()
         db.init()
         self.config = config
@@ -127,7 +123,6 @@ class Scheduler:
         try:
             self.scheduler_monitor()
         except (KeyboardInterrupt, SystemExit):
             self.stop_thread()
             pass

mindsdb/interfaces/knowledge_base/controller.py CHANGED Viewed

@@ -22,12 +22,7 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
     TableField,
     VectorStoreHandler,
 )
-from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
-from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
 from mindsdb.integrations.utilities.handler_utils import get_api_key
-from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import (
-    construct_model_from_args,
-)
 from mindsdb.interfaces.agents.constants import DEFAULT_EMBEDDINGS_MODEL_CLASS, MAX_INSERT_BATCH_SIZE
 from mindsdb.interfaces.agents.langchain_agent import create_chat_model, get_llm_provider
@@ -56,6 +51,7 @@ class KnowledgeBaseInputParams(BaseModel):
     content_columns: List[str] | None = None
     id_column: str | None = None
     kb_no_upsert: bool = False
+    kb_skip_existing: bool = False
     embedding_model: Dict[Text, Any] | None = None
     is_sparse: bool = False
     vector_size: int | None = None
@@ -83,9 +79,9 @@ def get_model_params(model_params: dict, default_config_key: str):
     return combined_model_params
-def get_embedding_model_from_params(embedding_model_params: dict):
+def adapt_embedding_model_params(embedding_model_params: dict):
     """
-    Create embedding model from parameters.
+    Prepare parameters for embedding model.
     """
     params_copy = copy.deepcopy(embedding_model_params)
     provider = params_copy.pop("provider", None).lower()
@@ -106,7 +102,7 @@ def get_embedding_model_from_params(embedding_model_params: dict):
     params_copy.pop("api_key", None)
     params_copy["model"] = params_copy.pop("model_name", None)
-    return construct_model_from_args(params_copy)
+    return params_copy
 def get_reranking_model_from_params(reranking_model_params: dict):
@@ -265,9 +261,9 @@ class KnowledgeBaseTable:
                             gt_filtering = True
                         logger.debug(f"Found relevance_threshold in query: {relevance_threshold}")
                     except (ValueError, TypeError) as e:
-                        error_msg = f"Invalid relevance_threshold value: {item.value}. {str(e)}"
+                        error_msg = f"Invalid relevance_threshold value: {item.value}. {e}"
                         logger.error(error_msg)
-                        raise ValueError(error_msg)
+                        raise ValueError(error_msg) from e
                 elif (item.column == "relevance") and (item.op.value not in relevance_threshold_allowed_operators):
                     raise ValueError(
                         f"Invalid operator for relevance: {item.op.value}. Only the following operators are allowed: "
@@ -318,13 +314,20 @@ class KnowledgeBaseTable:
         self.addapt_conditions_columns(conditions)
         # Set default limit if query is present
+        limit = query.limit.value if query.limit is not None else None
         if query_text is not None:
-            limit = query.limit.value if query.limit is not None else None
             if limit is None:
                 limit = 10
             elif limit > 100:
                 limit = 100
-            query.limit = Constant(limit)
+            if not disable_reranking:
+                # expand limit, get more records before reranking usage:
+                #   get twice size of input but not greater than 30
+                query_limit = min(limit * 2, limit + 30)
+            else:
+                query_limit = limit
+            query.limit = Constant(query_limit)
         allowed_metadata_columns = self._get_allowed_metadata_columns()
         df = db_handler.dispatch_select(query, conditions, allowed_metadata_columns=allowed_metadata_columns)
@@ -375,6 +378,8 @@ class KnowledgeBaseTable:
         # Check if we have a rerank_model configured in KB params
         df = self.add_relevance(df, query_text, relevance_threshold, disable_reranking)
+        if limit is not None:
+            df = df[:limit]
         # if relevance filtering method is strictly GREATER THAN we filter the df
         if gt_filtering:
@@ -407,7 +412,6 @@ class KnowledgeBaseTable:
         if reranking_model_params and query_text and len(df) > 0 and not disable_reranking:
             # Use reranker for relevance score
-            logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
             # Apply custom filtering threshold if provided
             if relevance_threshold is not None:
                 reranking_model_params["filtering_threshold"] = relevance_threshold
@@ -424,7 +428,6 @@ class KnowledgeBaseTable:
             # Filter by threshold
             scores_array = np.array(scores)
             df = df[scores_array >= reranker.filtering_threshold]
-            logger.debug(f"Applied reranking with params: {reranking_model_params}")
         elif "distance" in df.columns:
             # Calculate relevance from distance
@@ -547,7 +550,7 @@ class KnowledgeBaseTable:
                 if processed_chunks:
                     content.value = processed_chunks[0].content
-            query.update_columns[emb_col] = Constant(self._content_to_embeddings(content))
+            query.update_columns[emb_col] = Constant(self._content_to_embeddings(content.value))
         if "metadata" not in query.update_columns:
             query.update_columns["metadata"] = Constant({})
@@ -678,6 +681,25 @@ class KnowledgeBaseTable:
             logger.warning("No valid content found in any content columns")
             return
+        # Check if we should skip existing items (before calculating embeddings)
+        if params is not None and params.get("kb_skip_existing", False):
+            logger.debug(f"Checking for existing items to skip before processing {len(df)} items")
+            db_handler = self.get_vector_db()
+            # Get list of IDs from current batch
+            current_ids = df[TableField.ID.value].dropna().astype(str).tolist()
+            if current_ids:
+                # Check which IDs already exist
+                existing_ids = db_handler.check_existing_ids(self._kb.vector_database_table, current_ids)
+                if existing_ids:
+                    # Filter out existing items
+                    df = df[~df[TableField.ID.value].astype(str).isin(existing_ids)]
+                    logger.info(f"Skipped {len(existing_ids)} existing items, processing {len(df)} new items")
+                    if df.empty:
+                        logger.info("All items already exist, nothing to insert")
+                        return
         # add embeddings and send to vector db
         df_emb = self._df_to_embeddings(df)
         df = pd.concat([df, df_emb], axis=1)
@@ -915,7 +937,12 @@ class KnowledgeBaseTable:
             ValueError: If the configuration is invalid or required components are missing
         """
         # Get embedding model from knowledge base
-        embeddings_model = None
+        from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import (
+            construct_model_from_args,
+        )
+        from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
+        from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
         embedding_model_params = get_model_params(self._kb.params.get("embedding_model", {}), "default_embedding_model")
         if self._kb.embedding_model:
             # Extract embedding model args from knowledge base table
@@ -924,7 +951,7 @@ class KnowledgeBaseTable:
             embeddings_model = construct_model_from_args(embedding_args)
             logger.debug(f"Using knowledge base embedding model with args: {embedding_args}")
         elif embedding_model_params:
-            embeddings_model = get_embedding_model_from_params(embedding_model_params)
+            embeddings_model = construct_model_from_args(adapt_embedding_model_params(embedding_model_params))
             logger.debug(f"Using knowledge base embedding model from params: {self._kb.params['embedding_model']}")
         else:
             embeddings_model = DEFAULT_EMBEDDINGS_MODEL_CLASS()
@@ -952,8 +979,8 @@ class KnowledgeBaseTable:
             return rag
         except Exception as e:
-            logger.error(f"Error building RAG pipeline: {str(e)}")
-            raise ValueError(f"Failed to build RAG pipeline: {str(e)}")
+            logger.exception("Error building RAG pipeline:")
+            raise ValueError(f"Failed to build RAG pipeline: {str(e)}") from e
     def _parse_metadata(self, base_metadata):
         """Helper function to robustly parse metadata string to dict"""
@@ -1065,7 +1092,7 @@ class KnowledgeBaseController:
             msg = "\n".join(problems)
             if len(problems) > 1:
                 msg = "\n" + msg
-            raise ValueError(f"Problem with knowledge base parameters: {msg}")
+            raise ValueError(f"Problem with knowledge base parameters: {msg}") from e
         # Validate preprocessing config first if provided
         if preprocessing_config is not None:
@@ -1110,6 +1137,9 @@ class KnowledgeBaseController:
             model_record = db.Predictor.query.get(model["id"])
             embedding_model_id = model_record.id
+            if model_record.learn_args.get("using", {}).get("sparse"):
+                is_sparse = True
         # if params.get("reranking_model", {}) is bool and False we evaluate it to empty dictionary
         reranking_model_params = params.get("reranking_model", {})
@@ -1128,7 +1158,7 @@ class KnowledgeBaseController:
                 reranker = get_reranking_model_from_params(reranking_model_params)
                 reranker.get_scores("test", ["test"])
             except (ValueError, RuntimeError) as e:
-                raise RuntimeError(f"Problem with reranker config: {e}")
+                raise RuntimeError(f"Problem with reranker config: {e}") from e
         # search for the vector database table
         if storage is None:
@@ -1138,7 +1168,6 @@ class KnowledgeBaseController:
                 # Add sparse vector support for pgvector
                 vector_db_params = {}
                 # Check both explicit parameter and model configuration
-                is_sparse = is_sparse or model_record.learn_args.get("using", {}).get("sparse")
                 if is_sparse:
                     vector_db_params["is_sparse"] = True
                     if vector_size is not None:
@@ -1242,7 +1271,7 @@ class KnowledgeBaseController:
             try:
                 KnowledgeBaseTable.call_litellm_embedding(self.session, params, ["test"])
             except Exception as e:
-                raise RuntimeError(f"Problem with embedding model config: {e}")
+                raise RuntimeError(f"Problem with embedding model config: {e}") from e
             return
         params = copy.deepcopy(params)
@@ -1295,8 +1324,8 @@ class KnowledgeBaseController:
         """
         try:
             project = self.session.database_controller.get_project(project_name)
-        except ValueError:
-            raise ValueError(f"Project not found: {project_name}")
+        except ValueError as e:
+            raise ValueError(f"Project not found: {project_name}") from e
         project_id = project.id
         # check if knowledge base exists

mindsdb/interfaces/knowledge_base/preprocessing/json_chunker.py CHANGED Viewed

@@ -1,13 +1,10 @@
-from typing import List, Dict, Any, Optional
+import ast
 import json
+from typing import List, Dict, Any, Optional
 import pandas as pd
-import ast
-from mindsdb.interfaces.knowledge_base.preprocessing.models import (
-    Document,
-    ProcessedChunk,
-    JSONChunkingConfig
-)
+from mindsdb.interfaces.knowledge_base.preprocessing.models import Document, ProcessedChunk, JSONChunkingConfig
 from mindsdb.interfaces.knowledge_base.preprocessing.document_preprocessor import DocumentPreprocessor
 from mindsdb.utilities import log
@@ -50,7 +47,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
                 chunks = self._process_json_data(json_data, doc)
                 all_chunks.extend(chunks)
             except Exception as e:
-                logger.error(f"Error processing document {doc.id}: {e}")
+                logger.exception(f"Error processing document {doc.id}:")
                 error_chunk = self._create_error_chunk(doc, str(e))
                 all_chunks.append(error_chunk)
@@ -76,8 +73,8 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
             # If JSON parsing fails, try as Python literal
             try:
                 return ast.literal_eval(doc.content)
-            except (SyntaxError, ValueError) as e:
-                logger.error(f"Error parsing content for document {doc.id}: {e}")
+            except (SyntaxError, ValueError):
+                logger.exception(f"Error parsing content for document {doc.id}:")
                 # We'll create the error chunk in the main process_documents method
                 return None
@@ -117,7 +114,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
         return ProcessedChunk(
             id=f"{doc.id}_error",
             content=f"Error processing document: {error_message}",
-            metadata=self._prepare_chunk_metadata(doc.id, 0, doc.metadata)
+            metadata=self._prepare_chunk_metadata(doc.id, 0, doc.metadata),
         )
     def _process_json_list(self, json_list: List, doc: Document) -> List[ProcessedChunk]:
@@ -132,20 +129,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
             elif isinstance(item, list):
                 # Handle nested lists by converting to string representation
                 chunk = self._create_chunk_from_primitive(
-                    json.dumps(item),
-                    doc,
-                    chunk_index=i,
-                    total_chunks=total_objects
+                    json.dumps(item), doc, chunk_index=i, total_chunks=total_objects
                 )
                 chunks.append(chunk)
             else:
                 # Handle primitive values
-                chunk = self._create_chunk_from_primitive(
-                    item,
-                    doc,
-                    chunk_index=i,
-                    total_chunks=total_objects
-                )
+                chunk = self._create_chunk_from_primitive(item, doc, chunk_index=i, total_chunks=total_objects)
                 chunks.append(chunk)
         return chunks
@@ -159,7 +148,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
             try:
                 json_dict = json.loads(json_dict)
             except json.JSONDecodeError:
-                logger.error(f"Error parsing JSON string: {json_dict[:100]}...")
+                logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
                 return [self._create_error_chunk(doc, "Invalid JSON string")]
         # Filter fields based on include/exclude lists
@@ -190,31 +179,25 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
                 start_char=0,
                 end_char=len(field_content),
                 provided_id=doc.id,
-                content_column=self.config.content_column
+                content_column=self.config.content_column,
             )
             # Create and add the chunk
-            chunk = ProcessedChunk(
-                id=chunk_id,
-                content=field_content,
-                metadata=metadata
-            )
+            chunk = ProcessedChunk(id=chunk_id, content=field_content, metadata=metadata)
             chunks.append(chunk)
         return chunks
-    def _create_chunk_from_dict(self,
-                                json_dict: Dict,
-                                doc: Document,
-                                chunk_index: int,
-                                total_chunks: int) -> ProcessedChunk:
+    def _create_chunk_from_dict(
+        self, json_dict: Dict, doc: Document, chunk_index: int, total_chunks: int
+    ) -> ProcessedChunk:
         """Create a chunk from a JSON dictionary"""
         # Ensure we're working with a dictionary
         if isinstance(json_dict, str):
             try:
                 json_dict = json.loads(json_dict)
             except json.JSONDecodeError:
-                logger.error(f"Error parsing JSON string: {json_dict[:100]}...")
+                logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
                 return self._create_error_chunk(doc, "Invalid JSON string")
         # Format the content
@@ -223,9 +206,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
             filtered_dict = self._filter_fields(flattened)
             content = self._dict_to_text(filtered_dict)
         else:
-            filtered_dict = {k: v for k, v in json_dict.items()
-                             if (not self.config.include_fields or k in self.config.include_fields)
-                             and k not in self.config.exclude_fields}
+            filtered_dict = {
+                k: v
+                for k, v in json_dict.items()
+                if (not self.config.include_fields or k in self.config.include_fields)
+                and k not in self.config.exclude_fields
+            }
             content = json.dumps(filtered_dict, indent=2)
         # Create metadata
@@ -241,22 +227,23 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
             start_char=0,
             end_char=len(content),
             provided_id=doc.id,
-            content_column=self.config.content_column
+            content_column=self.config.content_column,
         )
-        return ProcessedChunk(
-            id=chunk_id,
-            content=content,
-            metadata=metadata
-        )
+        return ProcessedChunk(id=chunk_id, content=content, metadata=metadata)
     def _filter_fields(self, flattened_dict: Dict) -> Dict:
         """Filter fields based on include/exclude configuration"""
         # If include_fields is specified, only keep those fields
         if self.config.include_fields:
-            filtered_dict = {k: v for k, v in flattened_dict.items()
-                             if any(k == field or k.startswith(field + self.config.nested_delimiter)
-                                    for field in self.config.include_fields)}
+            filtered_dict = {
+                k: v
+                for k, v in flattened_dict.items()
+                if any(
+                    k == field or k.startswith(field + self.config.nested_delimiter)
+                    for field in self.config.include_fields
+                )
+            }
         else:
             filtered_dict = flattened_dict.copy()
@@ -276,11 +263,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
         return filtered_dict
     def _create_chunk_from_primitive(
-            self,
-            value: Any,
-            doc: Document,
-            chunk_index: int = 0,
-            total_chunks: int = 1
+        self, value: Any, doc: Document, chunk_index: int = 0, total_chunks: int = 1
     ) -> ProcessedChunk:
         """Create a chunk from a primitive value"""
         content = str(value)
@@ -300,16 +283,12 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
             start_char=0,
             end_char=len(content),
             provided_id=doc.id,
-            content_column=self.config.content_column
+            content_column=self.config.content_column,
         )
-        return ProcessedChunk(
-            id=chunk_id,
-            content=content,
-            metadata=metadata
-        )
+        return ProcessedChunk(id=chunk_id, content=content, metadata=metadata)
-    def _flatten_dict(self, d: Dict, delimiter: str = '.', prefix: str = '') -> Dict:
+    def _flatten_dict(self, d: Dict, delimiter: str = ".", prefix: str = "") -> Dict:
         """Flatten a nested dictionary structure"""
         result = {}
         for k, v in d.items():
@@ -337,7 +316,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
                     # Format list of dictionaries
                     lines.append(f"{key}:")
                     for i, item in enumerate(value):
-                        lines.append(f"  Item {i+1}:")
+                        lines.append(f"  Item {i + 1}:")
                         for k, v in item.items():
                             lines.append(f"    {k}: {v}")
                 else:
@@ -362,7 +341,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
                 # Format list of dictionaries
                 lines = [f"{key}:"]
                 for i, item in enumerate(value):
-                    lines.append(f"  Item {i+1}:")
+                    lines.append(f"  Item {i + 1}:")
                     for k, v in item.items():
                         lines.append(f"    {k}: {v}")
                 return "\n".join(lines)
@@ -380,7 +359,7 @@ class JSONChunkingPreprocessor(DocumentPreprocessor):
             try:
                 json_dict = json.loads(json_dict)
             except json.JSONDecodeError:
-                logger.error(f"Error parsing JSON string: {json_dict[:100]}...")
+                logger.exception(f"Error parsing JSON string: {json_dict[:100]}...")
                 return
         # Always flatten the dictionary for metadata extraction

MindsDB 25.9.1.2__py3-none-any.whl → 25.9.3rc1__py3-none-any.whl

Potentially problematic release.

MindsDB 25.9.1.2py3-none-any.whl → 25.9.3rc1py3-none-any.whl