MindsDB 25.4.2.1__py3-none-any.whl → 25.4.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

mindsdb/__about__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  __title__ = 'MindsDB'
2
2
  __package_name__ = 'mindsdb'
3
- __version__ = '25.4.2.1'
3
+ __version__ = '25.4.3.0'
4
4
  __description__ = "MindsDB's AI SQL Server enables developers to build AI tools that need access to real-time data to perform their tasks"
5
5
  __email__ = "jorge@mindsdb.com"
6
6
  __author__ = 'MindsDB Inc'
mindsdb/__main__.py CHANGED
@@ -299,15 +299,38 @@ if __name__ == '__main__':
299
299
  logger.debug(f"Checking if default project {config.get('default_project')} exists")
300
300
  project_controller = ProjectController()
301
301
 
302
- current_default_project = project_controller.get(is_default=True)
303
- if current_default_project.record.name != config.get('default_project'):
302
+ try:
303
+ current_default_project = project_controller.get(is_default=True)
304
+ except EntityNotExistsError:
305
+ # In previous versions, the default project could be deleted. This is no longer possible.
306
+ current_default_project = None
307
+
308
+ if current_default_project:
309
+ if current_default_project.record.name != config.get('default_project'):
310
+ try:
311
+ project_controller.get(name=config.get('default_project'))
312
+ log.critical(f"A project with the name '{config.get('default_project')}' already exists")
313
+ sys.exit(1)
314
+ except EntityNotExistsError:
315
+ pass
316
+ project_controller.update(current_default_project.record.id, new_name=config.get('default_project'))
317
+
318
+ # Legacy: If the default project does not exist, mark the new one as default.
319
+ else:
304
320
  try:
305
- new_default_project = project_controller.get(name=config.get('default_project'))
306
- log.critical(f"A project with the name '{config.get('default_project')}' already exists")
307
- sys.exit(1)
321
+ project_controller.get(name=config.get('default_project'))
308
322
  except EntityNotExistsError:
309
- pass
310
- project_controller.update(current_default_project.record.id, new_name=config.get('default_project'))
323
+ log.critical(
324
+ f"A project with the name '{config.get('default_project')}' does not exist"
325
+ )
326
+ raise
327
+
328
+ project_controller.update(
329
+ name=config.get('default_project'),
330
+ new_metadata={
331
+ "is_default": True
332
+ }
333
+ )
311
334
 
312
335
  apis = os.getenv('MINDSDB_APIS') or config.cmd_args.api
313
336
 
@@ -1,6 +1,4 @@
1
1
  import pandas as pd
2
- import threading
3
- import queue
4
2
  from typing import List
5
3
 
6
4
  from mindsdb_sql_parser import ASTNode
@@ -11,9 +9,10 @@ from mindsdb.interfaces.query_context.context_controller import RunningQuery
11
9
  from mindsdb.api.executor.sql_query.result_set import ResultSet
12
10
  from mindsdb.utilities import log
13
11
  from mindsdb.utilities.config import Config
14
- from mindsdb.utilities.context import Context, context as ctx
15
12
  from mindsdb.utilities.partitioning import get_max_thread_count, split_data_frame
16
13
  from mindsdb.api.executor.sql_query.steps.fetch_dataframe import get_table_alias, get_fill_param_fnc
14
+ from mindsdb.utilities.context_executor import ContextThreadPoolExecutor
15
+
17
16
 
18
17
  from .base import BaseStepCall
19
18
 
@@ -178,9 +177,6 @@ class FetchDataframePartitionCall(BaseStepCall):
178
177
  """
179
178
 
180
179
  # create communication queues
181
- queue_in = queue.Queue()
182
- queue_out = queue.Queue()
183
- self.stop_event = threading.Event()
184
180
 
185
181
  if thread_count is None:
186
182
  thread_count = get_max_thread_count()
@@ -191,16 +187,9 @@ class FetchDataframePartitionCall(BaseStepCall):
191
187
  if partition_size < 10:
192
188
  partition_size = 10
193
189
 
194
- # create N workers pool
195
- workers = []
196
190
  results = []
197
191
 
198
- try:
199
- for i in range(thread_count):
200
- worker = threading.Thread(target=self._worker, daemon=True, args=(ctx.dump(), queue_in,
201
- queue_out, self.stop_event))
202
- worker.start()
203
- workers.append(worker)
192
+ with ContextThreadPoolExecutor(max_workers=thread_count) as executor:
204
193
 
205
194
  while True:
206
195
  # fetch batch
@@ -220,69 +209,23 @@ class FetchDataframePartitionCall(BaseStepCall):
220
209
  max_track_value = run_query.get_max_track_value(df)
221
210
 
222
211
  # split into chunks and send to workers
223
- sent_chunks = 0
212
+ futures = []
224
213
  for df2 in split_data_frame(df, partition_size):
225
- queue_in.put([sent_chunks, df2])
226
- sent_chunks += 1
214
+ futures.append(executor.submit(self.exec_sub_steps, df2))
227
215
 
228
- batch_results = []
229
- for i in range(sent_chunks):
230
- res = queue_out.get()
231
- if 'error' in res:
216
+ for future in futures:
217
+ try:
218
+ results.append(future.result())
219
+ except Exception as e:
232
220
  if on_error == 'skip':
233
- logger.error(res['error'])
221
+ logger.error(e)
234
222
  else:
235
- raise RuntimeError(res['error'])
236
-
237
- if res['data']:
238
- batch_results.append(res)
239
-
240
- # sort results
241
- batch_results.sort(key=lambda x: x['num'])
242
-
243
- results.append(self.concat_results(
244
- [item['data'] for item in batch_results]
245
- ))
223
+ executor.shutdown()
224
+ raise e
246
225
 
247
226
  # TODO
248
227
  # 1. get next batch without updating track_value:
249
228
  # it allows to keep queue_in filled with data between fetching batches
250
229
  run_query.set_progress(df, max_track_value)
251
- finally:
252
- self.close_workers(workers)
253
230
 
254
231
  return self.concat_results(results)
255
-
256
- def close_workers(self, workers: List[threading.Thread]):
257
- """
258
- Sent signal to workers to stop
259
- """
260
-
261
- self.stop_event.set()
262
- for worker in workers:
263
- if worker.is_alive():
264
- worker.join()
265
-
266
- def _worker(self, context: Context, queue_in: queue.Queue, queue_out: queue.Queue, stop_event: threading.Event):
267
- """
268
- Worker function. Execute incoming tasks unless stop_event is set
269
- """
270
- ctx.load(context)
271
- while True:
272
- if stop_event.is_set():
273
- break
274
-
275
- try:
276
- chunk_num, df = queue_in.get(timeout=1)
277
- if df is None:
278
- continue
279
-
280
- sub_data = self.exec_sub_steps(df)
281
-
282
- queue_out.put({'data': sub_data, 'num': chunk_num})
283
- except queue.Empty:
284
- continue
285
-
286
- except Exception as e:
287
- queue_out.put({'error': str(e)})
288
- stop_event.set()
@@ -244,6 +244,7 @@ class ChromaDBHandler(VectorStoreHandler):
244
244
  offset: int = None,
245
245
  limit: int = None,
246
246
  ) -> pd.DataFrame:
247
+
247
248
  collection = self._client.get_collection(table_name)
248
249
  filters = self._translate_metadata_condition(conditions)
249
250
 
@@ -313,7 +314,7 @@ class ChromaDBHandler(VectorStoreHandler):
313
314
  TableField.ID.value: ids,
314
315
  TableField.CONTENT.value: documents,
315
316
  TableField.METADATA.value: metadatas,
316
- TableField.EMBEDDINGS.value: embeddings,
317
+ TableField.EMBEDDINGS.value: list(embeddings),
317
318
  }
318
319
 
319
320
  if columns is not None:
@@ -278,8 +278,16 @@ class VectorStoreHandler(BaseHandler):
278
278
  return self.do_upsert(table_name, df)
279
279
 
280
280
  def do_upsert(self, table_name, df):
281
- # if handler supports it, call upsert method
281
+ """Upsert data into table, handling document updates and deletions.
282
282
 
283
+ Args:
284
+ table_name (str): Name of the table
285
+ df (pd.DataFrame): DataFrame containing the data to upsert
286
+
287
+ The function handles three cases:
288
+ 1. New documents: Insert them
289
+ 2. Updated documents: Delete old chunks and insert new ones
290
+ """
283
291
  id_col = TableField.ID.value
284
292
  content_col = TableField.CONTENT.value
285
293
 
@@ -18,7 +18,7 @@ log = logging.getLogger(__name__)
18
18
 
19
19
 
20
20
  class LLMReranker(BaseDocumentCompressor):
21
- filtering_threshold: float = 0.5 # Default threshold for filtering
21
+ filtering_threshold: float = 0.0 # Default threshold for filtering
22
22
  model: str = DEFAULT_RERANKING_MODEL # Model to use for reranking
23
23
  temperature: float = 0.0 # Temperature for the model
24
24
  openai_api_key: Optional[str] = None
@@ -69,6 +69,12 @@ class Project:
69
69
  self.id = record.id
70
70
 
71
71
  def delete(self):
72
+ if self.record.metadata_ and self.record.metadata_.get('is_default', False):
73
+ raise Exception(
74
+ f"Project '{self.name}' can not be deleted, because it is default project."
75
+ "The default project can be changed in the config file or by setting the environment variable MINDSDB_DEFAULT_PROJECT."
76
+ )
77
+
72
78
  tables = self.get_tables()
73
79
  tables = [key for key, val in tables.items() if val['type'] != 'table']
74
80
  if len(tables) > 0:
@@ -466,7 +472,7 @@ class ProjectController:
466
472
 
467
473
  if new_metadata is not None:
468
474
  project.metadata = new_metadata
469
- project.record.metadata = new_metadata
475
+ project.record.metadata_ = new_metadata
470
476
  flag_modified(project.record, 'metadata_')
471
477
 
472
478
  db.session.commit()
@@ -3,7 +3,6 @@ import copy
3
3
  from typing import Dict, List, Optional
4
4
 
5
5
  import pandas as pd
6
- import hashlib
7
6
  import numpy as np
8
7
 
9
8
  from mindsdb_sql_parser.ast import (
@@ -155,19 +154,19 @@ class KnowledgeBaseTable:
155
154
  # extract values from conditions and prepare for vectordb
156
155
  conditions = []
157
156
  query_text = None
158
- reranking_threshold = None
157
+ relevance_threshold = None
159
158
  query_conditions = db_handler.extract_conditions(query.where)
160
159
  if query_conditions is not None:
161
160
  for item in query_conditions:
162
- if item.column == "reranking_threshold" and item.op.value == "=":
161
+ if item.column == "relevance_threshold" and item.op.value == "=":
163
162
  try:
164
- reranking_threshold = float(item.value)
163
+ relevance_threshold = float(item.value)
165
164
  # Validate range: must be between 0 and 1
166
- if not (0 <= reranking_threshold <= 1):
167
- raise ValueError(f"reranking_threshold must be between 0 and 1, got: {reranking_threshold}")
168
- logger.debug(f"Found reranking_threshold in query: {reranking_threshold}")
165
+ if not (0 <= relevance_threshold <= 1):
166
+ raise ValueError(f"relevance_threshold must be between 0 and 1, got: {relevance_threshold}")
167
+ logger.debug(f"Found relevance_threshold in query: {relevance_threshold}")
169
168
  except (ValueError, TypeError) as e:
170
- error_msg = f"Invalid reranking_threshold value: {item.value}. {str(e)}"
169
+ error_msg = f"Invalid relevance_threshold value: {item.value}. {str(e)}"
171
170
  logger.error(error_msg)
172
171
  raise ValueError(error_msg)
173
172
  elif item.column == TableField.CONTENT.value:
@@ -185,6 +184,16 @@ class KnowledgeBaseTable:
185
184
  logger.debug(f"Extracted query text: {query_text}")
186
185
 
187
186
  self.addapt_conditions_columns(conditions)
187
+
188
+ # Set default limit if query is present
189
+ if query_text is not None:
190
+ limit = query.limit.value if query.limit is not None else None
191
+ if limit is None:
192
+ limit = 10
193
+ elif limit > 100:
194
+ limit = 100
195
+ query.limit = Constant(limit)
196
+
188
197
  df = db_handler.dispatch_select(query, conditions)
189
198
  df = self.addapt_result_columns(df)
190
199
 
@@ -192,14 +201,14 @@ class KnowledgeBaseTable:
192
201
  logger.debug(f"Columns in response: {df.columns.tolist()}")
193
202
  # Check if we have a rerank_model configured in KB params
194
203
 
195
- df = self.add_relevance(df, query_text, reranking_threshold)
204
+ df = self.add_relevance(df, query_text, relevance_threshold)
196
205
 
197
206
  # filter by targets
198
207
  if requested_kb_columns is not None:
199
208
  df = df[requested_kb_columns]
200
209
  return df
201
210
 
202
- def add_relevance(self, df, query_text, reranking_threshold=None):
211
+ def add_relevance(self, df, query_text, relevance_threshold=None):
203
212
  relevance_column = TableField.RELEVANCE.value
204
213
 
205
214
  reranking_model_params = self._kb.params.get("reranking_model")
@@ -208,9 +217,9 @@ class KnowledgeBaseTable:
208
217
  try:
209
218
  logger.info(f"Using knowledge reranking model from params: {reranking_model_params}")
210
219
  # Apply custom filtering threshold if provided
211
- if reranking_threshold is not None:
212
- reranking_model_params["filtering_threshold"] = reranking_threshold
213
- logger.info(f"Using custom filtering threshold: {reranking_threshold}")
220
+ if relevance_threshold is not None:
221
+ reranking_model_params["filtering_threshold"] = relevance_threshold
222
+ logger.info(f"Using custom filtering threshold: {relevance_threshold}")
214
223
 
215
224
  reranker = get_reranking_model_from_params(reranking_model_params)
216
225
  # Get documents to rerank
@@ -236,8 +245,8 @@ class KnowledgeBaseTable:
236
245
  # Calculate relevance from distance
237
246
  logger.info("Calculating relevance from vector distance")
238
247
  df[relevance_column] = 1 / (1 + df['distance'])
239
- if reranking_threshold is not None:
240
- df = df[df[relevance_column] > reranking_threshold]
248
+ if relevance_threshold is not None:
249
+ df = df[df[relevance_column] > relevance_threshold]
241
250
 
242
251
  else:
243
252
  df[relevance_column] = None
@@ -333,12 +342,21 @@ class KnowledgeBaseTable:
333
342
 
334
343
  emb_col = TableField.EMBEDDINGS.value
335
344
  cont_col = TableField.CONTENT.value
345
+
346
+ db_handler = self.get_vector_db()
347
+ conditions = db_handler.extract_conditions(query.where)
348
+ doc_id = None
349
+ for condition in conditions:
350
+ if condition.column == 'chunk_id' and condition.op == FilterOperator.EQUAL:
351
+ doc_id = condition.value
352
+
336
353
  if cont_col in query.update_columns:
337
354
  content = query.update_columns[cont_col]
338
355
 
339
356
  # Apply preprocessing to content if configured
340
357
  if self.document_preprocessor:
341
358
  doc = Document(
359
+ id=doc_id,
342
360
  content=content.value,
343
361
  metadata={} # Empty metadata for content-only updates
344
362
  )
@@ -354,8 +372,6 @@ class KnowledgeBaseTable:
354
372
  query.table = Identifier(parts=[self._kb.vector_database_table])
355
373
 
356
374
  # send to vectordb
357
- db_handler = self.get_vector_db()
358
- conditions = db_handler.extract_conditions(query.where)
359
375
  self.addapt_conditions_columns(conditions)
360
376
  db_handler.dispatch_update(query, conditions)
361
377
 
@@ -409,7 +425,11 @@ class KnowledgeBaseTable:
409
425
  db_handler.delete(self._kb.vector_database_table)
410
426
 
411
427
  def insert(self, df: pd.DataFrame):
412
- """Insert dataframe to KB table."""
428
+ """Insert dataframe to KB table.
429
+
430
+ Args:
431
+ df: DataFrame to insert
432
+ """
413
433
  if df.empty:
414
434
  return
415
435
 
@@ -754,22 +774,9 @@ class KnowledgeBaseTable:
754
774
  return {}
755
775
 
756
776
  def _generate_document_id(self, content: str, content_column: str, provided_id: str = None) -> str:
757
- """
758
- Generate a deterministic document ID from content and column name.
759
- If provided_id exists, combines it with content_column.
760
-
761
- Args:
762
- content: The content string
763
- content_column: Name of the content column
764
- provided_id: Optional user-provided ID
765
- Returns:
766
- Deterministic document ID
767
- """
768
- if provided_id is not None:
769
- return f"{provided_id}_{content_column}"
770
-
771
- id_string = f"content={content}_column={content_column}"
772
- return hashlib.sha256(id_string.encode()).hexdigest()
777
+ """Generate a deterministic document ID using the utility function."""
778
+ from mindsdb.interfaces.knowledge_base.utils import generate_document_id
779
+ return generate_document_id(content, content_column, provided_id)
773
780
 
774
781
  def _convert_metadata_value(self, value):
775
782
  """
@@ -1,7 +1,6 @@
1
1
  from typing import List, Dict, Optional, Any
2
2
  import pandas as pd
3
3
  from langchain_text_splitters import RecursiveCharacterTextSplitter
4
- import hashlib
5
4
  import asyncio
6
5
 
7
6
 
@@ -43,7 +42,11 @@ class DocumentPreprocessor:
43
42
  self.splitter = None # Will be set by child classes
44
43
 
45
44
  def process_documents(self, documents: List[Document]) -> List[ProcessedChunk]:
46
- """Base implementation - should be overridden by child classes"""
45
+ """Base implementation - should be overridden by child classes
46
+
47
+ Args:
48
+ documents: List of documents to process
49
+ """
47
50
  raise NotImplementedError("Subclasses must implement process_documents")
48
51
 
49
52
  def _split_document(self, doc: Document) -> List[Document]:
@@ -80,27 +83,22 @@ class DocumentPreprocessor:
80
83
  metadata=data.get("metadata", {}),
81
84
  )
82
85
 
83
- def _generate_deterministic_id(
84
- self, content: str, content_column: str = None, provided_id: str = None
85
- ) -> str:
86
- """Generate a deterministic ID based on content and column"""
87
- if provided_id is not None:
88
- return f"{provided_id}_{content_column}"
89
-
90
- id_string = f"content={content}_column={content_column}"
91
- return hashlib.sha256(id_string.encode()).hexdigest()
92
-
93
86
  def _generate_chunk_id(
94
87
  self,
95
88
  chunk_index: Optional[int] = None,
89
+ total_chunks: Optional[int] = None,
90
+ start_char: Optional[int] = None,
91
+ end_char: Optional[int] = None,
96
92
  provided_id: str = None,
97
93
  ) -> str:
98
- """Generate deterministic ID for a chunk"""
99
- base_id = provided_id
100
- chunk_id = (
101
- f"{base_id}_chunk_{chunk_index}" if chunk_index is not None else base_id
102
- )
103
- logger.debug(f"Generated chunk ID: {chunk_id} for content hash: {base_id}")
94
+ """Generate human-readable deterministic ID for a chunk
95
+ Format: <doc_id>:<chunk_number>of<total_chunks>:<start_char>to<end_char>
96
+ """
97
+ if provided_id is None:
98
+ raise ValueError("Document ID must be provided for chunk ID generation")
99
+
100
+ chunk_id = f"{provided_id}:{chunk_index + 1}of{total_chunks}:{start_char}to{end_char}"
101
+ logger.debug(f"Generated chunk ID: {chunk_id}")
104
102
  return chunk_id
105
103
 
106
104
  def _prepare_chunk_metadata(
@@ -207,14 +205,10 @@ Please give a short succinct context to situate this chunk within the overall do
207
205
  processed_chunks = []
208
206
 
209
207
  for doc_index, doc in enumerate(documents):
210
- # Get content_column from metadata if available
211
- content_column = (
212
- doc.metadata.get("content_column") if doc.metadata else None
213
- )
214
208
 
215
- # Ensure document has an ID
209
+ # Document ID must be provided by this point
216
210
  if doc.id is None:
217
- doc.id = self._generate_deterministic_id(doc.content, content_column)
211
+ raise ValueError("Document ID must be provided before preprocessing")
218
212
 
219
213
  # Skip empty or whitespace-only content
220
214
  if not doc.content or not doc.content.strip():
@@ -298,68 +292,55 @@ class TextChunkingPreprocessor(DocumentPreprocessor):
298
292
  processed_chunks = []
299
293
 
300
294
  for doc in documents:
301
- # Get content_column from metadata if available
302
- content_column = (
303
- doc.metadata.get("content_column") if doc.metadata else None
304
- )
305
295
 
306
- # Ensure document has an ID
296
+ # Document ID must be provided by this point
307
297
  if doc.id is None:
308
- doc.id = self._generate_deterministic_id(doc.content, content_column)
298
+ raise ValueError("Document ID must be provided before preprocessing")
309
299
 
310
300
  # Skip empty or whitespace-only content
311
301
  if not doc.content or not doc.content.strip():
312
302
  continue
313
303
 
314
304
  chunk_docs = self._split_document(doc)
305
+ total_chunks = len(chunk_docs)
315
306
 
316
- # Single chunk case
317
- if len(chunk_docs) == 1:
318
- chunk_doc = chunk_docs[0]
307
+ # Track character positions
308
+ current_pos = 0
309
+ for i, chunk_doc in enumerate(chunk_docs):
319
310
  if not chunk_doc.content or not chunk_doc.content.strip():
320
311
  continue
321
312
 
313
+ # Calculate chunk positions
314
+ start_char = current_pos
315
+ end_char = start_char + len(chunk_doc.content)
316
+ current_pos = end_char + 1 # +1 for separator
317
+
322
318
  # Initialize metadata
323
319
  metadata = {}
324
320
  if doc.metadata:
325
321
  metadata.update(doc.metadata)
326
322
 
327
- # Pass through doc.id and content_column
328
- id = self._generate_chunk_id(
329
- chunk_index=0, provided_id=doc.id
323
+ # Add position metadata
324
+ metadata["start_char"] = start_char
325
+ metadata["end_char"] = end_char
326
+
327
+ # Generate chunk ID with total chunks
328
+ chunk_id = self._generate_chunk_id(
329
+ chunk_index=i,
330
+ total_chunks=total_chunks,
331
+ start_char=start_char,
332
+ end_char=end_char,
333
+ provided_id=doc.id
330
334
  )
335
+
331
336
  processed_chunks.append(
332
337
  ProcessedChunk(
333
- id=id,
338
+ id=chunk_id,
334
339
  content=chunk_doc.content,
335
340
  embeddings=doc.embeddings,
336
- metadata=self._prepare_chunk_metadata(doc.id, None, metadata),
341
+ metadata=self._prepare_chunk_metadata(doc.id, i, metadata),
337
342
  )
338
343
  )
339
- else:
340
- # Multiple chunks case
341
- for i, chunk_doc in enumerate(chunk_docs):
342
- if not chunk_doc.content or not chunk_doc.content.strip():
343
- continue
344
-
345
- # Initialize metadata
346
- metadata = {}
347
- if doc.metadata:
348
- metadata.update(doc.metadata)
349
-
350
- # Pass through doc.id and content_column
351
- chunk_id = self._generate_chunk_id(
352
- chunk_index=i,
353
- provided_id=doc.id,
354
- )
355
- processed_chunks.append(
356
- ProcessedChunk(
357
- id=chunk_id,
358
- content=chunk_doc.content,
359
- embeddings=doc.embeddings,
360
- metadata=self._prepare_chunk_metadata(doc.id, i, metadata),
361
- )
362
- )
363
344
 
364
345
  return processed_chunks
365
346
 
@@ -0,0 +1,28 @@
1
+ """Utilities for knowledge base operations."""
2
+ import hashlib
3
+
4
+
5
+ def generate_document_id(content: str, content_column: str, provided_id: str = None) -> str:
6
+ """
7
+ Generate a deterministic document ID from content and column name.
8
+ If provided_id exists, combines it with content_column.
9
+ For generated IDs, uses a short hash of just the content to ensure
10
+ same content gets same base ID across different columns.
11
+
12
+ Args:
13
+ content: The content string
14
+ content_column: Name of the content column
15
+ provided_id: Optional user-provided ID
16
+ Returns:
17
+ Deterministic document ID in format: <base_id>_<column>
18
+ where base_id is either the provided_id or a 16-char hash of content
19
+ """
20
+ if provided_id is not None:
21
+ base_id = provided_id
22
+ else:
23
+ # Generate a shorter 16-character hash based only on content
24
+ hash_obj = hashlib.md5(content.encode())
25
+ base_id = hash_obj.hexdigest()[:16]
26
+
27
+ # Append column name to maintain uniqueness across columns
28
+ return f"{base_id}_{content_column}"
mindsdb/utilities/auth.py CHANGED
@@ -15,9 +15,11 @@ def get_aws_meta_data() -> dict:
15
15
  'ami-id': None,
16
16
  'instance-id': None
17
17
  }
18
+ aws_token = requests.put("http://169.254.169.254/latest/api/token", headers={'X-aws-ec2-metadata-token-ttl-seconds': '30'}).text
18
19
  for key in aws_meta_data.keys():
19
20
  resp = requests.get(
20
21
  f'http://169.254.169.254/latest/meta-data/{key}',
22
+ headers={'X-aws-ec2-metadata-token': aws_token},
21
23
  timeout=1
22
24
  )
23
25
  if resp.status_code != 200:
@@ -35,7 +37,9 @@ def register_oauth_client():
35
37
  aws_meta_data = get_aws_meta_data()
36
38
 
37
39
  current_aws_meta_data = config.get('aws_meta_data', {})
38
- oauth_meta = config.get('auth', {}).get('oauth', {})
40
+ oauth_meta = config.get('auth', {}).get('oauth')
41
+ if oauth_meta is None:
42
+ return
39
43
 
40
44
  public_hostname = aws_meta_data['public-hostname']
41
45
  if (
@@ -56,6 +56,7 @@ import os
56
56
  import time
57
57
  from abc import ABC
58
58
  from pathlib import Path
59
+ import re
59
60
  import hashlib
60
61
  import typing as t
61
62
 
@@ -154,7 +155,9 @@ class FileCache(BaseCache):
154
155
  pass
155
156
 
156
157
  def file_path(self, name):
157
- return self.path / name
158
+ # Sanitize the key to avoid table (file) names with backticks and slashes.
159
+ sanitized_name = re.sub(r'[^\w\-.]', '_', name)
160
+ return self.path / sanitized_name
158
161
 
159
162
  def set_df(self, name, df):
160
163
  path = self.file_path(name)