MindsDB 25.7.4.0__py3-none-any.whl → 25.8.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (57) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +11 -1
  3. mindsdb/api/executor/command_executor.py +9 -15
  4. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +21 -24
  5. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +9 -3
  6. mindsdb/api/executor/sql_query/steps/subselect_step.py +11 -8
  7. mindsdb/api/executor/utilities/mysql_to_duckdb_functions.py +264 -0
  8. mindsdb/api/executor/utilities/sql.py +30 -0
  9. mindsdb/api/http/initialize.py +2 -1
  10. mindsdb/api/http/namespaces/views.py +56 -72
  11. mindsdb/integrations/handlers/db2_handler/db2_handler.py +19 -23
  12. mindsdb/integrations/handlers/gong_handler/__about__.py +2 -0
  13. mindsdb/integrations/handlers/gong_handler/__init__.py +30 -0
  14. mindsdb/integrations/handlers/gong_handler/connection_args.py +37 -0
  15. mindsdb/integrations/handlers/gong_handler/gong_handler.py +164 -0
  16. mindsdb/integrations/handlers/gong_handler/gong_tables.py +508 -0
  17. mindsdb/integrations/handlers/gong_handler/icon.svg +25 -0
  18. mindsdb/integrations/handlers/gong_handler/test_gong_handler.py +125 -0
  19. mindsdb/integrations/handlers/huggingface_handler/__init__.py +8 -12
  20. mindsdb/integrations/handlers/huggingface_handler/finetune.py +203 -223
  21. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +360 -383
  22. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -7
  23. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -7
  24. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  25. mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +1 -2
  26. mindsdb/integrations/handlers/openai_handler/constants.py +11 -30
  27. mindsdb/integrations/handlers/openai_handler/helpers.py +27 -34
  28. mindsdb/integrations/handlers/openai_handler/openai_handler.py +14 -12
  29. mindsdb/integrations/handlers/salesforce_handler/constants.py +9 -2
  30. mindsdb/integrations/libs/llm/config.py +0 -14
  31. mindsdb/integrations/libs/llm/utils.py +0 -15
  32. mindsdb/integrations/utilities/files/file_reader.py +5 -19
  33. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +1 -1
  34. mindsdb/interfaces/agents/agents_controller.py +83 -45
  35. mindsdb/interfaces/agents/constants.py +0 -1
  36. mindsdb/interfaces/agents/langchain_agent.py +1 -3
  37. mindsdb/interfaces/database/projects.py +111 -7
  38. mindsdb/interfaces/knowledge_base/controller.py +7 -1
  39. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +6 -10
  40. mindsdb/interfaces/knowledge_base/preprocessing/text_splitter.py +73 -0
  41. mindsdb/interfaces/query_context/context_controller.py +14 -15
  42. mindsdb/utilities/config.py +2 -0
  43. mindsdb/utilities/fs.py +54 -17
  44. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/METADATA +278 -263
  45. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/RECORD +49 -48
  46. mindsdb/integrations/handlers/anyscale_endpoints_handler/__about__.py +0 -9
  47. mindsdb/integrations/handlers/anyscale_endpoints_handler/__init__.py +0 -20
  48. mindsdb/integrations/handlers/anyscale_endpoints_handler/anyscale_endpoints_handler.py +0 -290
  49. mindsdb/integrations/handlers/anyscale_endpoints_handler/creation_args.py +0 -14
  50. mindsdb/integrations/handlers/anyscale_endpoints_handler/icon.svg +0 -4
  51. mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -2
  52. mindsdb/integrations/handlers/anyscale_endpoints_handler/settings.py +0 -51
  53. mindsdb/integrations/handlers/anyscale_endpoints_handler/tests/test_anyscale_endpoints_handler.py +0 -212
  54. /mindsdb/integrations/handlers/{anyscale_endpoints_handler/tests/__init__.py → gong_handler/requirements.txt} +0 -0
  55. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/WHEEL +0 -0
  56. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/licenses/LICENSE +0 -0
  57. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
- # # NOTE: Any changes made here need to be made to requirements_cpu.txt as well
2
- # datasets==2.16.1
3
- # evaluate==0.4.3
4
- # nltk==3.9.1
5
- # huggingface-hub==0.29.3
6
- # torch==2.7.1
7
- # transformers >= 4.42.4
1
+ # NOTE: Any changes made here need to be made to requirements_cpu.txt as well
2
+ datasets==2.16.1
3
+ evaluate==0.4.3
4
+ nltk==3.9.1
5
+ huggingface-hub==0.29.3
6
+ torch==2.8.0
7
+ transformers >= 4.42.4
@@ -1,7 +1,7 @@
1
- # # Needs to be installed with `pip install --extra-index-url https://download.pytorch.org/whl/ .[huggingface_cpu]`
2
- # datasets==2.16.1
3
- # evaluate==0.4.3
4
- # nltk==3.9.1
5
- # huggingface-hub==0.29.3
6
- # torch==2.7.1+cpu
7
- # transformers >= 4.42.4
1
+ # Needs to be installed with `pip install --extra-index-url https://download.pytorch.org/whl/ .[huggingface_cpu]`
2
+ datasets==2.16.1
3
+ evaluate==0.4.3
4
+ nltk==3.9.1
5
+ huggingface-hub==0.29.3
6
+ torch==2.8.0+cpu
7
+ transformers >= 4.42.4
@@ -1,27 +1,27 @@
1
- # from mindsdb.integrations.handlers.huggingface_handler.finetune import (
2
- # _finetune_cls,
3
- # _finetune_fill_mask,
4
- # _finetune_question_answering,
5
- # _finetune_summarization,
6
- # _finetune_text_generation,
7
- # _finetune_translate,
8
- # )
1
+ from mindsdb.integrations.handlers.huggingface_handler.finetune import (
2
+ _finetune_cls,
3
+ _finetune_fill_mask,
4
+ _finetune_question_answering,
5
+ _finetune_summarization,
6
+ _finetune_text_generation,
7
+ _finetune_translate,
8
+ )
9
9
 
10
- # # todo once we have moved predict tasks functions into a separate function
11
- # # PREDICT_MAP = {
12
- # # 'text-classification': self.predict_text_classification,
13
- # # 'zero-shot-classification': self.predict_zero_shot,
14
- # # 'translation': self.predict_translation,
15
- # # 'summarization': self.predict_summarization,
16
- # # 'fill-mask': self.predict_fill_mask
17
- # # }
10
+ # todo once we have moved predict tasks functions into a separate function
11
+ # PREDICT_MAP = {
12
+ # 'text-classification': self.predict_text_classification,
13
+ # 'zero-shot-classification': self.predict_zero_shot,
14
+ # 'translation': self.predict_translation,
15
+ # 'summarization': self.predict_summarization,
16
+ # 'fill-mask': self.predict_fill_mask
17
+ # }
18
18
 
19
- # FINETUNE_MAP = {
20
- # "text-classification": _finetune_cls,
21
- # "zero-shot-classification": _finetune_cls,
22
- # "translation": _finetune_translate,
23
- # "summarization": _finetune_summarization,
24
- # "fill-mask": _finetune_fill_mask,
25
- # "text-generation": _finetune_text_generation,
26
- # "question-answering": _finetune_question_answering,
27
- # }
19
+ FINETUNE_MAP = {
20
+ "text-classification": _finetune_cls,
21
+ "zero-shot-classification": _finetune_cls,
22
+ "translation": _finetune_translate,
23
+ "summarization": _finetune_summarization,
24
+ "fill-mask": _finetune_fill_mask,
25
+ "text-generation": _finetune_text_generation,
26
+ "question-answering": _finetune_question_answering,
27
+ }
@@ -36,7 +36,7 @@ from mindsdb.interfaces.storage.model_fs import HandlerStorage, ModelStorage
36
36
  from mindsdb.integrations.handlers.langchain_embedding_handler.langchain_embedding_handler import (
37
37
  construct_model_from_args,
38
38
  )
39
- from mindsdb.integrations.handlers.openai_handler.constants import CHAT_MODELS # noqa: F401 - for dependency checker
39
+ from mindsdb.integrations.handlers.openai_handler.constants import CHAT_MODELS_PREFIXES # noqa: F401 - for dependency checker
40
40
 
41
41
  from mindsdb.utilities import log
42
42
  from mindsdb.utilities.context_executor import ContextThreadPoolExecutor
@@ -54,7 +54,6 @@ class LangChainHandler(BaseMLEngine):
54
54
  Supported LLM providers:
55
55
  - OpenAI
56
56
  - Anthropic
57
- - Anyscale
58
57
  - Google
59
58
  - LiteLLM
60
59
  - Ollama
@@ -1,38 +1,19 @@
1
- OPENAI_API_BASE = 'https://api.openai.com/v1'
1
+ OPENAI_API_BASE = "https://api.openai.com/v1"
2
2
 
3
- CHAT_MODELS = (
4
- 'gpt-3.5-turbo',
5
- 'gpt-3.5-turbo-16k',
6
- 'gpt-3.5-turbo-instruct',
7
- 'gpt-4',
8
- 'gpt-4-32k',
9
- 'gpt-4-1106-preview',
10
- 'gpt-4-0125-preview',
11
- 'gpt-4o',
12
- 'o3-mini',
13
- 'o1-mini'
14
- )
15
- COMPLETION_MODELS = ('babbage-002', 'davinci-002')
16
- FINETUNING_MODELS = ('gpt-3.5-turbo', 'babbage-002', 'davinci-002', 'gpt-4')
17
- COMPLETION_LEGACY_BASE_MODELS = ('davinci', 'curie', 'babbage', 'ada')
18
- DEFAULT_CHAT_MODEL = 'gpt-3.5-turbo'
3
+ CHAT_MODELS_PREFIXES = ("gpt-3.5", "gpt-3.5", "gpt-3.5", "gpt-4", "o3-mini", "o1-mini")
4
+ COMPLETION_MODELS = ("babbage-002", "davinci-002")
5
+ FINETUNING_MODELS = ("gpt-3.5-turbo", "babbage-002", "davinci-002", "gpt-4")
6
+ COMPLETION_LEGACY_BASE_MODELS = ("davinci", "curie", "babbage", "ada")
7
+ DEFAULT_CHAT_MODEL = "gpt-4o-mini"
19
8
 
20
9
  FINETUNING_LEGACY_MODELS = FINETUNING_MODELS
21
10
  COMPLETION_LEGACY_MODELS = (
22
11
  COMPLETION_LEGACY_BASE_MODELS
23
- + tuple(f'text-{model}-001' for model in COMPLETION_LEGACY_BASE_MODELS)
24
- + ('text-davinci-002', 'text-davinci-003')
12
+ + tuple(f"text-{model}-001" for model in COMPLETION_LEGACY_BASE_MODELS)
13
+ + ("text-davinci-002", "text-davinci-003")
25
14
  )
26
15
 
27
- EMBEDDING_MODELS = (
28
- ('text-embedding-ada-002',)
29
- + tuple(f'text-similarity-{model}-001' for model in COMPLETION_LEGACY_BASE_MODELS)
30
- + tuple(f'text-search-{model}-query-001' for model in COMPLETION_LEGACY_BASE_MODELS)
31
- + tuple(f'text-search-{model}-doc-001' for model in COMPLETION_LEGACY_BASE_MODELS)
32
- + tuple(f'code-search-{model}-text-001' for model in COMPLETION_LEGACY_BASE_MODELS)
33
- + tuple(f'code-search-{model}-code-001' for model in COMPLETION_LEGACY_BASE_MODELS)
34
- )
35
- DEFAULT_EMBEDDING_MODEL = 'text-embedding-ada-002'
16
+ DEFAULT_EMBEDDING_MODEL = "text-embedding-ada-002"
36
17
 
37
- IMAGE_MODELS = ('dall-e-2', 'dall-e-3')
38
- DEFAULT_IMAGE_MODEL = 'dall-e-2'
18
+ IMAGE_MODELS = ("dall-e-2", "dall-e-3")
19
+ DEFAULT_IMAGE_MODEL = "dall-e-2"
@@ -14,6 +14,7 @@ class PendingFT(openai.OpenAIError):
14
14
  """
15
15
  Custom exception to handle pending fine-tuning status.
16
16
  """
17
+
17
18
  message: str
18
19
 
19
20
  def __init__(self, message) -> None:
@@ -65,10 +66,7 @@ def retry_with_exponential_backoff(
65
66
 
66
67
  if isinstance(hour_budget, float) or isinstance(hour_budget, int):
67
68
  try:
68
- max_retries = round(
69
- (math.log((hour_budget * 3600) / initial_delay))
70
- / math.log(exponential_base)
71
- )
69
+ max_retries = round((math.log((hour_budget * 3600) / initial_delay)) / math.log(exponential_base))
72
70
  except ValueError:
73
71
  max_retries = 10
74
72
  else:
@@ -81,22 +79,20 @@ def retry_with_exponential_backoff(
81
79
 
82
80
  except status_errors as e:
83
81
  raise Exception(
84
- f'Error status {e.status_code} raised by OpenAI API: {e.body.get("message", "Please refer to `https://platform.openai.com/docs/guides/error-codes` for more information.")}' # noqa
82
+ f"Error status {e.status_code} raised by OpenAI API: {e.body.get('message', 'Please refer to `https://platform.openai.com/docs/guides/error-codes` for more information.')}" # noqa
85
83
  ) # noqa
86
84
 
87
85
  except wait_errors:
88
86
  num_retries += 1
89
87
  if num_retries > max_retries:
90
- raise Exception(
91
- f"Maximum number of retries ({max_retries}) exceeded."
92
- )
88
+ raise Exception(f"Maximum number of retries ({max_retries}) exceeded.")
93
89
  # Increment the delay and wait
94
90
  delay *= exponential_base * (1 + jitter * random.random())
95
91
  time.sleep(delay)
96
92
 
97
93
  except openai.OpenAIError as e:
98
94
  raise Exception(
99
- f'General {str(e)} error raised by OpenAI. Please refer to `https://platform.openai.com/docs/guides/error-codes` for more information.' # noqa
95
+ f"General {str(e)} error raised by OpenAI. Please refer to `https://platform.openai.com/docs/guides/error-codes` for more information." # noqa
100
96
  )
101
97
 
102
98
  except Exception as e:
@@ -107,7 +103,7 @@ def retry_with_exponential_backoff(
107
103
  return _retry_with_exponential_backoff
108
104
 
109
105
 
110
- def truncate_msgs_for_token_limit(messages: List[Dict], model_name: Text, max_tokens: int, truncate: Text = 'first'):
106
+ def truncate_msgs_for_token_limit(messages: List[Dict], model_name: Text, max_tokens: int, truncate: Text = "first"):
111
107
  """
112
108
  Truncates message list to fit within the token limit.
113
109
  The first message for chat completion models are general directives with the system role, which will ideally be kept at all times.
@@ -129,20 +125,18 @@ def truncate_msgs_for_token_limit(messages: List[Dict], model_name: Text, max_to
129
125
  except KeyError:
130
126
  # If the encoding is not found, defualt to cl100k_base.
131
127
  # This is applicable for handlers that extend the OpenAI handler such as Anyscale.
132
- model_name = 'gpt-3.5-turbo-0301'
133
- encoder = tiktoken.get_encoding('cl100k_base')
128
+ model_name = "gpt-3.5-turbo-0301"
129
+ encoder = tiktoken.get_encoding("cl100k_base")
134
130
 
135
131
  sys_priming = messages[0:1]
136
132
  n_tokens = count_tokens(messages, encoder, model_name)
137
133
  while n_tokens > max_tokens:
138
134
  if len(messages) == 2:
139
- return messages[
140
- :-1
141
- ] # edge case: if limit is surpassed by just one input, we remove initial instruction
135
+ return messages[:-1] # edge case: if limit is surpassed by just one input, we remove initial instruction
142
136
  elif len(messages) == 1:
143
137
  return messages
144
138
 
145
- if truncate == 'first':
139
+ if truncate == "first":
146
140
  messages = sys_priming + messages[2:]
147
141
  else:
148
142
  messages = sys_priming + messages[1:-1]
@@ -151,7 +145,7 @@ def truncate_msgs_for_token_limit(messages: List[Dict], model_name: Text, max_to
151
145
  return messages
152
146
 
153
147
 
154
- def count_tokens(messages: List[Dict], encoder: tiktoken.core.Encoding, model_name: Text = 'gpt-3.5-turbo-0301'):
148
+ def count_tokens(messages: List[Dict], encoder: tiktoken.core.Encoding, model_name: Text = "gpt-3.5-turbo-0301"):
155
149
  """
156
150
  Counts the number of tokens in a list of messages.
157
151
 
@@ -160,24 +154,23 @@ def count_tokens(messages: List[Dict], encoder: tiktoken.core.Encoding, model_na
160
154
  encoder: Tokenizer
161
155
  model_name: Model name
162
156
  """
163
- if (
164
- "gpt-3.5-turbo" in model_name
165
- ): # note: future models may deviate from this (only 0301 really complies)
166
- num_tokens = 0
167
- for message in messages:
168
- num_tokens += (
169
- 4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
170
- )
171
- for key, value in message.items():
172
- num_tokens += len(encoder.encode(value))
173
- if key == "name": # if there's a name, the role is omitted
174
- num_tokens += -1 # role is always required and always 1 token
175
- num_tokens += 2 # every reply is primed with <im_start>assistant
176
- return num_tokens
157
+ if "gpt-3.5-turbo" in model_name: # note: future models may deviate from this (only 0301 really complies)
158
+ tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
159
+ tokens_per_name = -1
177
160
  else:
178
- raise NotImplementedError(
179
- f"""_count_tokens() is not presently implemented for model {model_name}."""
180
- )
161
+ tokens_per_message = 3
162
+ tokens_per_name = 1
163
+
164
+ num_tokens = 0
165
+ for message in messages:
166
+ num_tokens += tokens_per_message
167
+
168
+ for key, value in message.items():
169
+ num_tokens += len(encoder.encode(value))
170
+ if key == "name": # if there's a name, the role is omitted
171
+ num_tokens += tokens_per_name
172
+ num_tokens += 2 # every reply is primed with <im_start>assistant
173
+ return num_tokens
181
174
 
182
175
 
183
176
  def get_available_models(client) -> List[Text]:
@@ -24,7 +24,7 @@ from mindsdb.integrations.handlers.openai_handler.helpers import (
24
24
  PendingFT,
25
25
  )
26
26
  from mindsdb.integrations.handlers.openai_handler.constants import (
27
- CHAT_MODELS,
27
+ CHAT_MODELS_PREFIXES,
28
28
  IMAGE_MODELS,
29
29
  FINETUNING_MODELS,
30
30
  OPENAI_API_BASE,
@@ -62,7 +62,6 @@ class OpenAIHandler(BaseMLEngine):
62
62
  self.rate_limit = 60 # requests per minute
63
63
  self.max_batch_size = 20
64
64
  self.default_max_tokens = 100
65
- self.chat_completion_models = CHAT_MODELS
66
65
  self.supported_ft_models = FINETUNING_MODELS # base models compatible with finetuning
67
66
  # For now this are only used for handlers that inherits OpenAIHandler and don't need to override base methods
68
67
  self.api_key_name = getattr(self, "api_key_name", self.name)
@@ -89,6 +88,13 @@ class OpenAIHandler(BaseMLEngine):
89
88
  client = self._get_client(api_key=api_key, base_url=api_base, org=org, args=connection_args)
90
89
  OpenAIHandler._check_client_connection(client)
91
90
 
91
+ @staticmethod
92
+ def is_chat_model(model_name):
93
+ for prefix in CHAT_MODELS_PREFIXES:
94
+ if model_name.startswith(prefix):
95
+ return True
96
+ return False
97
+
92
98
  @staticmethod
93
99
  def _check_client_connection(client: OpenAI) -> None:
94
100
  """
@@ -350,11 +356,6 @@ class OpenAIHandler(BaseMLEngine):
350
356
  "user": pred_args.get("user", None),
351
357
  }
352
358
 
353
- if args.get("mode", self.default_mode) != "default" and model_name not in self.chat_completion_models:
354
- raise Exception(
355
- f"Conversational modes are only available for the following models: {', '.join(self.chat_completion_models)}"
356
- ) # noqa
357
-
358
359
  if args.get("prompt_template", False):
359
360
  prompts, empty_prompt_ids = get_completed_prompts(base_template, df, strict=strict_prompt_template)
360
361
 
@@ -515,7 +516,7 @@ class OpenAIHandler(BaseMLEngine):
515
516
  return _submit_image_completion(kwargs, prompts, api_args)
516
517
  elif model_name == "embedding":
517
518
  return _submit_embedding_completion(kwargs, prompts, api_args)
518
- elif model_name in self.chat_completion_models:
519
+ elif self.is_chat_model(model_name):
519
520
  if model_name == "gpt-3.5-turbo-instruct":
520
521
  return _submit_normal_completion(kwargs, prompts, api_args)
521
522
  else:
@@ -579,13 +580,14 @@ class OpenAIHandler(BaseMLEngine):
579
580
  tidy_comps.append(c.text.strip("\n").strip(""))
580
581
  return tidy_comps
581
582
 
582
- kwargs["prompt"] = prompts
583
583
  kwargs = {**kwargs, **api_args}
584
584
 
585
585
  before_openai_query(kwargs)
586
- resp = _tidy(client.completions.create(**kwargs))
587
- _log_api_call(kwargs, resp)
588
- return resp
586
+ responses = []
587
+ for prompt in prompts:
588
+ responses.extend(_tidy(client.completions.create(prompt=prompt, **kwargs)))
589
+ _log_api_call(kwargs, responses)
590
+ return responses
589
591
 
590
592
  def _submit_embedding_completion(kwargs: Dict, prompts: List[Text], api_args: Dict) -> List[float]:
591
593
  """
@@ -22,6 +22,7 @@ def get_soql_instructions(integration_name):
22
22
  - NO subqueries in FROM clause - only relationship-based subqueries allowed
23
23
  SQL: SELECT * FROM (SELECT Name FROM Account) AS AccountNames;
24
24
  SOQL: Not supported
25
+ - Do not use fields that are not defined in the schema or data catalog. Always reference exact field names.
25
26
 
26
27
  **FIELD SELECTION:**
27
28
  - Always include Id field when querying
@@ -43,7 +44,10 @@ def get_soql_instructions(integration_name):
43
44
  - Special date literals: TODAY, YESTERDAY, LAST_WEEK, LAST_MONTH, LAST_QUARTER, LAST_YEAR, THIS_WEEK, THIS_MONTH, THIS_QUARTER, THIS_YEAR
44
45
  CORRECT: WHERE CreatedDate = TODAY
45
46
  CORRECT: WHERE LastModifiedDate >= LAST_MONTH
46
- CORRECT: WHERE CloseDate = THIS_QUARTER
47
+ CORRECT: WHERE CloseDate >= THIS_QUARTER
48
+ - Date arithmetic (e.g., TODAY - 10) is not supported. Use literals like LAST_N_DAYS:10 instead.
49
+ CORRECT: WHERE CloseDate >= LAST_N_DAYS:10
50
+ INCORRECT: WHERE CloseDate >= TODAY - 10
47
51
  - LIKE operator: Only supports % wildcard, NO underscore (_) wildcard
48
52
  CORRECT: WHERE Name LIKE '%Corp%'
49
53
  CORRECT: WHERE Name LIKE 'Acme%'
@@ -69,6 +73,9 @@ def get_soql_instructions(integration_name):
69
73
  CORRECT: WHERE Services__c INCLUDES ('Consulting;Support')
70
74
  CORRECT: WHERE Services__c EXCLUDES ('Training')
71
75
  INCORRECT: WHERE Services__c = 'Consulting'
76
+ - Limited subquery support - only IN/NOT IN with non-correlated subqueries in WHERE clause
77
+ CORRECT: SELECT Id FROM Contact WHERE Id NOT IN (SELECT WhoId FROM Task)
78
+ INCORRECT: SELECT Id FROM Contact WHERE NOT EXISTS (SELECT 1 FROM Task WHERE WhoId = Contact.Id)
72
79
 
73
80
  **JOINS:**
74
81
  - NO explicit JOIN syntax supported
@@ -195,6 +202,7 @@ def get_soql_instructions(integration_name):
195
202
  - Multi-select picklist: SELECT Id, Name FROM Account WHERE Services__c INCLUDES ('Consulting;Support')
196
203
  - Sorting and limiting: SELECT Id, Name FROM Account ORDER BY Name ASC LIMIT 50
197
204
 
205
+
198
206
  ***EXECUTION INSTRUCTIONS. IMPORTANT!***
199
207
  After generating the core SOQL (and nothing else), always make sure you wrap it exactly as:
200
208
 
@@ -204,5 +212,4 @@ After generating the core SOQL (and nothing else), always make sure you wrap it
204
212
  )
205
213
 
206
214
  Return only that wrapper call.
207
-
208
215
  """
@@ -37,20 +37,6 @@ class AnthropicConfig(BaseLLMConfig):
37
37
  anthropic_api_url: Optional[str]
38
38
 
39
39
 
40
- # See https://api.python.langchain.com/en/latest/chat_models/langchain_community.chat_models.anyscale.ChatAnyscale.html
41
- # This config does not have to be exclusively used with Langchain.
42
- class AnyscaleConfig(BaseLLMConfig):
43
- model_name: str
44
- temperature: Optional[float]
45
- max_retries: Optional[int]
46
- max_tokens: Optional[int]
47
- anyscale_api_base: Optional[str]
48
- # Inferred from ANYSCALE_API_KEY if not provided.
49
- anyscale_api_key: Optional[str]
50
- anyscale_proxy: Optional[str]
51
- request_timeout: Optional[float]
52
-
53
-
54
40
  # See https://api.python.langchain.com/en/latest/chat_models/langchain_community.chat_models.litellm.ChatLiteLLM.html
55
41
  # This config does not have to be exclusively used with Langchain.
56
42
  class LiteLLMConfig(BaseLLMConfig):
@@ -8,7 +8,6 @@ import pandas as pd
8
8
 
9
9
  from mindsdb.integrations.libs.llm.config import (
10
10
  AnthropicConfig,
11
- AnyscaleConfig,
12
11
  BaseLLMConfig,
13
12
  GoogleConfig,
14
13
  LiteLLMConfig,
@@ -30,9 +29,6 @@ DEFAULT_OPENAI_MAX_RETRIES = 3
30
29
 
31
30
  DEFAULT_ANTHROPIC_MODEL = "claude-3-haiku-20240307"
32
31
 
33
- DEFAULT_ANYSCALE_MODEL = "meta-llama/Llama-2-7b-chat-hf"
34
- DEFAULT_ANYSCALE_BASE_URL = "https://api.endpoints.anyscale.com/v1"
35
-
36
32
  DEFAULT_GOOGLE_MODEL = "gemini-2.5-pro-preview-03-25"
37
33
 
38
34
  DEFAULT_LITELLM_MODEL = "gpt-3.5-turbo"
@@ -135,17 +131,6 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
135
131
  anthropic_api_key=args["api_keys"].get("anthropic", None),
136
132
  anthropic_api_url=args.get("base_url", None),
137
133
  )
138
- if provider == "anyscale":
139
- return AnyscaleConfig(
140
- model_name=args.get("model_name", DEFAULT_ANYSCALE_MODEL),
141
- temperature=temperature,
142
- max_retries=args.get("max_retries", DEFAULT_OPENAI_MAX_RETRIES),
143
- max_tokens=args.get("max_tokens", DEFAULT_OPENAI_MAX_TOKENS),
144
- anyscale_api_base=args.get("base_url", DEFAULT_ANYSCALE_BASE_URL),
145
- anyscale_api_key=args["api_keys"].get("anyscale", None),
146
- anyscale_proxy=args.get("proxy", None),
147
- request_timeout=args.get("request_timeout", None),
148
- )
149
134
  if provider == "litellm":
150
135
  model_kwargs = {
151
136
  "api_key": args["api_keys"].get("litellm", None),
@@ -10,6 +10,7 @@ from typing import List, Generator
10
10
  import filetype
11
11
  import pandas as pd
12
12
  from charset_normalizer import from_bytes
13
+ from mindsdb.interfaces.knowledge_base.preprocessing.text_splitter import TextSplitter
13
14
 
14
15
  from mindsdb.utilities import log
15
16
 
@@ -322,40 +323,25 @@ class FileReader(FormatDetector):
322
323
  @staticmethod
323
324
  def read_txt(file_obj: BytesIO, name: str | None = None, **kwargs) -> pd.DataFrame:
324
325
  # the lib is heavy, so import it only when needed
325
- from langchain_text_splitters import RecursiveCharacterTextSplitter
326
326
 
327
327
  file_obj = decode(file_obj)
328
328
 
329
- try:
330
- from langchain_core.documents import Document
331
- except ImportError:
332
- raise FileProcessingError(
333
- "To import TXT document please install 'langchain-community':\n pip install langchain-community"
334
- )
335
329
  text = file_obj.read()
336
330
 
337
- metadata = {"source_file": name, "file_format": "txt"}
338
- documents = [Document(page_content=text, metadata=metadata)]
331
+ text_splitter = TextSplitter(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP)
339
332
 
340
- text_splitter = RecursiveCharacterTextSplitter(
341
- chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP
342
- )
343
-
344
- docs = text_splitter.split_documents(documents)
345
- return pd.DataFrame([{"content": doc.page_content, "metadata": doc.metadata} for doc in docs])
333
+ docs = text_splitter.split_text(text)
334
+ return pd.DataFrame([{"content": doc, "metadata": {"source_file": name, "file_format": "txt"}} for doc in docs])
346
335
 
347
336
  @staticmethod
348
337
  def read_pdf(file_obj: BytesIO, name: str | None = None, **kwargs) -> pd.DataFrame:
349
338
  # the libs are heavy, so import it only when needed
350
339
  import fitz # pymupdf
351
- from langchain_text_splitters import RecursiveCharacterTextSplitter
352
340
 
353
341
  with fitz.open(stream=file_obj.read()) as pdf: # open pdf
354
342
  text = chr(12).join([page.get_text() for page in pdf])
355
343
 
356
- text_splitter = RecursiveCharacterTextSplitter(
357
- chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP
358
- )
344
+ text_splitter = TextSplitter(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP)
359
345
 
360
346
  split_text = text_splitter.split_text(text)
361
347
 
@@ -33,7 +33,7 @@ class BaseLLMReranker(BaseModel, ABC):
33
33
  client: Optional[AsyncOpenAI | BaseMLEngine] = None
34
34
  _semaphore: Optional[asyncio.Semaphore] = None
35
35
  max_concurrent_requests: int = 20
36
- max_retries: int = 2
36
+ max_retries: int = 4
37
37
  retry_delay: float = 1.0
38
38
  request_timeout: float = 20.0 # Timeout for API requests
39
39
  early_stop: bool = True # Whether to enable early stopping