MindsDB 25.7.4.0__py3-none-any.whl → 25.8.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (57) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +11 -1
  3. mindsdb/api/executor/command_executor.py +9 -15
  4. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +21 -24
  5. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +9 -3
  6. mindsdb/api/executor/sql_query/steps/subselect_step.py +11 -8
  7. mindsdb/api/executor/utilities/mysql_to_duckdb_functions.py +264 -0
  8. mindsdb/api/executor/utilities/sql.py +30 -0
  9. mindsdb/api/http/initialize.py +2 -1
  10. mindsdb/api/http/namespaces/views.py +56 -72
  11. mindsdb/integrations/handlers/db2_handler/db2_handler.py +19 -23
  12. mindsdb/integrations/handlers/gong_handler/__about__.py +2 -0
  13. mindsdb/integrations/handlers/gong_handler/__init__.py +30 -0
  14. mindsdb/integrations/handlers/gong_handler/connection_args.py +37 -0
  15. mindsdb/integrations/handlers/gong_handler/gong_handler.py +164 -0
  16. mindsdb/integrations/handlers/gong_handler/gong_tables.py +508 -0
  17. mindsdb/integrations/handlers/gong_handler/icon.svg +25 -0
  18. mindsdb/integrations/handlers/gong_handler/test_gong_handler.py +125 -0
  19. mindsdb/integrations/handlers/huggingface_handler/__init__.py +8 -12
  20. mindsdb/integrations/handlers/huggingface_handler/finetune.py +203 -223
  21. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +360 -383
  22. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -7
  23. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -7
  24. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  25. mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +1 -2
  26. mindsdb/integrations/handlers/openai_handler/constants.py +11 -30
  27. mindsdb/integrations/handlers/openai_handler/helpers.py +27 -34
  28. mindsdb/integrations/handlers/openai_handler/openai_handler.py +14 -12
  29. mindsdb/integrations/handlers/salesforce_handler/constants.py +9 -2
  30. mindsdb/integrations/libs/llm/config.py +0 -14
  31. mindsdb/integrations/libs/llm/utils.py +0 -15
  32. mindsdb/integrations/utilities/files/file_reader.py +5 -19
  33. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +1 -1
  34. mindsdb/interfaces/agents/agents_controller.py +83 -45
  35. mindsdb/interfaces/agents/constants.py +0 -1
  36. mindsdb/interfaces/agents/langchain_agent.py +1 -3
  37. mindsdb/interfaces/database/projects.py +111 -7
  38. mindsdb/interfaces/knowledge_base/controller.py +7 -1
  39. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +6 -10
  40. mindsdb/interfaces/knowledge_base/preprocessing/text_splitter.py +73 -0
  41. mindsdb/interfaces/query_context/context_controller.py +14 -15
  42. mindsdb/utilities/config.py +2 -0
  43. mindsdb/utilities/fs.py +54 -17
  44. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/METADATA +278 -263
  45. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/RECORD +49 -48
  46. mindsdb/integrations/handlers/anyscale_endpoints_handler/__about__.py +0 -9
  47. mindsdb/integrations/handlers/anyscale_endpoints_handler/__init__.py +0 -20
  48. mindsdb/integrations/handlers/anyscale_endpoints_handler/anyscale_endpoints_handler.py +0 -290
  49. mindsdb/integrations/handlers/anyscale_endpoints_handler/creation_args.py +0 -14
  50. mindsdb/integrations/handlers/anyscale_endpoints_handler/icon.svg +0 -4
  51. mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -2
  52. mindsdb/integrations/handlers/anyscale_endpoints_handler/settings.py +0 -51
  53. mindsdb/integrations/handlers/anyscale_endpoints_handler/tests/test_anyscale_endpoints_handler.py +0 -212
  54. /mindsdb/integrations/handlers/{anyscale_endpoints_handler/tests/__init__.py → gong_handler/requirements.txt} +0 -0
  55. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/WHEEL +0 -0
  56. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/licenses/LICENSE +0 -0
  57. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/top_level.txt +0 -0
@@ -1,223 +1,203 @@
1
- # import evaluate
2
- # import nltk
3
- # import numpy as np
4
- # from datasets import Dataset
5
- # from transformers import (
6
- # AutoConfig,
7
- # AutoModelForSeq2SeqLM,
8
- # AutoModelForSequenceClassification,
9
- # AutoTokenizer,
10
- # DataCollatorForSeq2Seq,
11
- # Seq2SeqTrainingArguments,
12
- # Trainer,
13
- # TrainingArguments,
14
- # )
15
-
16
- # # todo add support for question answering task
17
- # # todo add support for fill mask
18
- # # todo add support for text_generation (causal language model)
19
- # # todo add support for text_2_text generation
20
-
21
-
22
- # def _finetune_cls(df, args):
23
- # df = df.rename(columns={args["target"]: "labels", args["input_column"]: "text"})
24
- # tokenizer_from = args.get("using", {}).get("tokenizer_from", args["model_name"])
25
- # tokenizer = AutoTokenizer.from_pretrained(tokenizer_from)
26
- # dataset = Dataset.from_pandas(df)
27
-
28
- # def _tokenize_text_cls_fn(examples):
29
- # return tokenizer(examples["text"], padding="max_length", truncation=True)
30
-
31
- # tokenized_datasets = dataset.map(_tokenize_text_cls_fn, batched=True)
32
- # ds = tokenized_datasets.shuffle(seed=42).train_test_split(
33
- # test_size=args.get("eval_size", 0.1)
34
- # )
35
- # train_ds = ds["train"]
36
- # eval_ds = ds["test"]
37
-
38
- # ft_args = args.get("using", {}).get("trainer_args", {})
39
- # ft_args["output_dir"] = args["model_folder"]
40
-
41
- # n_labels = len(args["labels_map"])
42
- # # todo replace for prod
43
- # assert (
44
- # n_labels == df["labels"].nunique()
45
- # ), f'Label mismatch! Ensure labels match what the model was originally trained on. Found {df["labels"].nunique()} classes, expected {n_labels}.' # noqa
46
- # # TODO: ideally check that labels are a subset of the original ones, too.
47
- # config = AutoConfig.from_pretrained(args["model_name"])
48
- # model = AutoModelForSequenceClassification.from_pretrained(
49
- # args["model_name"], config=config
50
- # )
51
- # metric = evaluate.load("accuracy")
52
- # training_args = TrainingArguments(**ft_args)
53
-
54
- # def _compute_metrics(eval_pred):
55
- # logits, labels = eval_pred
56
- # predictions = np.argmax(logits, axis=-1)
57
- # return metric.compute(predictions=predictions, references=labels)
58
-
59
- # # generate trainer and finetune
60
- # trainer = Trainer(
61
- # model=model,
62
- # args=training_args,
63
- # train_dataset=train_ds,
64
- # eval_dataset=eval_ds,
65
- # compute_metrics=_compute_metrics,
66
- # )
67
-
68
- # return tokenizer, trainer
69
-
70
-
71
- # # TODO: merge with summarization?
72
- # def _finetune_translate(df, args):
73
- # config = AutoConfig.from_pretrained(args["model_name"])
74
- # df = df.rename(
75
- # columns={args["target"]: "translation", args["input_column"]: "text"}
76
- # )
77
- # tokenizer_from = args.get("using", {}).get("tokenizer_from", args["model_name"])
78
- # tokenizer = AutoTokenizer.from_pretrained(tokenizer_from)
79
- # dataset = Dataset.from_pandas(df)
80
-
81
- # def _tokenize_translate_fn(examples):
82
- # source_lang = args["lang_input"]
83
- # target_lang = args["lang_output"]
84
- # max_target_length = config.task_specific_params["summarization"]["max_length"]
85
- # prefix = f"translate {source_lang} to {target_lang}: "
86
- # inputs = [prefix + ex for ex in examples["text"]]
87
- # targets = [ex for ex in examples["translation"]]
88
- # model_inputs = tokenizer(inputs, max_length=config.n_positions, truncation=True)
89
-
90
- # # Setup the tokenizer for targets
91
- # with tokenizer.as_target_tokenizer():
92
- # labels = tokenizer(targets, max_length=max_target_length, truncation=True)
93
-
94
- # model_inputs["labels"] = labels["input_ids"]
95
- # return model_inputs
96
-
97
- # tokenized_datasets = dataset.map(_tokenize_translate_fn, batched=True)
98
- # ds = tokenized_datasets.shuffle(seed=42).train_test_split(
99
- # test_size=args.get("eval_size", 0.1)
100
- # )
101
- # train_ds = ds["train"]
102
- # eval_ds = ds["test"]
103
- # ft_args = args.get("using", {}).get("trainer_args", {})
104
- # ft_args["output_dir"] = args["model_folder"]
105
- # ft_args["predict_with_generate"] = True
106
-
107
- # model = AutoModelForSeq2SeqLM.from_pretrained(args["model_name"], config=config)
108
- # model.resize_token_embeddings(len(tokenizer))
109
- # training_args = Seq2SeqTrainingArguments(**ft_args)
110
- # data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
111
-
112
- # # generate trainer and finetune
113
- # trainer = Trainer(
114
- # model=model,
115
- # args=training_args,
116
- # train_dataset=train_ds,
117
- # eval_dataset=eval_ds,
118
- # data_collator=data_collator,
119
- # # compute_metrics=_compute_metrics,
120
- # )
121
-
122
- # return tokenizer, trainer
123
-
124
-
125
- # def _finetune_summarization(df, args):
126
- # df = df.rename(columns={args["target"]: "summary", args["input_column"]: "text"})
127
- # tokenizer_from = args.get("using", {}).get("tokenizer_from", args["model_name"])
128
- # tokenizer = AutoTokenizer.from_pretrained(tokenizer_from)
129
- # dataset = Dataset.from_pandas(df)
130
- # config = AutoConfig.from_pretrained(args["model_name"])
131
-
132
- # def _tokenize_summarize_fn(examples):
133
- # prefix = "summarize: " if "t5" in args["model_name"] else ""
134
- # inputs = [prefix + doc for doc in examples["text"]]
135
- # model_inputs = tokenizer(
136
- # inputs,
137
- # padding="max_length",
138
- # truncation=True,
139
- # max_length=config.max_position_embeddings,
140
- # pad_to_max_length=True,
141
- # ) # noqa
142
- # labels = tokenizer(
143
- # text_target=examples["summary"],
144
- # max_length=config.max_position_embeddings,
145
- # truncation=True,
146
- # ) # noqa
147
- # model_inputs["labels"] = labels["input_ids"]
148
- # return model_inputs
149
-
150
- # tokenized_datasets = dataset.map(_tokenize_summarize_fn, batched=True)
151
- # ds = tokenized_datasets.shuffle(seed=42).train_test_split(
152
- # test_size=args.get("eval_size", 0.1)
153
- # )
154
- # train_ds = ds["train"]
155
- # eval_ds = ds["test"]
156
-
157
- # ft_args = args.get("using", {}).get("trainer_args", {})
158
- # ft_args["output_dir"] = args["model_folder"]
159
- # ft_args["predict_with_generate"] = True
160
-
161
- # model = AutoModelForSeq2SeqLM.from_pretrained(args["model_name"], config=config)
162
- # metric = evaluate.load("rouge")
163
- # training_args = Seq2SeqTrainingArguments(**ft_args)
164
- # data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
165
-
166
- # def _compute_metrics(eval_pred):
167
- # # ref: github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb
168
- # predictions, labels = eval_pred
169
- # decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
170
- # decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
171
-
172
- # # Rogue expects a newline after each sentence
173
- # decoded_preds = [
174
- # "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds
175
- # ]
176
- # decoded_labels = [
177
- # "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
178
- # ]
179
-
180
- # result = metric.compute(
181
- # predictions=decoded_preds,
182
- # references=decoded_labels,
183
- # use_stemmer=True,
184
- # use_aggregator=True,
185
- # )
186
- # result = {key: value * 100 for key, value in result.items()}
187
- # prediction_lens = [
188
- # np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
189
- # ]
190
- # result["gen_len"] = np.mean(prediction_lens) # todo: remove?
191
- # return {k: round(v, 4) for k, v in result.items()}
192
-
193
- # # generate trainer and finetune
194
- # trainer = Trainer(
195
- # model=model,
196
- # args=training_args,
197
- # train_dataset=train_ds,
198
- # eval_dataset=eval_ds,
199
- # data_collator=data_collator,
200
- # compute_metrics=_compute_metrics,
201
- # )
202
-
203
- # return tokenizer, trainer
204
-
205
-
206
- # def _finetune_fill_mask(df, args):
207
- # raise NotImplementedError("Finetuning fill-mask models is not yet supported.")
208
-
209
-
210
- # def _finetune_text_generation(df, args):
211
- # raise NotImplementedError("Finetuning text-generation models is not yet supported.")
212
-
213
-
214
- # def _finetune_question_answering(df, args):
215
- # raise NotImplementedError(
216
- # "Finetuning question-answering models is not yet supported."
217
- # )
218
-
219
-
220
- # def _finetune_text_2_text_generation(df, args):
221
- # raise NotImplementedError(
222
- # "Finetuning text-2-text generation models is not yet supported."
223
- # )
1
+ import evaluate
2
+ import nltk
3
+ import numpy as np
4
+ from datasets import Dataset
5
+ from transformers import (
6
+ AutoConfig,
7
+ AutoModelForSeq2SeqLM,
8
+ AutoModelForSequenceClassification,
9
+ AutoTokenizer,
10
+ DataCollatorForSeq2Seq,
11
+ Seq2SeqTrainingArguments,
12
+ Trainer,
13
+ TrainingArguments,
14
+ )
15
+
16
+ # todo add support for question answering task
17
+ # todo add support for fill mask
18
+ # todo add support for text_generation (causal language model)
19
+ # todo add support for text_2_text generation
20
+
21
+
22
+ def _finetune_cls(df, args):
23
+ df = df.rename(columns={args["target"]: "labels", args["input_column"]: "text"})
24
+ tokenizer_from = args.get("using", {}).get("tokenizer_from", args["model_name"])
25
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_from)
26
+ dataset = Dataset.from_pandas(df)
27
+
28
+ def _tokenize_text_cls_fn(examples):
29
+ return tokenizer(examples["text"], padding="max_length", truncation=True)
30
+
31
+ tokenized_datasets = dataset.map(_tokenize_text_cls_fn, batched=True)
32
+ ds = tokenized_datasets.shuffle(seed=42).train_test_split(test_size=args.get("eval_size", 0.1))
33
+ train_ds = ds["train"]
34
+ eval_ds = ds["test"]
35
+
36
+ ft_args = args.get("using", {}).get("trainer_args", {})
37
+ ft_args["output_dir"] = args["model_folder"]
38
+
39
+ n_labels = len(args["labels_map"])
40
+ # todo replace for prod
41
+ assert n_labels == df["labels"].nunique(), (
42
+ f"Label mismatch! Ensure labels match what the model was originally trained on. Found {df['labels'].nunique()} classes, expected {n_labels}."
43
+ ) # noqa
44
+ # TODO: ideally check that labels are a subset of the original ones, too.
45
+ config = AutoConfig.from_pretrained(args["model_name"])
46
+ model = AutoModelForSequenceClassification.from_pretrained(args["model_name"], config=config)
47
+ metric = evaluate.load("accuracy")
48
+ training_args = TrainingArguments(**ft_args)
49
+
50
+ def _compute_metrics(eval_pred):
51
+ logits, labels = eval_pred
52
+ predictions = np.argmax(logits, axis=-1)
53
+ return metric.compute(predictions=predictions, references=labels)
54
+
55
+ # generate trainer and finetune
56
+ trainer = Trainer(
57
+ model=model,
58
+ args=training_args,
59
+ train_dataset=train_ds,
60
+ eval_dataset=eval_ds,
61
+ compute_metrics=_compute_metrics,
62
+ )
63
+
64
+ return tokenizer, trainer
65
+
66
+
67
+ # TODO: merge with summarization?
68
+ def _finetune_translate(df, args):
69
+ config = AutoConfig.from_pretrained(args["model_name"])
70
+ df = df.rename(columns={args["target"]: "translation", args["input_column"]: "text"})
71
+ tokenizer_from = args.get("using", {}).get("tokenizer_from", args["model_name"])
72
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_from)
73
+ dataset = Dataset.from_pandas(df)
74
+
75
+ def _tokenize_translate_fn(examples):
76
+ source_lang = args["lang_input"]
77
+ target_lang = args["lang_output"]
78
+ max_target_length = config.task_specific_params["summarization"]["max_length"]
79
+ prefix = f"translate {source_lang} to {target_lang}: "
80
+ inputs = [prefix + ex for ex in examples["text"]]
81
+ targets = [ex for ex in examples["translation"]]
82
+ model_inputs = tokenizer(inputs, max_length=config.n_positions, truncation=True)
83
+
84
+ # Setup the tokenizer for targets
85
+ with tokenizer.as_target_tokenizer():
86
+ labels = tokenizer(targets, max_length=max_target_length, truncation=True)
87
+
88
+ model_inputs["labels"] = labels["input_ids"]
89
+ return model_inputs
90
+
91
+ tokenized_datasets = dataset.map(_tokenize_translate_fn, batched=True)
92
+ ds = tokenized_datasets.shuffle(seed=42).train_test_split(test_size=args.get("eval_size", 0.1))
93
+ train_ds = ds["train"]
94
+ eval_ds = ds["test"]
95
+ ft_args = args.get("using", {}).get("trainer_args", {})
96
+ ft_args["output_dir"] = args["model_folder"]
97
+ ft_args["predict_with_generate"] = True
98
+
99
+ model = AutoModelForSeq2SeqLM.from_pretrained(args["model_name"], config=config)
100
+ model.resize_token_embeddings(len(tokenizer))
101
+ training_args = Seq2SeqTrainingArguments(**ft_args)
102
+ data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
103
+
104
+ # generate trainer and finetune
105
+ trainer = Trainer(
106
+ model=model,
107
+ args=training_args,
108
+ train_dataset=train_ds,
109
+ eval_dataset=eval_ds,
110
+ data_collator=data_collator,
111
+ # compute_metrics=_compute_metrics,
112
+ )
113
+
114
+ return tokenizer, trainer
115
+
116
+
117
+ def _finetune_summarization(df, args):
118
+ df = df.rename(columns={args["target"]: "summary", args["input_column"]: "text"})
119
+ tokenizer_from = args.get("using", {}).get("tokenizer_from", args["model_name"])
120
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_from)
121
+ dataset = Dataset.from_pandas(df)
122
+ config = AutoConfig.from_pretrained(args["model_name"])
123
+
124
+ def _tokenize_summarize_fn(examples):
125
+ prefix = "summarize: " if "t5" in args["model_name"] else ""
126
+ inputs = [prefix + doc for doc in examples["text"]]
127
+ model_inputs = tokenizer(
128
+ inputs,
129
+ padding="max_length",
130
+ truncation=True,
131
+ max_length=config.max_position_embeddings,
132
+ pad_to_max_length=True,
133
+ ) # noqa
134
+ labels = tokenizer(
135
+ text_target=examples["summary"],
136
+ max_length=config.max_position_embeddings,
137
+ truncation=True,
138
+ ) # noqa
139
+ model_inputs["labels"] = labels["input_ids"]
140
+ return model_inputs
141
+
142
+ tokenized_datasets = dataset.map(_tokenize_summarize_fn, batched=True)
143
+ ds = tokenized_datasets.shuffle(seed=42).train_test_split(test_size=args.get("eval_size", 0.1))
144
+ train_ds = ds["train"]
145
+ eval_ds = ds["test"]
146
+
147
+ ft_args = args.get("using", {}).get("trainer_args", {})
148
+ ft_args["output_dir"] = args["model_folder"]
149
+ ft_args["predict_with_generate"] = True
150
+
151
+ model = AutoModelForSeq2SeqLM.from_pretrained(args["model_name"], config=config)
152
+ metric = evaluate.load("rouge")
153
+ training_args = Seq2SeqTrainingArguments(**ft_args)
154
+ data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
155
+
156
+ def _compute_metrics(eval_pred):
157
+ # ref: github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb
158
+ predictions, labels = eval_pred
159
+ decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
160
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
161
+
162
+ # Rogue expects a newline after each sentence
163
+ decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
164
+ decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
165
+
166
+ result = metric.compute(
167
+ predictions=decoded_preds,
168
+ references=decoded_labels,
169
+ use_stemmer=True,
170
+ use_aggregator=True,
171
+ )
172
+ result = {key: value * 100 for key, value in result.items()}
173
+ prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
174
+ result["gen_len"] = np.mean(prediction_lens) # todo: remove?
175
+ return {k: round(v, 4) for k, v in result.items()}
176
+
177
+ # generate trainer and finetune
178
+ trainer = Trainer(
179
+ model=model,
180
+ args=training_args,
181
+ train_dataset=train_ds,
182
+ eval_dataset=eval_ds,
183
+ data_collator=data_collator,
184
+ compute_metrics=_compute_metrics,
185
+ )
186
+
187
+ return tokenizer, trainer
188
+
189
+
190
+ def _finetune_fill_mask(df, args):
191
+ raise NotImplementedError("Finetuning fill-mask models is not yet supported.")
192
+
193
+
194
+ def _finetune_text_generation(df, args):
195
+ raise NotImplementedError("Finetuning text-generation models is not yet supported.")
196
+
197
+
198
+ def _finetune_question_answering(df, args):
199
+ raise NotImplementedError("Finetuning question-answering models is not yet supported.")
200
+
201
+
202
+ def _finetune_text_2_text_generation(df, args):
203
+ raise NotImplementedError("Finetuning text-2-text generation models is not yet supported.")