MindsDB 25.1.2.1__py3-none-any.whl → 25.1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.4.0.dist-info}/METADATA +244 -242
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.4.0.dist-info}/RECORD +76 -67
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +5 -3
- mindsdb/api/executor/__init__.py +0 -1
- mindsdb/api/executor/command_executor.py +2 -1
- mindsdb/api/executor/data_types/answer.py +1 -1
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +7 -2
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +8 -1
- mindsdb/api/executor/sql_query/__init__.py +1 -0
- mindsdb/api/executor/sql_query/result_set.py +36 -21
- mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +1 -1
- mindsdb/api/executor/sql_query/steps/join_step.py +4 -4
- mindsdb/api/executor/sql_query/steps/map_reduce_step.py +6 -39
- mindsdb/api/executor/utilities/sql.py +2 -10
- mindsdb/api/http/namespaces/knowledge_bases.py +3 -3
- mindsdb/api/http/namespaces/sql.py +3 -1
- mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +2 -1
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +7 -0
- mindsdb/api/postgres/postgres_proxy/executor/executor.py +2 -1
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +2 -2
- mindsdb/integrations/handlers/chromadb_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/file_handler/file_handler.py +1 -1
- mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +17 -1
- mindsdb/integrations/handlers/jira_handler/jira_handler.py +15 -1
- mindsdb/integrations/handlers/jira_handler/jira_table.py +52 -31
- mindsdb/integrations/handlers/langchain_embedding_handler/fastapi_embeddings.py +82 -0
- mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +8 -1
- mindsdb/integrations/handlers/langchain_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +48 -16
- mindsdb/integrations/handlers/pinecone_handler/pinecone_handler.py +123 -72
- mindsdb/integrations/handlers/pinecone_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +12 -6
- mindsdb/integrations/handlers/slack_handler/slack_handler.py +13 -2
- mindsdb/integrations/handlers/slack_handler/slack_tables.py +21 -1
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +1 -1
- mindsdb/integrations/utilities/rag/loaders/vector_store_loader/pgvector.py +76 -27
- mindsdb/integrations/utilities/rag/loaders/vector_store_loader/vector_store_loader.py +18 -1
- mindsdb/integrations/utilities/rag/pipelines/rag.py +73 -18
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +166 -108
- mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +36 -14
- mindsdb/integrations/utilities/rag/settings.py +8 -2
- mindsdb/integrations/utilities/sql_utils.py +1 -1
- mindsdb/interfaces/agents/agents_controller.py +3 -5
- mindsdb/interfaces/agents/langchain_agent.py +112 -150
- mindsdb/interfaces/agents/langfuse_callback_handler.py +0 -37
- mindsdb/interfaces/agents/mindsdb_database_agent.py +15 -13
- mindsdb/interfaces/chatbot/chatbot_controller.py +7 -11
- mindsdb/interfaces/chatbot/chatbot_task.py +16 -5
- mindsdb/interfaces/chatbot/memory.py +58 -13
- mindsdb/interfaces/database/projects.py +17 -15
- mindsdb/interfaces/database/views.py +12 -25
- mindsdb/interfaces/knowledge_base/controller.py +39 -15
- mindsdb/interfaces/model/functions.py +15 -4
- mindsdb/interfaces/model/model_controller.py +4 -7
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +47 -38
- mindsdb/interfaces/skills/retrieval_tool.py +10 -3
- mindsdb/interfaces/skills/skill_tool.py +97 -53
- mindsdb/interfaces/skills/sql_agent.py +77 -36
- mindsdb/interfaces/storage/db.py +1 -1
- mindsdb/migrations/versions/2025-01-15_c06c35f7e8e1_project_company.py +88 -0
- mindsdb/utilities/cache.py +7 -4
- mindsdb/utilities/context.py +11 -1
- mindsdb/utilities/langfuse.py +264 -0
- mindsdb/utilities/log.py +20 -2
- mindsdb/utilities/otel/__init__.py +206 -0
- mindsdb/utilities/otel/logger.py +25 -0
- mindsdb/utilities/otel/meter.py +19 -0
- mindsdb/utilities/otel/metric_handlers/__init__.py +25 -0
- mindsdb/utilities/otel/tracer.py +16 -0
- mindsdb/utilities/partitioning.py +52 -0
- mindsdb/utilities/render/sqlalchemy_render.py +7 -1
- mindsdb/utilities/utils.py +34 -0
- mindsdb/utilities/otel.py +0 -72
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.4.0.dist-info}/LICENSE +0 -0
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.4.0.dist-info}/WHEEL +0 -0
- {MindsDB-25.1.2.1.dist-info → MindsDB-25.1.4.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
from typing import List
|
|
2
1
|
import copy
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
3
4
|
import numpy as np
|
|
4
5
|
import pandas as pd
|
|
5
6
|
|
|
@@ -35,6 +36,19 @@ class Column:
|
|
|
35
36
|
return f'{self.__class__.__name__}({self.__dict__})'
|
|
36
37
|
|
|
37
38
|
|
|
39
|
+
def rename_df_columns(df: pd.DataFrame, names: Optional[List] = None) -> None:
|
|
40
|
+
"""Inplace rename of dataframe columns
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
df (pd.DataFrame): dataframe
|
|
44
|
+
names (Optional[List]): columns names to set
|
|
45
|
+
"""
|
|
46
|
+
if names is not None:
|
|
47
|
+
df.columns = names
|
|
48
|
+
else:
|
|
49
|
+
df.columns = list(range(len(df.columns)))
|
|
50
|
+
|
|
51
|
+
|
|
38
52
|
class ResultSet:
|
|
39
53
|
def __init__(self, columns=None, values: List[List] = None, df: pd.DataFrame = None):
|
|
40
54
|
'''
|
|
@@ -73,20 +87,19 @@ class ResultSet:
|
|
|
73
87
|
# --- converters ---
|
|
74
88
|
|
|
75
89
|
def from_df(self, df, database=None, table_name=None, table_alias=None):
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
for i, col in enumerate(df.columns):
|
|
80
|
-
self._columns.append(Column(
|
|
81
|
-
name=col,
|
|
90
|
+
self._columns = [
|
|
91
|
+
Column(
|
|
92
|
+
name=column_name,
|
|
82
93
|
table_name=table_name,
|
|
83
94
|
table_alias=table_alias,
|
|
84
95
|
database=database,
|
|
85
|
-
type=
|
|
86
|
-
)
|
|
96
|
+
type=column_dtype
|
|
97
|
+
) for column_name, column_dtype
|
|
98
|
+
in zip(df.columns, df.dtypes)
|
|
99
|
+
]
|
|
87
100
|
|
|
88
|
-
|
|
89
|
-
self._df = df
|
|
101
|
+
rename_df_columns(df)
|
|
102
|
+
self._df = df
|
|
90
103
|
|
|
91
104
|
return self
|
|
92
105
|
|
|
@@ -97,9 +110,6 @@ class ResultSet:
|
|
|
97
110
|
if col.alias is not None:
|
|
98
111
|
alias_idx[col.alias] = col
|
|
99
112
|
|
|
100
|
-
# resp_dict = df.to_dict(orient='split')
|
|
101
|
-
# self._records = resp_dict['data']
|
|
102
|
-
|
|
103
113
|
for col in df.columns:
|
|
104
114
|
if col in col_names or strict:
|
|
105
115
|
column = col_names[col]
|
|
@@ -109,13 +119,16 @@ class ResultSet:
|
|
|
109
119
|
column = Column(col)
|
|
110
120
|
self._columns.append(column)
|
|
111
121
|
|
|
112
|
-
|
|
122
|
+
rename_df_columns(df)
|
|
123
|
+
self._df = df
|
|
113
124
|
|
|
114
125
|
return self
|
|
115
126
|
|
|
116
127
|
def to_df(self):
|
|
117
|
-
|
|
118
|
-
|
|
128
|
+
columns_names = self.get_column_names()
|
|
129
|
+
df = self.get_raw_df()
|
|
130
|
+
rename_df_columns(df, columns_names)
|
|
131
|
+
return df
|
|
119
132
|
|
|
120
133
|
def to_df_cols(self, prefix=''):
|
|
121
134
|
# returns dataframe and dict of columns
|
|
@@ -128,7 +141,9 @@ class ResultSet:
|
|
|
128
141
|
columns.append(name)
|
|
129
142
|
col_names[name] = col
|
|
130
143
|
|
|
131
|
-
|
|
144
|
+
df = self.get_raw_df()
|
|
145
|
+
rename_df_columns(df, columns)
|
|
146
|
+
return df, col_names
|
|
132
147
|
|
|
133
148
|
# --- tables ---
|
|
134
149
|
|
|
@@ -174,7 +189,7 @@ class ResultSet:
|
|
|
174
189
|
self._columns.pop(idx)
|
|
175
190
|
|
|
176
191
|
self._df.drop(idx, axis=1, inplace=True)
|
|
177
|
-
|
|
192
|
+
rename_df_columns(self._df)
|
|
178
193
|
|
|
179
194
|
@property
|
|
180
195
|
def columns(self):
|
|
@@ -226,7 +241,7 @@ class ResultSet:
|
|
|
226
241
|
if len(df.columns) != len(self._columns):
|
|
227
242
|
raise WrongArgumentError(f'Record length mismatch columns length: {len(df.columns)} != {len(self.columns)}')
|
|
228
243
|
|
|
229
|
-
|
|
244
|
+
rename_df_columns(df)
|
|
230
245
|
|
|
231
246
|
if self._df is None:
|
|
232
247
|
self._df = df
|
|
@@ -269,7 +284,7 @@ class ResultSet:
|
|
|
269
284
|
def get_column_values(self, col_idx):
|
|
270
285
|
# get by column index
|
|
271
286
|
df = self.get_raw_df()
|
|
272
|
-
return list(df[col_idx])
|
|
287
|
+
return list(df[df.columns[col_idx]])
|
|
273
288
|
|
|
274
289
|
def set_column_values(self, col_name, values):
|
|
275
290
|
# values is one value or list of values
|
|
@@ -213,7 +213,7 @@ class ApplyPredictorStepCall(ApplyPredictorBaseCall):
|
|
|
213
213
|
columns = list(table_df.columns)
|
|
214
214
|
for col_idx, name in cols_to_rename.items():
|
|
215
215
|
columns[col_idx] = name
|
|
216
|
-
table_df =
|
|
216
|
+
table_df.columns = columns
|
|
217
217
|
|
|
218
218
|
version = None
|
|
219
219
|
if len(step.predictor.parts) > 1 and step.predictor.parts[-1].isdigit():
|
|
@@ -90,15 +90,15 @@ class JoinStepCall(BaseStepCall):
|
|
|
90
90
|
table_b, names_b = right_data.to_df_cols(prefix='B')
|
|
91
91
|
|
|
92
92
|
query = f"""
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
93
|
+
SELECT * FROM table_a {join_type} table_b
|
|
94
|
+
ON {join_condition}
|
|
95
|
+
"""
|
|
96
96
|
resp_df, _description = query_df_with_type_infer_fallback(query, {
|
|
97
97
|
'table_a': table_a,
|
|
98
98
|
'table_b': table_b
|
|
99
99
|
})
|
|
100
100
|
|
|
101
|
-
resp_df
|
|
101
|
+
resp_df.replace({np.nan: None}, inplace=True)
|
|
102
102
|
|
|
103
103
|
names_a.update(names_b)
|
|
104
104
|
data = ResultSet().from_df_cols(resp_df, col_names=names_a)
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import copy
|
|
3
2
|
|
|
4
3
|
from mindsdb_sql_parser.ast import (
|
|
@@ -15,8 +14,7 @@ from mindsdb.api.executor.planner.steps import (
|
|
|
15
14
|
|
|
16
15
|
from mindsdb.api.executor.sql_query.result_set import ResultSet
|
|
17
16
|
from mindsdb.api.executor.exceptions import LogicError
|
|
18
|
-
from mindsdb.utilities.
|
|
19
|
-
from mindsdb.utilities.context_executor import execute_in_threads
|
|
17
|
+
from mindsdb.utilities.partitioning import process_dataframe_in_partitions
|
|
20
18
|
|
|
21
19
|
from .base import BaseStepCall
|
|
22
20
|
|
|
@@ -88,43 +86,12 @@ class MapReduceStepCall(BaseStepCall):
|
|
|
88
86
|
|
|
89
87
|
df = input_data.get_raw_df()
|
|
90
88
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
chunk = 0
|
|
94
|
-
while chunk * partition < len(df):
|
|
95
|
-
# create results with partition
|
|
96
|
-
df1 = df.iloc[chunk * partition: (chunk + 1) * partition]
|
|
97
|
-
chunk += 1
|
|
98
|
-
yield df1, substeps, input_idx, input_columns
|
|
99
|
-
|
|
100
|
-
tasks = split_data_f(df)
|
|
101
|
-
|
|
102
|
-
# workers count
|
|
103
|
-
is_cloud = Config().get('cloud', False)
|
|
104
|
-
if is_cloud:
|
|
105
|
-
max_threads = int(os.getenv('MAX_QUERY_PARTITIONS', 10))
|
|
106
|
-
else:
|
|
107
|
-
max_threads = os.cpu_count() - 2
|
|
108
|
-
|
|
109
|
-
# don't exceed chunk_count
|
|
110
|
-
chunk_count = int(len(df) / partition)
|
|
111
|
-
max_threads = min(max_threads, chunk_count)
|
|
112
|
-
|
|
113
|
-
if max_threads < 1:
|
|
114
|
-
max_threads = 1
|
|
89
|
+
def callback(chunk):
|
|
90
|
+
return self._exec_partition(chunk, substeps, input_idx, input_columns)
|
|
115
91
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
for task in tasks:
|
|
120
|
-
sub_data = self._exec_partition(*task)
|
|
121
|
-
if sub_data:
|
|
122
|
-
data = join_query_data(data, sub_data)
|
|
123
|
-
|
|
124
|
-
else:
|
|
125
|
-
for sub_data in execute_in_threads(self._exec_partition, tasks, thread_count=max_threads):
|
|
126
|
-
if sub_data:
|
|
127
|
-
data = join_query_data(data, sub_data)
|
|
92
|
+
for result in process_dataframe_in_partitions(df, callback, partition):
|
|
93
|
+
if result:
|
|
94
|
+
data = join_query_data(data, result)
|
|
128
95
|
|
|
129
96
|
return data
|
|
130
97
|
|
|
@@ -182,14 +182,6 @@ def query_df(df, query, session=None):
|
|
|
182
182
|
df = df.astype({'CONNECTION_DATA': 'string'})
|
|
183
183
|
|
|
184
184
|
result_df, description = query_df_with_type_infer_fallback(query_str, {'df': df}, user_functions=user_functions)
|
|
185
|
-
result_df
|
|
186
|
-
|
|
187
|
-
new_column_names = {}
|
|
188
|
-
real_column_names = [x[0] for x in description]
|
|
189
|
-
for i, duck_column_name in enumerate(result_df.columns):
|
|
190
|
-
new_column_names[duck_column_name] = real_column_names[i]
|
|
191
|
-
result_df = result_df.rename(
|
|
192
|
-
new_column_names,
|
|
193
|
-
axis='columns'
|
|
194
|
-
)
|
|
185
|
+
result_df.replace({np.nan: None}, inplace=True)
|
|
186
|
+
result_df.columns = [x[0] for x in description]
|
|
195
187
|
return result_df
|
|
@@ -185,8 +185,10 @@ class KnowledgeBaseResource(Resource):
|
|
|
185
185
|
)
|
|
186
186
|
|
|
187
187
|
try:
|
|
188
|
+
kb_data = request.json['knowledge_base']
|
|
189
|
+
|
|
188
190
|
# Retrieve the knowledge base table for updates
|
|
189
|
-
table = session.kb_controller.get_table(knowledge_base_name, project.id)
|
|
191
|
+
table = session.kb_controller.get_table(knowledge_base_name, project.id, params=kb_data.get('params'))
|
|
190
192
|
if table is None:
|
|
191
193
|
return http_error(
|
|
192
194
|
HTTPStatus.NOT_FOUND,
|
|
@@ -194,8 +196,6 @@ class KnowledgeBaseResource(Resource):
|
|
|
194
196
|
f'Knowledge Base with name {knowledge_base_name} does not exist'
|
|
195
197
|
)
|
|
196
198
|
|
|
197
|
-
kb_data = request.json['knowledge_base']
|
|
198
|
-
|
|
199
199
|
# Set up dependencies for DocumentLoader
|
|
200
200
|
file_controller = FileController()
|
|
201
201
|
file_splitter_config = FileSplitterConfig()
|
|
@@ -78,6 +78,7 @@ class Query(Resource):
|
|
|
78
78
|
"error_code": 0,
|
|
79
79
|
"error_message": str(e),
|
|
80
80
|
}
|
|
81
|
+
logger.error(f"Error query processing: \n{traceback.format_exc()}")
|
|
81
82
|
|
|
82
83
|
except UnknownError as e:
|
|
83
84
|
# unclassified
|
|
@@ -87,6 +88,7 @@ class Query(Resource):
|
|
|
87
88
|
"error_code": 0,
|
|
88
89
|
"error_message": str(e),
|
|
89
90
|
}
|
|
91
|
+
logger.error(f"Error query processing: \n{traceback.format_exc()}")
|
|
90
92
|
|
|
91
93
|
except Exception as e:
|
|
92
94
|
error_type = "unexpected"
|
|
@@ -95,7 +97,7 @@ class Query(Resource):
|
|
|
95
97
|
"error_code": 0,
|
|
96
98
|
"error_message": str(e),
|
|
97
99
|
}
|
|
98
|
-
logger.
|
|
100
|
+
logger.error(f"Error query processing: \n{traceback.format_exc()}")
|
|
99
101
|
|
|
100
102
|
if query_response.get("type") == SQL_RESPONSE_TYPE.ERROR:
|
|
101
103
|
error_type = "expected"
|
|
@@ -2,7 +2,8 @@ from mindsdb_sql_parser import parse_sql
|
|
|
2
2
|
from mindsdb.api.executor.planner import utils as planner_utils
|
|
3
3
|
|
|
4
4
|
import mindsdb.utilities.profiler as profiler
|
|
5
|
-
from mindsdb.api.executor import Column
|
|
5
|
+
from mindsdb.api.executor.sql_query.result_set import Column
|
|
6
|
+
from mindsdb.api.executor.sql_query import SQLQuery
|
|
6
7
|
from mindsdb.api.executor.command_executor import ExecuteCommands
|
|
7
8
|
from mindsdb.api.mysql.mysql_proxy.utilities import ErSqlSyntaxError
|
|
8
9
|
from mindsdb.utilities import log
|
|
@@ -83,6 +83,7 @@ from mindsdb.api.mysql.mysql_proxy.utilities.lightwood_dtype import dtype
|
|
|
83
83
|
from mindsdb.utilities import log
|
|
84
84
|
from mindsdb.utilities.config import Config
|
|
85
85
|
from mindsdb.utilities.context import context as ctx
|
|
86
|
+
from mindsdb.utilities.otel.metric_handlers import get_query_request_counter
|
|
86
87
|
from mindsdb.utilities.wizards import make_ssl_cert
|
|
87
88
|
|
|
88
89
|
logger = log.getLogger(__name__)
|
|
@@ -562,6 +563,12 @@ class MysqlProxy(SocketServer.BaseRequestHandler):
|
|
|
562
563
|
data=executor.data,
|
|
563
564
|
status=executor.server_status,
|
|
564
565
|
)
|
|
566
|
+
|
|
567
|
+
# Increment the counter and include metadata in attributes
|
|
568
|
+
metadata = ctx.metadata(query=sql)
|
|
569
|
+
query_request_counter = get_query_request_counter()
|
|
570
|
+
query_request_counter.add(1, metadata)
|
|
571
|
+
|
|
565
572
|
return resp
|
|
566
573
|
|
|
567
574
|
def answer_stmt_prepare(self, sql):
|
|
@@ -6,7 +6,8 @@ from mindsdb.api.executor.planner import utils as planner_utils
|
|
|
6
6
|
from numpy import dtype as np_dtype
|
|
7
7
|
from pandas.api import types as pd_types
|
|
8
8
|
|
|
9
|
-
from mindsdb.api.executor import SQLQuery
|
|
9
|
+
from mindsdb.api.executor.sql_query import SQLQuery
|
|
10
|
+
from mindsdb.api.executor.sql_query.result_set import Column
|
|
10
11
|
from mindsdb.api.mysql.mysql_proxy.utilities.lightwood_dtype import dtype
|
|
11
12
|
from mindsdb.api.executor.command_executor import ExecuteCommands
|
|
12
13
|
from mindsdb.api.mysql.mysql_proxy.utilities import SqlApiException
|
|
@@ -286,7 +286,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
286
286
|
else:
|
|
287
287
|
# general get query
|
|
288
288
|
result = collection.get(
|
|
289
|
-
ids=id_filters,
|
|
289
|
+
ids=id_filters or None,
|
|
290
290
|
where=filters,
|
|
291
291
|
limit=limit,
|
|
292
292
|
offset=offset,
|
|
@@ -475,7 +475,7 @@ class ChromaDBHandler(VectorStoreHandler):
|
|
|
475
475
|
collections = self._client.list_collections()
|
|
476
476
|
collections_name = pd.DataFrame(
|
|
477
477
|
columns=["table_name"],
|
|
478
|
-
data=
|
|
478
|
+
data=collections,
|
|
479
479
|
)
|
|
480
480
|
return Response(resp_type=RESPONSE_TYPE.TABLE, data_frame=collections_name)
|
|
481
481
|
|
|
@@ -1 +1 @@
|
|
|
1
|
-
chromadb~=0.
|
|
1
|
+
chromadb~=0.6.3
|
|
@@ -276,7 +276,7 @@ class FileHandler(DatabaseHandler):
|
|
|
276
276
|
|
|
277
277
|
header = df.columns.values.tolist()
|
|
278
278
|
|
|
279
|
-
df
|
|
279
|
+
df.columns = [key.strip() for key in header]
|
|
280
280
|
df = df.applymap(clean_cell)
|
|
281
281
|
|
|
282
282
|
header = [x.strip() for x in header]
|
|
@@ -25,6 +25,11 @@ test_file_content = [
|
|
|
25
25
|
[3, -3, 0.3, "C"],
|
|
26
26
|
]
|
|
27
27
|
|
|
28
|
+
test_excel_sheet_content = [
|
|
29
|
+
["Sheet_Name"],
|
|
30
|
+
["Sheet1"],
|
|
31
|
+
]
|
|
32
|
+
|
|
28
33
|
file_records = [("one", 1, test_file_content[0]), ("two", 2, test_file_content[0])]
|
|
29
34
|
|
|
30
35
|
|
|
@@ -349,7 +354,18 @@ def test_get_file_path_with_url(mock_fetch_url):
|
|
|
349
354
|
],
|
|
350
355
|
)
|
|
351
356
|
def test_handle_source(file_path, expected_columns):
|
|
352
|
-
|
|
357
|
+
sheet_name = None
|
|
358
|
+
# Excel files return a list of sheets when queried without a sheet name
|
|
359
|
+
if file_path.endswith(".xlsx"):
|
|
360
|
+
df, _ = FileHandler._handle_source(file_path)
|
|
361
|
+
assert isinstance(df, pandas.DataFrame)
|
|
362
|
+
|
|
363
|
+
assert df.columns.tolist() == test_excel_sheet_content[0]
|
|
364
|
+
assert len(df) == len(test_excel_sheet_content) - 1
|
|
365
|
+
assert df.values.tolist() == test_excel_sheet_content[1:]
|
|
366
|
+
sheet_name = test_excel_sheet_content[1][0]
|
|
367
|
+
|
|
368
|
+
df, _ = FileHandler._handle_source(file_path, sheet_name=sheet_name)
|
|
353
369
|
assert isinstance(df, pandas.DataFrame)
|
|
354
370
|
assert df.columns.tolist() == expected_columns
|
|
355
371
|
|
|
@@ -55,9 +55,23 @@ class JiraHandler(APIHandler):
|
|
|
55
55
|
return self.connection
|
|
56
56
|
|
|
57
57
|
s = requests.Session()
|
|
58
|
+
if self.connection_data.get("cloud", False):
|
|
59
|
+
params = {
|
|
60
|
+
"cloud": True,
|
|
61
|
+
"username": self.connection_data['jira_username'],
|
|
62
|
+
"password": self.connection_data['jira_api_token'],
|
|
63
|
+
"url": self.connection_data['jira_url'],
|
|
64
|
+
}
|
|
65
|
+
else:
|
|
66
|
+
params = {
|
|
67
|
+
"cloud": False,
|
|
68
|
+
"url": self.connection_data['jira_url'],
|
|
69
|
+
"session": s
|
|
70
|
+
}
|
|
71
|
+
|
|
58
72
|
s.headers['Authorization'] = f"Bearer {self.connection_data['jira_api_token']}"
|
|
59
73
|
|
|
60
|
-
self.connection = Jira(
|
|
74
|
+
self.connection = Jira(**params)
|
|
61
75
|
self.is_connected = True
|
|
62
76
|
|
|
63
77
|
|
|
@@ -10,9 +10,26 @@ from mindsdb_sql_parser import ast
|
|
|
10
10
|
|
|
11
11
|
logger = log.getLogger(__name__)
|
|
12
12
|
|
|
13
|
+
|
|
14
|
+
def flatten_json(nested_json, parent_key="", separator="."):
|
|
15
|
+
"""
|
|
16
|
+
Recursively flattens a nested JSON object into a dictionary with dot notation keys.
|
|
17
|
+
"""
|
|
18
|
+
items = []
|
|
19
|
+
for k, v in nested_json.items():
|
|
20
|
+
new_key = f"{parent_key}{separator}{k}" if parent_key else k
|
|
21
|
+
if isinstance(v, dict):
|
|
22
|
+
items.extend(flatten_json(v, new_key, separator=separator).items())
|
|
23
|
+
else:
|
|
24
|
+
items.append((new_key, v))
|
|
25
|
+
return dict(items)
|
|
26
|
+
|
|
27
|
+
|
|
13
28
|
class JiraProjectsTable(APITable):
|
|
14
29
|
"""Jira Projects Table implementation"""
|
|
30
|
+
|
|
15
31
|
_MAX_API_RESULTS = 100
|
|
32
|
+
|
|
16
33
|
def select(self, query: ast.Select) -> pd.DataFrame:
|
|
17
34
|
"""Pulls data from the Jira "get_all_project_issues" API endpoint
|
|
18
35
|
Parameters
|
|
@@ -42,8 +59,8 @@ class JiraProjectsTable(APITable):
|
|
|
42
59
|
|
|
43
60
|
for an_order in query.order_by:
|
|
44
61
|
if an_order.field.parts[0] != "key":
|
|
45
|
-
continue
|
|
46
|
-
if an_order.field.parts[1] in ["reporter","assignee","status"]:
|
|
62
|
+
continue
|
|
63
|
+
if an_order.field.parts[1] in ["reporter", "assignee", "status"]:
|
|
47
64
|
if issues_kwargs != {}:
|
|
48
65
|
raise ValueError(
|
|
49
66
|
"Duplicate order conditions found for reporter,status and assignee"
|
|
@@ -61,9 +78,9 @@ class JiraProjectsTable(APITable):
|
|
|
61
78
|
raise ValueError(
|
|
62
79
|
f"Order by unknown column {an_order.field.parts[1]}"
|
|
63
80
|
)
|
|
64
|
-
project = self.handler.connection_data[
|
|
81
|
+
project = self.handler.connection_data["project"]
|
|
65
82
|
jira_project_df = self.call_jira_api(project)
|
|
66
|
-
|
|
83
|
+
|
|
67
84
|
selected_columns = []
|
|
68
85
|
for target in query.targets:
|
|
69
86
|
if isinstance(target, ast.Star):
|
|
@@ -74,7 +91,6 @@ class JiraProjectsTable(APITable):
|
|
|
74
91
|
else:
|
|
75
92
|
raise ValueError(f"Unknown query target {type(target)}")
|
|
76
93
|
|
|
77
|
-
|
|
78
94
|
if len(jira_project_df) == 0:
|
|
79
95
|
jira_project_df = pd.DataFrame([], columns=selected_columns)
|
|
80
96
|
return jira_project_df
|
|
@@ -88,7 +104,7 @@ class JiraProjectsTable(APITable):
|
|
|
88
104
|
by=order_by_conditions["columns"],
|
|
89
105
|
ascending=order_by_conditions["ascending"],
|
|
90
106
|
)
|
|
91
|
-
|
|
107
|
+
|
|
92
108
|
if query.limit:
|
|
93
109
|
jira_project_df = jira_project_df.head(total_results)
|
|
94
110
|
|
|
@@ -102,12 +118,12 @@ class JiraProjectsTable(APITable):
|
|
|
102
118
|
List of columns
|
|
103
119
|
"""
|
|
104
120
|
return [
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
121
|
+
"key",
|
|
122
|
+
"summary",
|
|
123
|
+
"status",
|
|
124
|
+
"reporter",
|
|
125
|
+
"assignee",
|
|
126
|
+
"priority",
|
|
111
127
|
]
|
|
112
128
|
|
|
113
129
|
def call_jira_api(self, project):
|
|
@@ -116,36 +132,41 @@ class JiraProjectsTable(APITable):
|
|
|
116
132
|
max_records = jira.get_project_issues_count(project)
|
|
117
133
|
max_records = 100
|
|
118
134
|
jql_query = self.handler.construct_jql()
|
|
119
|
-
max_results = self._MAX_API_RESULTS
|
|
135
|
+
max_results = self._MAX_API_RESULTS
|
|
120
136
|
start_index = 0
|
|
121
137
|
total = 1
|
|
122
138
|
fields = [
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
139
|
+
"key",
|
|
140
|
+
"fields.summary",
|
|
141
|
+
"fields.status.name",
|
|
142
|
+
"fields.reporter.displayName",
|
|
143
|
+
"fields.assignee.displayName",
|
|
144
|
+
"fields.priority.name",
|
|
129
145
|
]
|
|
130
146
|
|
|
131
147
|
all_jira_issues_df = pd.DataFrame(columns=fields)
|
|
132
148
|
|
|
133
149
|
while start_index <= total:
|
|
134
|
-
results = self.handler.connect().jql(
|
|
135
|
-
|
|
150
|
+
results = self.handler.connect().jql(
|
|
151
|
+
jql_query, start=start_index, limit=max_results
|
|
152
|
+
)
|
|
153
|
+
flattened_data = [flatten_json(item) for item in results["issues"]]
|
|
154
|
+
df = pd.DataFrame(flattened_data)
|
|
136
155
|
df = df[fields]
|
|
137
156
|
start_index += max_results
|
|
138
|
-
total =
|
|
157
|
+
total = results["total"]
|
|
139
158
|
all_jira_issues_df = pd.concat([all_jira_issues_df, df], axis=0)
|
|
140
159
|
|
|
160
|
+
all_jira_issues_df = all_jira_issues_df.rename(
|
|
161
|
+
columns={
|
|
162
|
+
"key": "key",
|
|
163
|
+
"fields.summary": "summary",
|
|
164
|
+
"fields.reporter.displayName": "reporter",
|
|
165
|
+
"fields.assignee.displayName": "assignee",
|
|
166
|
+
"fields.priority.name": "priority",
|
|
167
|
+
"fields.status.name": "status",
|
|
168
|
+
},
|
|
169
|
+
errors="ignore",
|
|
170
|
+
)
|
|
141
171
|
|
|
142
|
-
all_jira_issues_df = all_jira_issues_df.rename(columns={
|
|
143
|
-
'key': 'key',
|
|
144
|
-
'fields.summary': 'summary',
|
|
145
|
-
'fields.reporter.name':'reporter',
|
|
146
|
-
'fields.assignee.name':'assignee',
|
|
147
|
-
'fields.priority.name':'priority',
|
|
148
|
-
'fields.status.name':'status'})
|
|
149
|
-
|
|
150
172
|
return all_jira_issues_df
|
|
151
|
-
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from typing import Any, List
|
|
2
|
+
from langchain_core.embeddings import Embeddings
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FastAPIEmbeddings(Embeddings):
|
|
7
|
+
"""An embedding extension that interfaces with FAST API. Useful for custom serving solutions."""
|
|
8
|
+
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
api_base: str,
|
|
12
|
+
model: str,
|
|
13
|
+
batch_size: int = 32,
|
|
14
|
+
**kwargs: Any,
|
|
15
|
+
):
|
|
16
|
+
"""Initialize the embeddings class.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
api_base: Base URL for the VLLM server
|
|
20
|
+
model: Model name/path to use for embeddings
|
|
21
|
+
batch_size: Batch size for generating embeddings
|
|
22
|
+
"""
|
|
23
|
+
super().__init__()
|
|
24
|
+
self.api_base = api_base
|
|
25
|
+
self.model = model
|
|
26
|
+
self.batch_size = batch_size
|
|
27
|
+
|
|
28
|
+
# initialize requests here with the api_base
|
|
29
|
+
|
|
30
|
+
def _get_embeddings(self, texts: List[str]) -> List[str]:
|
|
31
|
+
"""Get embeddings for a batch of text chunks.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
List of embeddings as strings. For sparse vectors, returns strings in format
|
|
35
|
+
"{key:value,...}/size" where size is the dimension of the vector space.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
headers = {"accept": "application/json", "Content-Type": "application/json"}
|
|
39
|
+
|
|
40
|
+
data = {
|
|
41
|
+
"input": texts,
|
|
42
|
+
"model": self.model
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
response = requests.post(self.api_base, headers=headers, json=data)
|
|
46
|
+
|
|
47
|
+
response.raise_for_status()
|
|
48
|
+
|
|
49
|
+
embeddings = []
|
|
50
|
+
for response_dict in response.json()["data"]:
|
|
51
|
+
embedding = response_dict["embedding"]
|
|
52
|
+
embeddings.append(embedding)
|
|
53
|
+
|
|
54
|
+
return embeddings
|
|
55
|
+
|
|
56
|
+
def embed_documents(self, texts: List[str]) -> List[str]:
|
|
57
|
+
"""Embed a list of documents using vLLM.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
texts: List of documents to embed
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
List of embeddings as strings, one for each document.
|
|
64
|
+
For sparse embeddings, returns strings in format "{key:value,...}/size"
|
|
65
|
+
For dense embeddings, returns JSON strings of float lists
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
return self._get_embeddings(texts)
|
|
69
|
+
|
|
70
|
+
def embed_query(self, text: str) -> str:
|
|
71
|
+
"""Embed a single query text using vLLM.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
text: Query text to embed
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Query embedding as a string.
|
|
78
|
+
For sparse embeddings, returns string in format "{key:value,...}/size"
|
|
79
|
+
For dense embeddings, returns JSON string of float list
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
return self._get_embeddings([text])[0]
|
|
@@ -10,6 +10,7 @@ from mindsdb.integrations.libs.base import BaseMLEngine
|
|
|
10
10
|
from mindsdb.utilities import log
|
|
11
11
|
from langchain_core.embeddings import Embeddings
|
|
12
12
|
from mindsdb.integrations.handlers.langchain_embedding_handler.vllm_embeddings import VLLMEmbeddings
|
|
13
|
+
from mindsdb.integrations.handlers.langchain_embedding_handler.fastapi_embeddings import FastAPIEmbeddings
|
|
13
14
|
|
|
14
15
|
logger = log.getLogger(__name__)
|
|
15
16
|
|
|
@@ -20,7 +21,10 @@ logger = log.getLogger(__name__)
|
|
|
20
21
|
# This is used for the user to select the embedding model
|
|
21
22
|
EMBEDDING_MODELS = {
|
|
22
23
|
'VLLM': 'VLLMEmbeddings',
|
|
23
|
-
'vllm': 'VLLMEmbeddings'
|
|
24
|
+
'vllm': 'VLLMEmbeddings',
|
|
25
|
+
'FastAPI': 'FastAPIEmbeddings',
|
|
26
|
+
'fastapi': 'FastAPIEmbeddings'
|
|
27
|
+
|
|
24
28
|
}
|
|
25
29
|
|
|
26
30
|
try:
|
|
@@ -55,6 +59,9 @@ def get_langchain_class(class_name: str) -> Embeddings:
|
|
|
55
59
|
if class_name == "VLLMEmbeddings":
|
|
56
60
|
return VLLMEmbeddings
|
|
57
61
|
|
|
62
|
+
if class_name == "FastAPIEmbeddings":
|
|
63
|
+
return FastAPIEmbeddings
|
|
64
|
+
|
|
58
65
|
# Then try langchain_community.embeddings
|
|
59
66
|
try:
|
|
60
67
|
module = importlib.import_module("langchain_community.embeddings")
|