MindsDB 25.2.2.2__py3-none-any.whl → 25.2.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- {MindsDB-25.2.2.2.dist-info → MindsDB-25.2.4.0.dist-info}/METADATA +209 -228
- {MindsDB-25.2.2.2.dist-info → MindsDB-25.2.4.0.dist-info}/RECORD +52 -50
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +1 -11
- mindsdb/api/executor/datahub/datanodes/system_tables.py +4 -1
- mindsdb/api/http/initialize.py +8 -5
- mindsdb/api/http/namespaces/agents.py +0 -7
- mindsdb/api/http/namespaces/config.py +0 -48
- mindsdb/api/http/namespaces/databases.py +69 -1
- mindsdb/api/http/namespaces/knowledge_bases.py +1 -1
- mindsdb/api/http/namespaces/util.py +0 -28
- mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/dspy_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/file_handler/file_handler.py +28 -46
- mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +8 -11
- mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/langchain_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/llama_index_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +1 -1
- mindsdb/integrations/handlers/openai_handler/constants.py +3 -1
- mindsdb/integrations/handlers/openai_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/rag_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +33 -8
- mindsdb/integrations/handlers/timegpt_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +3 -2
- mindsdb/integrations/handlers/web_handler/web_handler.py +42 -33
- mindsdb/integrations/handlers/youtube_handler/__init__.py +2 -0
- mindsdb/integrations/handlers/youtube_handler/connection_args.py +32 -0
- mindsdb/integrations/libs/llm/utils.py +5 -0
- mindsdb/integrations/libs/process_cache.py +2 -2
- mindsdb/integrations/utilities/files/file_reader.py +66 -14
- mindsdb/integrations/utilities/rag/chains/local_context_summarizer_chain.py +227 -0
- mindsdb/interfaces/agents/agents_controller.py +3 -3
- mindsdb/interfaces/agents/callback_handlers.py +52 -5
- mindsdb/interfaces/agents/langchain_agent.py +5 -3
- mindsdb/interfaces/database/database.py +1 -1
- mindsdb/interfaces/database/integrations.py +1 -1
- mindsdb/interfaces/file/file_controller.py +140 -11
- mindsdb/interfaces/jobs/scheduler.py +1 -1
- mindsdb/interfaces/knowledge_base/preprocessing/constants.py +2 -2
- mindsdb/interfaces/skills/skills_controller.py +2 -2
- mindsdb/interfaces/skills/sql_agent.py +6 -1
- mindsdb/interfaces/storage/db.py +1 -12
- mindsdb/migrations/versions/2025-02-09_4943359e354a_file_metadata.py +31 -0
- mindsdb/migrations/versions/2025-02-10_6ab9903fc59a_del_log_table.py +33 -0
- mindsdb/utilities/config.py +1 -0
- mindsdb/utilities/log.py +17 -2
- mindsdb/utilities/ml_task_queue/consumer.py +4 -2
- mindsdb/utilities/render/sqlalchemy_render.py +15 -5
- mindsdb/utilities/log_controller.py +0 -39
- mindsdb/utilities/telemetry.py +0 -44
- {MindsDB-25.2.2.2.dist-info → MindsDB-25.2.4.0.dist-info}/LICENSE +0 -0
- {MindsDB-25.2.2.2.dist-info → MindsDB-25.2.4.0.dist-info}/WHEEL +0 -0
- {MindsDB-25.2.2.2.dist-info → MindsDB-25.2.4.0.dist-info}/top_level.txt +0 -0
|
@@ -9,12 +9,6 @@ from flask import current_app as ca
|
|
|
9
9
|
|
|
10
10
|
from mindsdb.metrics.metrics import api_endpoint_metrics
|
|
11
11
|
from mindsdb.api.http.namespaces.configs.util import ns_conf
|
|
12
|
-
from mindsdb.utilities.telemetry import (
|
|
13
|
-
enable_telemetry,
|
|
14
|
-
disable_telemetry,
|
|
15
|
-
telemetry_file_exists,
|
|
16
|
-
inject_telemetry_to_static
|
|
17
|
-
)
|
|
18
12
|
from mindsdb.api.http.gui import update_static
|
|
19
13
|
from mindsdb.utilities.fs import clean_unlinked_process_marks
|
|
20
14
|
from mindsdb.api.http.utils import http_error
|
|
@@ -98,28 +92,6 @@ class PingNative(Resource):
|
|
|
98
92
|
return get_active_tasks()
|
|
99
93
|
|
|
100
94
|
|
|
101
|
-
@ns_conf.route('/telemetry')
|
|
102
|
-
class Telemetry(Resource):
|
|
103
|
-
@ns_conf.doc('get_telemetry_status')
|
|
104
|
-
@api_endpoint_metrics('GET', '/util/telemetry')
|
|
105
|
-
def get(self):
|
|
106
|
-
root_storage_path = ca.config_obj['paths']['root']
|
|
107
|
-
status = "enabled" if telemetry_file_exists(root_storage_path) else "disabled"
|
|
108
|
-
return {"status": status}
|
|
109
|
-
|
|
110
|
-
@ns_conf.doc('set_telemetry')
|
|
111
|
-
@api_endpoint_metrics('POST', '/util/telemetry')
|
|
112
|
-
def post(self):
|
|
113
|
-
data = request.json
|
|
114
|
-
action = data['action']
|
|
115
|
-
if str(action).lower() in ["true", "enable", "on"]:
|
|
116
|
-
enable_telemetry(ca.config_obj['paths']['root'])
|
|
117
|
-
else:
|
|
118
|
-
disable_telemetry(ca.config_obj['paths']['root'])
|
|
119
|
-
inject_telemetry_to_static(ca.config_obj.paths['static'])
|
|
120
|
-
return '', 200
|
|
121
|
-
|
|
122
|
-
|
|
123
95
|
@ns_conf.route('/validate_json_ai')
|
|
124
96
|
class ValidateJsonAI(Resource):
|
|
125
97
|
@api_endpoint_metrics('POST', '/util/validate_json_ai')
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import shutil
|
|
3
3
|
import tempfile
|
|
4
|
-
from pathlib import Path
|
|
5
4
|
|
|
6
5
|
import pandas as pd
|
|
7
6
|
from mindsdb_sql_parser import parse_sql
|
|
8
|
-
from mindsdb_sql_parser.ast import CreateTable, DropTables, Insert, Select
|
|
7
|
+
from mindsdb_sql_parser.ast import CreateTable, DropTables, Insert, Select, Identifier
|
|
9
8
|
from mindsdb_sql_parser.ast.base import ASTNode
|
|
10
9
|
|
|
11
10
|
from mindsdb.api.executor.utilities.sql import query_df
|
|
@@ -15,8 +14,6 @@ from mindsdb.integrations.libs.response import HandlerResponse as Response
|
|
|
15
14
|
from mindsdb.integrations.libs.response import HandlerStatusResponse as StatusResponse
|
|
16
15
|
from mindsdb.utilities import log
|
|
17
16
|
|
|
18
|
-
from mindsdb.integrations.utilities.files.file_reader import FileReader
|
|
19
|
-
|
|
20
17
|
|
|
21
18
|
logger = log.getLogger(__name__)
|
|
22
19
|
|
|
@@ -63,6 +60,18 @@ class FileHandler(DatabaseHandler):
|
|
|
63
60
|
def check_connection(self) -> StatusResponse:
|
|
64
61
|
return StatusResponse(True)
|
|
65
62
|
|
|
63
|
+
def _get_table_page_names(self, table: Identifier):
|
|
64
|
+
table_name_parts = table.parts
|
|
65
|
+
|
|
66
|
+
# Check if it's a multi-part name (e.g., `file_name.sheet_name`)
|
|
67
|
+
if len(table_name_parts) > 1:
|
|
68
|
+
table_name = table_name_parts[-2]
|
|
69
|
+
page_name = table_name_parts[-1] # Get the sheet name
|
|
70
|
+
else:
|
|
71
|
+
table_name = table_name_parts[-1]
|
|
72
|
+
page_name = None
|
|
73
|
+
return table_name, page_name
|
|
74
|
+
|
|
66
75
|
def query(self, query: ASTNode) -> Response:
|
|
67
76
|
if type(query) is DropTables:
|
|
68
77
|
for table_identifier in query.tables:
|
|
@@ -84,7 +93,7 @@ class FileHandler(DatabaseHandler):
|
|
|
84
93
|
)
|
|
85
94
|
return Response(RESPONSE_TYPE.OK)
|
|
86
95
|
|
|
87
|
-
if
|
|
96
|
+
if isinstance(query, CreateTable):
|
|
88
97
|
# Check if the table already exists or if the table name contains more than one namespace
|
|
89
98
|
existing_files = self.file_controller.get_files_names()
|
|
90
99
|
|
|
@@ -96,13 +105,13 @@ class FileHandler(DatabaseHandler):
|
|
|
96
105
|
|
|
97
106
|
table_name = query.name.parts[-1]
|
|
98
107
|
if table_name in existing_files:
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
108
|
+
if query.is_replace:
|
|
109
|
+
self.file_controller.delete_file(table_name)
|
|
110
|
+
else:
|
|
111
|
+
return Response(
|
|
112
|
+
RESPONSE_TYPE.ERROR,
|
|
113
|
+
error_message=f"Table '{table_name}' already exists",
|
|
114
|
+
)
|
|
106
115
|
|
|
107
116
|
temp_dir_path = tempfile.mkdtemp(prefix="mindsdb_file_")
|
|
108
117
|
|
|
@@ -126,31 +135,19 @@ class FileHandler(DatabaseHandler):
|
|
|
126
135
|
|
|
127
136
|
return Response(RESPONSE_TYPE.OK)
|
|
128
137
|
|
|
129
|
-
elif
|
|
130
|
-
|
|
131
|
-
table_name = table_name_parts[-1]
|
|
132
|
-
|
|
133
|
-
# Check if it's a multi-part name (e.g., `files.file_name.sheet_name`)
|
|
134
|
-
if len(table_name_parts) > 1:
|
|
135
|
-
table_name = table_name_parts[-2]
|
|
136
|
-
sheet_name = table_name_parts[-1] # Get the sheet name
|
|
137
|
-
else:
|
|
138
|
-
sheet_name = None
|
|
139
|
-
file_path = self.file_controller.get_file_path(table_name)
|
|
138
|
+
elif isinstance(query, Select):
|
|
139
|
+
table_name, page_name = self._get_table_page_names(query.from_table)
|
|
140
140
|
|
|
141
|
-
df = self.
|
|
141
|
+
df = self.file_controller.get_file_data(table_name, page_name)
|
|
142
142
|
|
|
143
143
|
# Process the SELECT query
|
|
144
144
|
result_df = query_df(df, query)
|
|
145
145
|
return Response(RESPONSE_TYPE.TABLE, data_frame=result_df)
|
|
146
146
|
|
|
147
|
-
elif
|
|
148
|
-
table_name = query.table
|
|
149
|
-
file_path = self.file_controller.get_file_path(table_name)
|
|
147
|
+
elif isinstance(query, Insert):
|
|
148
|
+
table_name, page_name = self._get_table_page_names(query.table)
|
|
150
149
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
df = file_reader.to_df()
|
|
150
|
+
df = self.file_controller.get_file_data(table_name, page_name)
|
|
154
151
|
|
|
155
152
|
# Create a new dataframe with the values from the query
|
|
156
153
|
new_df = pd.DataFrame(query.values, columns=[col.name for col in query.columns])
|
|
@@ -158,10 +155,7 @@ class FileHandler(DatabaseHandler):
|
|
|
158
155
|
# Concatenate the new dataframe with the existing one
|
|
159
156
|
df = pd.concat([df, new_df], ignore_index=True)
|
|
160
157
|
|
|
161
|
-
|
|
162
|
-
format = Path(file_path).suffix.strip(".").lower()
|
|
163
|
-
write_method = getattr(df, f"to_{format}")
|
|
164
|
-
write_method(file_path, index=False)
|
|
158
|
+
self.file_controller.set_file_data(table_name, df, page_name=page_name)
|
|
165
159
|
|
|
166
160
|
return Response(RESPONSE_TYPE.OK)
|
|
167
161
|
|
|
@@ -175,18 +169,6 @@ class FileHandler(DatabaseHandler):
|
|
|
175
169
|
ast = self.parser(query)
|
|
176
170
|
return self.query(ast)
|
|
177
171
|
|
|
178
|
-
@staticmethod
|
|
179
|
-
def handle_source(file_path, **kwargs):
|
|
180
|
-
file_reader = FileReader(path=file_path)
|
|
181
|
-
|
|
182
|
-
df = file_reader.to_df(**kwargs)
|
|
183
|
-
|
|
184
|
-
header = df.columns.values.tolist()
|
|
185
|
-
|
|
186
|
-
df.columns = [key.strip() for key in header]
|
|
187
|
-
df = df.applymap(clean_cell)
|
|
188
|
-
return df
|
|
189
|
-
|
|
190
172
|
def get_tables(self) -> Response:
|
|
191
173
|
"""
|
|
192
174
|
List all files
|
|
@@ -64,6 +64,12 @@ class MockFileController:
|
|
|
64
64
|
def save_file(self, name, file_path, file_name=None):
|
|
65
65
|
return True
|
|
66
66
|
|
|
67
|
+
def get_file_data(self, name, page_name=None):
|
|
68
|
+
return pandas.DataFrame(test_file_content[1:], columns=test_file_content[0])
|
|
69
|
+
|
|
70
|
+
def set_file_data(self, name, df, page_name=None):
|
|
71
|
+
return True
|
|
72
|
+
|
|
67
73
|
|
|
68
74
|
def curr_dir():
|
|
69
75
|
return os.path.dirname(os.path.realpath(__file__))
|
|
@@ -296,18 +302,9 @@ def test_handle_source(file_path, expected_columns):
|
|
|
296
302
|
|
|
297
303
|
# using different methods to create reader
|
|
298
304
|
for reader in get_reader(file_path):
|
|
299
|
-
df = reader.
|
|
305
|
+
df = reader.get_page_content()
|
|
300
306
|
assert isinstance(df, pandas.DataFrame)
|
|
301
307
|
|
|
302
|
-
if reader.get_format() == 'xlsx':
|
|
303
|
-
|
|
304
|
-
assert df.columns.tolist() == test_excel_sheet_content[0]
|
|
305
|
-
assert len(df) == len(test_excel_sheet_content) - 1
|
|
306
|
-
assert df.values.tolist() == test_excel_sheet_content[1:]
|
|
307
|
-
sheet_name = test_excel_sheet_content[1][0]
|
|
308
|
-
|
|
309
|
-
df = reader.to_df(sheet_name=sheet_name)
|
|
310
|
-
|
|
311
308
|
assert df.columns.tolist() == expected_columns
|
|
312
309
|
|
|
313
310
|
# The pdf and txt files have some different content
|
|
@@ -336,7 +333,7 @@ def test_tsv():
|
|
|
336
333
|
assert reader.get_format() == 'csv'
|
|
337
334
|
assert reader.parameters['delimiter'] == '\t'
|
|
338
335
|
|
|
339
|
-
df = reader.
|
|
336
|
+
df = reader.get_page_content()
|
|
340
337
|
assert len(df.columns) == 2
|
|
341
338
|
|
|
342
339
|
|
|
@@ -8,7 +8,9 @@ CHAT_MODELS = (
|
|
|
8
8
|
'gpt-4-32k',
|
|
9
9
|
'gpt-4-1106-preview',
|
|
10
10
|
'gpt-4-0125-preview',
|
|
11
|
-
'gpt-4o'
|
|
11
|
+
'gpt-4o',
|
|
12
|
+
'o3-mini',
|
|
13
|
+
'o1-mini'
|
|
12
14
|
)
|
|
13
15
|
COMPLETION_MODELS = ('babbage-002', 'davinci-002')
|
|
14
16
|
FINETUNING_MODELS = ('gpt-3.5-turbo', 'babbage-002', 'davinci-002', 'gpt-4')
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
+
import io
|
|
1
2
|
import json
|
|
2
3
|
|
|
3
4
|
import requests
|
|
4
5
|
from typing import Dict, Optional
|
|
5
6
|
|
|
6
7
|
import pandas as pd
|
|
8
|
+
import pyarrow.parquet as pq
|
|
7
9
|
|
|
8
10
|
from mindsdb.integrations.libs.base import BaseMLEngine
|
|
9
11
|
|
|
@@ -37,9 +39,17 @@ class RayServeHandler(BaseMLEngine):
|
|
|
37
39
|
args['target'] = target
|
|
38
40
|
self.model_storage.json_set('args', args)
|
|
39
41
|
try:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
42
|
+
if args.get('is_parquet', False):
|
|
43
|
+
buffer = io.BytesIO()
|
|
44
|
+
df.to_parquet(buffer)
|
|
45
|
+
resp = requests.post(args['train_url'],
|
|
46
|
+
files={"df": ("df", buffer.getvalue(), "application/octet-stream")},
|
|
47
|
+
data={"args": json.dumps(args), "target": target},
|
|
48
|
+
)
|
|
49
|
+
else:
|
|
50
|
+
resp = requests.post(args['train_url'],
|
|
51
|
+
json={'df': df.to_json(orient='records'), 'target': target, 'args': args},
|
|
52
|
+
headers={'content-type': 'application/json; format=pandas-records'})
|
|
43
53
|
except requests.exceptions.InvalidSchema:
|
|
44
54
|
raise Exception("Error: The URL provided for the training endpoint is invalid.")
|
|
45
55
|
|
|
@@ -59,14 +69,29 @@ class RayServeHandler(BaseMLEngine):
|
|
|
59
69
|
args = {**(self.model_storage.json_get('args')), **args} # merge incoming args
|
|
60
70
|
pred_args = args.get('predict_params', {})
|
|
61
71
|
args = {**args, **pred_args} # merge pred_args
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
72
|
+
if args.get('is_parquet', False):
|
|
73
|
+
buffer = io.BytesIO()
|
|
74
|
+
df.attrs['pred_args'] = pred_args
|
|
75
|
+
df.to_parquet(buffer)
|
|
76
|
+
resp = requests.post(args['predict_url'],
|
|
77
|
+
files={"df": ("df", buffer.getvalue(), "application/octet-stream")},
|
|
78
|
+
data={"pred_args": json.dumps(pred_args)},
|
|
79
|
+
)
|
|
80
|
+
else:
|
|
81
|
+
resp = requests.post(args['predict_url'],
|
|
82
|
+
json={'df': df.to_json(orient='records'), 'pred_args': pred_args},
|
|
83
|
+
headers={'content-type': 'application/json; format=pandas-records'})
|
|
66
84
|
try:
|
|
67
|
-
|
|
85
|
+
if args.get('is_parquet', False):
|
|
86
|
+
buffer = io.BytesIO(resp.content)
|
|
87
|
+
table = pq.read_table(buffer)
|
|
88
|
+
response = table.to_pandas()
|
|
89
|
+
else:
|
|
90
|
+
response = resp.json()
|
|
68
91
|
except json.JSONDecodeError:
|
|
69
92
|
error = resp.text
|
|
93
|
+
except Exception:
|
|
94
|
+
error = 'Could not decode parquet.'
|
|
70
95
|
else:
|
|
71
96
|
if 'prediction' in response:
|
|
72
97
|
target = args['target']
|
|
@@ -1 +1 @@
|
|
|
1
|
-
nixtla==0.
|
|
1
|
+
nixtla==0.6.6
|
|
@@ -220,8 +220,6 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
|
|
|
220
220
|
if limit is not None:
|
|
221
221
|
if len(reviewed_urls) >= limit:
|
|
222
222
|
return reviewed_urls
|
|
223
|
-
if crawl_depth == current_depth:
|
|
224
|
-
return reviewed_urls
|
|
225
223
|
|
|
226
224
|
if not filters:
|
|
227
225
|
matches_filter = True
|
|
@@ -241,6 +239,9 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
|
|
|
241
239
|
"error": str(error_message),
|
|
242
240
|
}
|
|
243
241
|
|
|
242
|
+
if crawl_depth is not None and crawl_depth == current_depth:
|
|
243
|
+
return reviewed_urls
|
|
244
|
+
|
|
244
245
|
to_rev_url_list = []
|
|
245
246
|
|
|
246
247
|
# create a list of new urls to review that don't exist in the already reviewed ones
|
|
@@ -1,62 +1,71 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
1
3
|
import pandas as pd
|
|
2
4
|
from mindsdb.integrations.libs.response import HandlerStatusResponse
|
|
3
|
-
from
|
|
4
|
-
from mindsdb.integrations.libs.api_handler import APIHandler, APITable
|
|
5
|
-
from mindsdb.utilities.config import Config
|
|
6
|
-
from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions, project_dataframe
|
|
5
|
+
from mindsdb.utilities.config import config
|
|
7
6
|
from mindsdb.utilities.security import validate_urls
|
|
8
7
|
from .urlcrawl_helpers import get_all_websites
|
|
9
8
|
|
|
9
|
+
from mindsdb.integrations.libs.api_handler import APIResource, APIHandler
|
|
10
|
+
from mindsdb.integrations.utilities.sql_utils import (FilterCondition, FilterOperator)
|
|
10
11
|
|
|
11
|
-
class CrawlerTable(APITable):
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
super().__init__(handler)
|
|
15
|
-
self.config = Config()
|
|
13
|
+
class CrawlerTable(APIResource):
|
|
16
14
|
|
|
17
|
-
def
|
|
15
|
+
def list(
|
|
16
|
+
self,
|
|
17
|
+
conditions: List[FilterCondition] = None,
|
|
18
|
+
limit: int = None,
|
|
19
|
+
**kwargs
|
|
20
|
+
) -> pd.DataFrame:
|
|
18
21
|
"""
|
|
19
22
|
Selects data from the provided websites
|
|
20
23
|
|
|
21
|
-
Args:
|
|
22
|
-
query (ast.Select): Given SQL SELECT query
|
|
23
|
-
|
|
24
24
|
Returns:
|
|
25
25
|
dataframe: Dataframe containing the crawled data
|
|
26
26
|
|
|
27
27
|
Raises:
|
|
28
28
|
NotImplementedError: If the query is not supported
|
|
29
29
|
"""
|
|
30
|
-
conditions = extract_comparison_conditions(query.where)
|
|
31
30
|
urls = []
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
if
|
|
36
|
-
if
|
|
37
|
-
urls =
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
31
|
+
crawl_depth = None
|
|
32
|
+
per_url_limit = None
|
|
33
|
+
for condition in conditions:
|
|
34
|
+
if condition.column == 'url':
|
|
35
|
+
if condition.op == FilterOperator.IN:
|
|
36
|
+
urls = condition.value
|
|
37
|
+
elif condition.op == FilterOperator.EQUAL:
|
|
38
|
+
urls = [condition.value]
|
|
39
|
+
condition.applied = True
|
|
40
|
+
if condition.column == 'crawl_depth' and condition.op == FilterOperator.EQUAL:
|
|
41
|
+
crawl_depth = condition.value
|
|
42
|
+
condition.applied = True
|
|
43
|
+
if condition.column == 'per_url_limit' and condition.op == FilterOperator.EQUAL:
|
|
44
|
+
per_url_limit = condition.value
|
|
45
|
+
condition.applied = True
|
|
41
46
|
|
|
42
47
|
if len(urls) == 0:
|
|
43
48
|
raise NotImplementedError(
|
|
44
|
-
'You must specify what url you want to crawl, for example: SELECT * FROM
|
|
49
|
+
'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"')
|
|
45
50
|
|
|
46
|
-
allowed_urls =
|
|
51
|
+
allowed_urls = config.get('web_crawling_allowed_sites', [])
|
|
47
52
|
if allowed_urls and not validate_urls(urls, allowed_urls):
|
|
48
53
|
raise ValueError(f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}.")
|
|
49
54
|
|
|
50
|
-
if
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
if limit is None and per_url_limit is None and crawl_depth is None:
|
|
56
|
+
per_url_limit = 1
|
|
57
|
+
if per_url_limit is not None:
|
|
58
|
+
# crawl every url separately
|
|
59
|
+
results = []
|
|
60
|
+
for url in urls:
|
|
61
|
+
results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth))
|
|
62
|
+
result = pd.concat(results)
|
|
63
|
+
else:
|
|
64
|
+
result = get_all_websites(urls, limit, crawl_depth=crawl_depth)
|
|
65
|
+
|
|
66
|
+
if limit is not None and len(result) > limit:
|
|
57
67
|
result = result[:limit]
|
|
58
|
-
|
|
59
|
-
result = project_dataframe(result, query.targets, self.get_columns())
|
|
68
|
+
|
|
60
69
|
return result
|
|
61
70
|
|
|
62
71
|
def get_columns(self):
|
|
@@ -5,6 +5,7 @@ from .__about__ import __version__ as version, __description__ as description
|
|
|
5
5
|
|
|
6
6
|
try:
|
|
7
7
|
from .youtube_handler import YoutubeHandler as Handler
|
|
8
|
+
from .connection_args import connection_args
|
|
8
9
|
import_error = None
|
|
9
10
|
except Exception as e:
|
|
10
11
|
Handler = None
|
|
@@ -24,4 +25,5 @@ __all__ = [
|
|
|
24
25
|
"description",
|
|
25
26
|
"import_error",
|
|
26
27
|
"icon_path",
|
|
28
|
+
"connection_args",
|
|
27
29
|
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from collections import OrderedDict
|
|
2
|
+
|
|
3
|
+
from mindsdb.integrations.libs.const import HANDLER_CONNECTION_ARG_TYPE as ARG_TYPE
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
connection_args = OrderedDict(
|
|
7
|
+
youtube_api_token={
|
|
8
|
+
'type': ARG_TYPE.STR,
|
|
9
|
+
'description': 'Youtube API Token',
|
|
10
|
+
'label': 'Youtube API Token',
|
|
11
|
+
},
|
|
12
|
+
credentials_url={
|
|
13
|
+
'type': ARG_TYPE.STR,
|
|
14
|
+
'description': 'URL to Service Account Keys',
|
|
15
|
+
'label': 'URL to Service Account Keys',
|
|
16
|
+
},
|
|
17
|
+
credentials_file={
|
|
18
|
+
'type': ARG_TYPE.STR,
|
|
19
|
+
'description': 'Location of Service Account Keys',
|
|
20
|
+
'label': 'Path to Service Account Keys',
|
|
21
|
+
},
|
|
22
|
+
credentials={
|
|
23
|
+
'type': ARG_TYPE.PATH,
|
|
24
|
+
'description': 'Service Account Keys',
|
|
25
|
+
'label': 'Upload Service Account Keys',
|
|
26
|
+
},
|
|
27
|
+
code={
|
|
28
|
+
'type': ARG_TYPE.STR,
|
|
29
|
+
'description': 'Code After Authorisation',
|
|
30
|
+
'label': 'Code After Authorisation',
|
|
31
|
+
},
|
|
32
|
+
)
|
|
@@ -115,6 +115,11 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
|
|
|
115
115
|
"""
|
|
116
116
|
temperature = min(1.0, max(0.0, args.get("temperature", 0.0)))
|
|
117
117
|
if provider == "openai":
|
|
118
|
+
|
|
119
|
+
if any(x in args.get("model_name", "") for x in ['o1', 'o3']):
|
|
120
|
+
# for o1 and 03, 'temperature' does not support 0.0 with this model. Only the default (1) value is supported
|
|
121
|
+
temperature = 1
|
|
122
|
+
|
|
118
123
|
return OpenAIConfig(
|
|
119
124
|
model_name=args.get("model_name", DEFAULT_OPENAI_MODEL),
|
|
120
125
|
temperature=temperature,
|
|
@@ -186,7 +186,6 @@ class ProcessCache:
|
|
|
186
186
|
self._keep_alive = {}
|
|
187
187
|
self._stop_event = threading.Event()
|
|
188
188
|
self.cleaner_thread = None
|
|
189
|
-
self._start_clean()
|
|
190
189
|
|
|
191
190
|
def __del__(self):
|
|
192
191
|
self._stop_clean()
|
|
@@ -200,7 +199,7 @@ class ProcessCache:
|
|
|
200
199
|
):
|
|
201
200
|
return
|
|
202
201
|
self._stop_event.clear()
|
|
203
|
-
self.cleaner_thread = threading.Thread(target=self._clean)
|
|
202
|
+
self.cleaner_thread = threading.Thread(target=self._clean, name='ProcessCache.clean')
|
|
204
203
|
self.cleaner_thread.daemon = True
|
|
205
204
|
self.cleaner_thread.start()
|
|
206
205
|
|
|
@@ -258,6 +257,7 @@ class ProcessCache:
|
|
|
258
257
|
Returns:
|
|
259
258
|
Future
|
|
260
259
|
"""
|
|
260
|
+
self._start_clean()
|
|
261
261
|
handler_module_path = payload['handler_meta']['module_path']
|
|
262
262
|
integration_id = payload['handler_meta']['integration_id']
|
|
263
263
|
if task_type in (ML_TASK_TYPE.LEARN, ML_TASK_TYPE.FINETUNE):
|