MindsDB 25.2.3.0__py3-none-any.whl → 25.3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +16 -11
- mindsdb/api/executor/command_executor.py +1 -1
- mindsdb/api/executor/datahub/datanodes/system_tables.py +10 -2
- mindsdb/api/executor/planner/query_planner.py +6 -2
- mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -1
- mindsdb/api/http/initialize.py +8 -5
- mindsdb/api/http/namespaces/agents.py +0 -7
- mindsdb/api/http/namespaces/config.py +0 -48
- mindsdb/api/http/namespaces/knowledge_bases.py +1 -1
- mindsdb/api/http/namespaces/util.py +0 -28
- mindsdb/api/mongo/classes/query_sql.py +2 -1
- mindsdb/api/mongo/responders/aggregate.py +2 -2
- mindsdb/api/mongo/responders/coll_stats.py +3 -2
- mindsdb/api/mongo/responders/db_stats.py +2 -1
- mindsdb/api/mongo/responders/insert.py +4 -2
- mindsdb/api/mysql/mysql_proxy/classes/fake_mysql_proxy/fake_mysql_proxy.py +2 -1
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +5 -4
- mindsdb/api/postgres/postgres_proxy/postgres_proxy.py +2 -4
- mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/autosklearn_handler/autosklearn_handler.py +1 -1
- mindsdb/integrations/handlers/dspy_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/gmail_handler/connection_args.py +2 -2
- mindsdb/integrations/handlers/gmail_handler/gmail_handler.py +19 -66
- mindsdb/integrations/handlers/gmail_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/google_calendar_handler/connection_args.py +15 -0
- mindsdb/integrations/handlers/google_calendar_handler/google_calendar_handler.py +31 -41
- mindsdb/integrations/handlers/google_calendar_handler/requirements.txt +0 -2
- mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/langchain_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/llama_index_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/openai_handler/constants.py +3 -1
- mindsdb/integrations/handlers/openai_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/rag_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +33 -8
- mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +3 -2
- mindsdb/integrations/handlers/web_handler/web_handler.py +42 -33
- mindsdb/integrations/handlers/youtube_handler/__init__.py +2 -0
- mindsdb/integrations/handlers/youtube_handler/connection_args.py +32 -0
- mindsdb/integrations/handlers/youtube_handler/youtube_handler.py +2 -38
- mindsdb/integrations/libs/llm/utils.py +7 -1
- mindsdb/integrations/libs/process_cache.py +2 -2
- mindsdb/integrations/utilities/handlers/auth_utilities/google/google_user_oauth_utilities.py +29 -38
- mindsdb/integrations/utilities/pydantic_utils.py +208 -0
- mindsdb/integrations/utilities/rag/chains/local_context_summarizer_chain.py +227 -0
- mindsdb/integrations/utilities/rag/pipelines/rag.py +11 -4
- mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +800 -135
- mindsdb/integrations/utilities/rag/settings.py +390 -152
- mindsdb/integrations/utilities/sql_utils.py +2 -1
- mindsdb/interfaces/agents/agents_controller.py +14 -10
- mindsdb/interfaces/agents/callback_handlers.py +52 -5
- mindsdb/interfaces/agents/langchain_agent.py +5 -3
- mindsdb/interfaces/agents/mindsdb_chat_model.py +4 -2
- mindsdb/interfaces/chatbot/chatbot_controller.py +9 -8
- mindsdb/interfaces/database/database.py +3 -2
- mindsdb/interfaces/database/integrations.py +1 -1
- mindsdb/interfaces/database/projects.py +28 -2
- mindsdb/interfaces/jobs/jobs_controller.py +4 -1
- mindsdb/interfaces/jobs/scheduler.py +1 -1
- mindsdb/interfaces/knowledge_base/preprocessing/constants.py +2 -2
- mindsdb/interfaces/model/model_controller.py +5 -2
- mindsdb/interfaces/skills/retrieval_tool.py +128 -39
- mindsdb/interfaces/skills/skill_tool.py +7 -7
- mindsdb/interfaces/skills/skills_controller.py +10 -6
- mindsdb/interfaces/skills/sql_agent.py +6 -1
- mindsdb/interfaces/storage/db.py +14 -12
- mindsdb/interfaces/storage/json.py +59 -0
- mindsdb/interfaces/storage/model_fs.py +85 -3
- mindsdb/interfaces/triggers/triggers_controller.py +2 -1
- mindsdb/migrations/versions/2022-10-14_43c52d23845a_projects.py +17 -3
- mindsdb/migrations/versions/2025-02-10_6ab9903fc59a_del_log_table.py +33 -0
- mindsdb/migrations/versions/2025-02-14_4521dafe89ab_added_encrypted_content_to_json_storage.py +29 -0
- mindsdb/migrations/versions/2025-02-19_11347c213b36_added_metadata_to_projects.py +41 -0
- mindsdb/utilities/config.py +6 -1
- mindsdb/utilities/functions.py +11 -0
- mindsdb/utilities/log.py +17 -2
- mindsdb/utilities/ml_task_queue/consumer.py +4 -2
- mindsdb/utilities/render/sqlalchemy_render.py +4 -0
- {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/METADATA +226 -247
- {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/RECORD +83 -80
- {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/WHEEL +1 -1
- mindsdb/integrations/handlers/gmail_handler/utils.py +0 -45
- mindsdb/utilities/log_controller.py +0 -39
- mindsdb/utilities/telemetry.py +0 -44
- {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/LICENSE +0 -0
- {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,4 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from shutil import copyfile
|
|
3
|
-
|
|
4
|
-
import requests
|
|
5
2
|
|
|
6
3
|
from mindsdb.integrations.libs.response import (
|
|
7
4
|
HandlerStatusResponse as StatusResponse,
|
|
@@ -15,31 +12,28 @@ from mindsdb.utilities import log
|
|
|
15
12
|
from mindsdb_sql_parser import parse_sql
|
|
16
13
|
from mindsdb.utilities.config import Config
|
|
17
14
|
|
|
18
|
-
import os
|
|
19
15
|
import time
|
|
20
16
|
from typing import List
|
|
21
17
|
import pandas as pd
|
|
22
18
|
|
|
23
|
-
from google.auth.transport.requests import Request
|
|
24
|
-
from google.oauth2.credentials import Credentials
|
|
25
|
-
from google_auth_oauthlib.flow import Flow
|
|
26
19
|
from googleapiclient.discovery import build
|
|
27
20
|
from googleapiclient.errors import HttpError
|
|
28
21
|
from email.message import EmailMessage
|
|
29
22
|
|
|
30
23
|
from base64 import urlsafe_b64encode, urlsafe_b64decode
|
|
31
24
|
|
|
32
|
-
from .
|
|
25
|
+
from mindsdb.integrations.utilities.handlers.auth_utilities import GoogleUserOAuth2Manager
|
|
26
|
+
from mindsdb.integrations.utilities.handlers.auth_utilities.exceptions import AuthException
|
|
33
27
|
|
|
34
|
-
DEFAULT_SCOPES = [
|
|
35
|
-
|
|
36
|
-
|
|
28
|
+
DEFAULT_SCOPES = [
|
|
29
|
+
'https://www.googleapis.com/auth/gmail.compose',
|
|
30
|
+
'https://www.googleapis.com/auth/gmail.readonly',
|
|
31
|
+
'https://www.googleapis.com/auth/gmail.modify'
|
|
32
|
+
]
|
|
37
33
|
|
|
38
34
|
logger = log.getLogger(__name__)
|
|
39
35
|
|
|
40
36
|
|
|
41
|
-
|
|
42
|
-
|
|
43
37
|
class EmailsTable(APITable):
|
|
44
38
|
"""Implementation for the emails table for Gmail"""
|
|
45
39
|
|
|
@@ -283,6 +277,14 @@ class GmailHandler(APIHandler):
|
|
|
283
277
|
super().__init__(name)
|
|
284
278
|
self.connection_args = kwargs.get('connection_data', {})
|
|
285
279
|
|
|
280
|
+
self.token_file = None
|
|
281
|
+
self.max_page_size = 500
|
|
282
|
+
self.max_batch_size = 100
|
|
283
|
+
self.service = None
|
|
284
|
+
self.is_connected = False
|
|
285
|
+
|
|
286
|
+
self.handler_storage = kwargs['handler_storage']
|
|
287
|
+
|
|
286
288
|
self.credentials_url = self.connection_args.get('credentials_url', None)
|
|
287
289
|
self.credentials_file = self.connection_args.get('credentials_file', None)
|
|
288
290
|
if self.connection_args.get('credentials'):
|
|
@@ -298,63 +300,11 @@ class GmailHandler(APIHandler):
|
|
|
298
300
|
self.credentials_url = secret_url
|
|
299
301
|
|
|
300
302
|
self.scopes = self.connection_args.get('scopes', DEFAULT_SCOPES)
|
|
301
|
-
self.token_file = None
|
|
302
|
-
self.max_page_size = 500
|
|
303
|
-
self.max_batch_size = 100
|
|
304
|
-
self.service = None
|
|
305
|
-
self.is_connected = False
|
|
306
|
-
|
|
307
|
-
self.handler_storage = kwargs['handler_storage']
|
|
308
303
|
|
|
309
304
|
emails = EmailsTable(self)
|
|
310
305
|
self.emails = emails
|
|
311
306
|
self._register_table('emails', emails)
|
|
312
307
|
|
|
313
|
-
def _download_secret_file(self, secret_file):
|
|
314
|
-
# Giving more priority to the S3 file
|
|
315
|
-
if self.credentials_url:
|
|
316
|
-
response = requests.get(self.credentials_url)
|
|
317
|
-
if response.status_code == 200:
|
|
318
|
-
with open(secret_file, 'w') as creds:
|
|
319
|
-
creds.write(response.text)
|
|
320
|
-
return True
|
|
321
|
-
else:
|
|
322
|
-
logger.error("Failed to get credentials from S3", response.status_code)
|
|
323
|
-
|
|
324
|
-
if self.credentials_file and os.path.isfile(self.credentials_file):
|
|
325
|
-
copyfile(self.credentials_file, secret_file)
|
|
326
|
-
return True
|
|
327
|
-
return False
|
|
328
|
-
|
|
329
|
-
def create_connection(self):
|
|
330
|
-
creds = None
|
|
331
|
-
|
|
332
|
-
# Get the current dir, we'll check for Token & Creds files in this dir
|
|
333
|
-
curr_dir = self.handler_storage.folder_get('config')
|
|
334
|
-
|
|
335
|
-
creds_file = os.path.join(curr_dir, 'creds.json')
|
|
336
|
-
secret_file = os.path.join(curr_dir, 'secret.json')
|
|
337
|
-
|
|
338
|
-
if os.path.isfile(creds_file):
|
|
339
|
-
creds = Credentials.from_authorized_user_file(creds_file, self.scopes)
|
|
340
|
-
|
|
341
|
-
if not creds or not creds.valid:
|
|
342
|
-
if creds and creds.expired and creds.refresh_token:
|
|
343
|
-
creds.refresh(Request())
|
|
344
|
-
|
|
345
|
-
if self._download_secret_file(secret_file):
|
|
346
|
-
# save to storage
|
|
347
|
-
self.handler_storage.folder_sync('config')
|
|
348
|
-
else:
|
|
349
|
-
raise ValueError('No valid Gmail Credentials filepath or S3 url found.')
|
|
350
|
-
|
|
351
|
-
creds = google_auth_flow(secret_file, self.scopes, self.connection_args.get('code'))
|
|
352
|
-
|
|
353
|
-
save_creds_to_file(creds, creds_file)
|
|
354
|
-
self.handler_storage.folder_sync('config')
|
|
355
|
-
|
|
356
|
-
return build('gmail', 'v1', credentials=creds)
|
|
357
|
-
|
|
358
308
|
def connect(self):
|
|
359
309
|
"""Authenticate with the Gmail API using the credentials file.
|
|
360
310
|
|
|
@@ -366,7 +316,10 @@ class GmailHandler(APIHandler):
|
|
|
366
316
|
if self.is_connected and self.service is not None:
|
|
367
317
|
return self.service
|
|
368
318
|
|
|
369
|
-
self.
|
|
319
|
+
google_oauth2_manager = GoogleUserOAuth2Manager(self.handler_storage, self.scopes, self.credentials_file, self.credentials_url, self.connection_args.get('code'))
|
|
320
|
+
creds = google_oauth2_manager.get_oauth2_credentials()
|
|
321
|
+
|
|
322
|
+
self.service = build('gmail', 'v1', credentials=creds)
|
|
370
323
|
|
|
371
324
|
self.is_connected = True
|
|
372
325
|
return self.service
|
|
@@ -4,9 +4,24 @@ from mindsdb.integrations.libs.const import HANDLER_CONNECTION_ARG_TYPE as ARG_T
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
connection_args = OrderedDict(
|
|
7
|
+
credentials_url={
|
|
8
|
+
'type': ARG_TYPE.STR,
|
|
9
|
+
'description': 'URL to Service Account Keys',
|
|
10
|
+
'label': 'URL to Service Account Keys',
|
|
11
|
+
},
|
|
12
|
+
credentials_file={
|
|
13
|
+
'type': ARG_TYPE.STR,
|
|
14
|
+
'description': 'Location of Service Account Keys',
|
|
15
|
+
'label': 'Path to Service Account Keys',
|
|
16
|
+
},
|
|
7
17
|
credentials={
|
|
8
18
|
'type': ARG_TYPE.PATH,
|
|
9
19
|
'description': 'Service Account Keys',
|
|
10
20
|
'label': 'Upload Service Account Keys',
|
|
11
21
|
},
|
|
22
|
+
code={
|
|
23
|
+
'type': ARG_TYPE.STR,
|
|
24
|
+
'description': 'Code After Authorisation',
|
|
25
|
+
'label': 'Code After Authorisation',
|
|
26
|
+
},
|
|
12
27
|
)
|
|
@@ -1,8 +1,4 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
1
|
import pandas as pd
|
|
4
|
-
from google.auth.transport.requests import Request
|
|
5
|
-
from google.oauth2.credentials import Credentials
|
|
6
2
|
from googleapiclient.discovery import build
|
|
7
3
|
|
|
8
4
|
from mindsdb.api.executor.data_types.response_type import RESPONSE_TYPE
|
|
@@ -11,13 +7,21 @@ from mindsdb.integrations.libs.response import (
|
|
|
11
7
|
HandlerStatusResponse as StatusResponse,
|
|
12
8
|
HandlerResponse as Response,
|
|
13
9
|
)
|
|
10
|
+
from mindsdb.utilities.config import Config
|
|
14
11
|
from mindsdb.utilities import log
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
from mindsdb.integrations.handlers.gmail_handler.utils import AuthException, google_auth_flow, save_creds_to_file
|
|
12
|
+
from mindsdb.integrations.utilities.handlers.auth_utilities import GoogleUserOAuth2Manager
|
|
13
|
+
from mindsdb.integrations.utilities.handlers.auth_utilities.exceptions import AuthException
|
|
18
14
|
|
|
19
15
|
from .google_calendar_tables import GoogleCalendarEventsTable
|
|
20
16
|
|
|
17
|
+
DEFAULT_SCOPES = [
|
|
18
|
+
'https://www.googleapis.com/auth/calendar',
|
|
19
|
+
'https://www.googleapis.com/auth/calendar.events',
|
|
20
|
+
'https://www.googleapis.com/auth/calendar.readonly'
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
logger = log.getLogger(__name__)
|
|
24
|
+
|
|
21
25
|
|
|
22
26
|
class GoogleCalendarHandler(APIHandler):
|
|
23
27
|
"""
|
|
@@ -35,20 +39,29 @@ class GoogleCalendarHandler(APIHandler):
|
|
|
35
39
|
events (GoogleCalendarEventsTable): The `GoogleCalendarEventsTable` object for interacting with the events table.
|
|
36
40
|
"""
|
|
37
41
|
super().__init__(name)
|
|
42
|
+
self.connection_data = kwargs.get('connection_data', {})
|
|
38
43
|
|
|
39
|
-
self.token = None
|
|
40
44
|
self.service = None
|
|
41
|
-
self.connection_data = kwargs.get('connection_data', {})
|
|
42
|
-
self.credentials_file = self.connection_data['credentials']
|
|
43
|
-
self.scopes = [
|
|
44
|
-
'https://www.googleapis.com/auth/calendar',
|
|
45
|
-
'https://www.googleapis.com/auth/calendar.events',
|
|
46
|
-
'https://www.googleapis.com/auth/calendar.readonly'
|
|
47
|
-
]
|
|
48
45
|
self.is_connected = False
|
|
49
46
|
|
|
50
47
|
self.handler_storage = kwargs['handler_storage']
|
|
51
48
|
|
|
49
|
+
self.credentials_url = self.connection_data.get('credentials_url', None)
|
|
50
|
+
self.credentials_file = self.connection_data.get('credentials_file', None)
|
|
51
|
+
if self.connection_data.get('credentials'):
|
|
52
|
+
self.credentials_file = self.connection_data.pop('credentials')
|
|
53
|
+
if not self.credentials_file and not self.credentials_url:
|
|
54
|
+
# try to get from config
|
|
55
|
+
gcalendar_config = Config().get('handlers', {}).get('youtube', {})
|
|
56
|
+
secret_file = gcalendar_config.get('credentials_file')
|
|
57
|
+
secret_url = gcalendar_config.get('credentials_url')
|
|
58
|
+
if secret_file:
|
|
59
|
+
self.credentials_file = secret_file
|
|
60
|
+
elif secret_url:
|
|
61
|
+
self.credentials_url = secret_url
|
|
62
|
+
|
|
63
|
+
self.scopes = self.connection_data.get('scopes', DEFAULT_SCOPES)
|
|
64
|
+
|
|
52
65
|
events = GoogleCalendarEventsTable(self)
|
|
53
66
|
self.events = events
|
|
54
67
|
self._register_table('events', events)
|
|
@@ -64,32 +77,8 @@ class GoogleCalendarHandler(APIHandler):
|
|
|
64
77
|
if self.is_connected is True:
|
|
65
78
|
return self.service
|
|
66
79
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
curr_dir = self.handler_storage.folder_get('config')
|
|
70
|
-
|
|
71
|
-
creds_file = None
|
|
72
|
-
try:
|
|
73
|
-
creds_file = os.path.join(curr_dir, 'secret.json')
|
|
74
|
-
except Exception:
|
|
75
|
-
pass
|
|
76
|
-
|
|
77
|
-
creds = None
|
|
78
|
-
if os.path.isfile(creds_file):
|
|
79
|
-
creds = Credentials.from_authorized_user_file(creds_file, self.scopes)
|
|
80
|
-
|
|
81
|
-
if not creds or not creds.valid:
|
|
82
|
-
if creds and creds.expired and creds.refresh_token:
|
|
83
|
-
creds.refresh(Request())
|
|
84
|
-
|
|
85
|
-
save_creds_to_file(creds, creds_file)
|
|
86
|
-
self.handler_storage.folder_sync('config')
|
|
87
|
-
|
|
88
|
-
else:
|
|
89
|
-
creds = google_auth_flow(secret_file, self.scopes, self.connection_data.get('code'))
|
|
90
|
-
|
|
91
|
-
save_creds_to_file(creds, creds_file)
|
|
92
|
-
self.handler_storage.folder_sync('config')
|
|
80
|
+
google_oauth2_manager = GoogleUserOAuth2Manager(self.handler_storage, self.scopes, self.credentials_file, self.credentials_url, self.connection_data.get('code'))
|
|
81
|
+
creds = google_oauth2_manager.get_oauth2_credentials()
|
|
93
82
|
|
|
94
83
|
self.service = build('calendar', 'v3', credentials=creds)
|
|
95
84
|
return self.service
|
|
@@ -106,6 +95,7 @@ class GoogleCalendarHandler(APIHandler):
|
|
|
106
95
|
try:
|
|
107
96
|
self.connect()
|
|
108
97
|
response.success = True
|
|
98
|
+
response.copy_storage = True
|
|
109
99
|
|
|
110
100
|
except AuthException as error:
|
|
111
101
|
response.error_message = str(error)
|
|
@@ -8,7 +8,9 @@ CHAT_MODELS = (
|
|
|
8
8
|
'gpt-4-32k',
|
|
9
9
|
'gpt-4-1106-preview',
|
|
10
10
|
'gpt-4-0125-preview',
|
|
11
|
-
'gpt-4o'
|
|
11
|
+
'gpt-4o',
|
|
12
|
+
'o3-mini',
|
|
13
|
+
'o1-mini'
|
|
12
14
|
)
|
|
13
15
|
COMPLETION_MODELS = ('babbage-002', 'davinci-002')
|
|
14
16
|
FINETUNING_MODELS = ('gpt-3.5-turbo', 'babbage-002', 'davinci-002', 'gpt-4')
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
+
import io
|
|
1
2
|
import json
|
|
2
3
|
|
|
3
4
|
import requests
|
|
4
5
|
from typing import Dict, Optional
|
|
5
6
|
|
|
6
7
|
import pandas as pd
|
|
8
|
+
import pyarrow.parquet as pq
|
|
7
9
|
|
|
8
10
|
from mindsdb.integrations.libs.base import BaseMLEngine
|
|
9
11
|
|
|
@@ -37,9 +39,17 @@ class RayServeHandler(BaseMLEngine):
|
|
|
37
39
|
args['target'] = target
|
|
38
40
|
self.model_storage.json_set('args', args)
|
|
39
41
|
try:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
42
|
+
if args.get('is_parquet', False):
|
|
43
|
+
buffer = io.BytesIO()
|
|
44
|
+
df.to_parquet(buffer)
|
|
45
|
+
resp = requests.post(args['train_url'],
|
|
46
|
+
files={"df": ("df", buffer.getvalue(), "application/octet-stream")},
|
|
47
|
+
data={"args": json.dumps(args), "target": target},
|
|
48
|
+
)
|
|
49
|
+
else:
|
|
50
|
+
resp = requests.post(args['train_url'],
|
|
51
|
+
json={'df': df.to_json(orient='records'), 'target': target, 'args': args},
|
|
52
|
+
headers={'content-type': 'application/json; format=pandas-records'})
|
|
43
53
|
except requests.exceptions.InvalidSchema:
|
|
44
54
|
raise Exception("Error: The URL provided for the training endpoint is invalid.")
|
|
45
55
|
|
|
@@ -59,14 +69,29 @@ class RayServeHandler(BaseMLEngine):
|
|
|
59
69
|
args = {**(self.model_storage.json_get('args')), **args} # merge incoming args
|
|
60
70
|
pred_args = args.get('predict_params', {})
|
|
61
71
|
args = {**args, **pred_args} # merge pred_args
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
72
|
+
if args.get('is_parquet', False):
|
|
73
|
+
buffer = io.BytesIO()
|
|
74
|
+
df.attrs['pred_args'] = pred_args
|
|
75
|
+
df.to_parquet(buffer)
|
|
76
|
+
resp = requests.post(args['predict_url'],
|
|
77
|
+
files={"df": ("df", buffer.getvalue(), "application/octet-stream")},
|
|
78
|
+
data={"pred_args": json.dumps(pred_args)},
|
|
79
|
+
)
|
|
80
|
+
else:
|
|
81
|
+
resp = requests.post(args['predict_url'],
|
|
82
|
+
json={'df': df.to_json(orient='records'), 'pred_args': pred_args},
|
|
83
|
+
headers={'content-type': 'application/json; format=pandas-records'})
|
|
66
84
|
try:
|
|
67
|
-
|
|
85
|
+
if args.get('is_parquet', False):
|
|
86
|
+
buffer = io.BytesIO(resp.content)
|
|
87
|
+
table = pq.read_table(buffer)
|
|
88
|
+
response = table.to_pandas()
|
|
89
|
+
else:
|
|
90
|
+
response = resp.json()
|
|
68
91
|
except json.JSONDecodeError:
|
|
69
92
|
error = resp.text
|
|
93
|
+
except Exception:
|
|
94
|
+
error = 'Could not decode parquet.'
|
|
70
95
|
else:
|
|
71
96
|
if 'prediction' in response:
|
|
72
97
|
target = args['target']
|
|
@@ -220,8 +220,6 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
|
|
|
220
220
|
if limit is not None:
|
|
221
221
|
if len(reviewed_urls) >= limit:
|
|
222
222
|
return reviewed_urls
|
|
223
|
-
if crawl_depth == current_depth:
|
|
224
|
-
return reviewed_urls
|
|
225
223
|
|
|
226
224
|
if not filters:
|
|
227
225
|
matches_filter = True
|
|
@@ -241,6 +239,9 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
|
|
|
241
239
|
"error": str(error_message),
|
|
242
240
|
}
|
|
243
241
|
|
|
242
|
+
if crawl_depth is not None and crawl_depth == current_depth:
|
|
243
|
+
return reviewed_urls
|
|
244
|
+
|
|
244
245
|
to_rev_url_list = []
|
|
245
246
|
|
|
246
247
|
# create a list of new urls to review that don't exist in the already reviewed ones
|
|
@@ -1,62 +1,71 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
1
3
|
import pandas as pd
|
|
2
4
|
from mindsdb.integrations.libs.response import HandlerStatusResponse
|
|
3
|
-
from
|
|
4
|
-
from mindsdb.integrations.libs.api_handler import APIHandler, APITable
|
|
5
|
-
from mindsdb.utilities.config import Config
|
|
6
|
-
from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions, project_dataframe
|
|
5
|
+
from mindsdb.utilities.config import config
|
|
7
6
|
from mindsdb.utilities.security import validate_urls
|
|
8
7
|
from .urlcrawl_helpers import get_all_websites
|
|
9
8
|
|
|
9
|
+
from mindsdb.integrations.libs.api_handler import APIResource, APIHandler
|
|
10
|
+
from mindsdb.integrations.utilities.sql_utils import (FilterCondition, FilterOperator)
|
|
10
11
|
|
|
11
|
-
class CrawlerTable(APITable):
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
super().__init__(handler)
|
|
15
|
-
self.config = Config()
|
|
13
|
+
class CrawlerTable(APIResource):
|
|
16
14
|
|
|
17
|
-
def
|
|
15
|
+
def list(
|
|
16
|
+
self,
|
|
17
|
+
conditions: List[FilterCondition] = None,
|
|
18
|
+
limit: int = None,
|
|
19
|
+
**kwargs
|
|
20
|
+
) -> pd.DataFrame:
|
|
18
21
|
"""
|
|
19
22
|
Selects data from the provided websites
|
|
20
23
|
|
|
21
|
-
Args:
|
|
22
|
-
query (ast.Select): Given SQL SELECT query
|
|
23
|
-
|
|
24
24
|
Returns:
|
|
25
25
|
dataframe: Dataframe containing the crawled data
|
|
26
26
|
|
|
27
27
|
Raises:
|
|
28
28
|
NotImplementedError: If the query is not supported
|
|
29
29
|
"""
|
|
30
|
-
conditions = extract_comparison_conditions(query.where)
|
|
31
30
|
urls = []
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
if
|
|
36
|
-
if
|
|
37
|
-
urls =
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
31
|
+
crawl_depth = None
|
|
32
|
+
per_url_limit = None
|
|
33
|
+
for condition in conditions:
|
|
34
|
+
if condition.column == 'url':
|
|
35
|
+
if condition.op == FilterOperator.IN:
|
|
36
|
+
urls = condition.value
|
|
37
|
+
elif condition.op == FilterOperator.EQUAL:
|
|
38
|
+
urls = [condition.value]
|
|
39
|
+
condition.applied = True
|
|
40
|
+
if condition.column == 'crawl_depth' and condition.op == FilterOperator.EQUAL:
|
|
41
|
+
crawl_depth = condition.value
|
|
42
|
+
condition.applied = True
|
|
43
|
+
if condition.column == 'per_url_limit' and condition.op == FilterOperator.EQUAL:
|
|
44
|
+
per_url_limit = condition.value
|
|
45
|
+
condition.applied = True
|
|
41
46
|
|
|
42
47
|
if len(urls) == 0:
|
|
43
48
|
raise NotImplementedError(
|
|
44
|
-
'You must specify what url you want to crawl, for example: SELECT * FROM
|
|
49
|
+
'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"')
|
|
45
50
|
|
|
46
|
-
allowed_urls =
|
|
51
|
+
allowed_urls = config.get('web_crawling_allowed_sites', [])
|
|
47
52
|
if allowed_urls and not validate_urls(urls, allowed_urls):
|
|
48
53
|
raise ValueError(f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}.")
|
|
49
54
|
|
|
50
|
-
if
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
if limit is None and per_url_limit is None and crawl_depth is None:
|
|
56
|
+
per_url_limit = 1
|
|
57
|
+
if per_url_limit is not None:
|
|
58
|
+
# crawl every url separately
|
|
59
|
+
results = []
|
|
60
|
+
for url in urls:
|
|
61
|
+
results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth))
|
|
62
|
+
result = pd.concat(results)
|
|
63
|
+
else:
|
|
64
|
+
result = get_all_websites(urls, limit, crawl_depth=crawl_depth)
|
|
65
|
+
|
|
66
|
+
if limit is not None and len(result) > limit:
|
|
57
67
|
result = result[:limit]
|
|
58
|
-
|
|
59
|
-
result = project_dataframe(result, query.targets, self.get_columns())
|
|
68
|
+
|
|
60
69
|
return result
|
|
61
70
|
|
|
62
71
|
def get_columns(self):
|
|
@@ -5,6 +5,7 @@ from .__about__ import __version__ as version, __description__ as description
|
|
|
5
5
|
|
|
6
6
|
try:
|
|
7
7
|
from .youtube_handler import YoutubeHandler as Handler
|
|
8
|
+
from .connection_args import connection_args
|
|
8
9
|
import_error = None
|
|
9
10
|
except Exception as e:
|
|
10
11
|
Handler = None
|
|
@@ -24,4 +25,5 @@ __all__ = [
|
|
|
24
25
|
"description",
|
|
25
26
|
"import_error",
|
|
26
27
|
"icon_path",
|
|
28
|
+
"connection_args",
|
|
27
29
|
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from collections import OrderedDict
|
|
2
|
+
|
|
3
|
+
from mindsdb.integrations.libs.const import HANDLER_CONNECTION_ARG_TYPE as ARG_TYPE
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
connection_args = OrderedDict(
|
|
7
|
+
youtube_api_token={
|
|
8
|
+
'type': ARG_TYPE.STR,
|
|
9
|
+
'description': 'Youtube API Token',
|
|
10
|
+
'label': 'Youtube API Token',
|
|
11
|
+
},
|
|
12
|
+
credentials_url={
|
|
13
|
+
'type': ARG_TYPE.STR,
|
|
14
|
+
'description': 'URL to Service Account Keys',
|
|
15
|
+
'label': 'URL to Service Account Keys',
|
|
16
|
+
},
|
|
17
|
+
credentials_file={
|
|
18
|
+
'type': ARG_TYPE.STR,
|
|
19
|
+
'description': 'Location of Service Account Keys',
|
|
20
|
+
'label': 'Path to Service Account Keys',
|
|
21
|
+
},
|
|
22
|
+
credentials={
|
|
23
|
+
'type': ARG_TYPE.PATH,
|
|
24
|
+
'description': 'Service Account Keys',
|
|
25
|
+
'label': 'Upload Service Account Keys',
|
|
26
|
+
},
|
|
27
|
+
code={
|
|
28
|
+
'type': ARG_TYPE.STR,
|
|
29
|
+
'description': 'Code After Authorisation',
|
|
30
|
+
'label': 'Code After Authorisation',
|
|
31
|
+
},
|
|
32
|
+
)
|
|
@@ -10,9 +10,7 @@ from mindsdb.integrations.libs.response import (
|
|
|
10
10
|
from mindsdb.utilities import log
|
|
11
11
|
from mindsdb_sql_parser import parse_sql
|
|
12
12
|
|
|
13
|
-
from collections import OrderedDict
|
|
14
13
|
from mindsdb.utilities.config import Config
|
|
15
|
-
from mindsdb.integrations.libs.const import HANDLER_CONNECTION_ARG_TYPE as ARG_TYPE
|
|
16
14
|
|
|
17
15
|
from googleapiclient.discovery import build
|
|
18
16
|
|
|
@@ -38,12 +36,10 @@ class YoutubeHandler(APIHandler):
|
|
|
38
36
|
name of a handler instance
|
|
39
37
|
"""
|
|
40
38
|
super().__init__(name)
|
|
41
|
-
|
|
42
|
-
|
|
39
|
+
self.connection_data = kwargs.get("connection_data", {})
|
|
40
|
+
self.kwargs = kwargs
|
|
43
41
|
|
|
44
42
|
self.parser = parse_sql
|
|
45
|
-
self.connection_data = connection_data
|
|
46
|
-
self.kwargs = kwargs
|
|
47
43
|
self.connection = None
|
|
48
44
|
self.is_connected = False
|
|
49
45
|
|
|
@@ -104,7 +100,6 @@ class YoutubeHandler(APIHandler):
|
|
|
104
100
|
Status confirmation
|
|
105
101
|
"""
|
|
106
102
|
response = StatusResponse(False)
|
|
107
|
-
need_to_close = self.is_connected is False
|
|
108
103
|
|
|
109
104
|
try:
|
|
110
105
|
self.connect()
|
|
@@ -131,34 +126,3 @@ class YoutubeHandler(APIHandler):
|
|
|
131
126
|
"""
|
|
132
127
|
ast = parse_sql(query)
|
|
133
128
|
return self.query(ast)
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
connection_args = OrderedDict(
|
|
137
|
-
youtube_access_token={
|
|
138
|
-
"type": ARG_TYPE.STR,
|
|
139
|
-
"description": "API Key",
|
|
140
|
-
"label": "API Key",
|
|
141
|
-
},
|
|
142
|
-
credentials_url={
|
|
143
|
-
'type': ARG_TYPE.STR,
|
|
144
|
-
'description': 'URL to OAuth2 Credentials',
|
|
145
|
-
'label': 'URL to OAuth2 Credentials',
|
|
146
|
-
},
|
|
147
|
-
credentials_file={
|
|
148
|
-
'type': ARG_TYPE.STR,
|
|
149
|
-
'description': 'Location of OAuth2 Credentials',
|
|
150
|
-
'label': 'Location of OAuth2 Credentials',
|
|
151
|
-
},
|
|
152
|
-
credentials={
|
|
153
|
-
'type': ARG_TYPE.PATH,
|
|
154
|
-
'description': 'OAuth2 Credentials',
|
|
155
|
-
'label': 'Upload OAuth2 Credentials',
|
|
156
|
-
},
|
|
157
|
-
code={
|
|
158
|
-
'type': ARG_TYPE.STR,
|
|
159
|
-
'description': 'Authentication Code',
|
|
160
|
-
'label': 'Authentication Code',
|
|
161
|
-
}
|
|
162
|
-
)
|
|
163
|
-
|
|
164
|
-
connection_args_example = OrderedDict(youtube_api_token="<your-youtube-api-token>")
|
|
@@ -16,6 +16,7 @@ from mindsdb.integrations.libs.llm.config import (
|
|
|
16
16
|
NvidiaNIMConfig,
|
|
17
17
|
MindsdbConfig,
|
|
18
18
|
)
|
|
19
|
+
from mindsdb.utilities.config import config
|
|
19
20
|
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
|
|
20
21
|
|
|
21
22
|
|
|
@@ -115,6 +116,11 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
|
|
|
115
116
|
"""
|
|
116
117
|
temperature = min(1.0, max(0.0, args.get("temperature", 0.0)))
|
|
117
118
|
if provider == "openai":
|
|
119
|
+
|
|
120
|
+
if any(x in args.get("model_name", "") for x in ['o1', 'o3']):
|
|
121
|
+
# for o1 and 03, 'temperature' does not support 0.0 with this model. Only the default (1) value is supported
|
|
122
|
+
temperature = 1
|
|
123
|
+
|
|
118
124
|
return OpenAIConfig(
|
|
119
125
|
model_name=args.get("model_name", DEFAULT_OPENAI_MODEL),
|
|
120
126
|
temperature=temperature,
|
|
@@ -206,7 +212,7 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
|
|
|
206
212
|
if provider == "mindsdb":
|
|
207
213
|
return MindsdbConfig(
|
|
208
214
|
model_name=args["model_name"],
|
|
209
|
-
project_name=args.get("project_name", "
|
|
215
|
+
project_name=args.get("project_name", config.get("default_project")),
|
|
210
216
|
)
|
|
211
217
|
if provider == "vllm":
|
|
212
218
|
return OpenAIConfig(
|