MindsDB 25.2.3.0__py3-none-any.whl → 25.3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (86) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +16 -11
  3. mindsdb/api/executor/command_executor.py +1 -1
  4. mindsdb/api/executor/datahub/datanodes/system_tables.py +10 -2
  5. mindsdb/api/executor/planner/query_planner.py +6 -2
  6. mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -1
  7. mindsdb/api/http/initialize.py +8 -5
  8. mindsdb/api/http/namespaces/agents.py +0 -7
  9. mindsdb/api/http/namespaces/config.py +0 -48
  10. mindsdb/api/http/namespaces/knowledge_bases.py +1 -1
  11. mindsdb/api/http/namespaces/util.py +0 -28
  12. mindsdb/api/mongo/classes/query_sql.py +2 -1
  13. mindsdb/api/mongo/responders/aggregate.py +2 -2
  14. mindsdb/api/mongo/responders/coll_stats.py +3 -2
  15. mindsdb/api/mongo/responders/db_stats.py +2 -1
  16. mindsdb/api/mongo/responders/insert.py +4 -2
  17. mindsdb/api/mysql/mysql_proxy/classes/fake_mysql_proxy/fake_mysql_proxy.py +2 -1
  18. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +5 -4
  19. mindsdb/api/postgres/postgres_proxy/postgres_proxy.py +2 -4
  20. mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -1
  21. mindsdb/integrations/handlers/autosklearn_handler/autosklearn_handler.py +1 -1
  22. mindsdb/integrations/handlers/dspy_handler/requirements.txt +0 -1
  23. mindsdb/integrations/handlers/gmail_handler/connection_args.py +2 -2
  24. mindsdb/integrations/handlers/gmail_handler/gmail_handler.py +19 -66
  25. mindsdb/integrations/handlers/gmail_handler/requirements.txt +0 -1
  26. mindsdb/integrations/handlers/google_calendar_handler/connection_args.py +15 -0
  27. mindsdb/integrations/handlers/google_calendar_handler/google_calendar_handler.py +31 -41
  28. mindsdb/integrations/handlers/google_calendar_handler/requirements.txt +0 -2
  29. mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt +0 -1
  30. mindsdb/integrations/handlers/langchain_handler/requirements.txt +0 -1
  31. mindsdb/integrations/handlers/llama_index_handler/requirements.txt +0 -1
  32. mindsdb/integrations/handlers/openai_handler/constants.py +3 -1
  33. mindsdb/integrations/handlers/openai_handler/requirements.txt +0 -1
  34. mindsdb/integrations/handlers/rag_handler/requirements.txt +0 -1
  35. mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +33 -8
  36. mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +3 -2
  37. mindsdb/integrations/handlers/web_handler/web_handler.py +42 -33
  38. mindsdb/integrations/handlers/youtube_handler/__init__.py +2 -0
  39. mindsdb/integrations/handlers/youtube_handler/connection_args.py +32 -0
  40. mindsdb/integrations/handlers/youtube_handler/youtube_handler.py +2 -38
  41. mindsdb/integrations/libs/llm/utils.py +7 -1
  42. mindsdb/integrations/libs/process_cache.py +2 -2
  43. mindsdb/integrations/utilities/handlers/auth_utilities/google/google_user_oauth_utilities.py +29 -38
  44. mindsdb/integrations/utilities/pydantic_utils.py +208 -0
  45. mindsdb/integrations/utilities/rag/chains/local_context_summarizer_chain.py +227 -0
  46. mindsdb/integrations/utilities/rag/pipelines/rag.py +11 -4
  47. mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +800 -135
  48. mindsdb/integrations/utilities/rag/settings.py +390 -152
  49. mindsdb/integrations/utilities/sql_utils.py +2 -1
  50. mindsdb/interfaces/agents/agents_controller.py +14 -10
  51. mindsdb/interfaces/agents/callback_handlers.py +52 -5
  52. mindsdb/interfaces/agents/langchain_agent.py +5 -3
  53. mindsdb/interfaces/agents/mindsdb_chat_model.py +4 -2
  54. mindsdb/interfaces/chatbot/chatbot_controller.py +9 -8
  55. mindsdb/interfaces/database/database.py +3 -2
  56. mindsdb/interfaces/database/integrations.py +1 -1
  57. mindsdb/interfaces/database/projects.py +28 -2
  58. mindsdb/interfaces/jobs/jobs_controller.py +4 -1
  59. mindsdb/interfaces/jobs/scheduler.py +1 -1
  60. mindsdb/interfaces/knowledge_base/preprocessing/constants.py +2 -2
  61. mindsdb/interfaces/model/model_controller.py +5 -2
  62. mindsdb/interfaces/skills/retrieval_tool.py +128 -39
  63. mindsdb/interfaces/skills/skill_tool.py +7 -7
  64. mindsdb/interfaces/skills/skills_controller.py +10 -6
  65. mindsdb/interfaces/skills/sql_agent.py +6 -1
  66. mindsdb/interfaces/storage/db.py +14 -12
  67. mindsdb/interfaces/storage/json.py +59 -0
  68. mindsdb/interfaces/storage/model_fs.py +85 -3
  69. mindsdb/interfaces/triggers/triggers_controller.py +2 -1
  70. mindsdb/migrations/versions/2022-10-14_43c52d23845a_projects.py +17 -3
  71. mindsdb/migrations/versions/2025-02-10_6ab9903fc59a_del_log_table.py +33 -0
  72. mindsdb/migrations/versions/2025-02-14_4521dafe89ab_added_encrypted_content_to_json_storage.py +29 -0
  73. mindsdb/migrations/versions/2025-02-19_11347c213b36_added_metadata_to_projects.py +41 -0
  74. mindsdb/utilities/config.py +6 -1
  75. mindsdb/utilities/functions.py +11 -0
  76. mindsdb/utilities/log.py +17 -2
  77. mindsdb/utilities/ml_task_queue/consumer.py +4 -2
  78. mindsdb/utilities/render/sqlalchemy_render.py +4 -0
  79. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/METADATA +226 -247
  80. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/RECORD +83 -80
  81. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/WHEEL +1 -1
  82. mindsdb/integrations/handlers/gmail_handler/utils.py +0 -45
  83. mindsdb/utilities/log_controller.py +0 -39
  84. mindsdb/utilities/telemetry.py +0 -44
  85. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/LICENSE +0 -0
  86. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,4 @@
1
1
  import json
2
- from shutil import copyfile
3
-
4
- import requests
5
2
 
6
3
  from mindsdb.integrations.libs.response import (
7
4
  HandlerStatusResponse as StatusResponse,
@@ -15,31 +12,28 @@ from mindsdb.utilities import log
15
12
  from mindsdb_sql_parser import parse_sql
16
13
  from mindsdb.utilities.config import Config
17
14
 
18
- import os
19
15
  import time
20
16
  from typing import List
21
17
  import pandas as pd
22
18
 
23
- from google.auth.transport.requests import Request
24
- from google.oauth2.credentials import Credentials
25
- from google_auth_oauthlib.flow import Flow
26
19
  from googleapiclient.discovery import build
27
20
  from googleapiclient.errors import HttpError
28
21
  from email.message import EmailMessage
29
22
 
30
23
  from base64 import urlsafe_b64encode, urlsafe_b64decode
31
24
 
32
- from .utils import AuthException, google_auth_flow, save_creds_to_file
25
+ from mindsdb.integrations.utilities.handlers.auth_utilities import GoogleUserOAuth2Manager
26
+ from mindsdb.integrations.utilities.handlers.auth_utilities.exceptions import AuthException
33
27
 
34
- DEFAULT_SCOPES = ['https://www.googleapis.com/auth/gmail.compose',
35
- 'https://www.googleapis.com/auth/gmail.readonly',
36
- 'https://www.googleapis.com/auth/gmail.modify']
28
+ DEFAULT_SCOPES = [
29
+ 'https://www.googleapis.com/auth/gmail.compose',
30
+ 'https://www.googleapis.com/auth/gmail.readonly',
31
+ 'https://www.googleapis.com/auth/gmail.modify'
32
+ ]
37
33
 
38
34
  logger = log.getLogger(__name__)
39
35
 
40
36
 
41
-
42
-
43
37
  class EmailsTable(APITable):
44
38
  """Implementation for the emails table for Gmail"""
45
39
 
@@ -283,6 +277,14 @@ class GmailHandler(APIHandler):
283
277
  super().__init__(name)
284
278
  self.connection_args = kwargs.get('connection_data', {})
285
279
 
280
+ self.token_file = None
281
+ self.max_page_size = 500
282
+ self.max_batch_size = 100
283
+ self.service = None
284
+ self.is_connected = False
285
+
286
+ self.handler_storage = kwargs['handler_storage']
287
+
286
288
  self.credentials_url = self.connection_args.get('credentials_url', None)
287
289
  self.credentials_file = self.connection_args.get('credentials_file', None)
288
290
  if self.connection_args.get('credentials'):
@@ -298,63 +300,11 @@ class GmailHandler(APIHandler):
298
300
  self.credentials_url = secret_url
299
301
 
300
302
  self.scopes = self.connection_args.get('scopes', DEFAULT_SCOPES)
301
- self.token_file = None
302
- self.max_page_size = 500
303
- self.max_batch_size = 100
304
- self.service = None
305
- self.is_connected = False
306
-
307
- self.handler_storage = kwargs['handler_storage']
308
303
 
309
304
  emails = EmailsTable(self)
310
305
  self.emails = emails
311
306
  self._register_table('emails', emails)
312
307
 
313
- def _download_secret_file(self, secret_file):
314
- # Giving more priority to the S3 file
315
- if self.credentials_url:
316
- response = requests.get(self.credentials_url)
317
- if response.status_code == 200:
318
- with open(secret_file, 'w') as creds:
319
- creds.write(response.text)
320
- return True
321
- else:
322
- logger.error("Failed to get credentials from S3", response.status_code)
323
-
324
- if self.credentials_file and os.path.isfile(self.credentials_file):
325
- copyfile(self.credentials_file, secret_file)
326
- return True
327
- return False
328
-
329
- def create_connection(self):
330
- creds = None
331
-
332
- # Get the current dir, we'll check for Token & Creds files in this dir
333
- curr_dir = self.handler_storage.folder_get('config')
334
-
335
- creds_file = os.path.join(curr_dir, 'creds.json')
336
- secret_file = os.path.join(curr_dir, 'secret.json')
337
-
338
- if os.path.isfile(creds_file):
339
- creds = Credentials.from_authorized_user_file(creds_file, self.scopes)
340
-
341
- if not creds or not creds.valid:
342
- if creds and creds.expired and creds.refresh_token:
343
- creds.refresh(Request())
344
-
345
- if self._download_secret_file(secret_file):
346
- # save to storage
347
- self.handler_storage.folder_sync('config')
348
- else:
349
- raise ValueError('No valid Gmail Credentials filepath or S3 url found.')
350
-
351
- creds = google_auth_flow(secret_file, self.scopes, self.connection_args.get('code'))
352
-
353
- save_creds_to_file(creds, creds_file)
354
- self.handler_storage.folder_sync('config')
355
-
356
- return build('gmail', 'v1', credentials=creds)
357
-
358
308
  def connect(self):
359
309
  """Authenticate with the Gmail API using the credentials file.
360
310
 
@@ -366,7 +316,10 @@ class GmailHandler(APIHandler):
366
316
  if self.is_connected and self.service is not None:
367
317
  return self.service
368
318
 
369
- self.service = self.create_connection()
319
+ google_oauth2_manager = GoogleUserOAuth2Manager(self.handler_storage, self.scopes, self.credentials_file, self.credentials_url, self.connection_args.get('code'))
320
+ creds = google_oauth2_manager.get_oauth2_credentials()
321
+
322
+ self.service = build('gmail', 'v1', credentials=creds)
370
323
 
371
324
  self.is_connected = True
372
325
  return self.service
@@ -1,2 +1 @@
1
1
  google-api-python-client
2
- google-auth-httplib2
@@ -4,9 +4,24 @@ from mindsdb.integrations.libs.const import HANDLER_CONNECTION_ARG_TYPE as ARG_T
4
4
 
5
5
 
6
6
  connection_args = OrderedDict(
7
+ credentials_url={
8
+ 'type': ARG_TYPE.STR,
9
+ 'description': 'URL to Service Account Keys',
10
+ 'label': 'URL to Service Account Keys',
11
+ },
12
+ credentials_file={
13
+ 'type': ARG_TYPE.STR,
14
+ 'description': 'Location of Service Account Keys',
15
+ 'label': 'Path to Service Account Keys',
16
+ },
7
17
  credentials={
8
18
  'type': ARG_TYPE.PATH,
9
19
  'description': 'Service Account Keys',
10
20
  'label': 'Upload Service Account Keys',
11
21
  },
22
+ code={
23
+ 'type': ARG_TYPE.STR,
24
+ 'description': 'Code After Authorisation',
25
+ 'label': 'Code After Authorisation',
26
+ },
12
27
  )
@@ -1,8 +1,4 @@
1
- import os
2
-
3
1
  import pandas as pd
4
- from google.auth.transport.requests import Request
5
- from google.oauth2.credentials import Credentials
6
2
  from googleapiclient.discovery import build
7
3
 
8
4
  from mindsdb.api.executor.data_types.response_type import RESPONSE_TYPE
@@ -11,13 +7,21 @@ from mindsdb.integrations.libs.response import (
11
7
  HandlerStatusResponse as StatusResponse,
12
8
  HandlerResponse as Response,
13
9
  )
10
+ from mindsdb.utilities.config import Config
14
11
  from mindsdb.utilities import log
15
- logger = log.getLogger(__name__)
16
-
17
- from mindsdb.integrations.handlers.gmail_handler.utils import AuthException, google_auth_flow, save_creds_to_file
12
+ from mindsdb.integrations.utilities.handlers.auth_utilities import GoogleUserOAuth2Manager
13
+ from mindsdb.integrations.utilities.handlers.auth_utilities.exceptions import AuthException
18
14
 
19
15
  from .google_calendar_tables import GoogleCalendarEventsTable
20
16
 
17
+ DEFAULT_SCOPES = [
18
+ 'https://www.googleapis.com/auth/calendar',
19
+ 'https://www.googleapis.com/auth/calendar.events',
20
+ 'https://www.googleapis.com/auth/calendar.readonly'
21
+ ]
22
+
23
+ logger = log.getLogger(__name__)
24
+
21
25
 
22
26
  class GoogleCalendarHandler(APIHandler):
23
27
  """
@@ -35,20 +39,29 @@ class GoogleCalendarHandler(APIHandler):
35
39
  events (GoogleCalendarEventsTable): The `GoogleCalendarEventsTable` object for interacting with the events table.
36
40
  """
37
41
  super().__init__(name)
42
+ self.connection_data = kwargs.get('connection_data', {})
38
43
 
39
- self.token = None
40
44
  self.service = None
41
- self.connection_data = kwargs.get('connection_data', {})
42
- self.credentials_file = self.connection_data['credentials']
43
- self.scopes = [
44
- 'https://www.googleapis.com/auth/calendar',
45
- 'https://www.googleapis.com/auth/calendar.events',
46
- 'https://www.googleapis.com/auth/calendar.readonly'
47
- ]
48
45
  self.is_connected = False
49
46
 
50
47
  self.handler_storage = kwargs['handler_storage']
51
48
 
49
+ self.credentials_url = self.connection_data.get('credentials_url', None)
50
+ self.credentials_file = self.connection_data.get('credentials_file', None)
51
+ if self.connection_data.get('credentials'):
52
+ self.credentials_file = self.connection_data.pop('credentials')
53
+ if not self.credentials_file and not self.credentials_url:
54
+ # try to get from config
55
+ gcalendar_config = Config().get('handlers', {}).get('youtube', {})
56
+ secret_file = gcalendar_config.get('credentials_file')
57
+ secret_url = gcalendar_config.get('credentials_url')
58
+ if secret_file:
59
+ self.credentials_file = secret_file
60
+ elif secret_url:
61
+ self.credentials_url = secret_url
62
+
63
+ self.scopes = self.connection_data.get('scopes', DEFAULT_SCOPES)
64
+
52
65
  events = GoogleCalendarEventsTable(self)
53
66
  self.events = events
54
67
  self._register_table('events', events)
@@ -64,32 +77,8 @@ class GoogleCalendarHandler(APIHandler):
64
77
  if self.is_connected is True:
65
78
  return self.service
66
79
 
67
- secret_file = self.credentials_file
68
-
69
- curr_dir = self.handler_storage.folder_get('config')
70
-
71
- creds_file = None
72
- try:
73
- creds_file = os.path.join(curr_dir, 'secret.json')
74
- except Exception:
75
- pass
76
-
77
- creds = None
78
- if os.path.isfile(creds_file):
79
- creds = Credentials.from_authorized_user_file(creds_file, self.scopes)
80
-
81
- if not creds or not creds.valid:
82
- if creds and creds.expired and creds.refresh_token:
83
- creds.refresh(Request())
84
-
85
- save_creds_to_file(creds, creds_file)
86
- self.handler_storage.folder_sync('config')
87
-
88
- else:
89
- creds = google_auth_flow(secret_file, self.scopes, self.connection_data.get('code'))
90
-
91
- save_creds_to_file(creds, creds_file)
92
- self.handler_storage.folder_sync('config')
80
+ google_oauth2_manager = GoogleUserOAuth2Manager(self.handler_storage, self.scopes, self.credentials_file, self.credentials_url, self.connection_data.get('code'))
81
+ creds = google_oauth2_manager.get_oauth2_credentials()
93
82
 
94
83
  self.service = build('calendar', 'v3', credentials=creds)
95
84
  return self.service
@@ -106,6 +95,7 @@ class GoogleCalendarHandler(APIHandler):
106
95
  try:
107
96
  self.connect()
108
97
  response.success = True
98
+ response.copy_storage = True
109
99
 
110
100
  except AuthException as error:
111
101
  response.error_message = str(error)
@@ -1,3 +1 @@
1
1
  google-api-python-client
2
- google-auth-httplib2
3
- -r mindsdb/integrations/handlers/gmail_handler/requirements.txt
@@ -1,2 +1 @@
1
- openai==1.55.3
2
1
  tiktoken
@@ -1,4 +1,3 @@
1
- openai==1.55.3
2
1
  wikipedia==1.4.0
3
2
  tiktoken
4
3
  anthropic>=0.26.1
@@ -1,5 +1,4 @@
1
1
  llama-index==0.10.13
2
- openai == 1.24.0
3
2
  pydantic-settings >= 2.1.0
4
3
  llama-index-readers-web
5
4
  llama-index-embeddings-openai
@@ -8,7 +8,9 @@ CHAT_MODELS = (
8
8
  'gpt-4-32k',
9
9
  'gpt-4-1106-preview',
10
10
  'gpt-4-0125-preview',
11
- 'gpt-4o'
11
+ 'gpt-4o',
12
+ 'o3-mini',
13
+ 'o1-mini'
12
14
  )
13
15
  COMPLETION_MODELS = ('babbage-002', 'davinci-002')
14
16
  FINETUNING_MODELS = ('gpt-3.5-turbo', 'babbage-002', 'davinci-002', 'gpt-4')
@@ -1,2 +1 @@
1
- openai<2.0.0,>=1.54.0
2
1
  tiktoken
@@ -1,6 +1,5 @@
1
1
  -r mindsdb/integrations/handlers/chromadb_handler/requirements.txt
2
2
  faiss-cpu
3
- openai==1.55.3
4
3
  html2text
5
4
  writerai~=1.1.0
6
5
  sentence-transformers # needed for HuggingFaceEmbeddings from langchain-community
@@ -1,9 +1,11 @@
1
+ import io
1
2
  import json
2
3
 
3
4
  import requests
4
5
  from typing import Dict, Optional
5
6
 
6
7
  import pandas as pd
8
+ import pyarrow.parquet as pq
7
9
 
8
10
  from mindsdb.integrations.libs.base import BaseMLEngine
9
11
 
@@ -37,9 +39,17 @@ class RayServeHandler(BaseMLEngine):
37
39
  args['target'] = target
38
40
  self.model_storage.json_set('args', args)
39
41
  try:
40
- resp = requests.post(args['train_url'],
41
- json={'df': df.to_json(orient='records'), 'target': target},
42
- headers={'content-type': 'application/json; format=pandas-records'})
42
+ if args.get('is_parquet', False):
43
+ buffer = io.BytesIO()
44
+ df.to_parquet(buffer)
45
+ resp = requests.post(args['train_url'],
46
+ files={"df": ("df", buffer.getvalue(), "application/octet-stream")},
47
+ data={"args": json.dumps(args), "target": target},
48
+ )
49
+ else:
50
+ resp = requests.post(args['train_url'],
51
+ json={'df': df.to_json(orient='records'), 'target': target, 'args': args},
52
+ headers={'content-type': 'application/json; format=pandas-records'})
43
53
  except requests.exceptions.InvalidSchema:
44
54
  raise Exception("Error: The URL provided for the training endpoint is invalid.")
45
55
 
@@ -59,14 +69,29 @@ class RayServeHandler(BaseMLEngine):
59
69
  args = {**(self.model_storage.json_get('args')), **args} # merge incoming args
60
70
  pred_args = args.get('predict_params', {})
61
71
  args = {**args, **pred_args} # merge pred_args
62
- resp = requests.post(args['predict_url'],
63
- json={'df': df.to_json(orient='records'), 'pred_args': pred_args},
64
- headers={'content-type': 'application/json; format=pandas-records'})
65
-
72
+ if args.get('is_parquet', False):
73
+ buffer = io.BytesIO()
74
+ df.attrs['pred_args'] = pred_args
75
+ df.to_parquet(buffer)
76
+ resp = requests.post(args['predict_url'],
77
+ files={"df": ("df", buffer.getvalue(), "application/octet-stream")},
78
+ data={"pred_args": json.dumps(pred_args)},
79
+ )
80
+ else:
81
+ resp = requests.post(args['predict_url'],
82
+ json={'df': df.to_json(orient='records'), 'pred_args': pred_args},
83
+ headers={'content-type': 'application/json; format=pandas-records'})
66
84
  try:
67
- response = resp.json()
85
+ if args.get('is_parquet', False):
86
+ buffer = io.BytesIO(resp.content)
87
+ table = pq.read_table(buffer)
88
+ response = table.to_pandas()
89
+ else:
90
+ response = resp.json()
68
91
  except json.JSONDecodeError:
69
92
  error = resp.text
93
+ except Exception:
94
+ error = 'Could not decode parquet.'
70
95
  else:
71
96
  if 'prediction' in response:
72
97
  target = args['target']
@@ -220,8 +220,6 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
220
220
  if limit is not None:
221
221
  if len(reviewed_urls) >= limit:
222
222
  return reviewed_urls
223
- if crawl_depth == current_depth:
224
- return reviewed_urls
225
223
 
226
224
  if not filters:
227
225
  matches_filter = True
@@ -241,6 +239,9 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
241
239
  "error": str(error_message),
242
240
  }
243
241
 
242
+ if crawl_depth is not None and crawl_depth == current_depth:
243
+ return reviewed_urls
244
+
244
245
  to_rev_url_list = []
245
246
 
246
247
  # create a list of new urls to review that don't exist in the already reviewed ones
@@ -1,62 +1,71 @@
1
+ from typing import List
2
+
1
3
  import pandas as pd
2
4
  from mindsdb.integrations.libs.response import HandlerStatusResponse
3
- from mindsdb_sql_parser import ast
4
- from mindsdb.integrations.libs.api_handler import APIHandler, APITable
5
- from mindsdb.utilities.config import Config
6
- from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions, project_dataframe
5
+ from mindsdb.utilities.config import config
7
6
  from mindsdb.utilities.security import validate_urls
8
7
  from .urlcrawl_helpers import get_all_websites
9
8
 
9
+ from mindsdb.integrations.libs.api_handler import APIResource, APIHandler
10
+ from mindsdb.integrations.utilities.sql_utils import (FilterCondition, FilterOperator)
10
11
 
11
- class CrawlerTable(APITable):
12
12
 
13
- def __init__(self, handler: APIHandler):
14
- super().__init__(handler)
15
- self.config = Config()
13
+ class CrawlerTable(APIResource):
16
14
 
17
- def select(self, query: ast.Select) -> pd.DataFrame:
15
+ def list(
16
+ self,
17
+ conditions: List[FilterCondition] = None,
18
+ limit: int = None,
19
+ **kwargs
20
+ ) -> pd.DataFrame:
18
21
  """
19
22
  Selects data from the provided websites
20
23
 
21
- Args:
22
- query (ast.Select): Given SQL SELECT query
23
-
24
24
  Returns:
25
25
  dataframe: Dataframe containing the crawled data
26
26
 
27
27
  Raises:
28
28
  NotImplementedError: If the query is not supported
29
29
  """
30
- conditions = extract_comparison_conditions(query.where)
31
30
  urls = []
32
- for operator, arg1, arg2 in conditions:
33
- if operator == 'or':
34
- raise NotImplementedError('OR is not supported')
35
- if arg1 == 'url':
36
- if operator in ['=', 'in']:
37
- urls = [str(arg2)] if isinstance(arg2, str) else arg2
38
- else:
39
- raise NotImplementedError('Invalid URL format. Please provide a single URL like url = "example.com" or'
40
- 'multiple URLs using the format url IN ("url1", "url2", ...)')
31
+ crawl_depth = None
32
+ per_url_limit = None
33
+ for condition in conditions:
34
+ if condition.column == 'url':
35
+ if condition.op == FilterOperator.IN:
36
+ urls = condition.value
37
+ elif condition.op == FilterOperator.EQUAL:
38
+ urls = [condition.value]
39
+ condition.applied = True
40
+ if condition.column == 'crawl_depth' and condition.op == FilterOperator.EQUAL:
41
+ crawl_depth = condition.value
42
+ condition.applied = True
43
+ if condition.column == 'per_url_limit' and condition.op == FilterOperator.EQUAL:
44
+ per_url_limit = condition.value
45
+ condition.applied = True
41
46
 
42
47
  if len(urls) == 0:
43
48
  raise NotImplementedError(
44
- 'You must specify what url you want to crawl, for example: SELECT * FROM crawl WHERE url = "someurl"')
49
+ 'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"')
45
50
 
46
- allowed_urls = self.config.get('web_crawling_allowed_sites', [])
51
+ allowed_urls = config.get('web_crawling_allowed_sites', [])
47
52
  if allowed_urls and not validate_urls(urls, allowed_urls):
48
53
  raise ValueError(f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}.")
49
54
 
50
- if query.limit is None:
51
- raise NotImplementedError('You must specify a LIMIT clause which defines the number of pages to crawl')
52
-
53
- limit = query.limit.value
54
-
55
- result = get_all_websites(urls, limit, html=False)
56
- if len(result) > limit:
55
+ if limit is None and per_url_limit is None and crawl_depth is None:
56
+ per_url_limit = 1
57
+ if per_url_limit is not None:
58
+ # crawl every url separately
59
+ results = []
60
+ for url in urls:
61
+ results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth))
62
+ result = pd.concat(results)
63
+ else:
64
+ result = get_all_websites(urls, limit, crawl_depth=crawl_depth)
65
+
66
+ if limit is not None and len(result) > limit:
57
67
  result = result[:limit]
58
- # filter targets
59
- result = project_dataframe(result, query.targets, self.get_columns())
68
+
60
69
  return result
61
70
 
62
71
  def get_columns(self):
@@ -5,6 +5,7 @@ from .__about__ import __version__ as version, __description__ as description
5
5
 
6
6
  try:
7
7
  from .youtube_handler import YoutubeHandler as Handler
8
+ from .connection_args import connection_args
8
9
  import_error = None
9
10
  except Exception as e:
10
11
  Handler = None
@@ -24,4 +25,5 @@ __all__ = [
24
25
  "description",
25
26
  "import_error",
26
27
  "icon_path",
28
+ "connection_args",
27
29
  ]
@@ -0,0 +1,32 @@
1
+ from collections import OrderedDict
2
+
3
+ from mindsdb.integrations.libs.const import HANDLER_CONNECTION_ARG_TYPE as ARG_TYPE
4
+
5
+
6
+ connection_args = OrderedDict(
7
+ youtube_api_token={
8
+ 'type': ARG_TYPE.STR,
9
+ 'description': 'Youtube API Token',
10
+ 'label': 'Youtube API Token',
11
+ },
12
+ credentials_url={
13
+ 'type': ARG_TYPE.STR,
14
+ 'description': 'URL to Service Account Keys',
15
+ 'label': 'URL to Service Account Keys',
16
+ },
17
+ credentials_file={
18
+ 'type': ARG_TYPE.STR,
19
+ 'description': 'Location of Service Account Keys',
20
+ 'label': 'Path to Service Account Keys',
21
+ },
22
+ credentials={
23
+ 'type': ARG_TYPE.PATH,
24
+ 'description': 'Service Account Keys',
25
+ 'label': 'Upload Service Account Keys',
26
+ },
27
+ code={
28
+ 'type': ARG_TYPE.STR,
29
+ 'description': 'Code After Authorisation',
30
+ 'label': 'Code After Authorisation',
31
+ },
32
+ )
@@ -10,9 +10,7 @@ from mindsdb.integrations.libs.response import (
10
10
  from mindsdb.utilities import log
11
11
  from mindsdb_sql_parser import parse_sql
12
12
 
13
- from collections import OrderedDict
14
13
  from mindsdb.utilities.config import Config
15
- from mindsdb.integrations.libs.const import HANDLER_CONNECTION_ARG_TYPE as ARG_TYPE
16
14
 
17
15
  from googleapiclient.discovery import build
18
16
 
@@ -38,12 +36,10 @@ class YoutubeHandler(APIHandler):
38
36
  name of a handler instance
39
37
  """
40
38
  super().__init__(name)
41
-
42
- connection_data = kwargs.get("connection_data", {})
39
+ self.connection_data = kwargs.get("connection_data", {})
40
+ self.kwargs = kwargs
43
41
 
44
42
  self.parser = parse_sql
45
- self.connection_data = connection_data
46
- self.kwargs = kwargs
47
43
  self.connection = None
48
44
  self.is_connected = False
49
45
 
@@ -104,7 +100,6 @@ class YoutubeHandler(APIHandler):
104
100
  Status confirmation
105
101
  """
106
102
  response = StatusResponse(False)
107
- need_to_close = self.is_connected is False
108
103
 
109
104
  try:
110
105
  self.connect()
@@ -131,34 +126,3 @@ class YoutubeHandler(APIHandler):
131
126
  """
132
127
  ast = parse_sql(query)
133
128
  return self.query(ast)
134
-
135
-
136
- connection_args = OrderedDict(
137
- youtube_access_token={
138
- "type": ARG_TYPE.STR,
139
- "description": "API Key",
140
- "label": "API Key",
141
- },
142
- credentials_url={
143
- 'type': ARG_TYPE.STR,
144
- 'description': 'URL to OAuth2 Credentials',
145
- 'label': 'URL to OAuth2 Credentials',
146
- },
147
- credentials_file={
148
- 'type': ARG_TYPE.STR,
149
- 'description': 'Location of OAuth2 Credentials',
150
- 'label': 'Location of OAuth2 Credentials',
151
- },
152
- credentials={
153
- 'type': ARG_TYPE.PATH,
154
- 'description': 'OAuth2 Credentials',
155
- 'label': 'Upload OAuth2 Credentials',
156
- },
157
- code={
158
- 'type': ARG_TYPE.STR,
159
- 'description': 'Authentication Code',
160
- 'label': 'Authentication Code',
161
- }
162
- )
163
-
164
- connection_args_example = OrderedDict(youtube_api_token="<your-youtube-api-token>")
@@ -16,6 +16,7 @@ from mindsdb.integrations.libs.llm.config import (
16
16
  NvidiaNIMConfig,
17
17
  MindsdbConfig,
18
18
  )
19
+ from mindsdb.utilities.config import config
19
20
  from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
20
21
 
21
22
 
@@ -115,6 +116,11 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
115
116
  """
116
117
  temperature = min(1.0, max(0.0, args.get("temperature", 0.0)))
117
118
  if provider == "openai":
119
+
120
+ if any(x in args.get("model_name", "") for x in ['o1', 'o3']):
121
+ # for o1 and 03, 'temperature' does not support 0.0 with this model. Only the default (1) value is supported
122
+ temperature = 1
123
+
118
124
  return OpenAIConfig(
119
125
  model_name=args.get("model_name", DEFAULT_OPENAI_MODEL),
120
126
  temperature=temperature,
@@ -206,7 +212,7 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
206
212
  if provider == "mindsdb":
207
213
  return MindsdbConfig(
208
214
  model_name=args["model_name"],
209
- project_name=args.get("project_name", "mindsdb"),
215
+ project_name=args.get("project_name", config.get("default_project")),
210
216
  )
211
217
  if provider == "vllm":
212
218
  return OpenAIConfig(