MindsDB 25.7.3.0__py3-none-any.whl → 25.7.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/a2a/common/server/server.py +16 -6
- mindsdb/api/executor/command_executor.py +206 -135
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +14 -3
- mindsdb/api/executor/planner/plan_join.py +3 -0
- mindsdb/api/executor/planner/plan_join_ts.py +117 -100
- mindsdb/api/executor/planner/query_planner.py +1 -0
- mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +54 -85
- mindsdb/api/http/initialize.py +16 -43
- mindsdb/api/http/namespaces/agents.py +23 -20
- mindsdb/api/http/namespaces/chatbots.py +83 -120
- mindsdb/api/http/namespaces/file.py +1 -1
- mindsdb/api/http/namespaces/jobs.py +38 -60
- mindsdb/api/http/namespaces/tree.py +69 -61
- mindsdb/api/mcp/start.py +2 -0
- mindsdb/api/mysql/mysql_proxy/utilities/dump.py +3 -2
- mindsdb/integrations/handlers/autogluon_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/autosklearn_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +25 -5
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +3 -3
- mindsdb/integrations/handlers/flaml_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/google_calendar_handler/google_calendar_tables.py +82 -73
- mindsdb/integrations/handlers/hubspot_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +83 -76
- mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +5 -2
- mindsdb/integrations/handlers/litellm_handler/settings.py +2 -1
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +106 -90
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +41 -39
- mindsdb/integrations/handlers/salesforce_handler/constants.py +208 -0
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +141 -80
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +0 -1
- mindsdb/integrations/handlers/tpot_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +32 -17
- mindsdb/integrations/handlers/web_handler/web_handler.py +19 -22
- mindsdb/integrations/libs/vectordatabase_handler.py +10 -1
- mindsdb/integrations/utilities/handler_utils.py +32 -12
- mindsdb/interfaces/agents/agents_controller.py +167 -108
- mindsdb/interfaces/agents/langchain_agent.py +10 -3
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +4 -4
- mindsdb/interfaces/database/database.py +38 -13
- mindsdb/interfaces/database/integrations.py +20 -5
- mindsdb/interfaces/database/projects.py +63 -16
- mindsdb/interfaces/database/views.py +86 -60
- mindsdb/interfaces/jobs/jobs_controller.py +103 -110
- mindsdb/interfaces/knowledge_base/controller.py +26 -5
- mindsdb/interfaces/knowledge_base/evaluate.py +2 -1
- mindsdb/interfaces/knowledge_base/executor.py +24 -0
- mindsdb/interfaces/query_context/context_controller.py +100 -133
- mindsdb/interfaces/skills/skills_controller.py +18 -6
- mindsdb/interfaces/storage/db.py +40 -6
- mindsdb/interfaces/variables/variables_controller.py +8 -15
- mindsdb/utilities/config.py +3 -3
- mindsdb/utilities/functions.py +72 -60
- mindsdb/utilities/log.py +38 -6
- mindsdb/utilities/ps.py +7 -7
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.7.4.0.dist-info}/METADATA +246 -247
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.7.4.0.dist-info}/RECORD +61 -60
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.7.4.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.7.4.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.7.4.0.dist-info}/top_level.txt +0 -0
|
@@ -11,6 +11,7 @@ from mindsdb.integrations.libs.response import (
|
|
|
11
11
|
RESPONSE_TYPE,
|
|
12
12
|
)
|
|
13
13
|
from mindsdb.integrations.handlers.salesforce_handler.salesforce_tables import create_table_class
|
|
14
|
+
from mindsdb.integrations.handlers.salesforce_handler.constants import get_soql_instructions
|
|
14
15
|
from mindsdb.utilities import log
|
|
15
16
|
|
|
16
17
|
|
|
@@ -156,91 +157,152 @@ class SalesforceHandler(MetaAPIHandler):
|
|
|
156
157
|
|
|
157
158
|
def _get_resource_names(self) -> List[str]:
|
|
158
159
|
"""
|
|
159
|
-
Retrieves the names of the Salesforce resources
|
|
160
|
+
Retrieves the names of the Salesforce resources with optimized pre-filtering.
|
|
160
161
|
Returns:
|
|
161
162
|
List[str]: A list of filtered resource names.
|
|
162
163
|
"""
|
|
163
164
|
if not self.resource_names:
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
165
|
+
# Check for user-specified table filtering first
|
|
166
|
+
include_tables = self.connection_data.get("include_tables") or self.connection_data.get("tables")
|
|
167
|
+
exclude_tables = self.connection_data.get("exclude_tables", [])
|
|
168
|
+
|
|
169
|
+
if include_tables:
|
|
170
|
+
# OPTIMIZATION: Skip expensive global describe() call
|
|
171
|
+
# Only validate the specified tables
|
|
172
|
+
logger.info(f"Using pre-filtered table list: {include_tables}")
|
|
173
|
+
self.resource_names = self._validate_specified_tables(include_tables, exclude_tables)
|
|
174
|
+
else:
|
|
175
|
+
# Fallback to full discovery with hard-coded filtering
|
|
176
|
+
logger.info("No table filter specified, performing full discovery...")
|
|
177
|
+
self.resource_names = self._discover_all_tables_with_filtering(exclude_tables)
|
|
169
178
|
|
|
170
|
-
# Define patterns for tables to be filtered out.
|
|
171
|
-
# Expanded suffixes and prefixes and exact matches
|
|
172
|
-
ignore_suffixes = ("Share", "History", "Feed", "ChangeEvent", "Tag", "Permission", "Setup", "Consent")
|
|
173
|
-
ignore_prefixes = (
|
|
174
|
-
"Apex",
|
|
175
|
-
"CommPlatform",
|
|
176
|
-
"Lightning",
|
|
177
|
-
"Flow",
|
|
178
|
-
"Transaction",
|
|
179
|
-
"AI",
|
|
180
|
-
"Aura",
|
|
181
|
-
"ContentWorkspace",
|
|
182
|
-
"Collaboration",
|
|
183
|
-
"Datacloud",
|
|
184
|
-
)
|
|
185
|
-
ignore_exact = {
|
|
186
|
-
"EntityDefinition",
|
|
187
|
-
"FieldDefinition",
|
|
188
|
-
"RecordType",
|
|
189
|
-
"CaseStatus",
|
|
190
|
-
"UserRole",
|
|
191
|
-
"UserLicense",
|
|
192
|
-
"UserPermissionAccess",
|
|
193
|
-
"UserRecordAccess",
|
|
194
|
-
"Folder",
|
|
195
|
-
"Group",
|
|
196
|
-
"Note",
|
|
197
|
-
"ProcessDefinition",
|
|
198
|
-
"ProcessInstance",
|
|
199
|
-
"ContentFolder",
|
|
200
|
-
"ContentDocumentSubscription",
|
|
201
|
-
"DashboardComponent",
|
|
202
|
-
"Report",
|
|
203
|
-
"Dashboard",
|
|
204
|
-
"Topic",
|
|
205
|
-
"TopicAssignment",
|
|
206
|
-
"Period",
|
|
207
|
-
"Partner",
|
|
208
|
-
"PackageLicense",
|
|
209
|
-
"ColorDefinition",
|
|
210
|
-
"DataUsePurpose",
|
|
211
|
-
"DataUseLegalBasis",
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
ignore_substrings = (
|
|
215
|
-
"CleanInfo",
|
|
216
|
-
"Template",
|
|
217
|
-
"Rule",
|
|
218
|
-
"Definition",
|
|
219
|
-
"Status",
|
|
220
|
-
"Policy",
|
|
221
|
-
"Setting",
|
|
222
|
-
"Access",
|
|
223
|
-
"Config",
|
|
224
|
-
"Subscription",
|
|
225
|
-
"DataType",
|
|
226
|
-
"MilestoneType",
|
|
227
|
-
"Entitlement",
|
|
228
|
-
"Auth",
|
|
229
|
-
)
|
|
230
|
-
|
|
231
|
-
filtered = []
|
|
232
|
-
for r in all_resources:
|
|
233
|
-
if (
|
|
234
|
-
not r.endswith(ignore_suffixes)
|
|
235
|
-
and not r.startswith(ignore_prefixes)
|
|
236
|
-
and not any(sub in r for sub in ignore_substrings)
|
|
237
|
-
and r not in ignore_exact
|
|
238
|
-
):
|
|
239
|
-
filtered.append(r)
|
|
240
|
-
|
|
241
|
-
self.resource_names = [r for r in filtered]
|
|
242
179
|
return self.resource_names
|
|
243
180
|
|
|
181
|
+
def _validate_specified_tables(self, include_tables: List[str], exclude_tables: List[str]) -> List[str]:
|
|
182
|
+
"""
|
|
183
|
+
Validate user-specified tables without expensive global describe() call.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
include_tables: List of table names to include
|
|
187
|
+
exclude_tables: List of table names to exclude
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
List[str]: Validated and filtered table names
|
|
191
|
+
"""
|
|
192
|
+
validated_tables = []
|
|
193
|
+
|
|
194
|
+
for table_name in include_tables:
|
|
195
|
+
# Skip if explicitly excluded
|
|
196
|
+
if table_name in exclude_tables:
|
|
197
|
+
logger.info(f"Skipping excluded table: {table_name}")
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
# Quick validation: check if table exists and is queryable
|
|
202
|
+
# This is much faster than global describe()
|
|
203
|
+
metadata = getattr(self.connection.sobjects, table_name).describe()
|
|
204
|
+
if metadata.get("queryable", False):
|
|
205
|
+
validated_tables.append(table_name)
|
|
206
|
+
logger.debug(f"Validated table: {table_name}")
|
|
207
|
+
else:
|
|
208
|
+
logger.warning(f"Table {table_name} is not queryable, skipping")
|
|
209
|
+
except Exception as e:
|
|
210
|
+
logger.warning(f"Table {table_name} not found or accessible: {e}")
|
|
211
|
+
|
|
212
|
+
logger.info(f"Validated {len(validated_tables)} tables from include_tables")
|
|
213
|
+
return validated_tables
|
|
214
|
+
|
|
215
|
+
def _discover_all_tables_with_filtering(self, exclude_tables: List[str]) -> List[str]:
|
|
216
|
+
"""
|
|
217
|
+
Fallback method: discover all tables with hard-coded filtering.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
exclude_tables: List of table names to exclude
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
List[str]: Filtered table names
|
|
224
|
+
"""
|
|
225
|
+
# This is the original expensive approach - only used when no include_tables specified
|
|
226
|
+
all_resources = [
|
|
227
|
+
resource["name"]
|
|
228
|
+
for resource in self.connection.sobjects.describe()["sobjects"]
|
|
229
|
+
if resource.get("queryable", False)
|
|
230
|
+
]
|
|
231
|
+
|
|
232
|
+
# Apply hard-coded filtering (existing logic)
|
|
233
|
+
ignore_suffixes = ("Share", "History", "Feed", "ChangeEvent", "Tag", "Permission", "Setup", "Consent")
|
|
234
|
+
ignore_prefixes = (
|
|
235
|
+
"Apex",
|
|
236
|
+
"CommPlatform",
|
|
237
|
+
"Lightning",
|
|
238
|
+
"Flow",
|
|
239
|
+
"Transaction",
|
|
240
|
+
"AI",
|
|
241
|
+
"Aura",
|
|
242
|
+
"ContentWorkspace",
|
|
243
|
+
"Collaboration",
|
|
244
|
+
"Datacloud",
|
|
245
|
+
)
|
|
246
|
+
ignore_exact = {
|
|
247
|
+
"EntityDefinition",
|
|
248
|
+
"FieldDefinition",
|
|
249
|
+
"RecordType",
|
|
250
|
+
"CaseStatus",
|
|
251
|
+
"UserRole",
|
|
252
|
+
"UserLicense",
|
|
253
|
+
"UserPermissionAccess",
|
|
254
|
+
"UserRecordAccess",
|
|
255
|
+
"Folder",
|
|
256
|
+
"Group",
|
|
257
|
+
"Note",
|
|
258
|
+
"ProcessDefinition",
|
|
259
|
+
"ProcessInstance",
|
|
260
|
+
"ContentFolder",
|
|
261
|
+
"ContentDocumentSubscription",
|
|
262
|
+
"DashboardComponent",
|
|
263
|
+
"Report",
|
|
264
|
+
"Dashboard",
|
|
265
|
+
"Topic",
|
|
266
|
+
"TopicAssignment",
|
|
267
|
+
"Period",
|
|
268
|
+
"Partner",
|
|
269
|
+
"PackageLicense",
|
|
270
|
+
"ColorDefinition",
|
|
271
|
+
"DataUsePurpose",
|
|
272
|
+
"DataUseLegalBasis",
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
ignore_substrings = (
|
|
276
|
+
"CleanInfo",
|
|
277
|
+
"Template",
|
|
278
|
+
"Rule",
|
|
279
|
+
"Definition",
|
|
280
|
+
"Status",
|
|
281
|
+
"Policy",
|
|
282
|
+
"Setting",
|
|
283
|
+
"Access",
|
|
284
|
+
"Config",
|
|
285
|
+
"Subscription",
|
|
286
|
+
"DataType",
|
|
287
|
+
"MilestoneType",
|
|
288
|
+
"Entitlement",
|
|
289
|
+
"Auth",
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
# Apply hard-coded filtering
|
|
293
|
+
filtered = []
|
|
294
|
+
for r in all_resources:
|
|
295
|
+
if (
|
|
296
|
+
not r.endswith(ignore_suffixes)
|
|
297
|
+
and not r.startswith(ignore_prefixes)
|
|
298
|
+
and not any(sub in r for sub in ignore_substrings)
|
|
299
|
+
and r not in ignore_exact
|
|
300
|
+
and r not in exclude_tables # Apply user exclusions
|
|
301
|
+
):
|
|
302
|
+
filtered.append(r)
|
|
303
|
+
|
|
304
|
+
return filtered
|
|
305
|
+
|
|
244
306
|
def meta_get_handler_info(self, **kwargs) -> str:
|
|
245
307
|
"""
|
|
246
308
|
Retrieves information about the design and implementation of the API handler.
|
|
@@ -254,8 +316,7 @@ class SalesforceHandler(MetaAPIHandler):
|
|
|
254
316
|
Returns:
|
|
255
317
|
str: A string containing information about the API handler's design and implementation.
|
|
256
318
|
"""
|
|
257
|
-
|
|
258
|
-
return "When filtering on a Date or DateTime field, the value MUST be an unquoted literal in YYYY-MM-DD or YYYY-MM-DDThh:mm:ssZ format. For example, CloseDate >= 2025-05-28 is correct; CloseDate >= '2025-05-28' is incorrect."
|
|
319
|
+
return get_soql_instructions(self.name)
|
|
259
320
|
|
|
260
321
|
def meta_get_tables(self, table_names: Optional[List[str]] = None) -> Response:
|
|
261
322
|
"""
|
|
@@ -176,7 +176,6 @@ def create_table_class(resource_name: Text) -> MetaAPIResource:
|
|
|
176
176
|
"table_description": "",
|
|
177
177
|
"row_count": None,
|
|
178
178
|
}
|
|
179
|
-
|
|
180
179
|
# Get row count if Id column is aggregatable.
|
|
181
180
|
row_count = None
|
|
182
181
|
# if next(field for field in resource_metadata['fields'] if field['name'] == 'Id').get('aggregatable', False):
|
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
tpot<=0.11.7
|
|
2
|
-
type_infer==0.0.
|
|
2
|
+
type_infer==0.0.23
|
|
@@ -100,26 +100,25 @@ def parallel_get_all_website_links(urls) -> dict:
|
|
|
100
100
|
return url_contents
|
|
101
101
|
|
|
102
102
|
with concurrent.futures.ProcessPoolExecutor() as executor:
|
|
103
|
-
future_to_url = {
|
|
104
|
-
executor.submit(get_all_website_links, url): url for url in urls
|
|
105
|
-
}
|
|
103
|
+
future_to_url = {executor.submit(get_all_website_links, url): url for url in urls}
|
|
106
104
|
for future in concurrent.futures.as_completed(future_to_url):
|
|
107
105
|
url = future_to_url[future]
|
|
108
106
|
try:
|
|
109
107
|
url_contents[url] = future.result()
|
|
110
108
|
except Exception as exc:
|
|
111
|
-
logger.error(f
|
|
109
|
+
logger.error(f"{url} generated an exception: {exc}")
|
|
112
110
|
# don't raise the exception, just log it, continue processing other urls
|
|
113
111
|
|
|
114
112
|
return url_contents
|
|
115
113
|
|
|
116
114
|
|
|
117
|
-
def get_all_website_links(url) -> dict:
|
|
115
|
+
def get_all_website_links(url, headers: dict = None) -> dict:
|
|
118
116
|
"""
|
|
119
117
|
Fetch all website links from a URL.
|
|
120
118
|
|
|
121
119
|
Args:
|
|
122
120
|
url (str): the URL to fetch links from
|
|
121
|
+
headers (dict): a dictionary of headers to use when fetching links
|
|
123
122
|
|
|
124
123
|
Returns:
|
|
125
124
|
A dictionary containing the URL, the extracted links, the HTML content, the text content, and any error that occurred.
|
|
@@ -132,9 +131,12 @@ def get_all_website_links(url) -> dict:
|
|
|
132
131
|
session = requests.Session()
|
|
133
132
|
|
|
134
133
|
# Add headers to mimic a real browser request
|
|
135
|
-
headers
|
|
136
|
-
|
|
137
|
-
|
|
134
|
+
if headers is None:
|
|
135
|
+
headers = {}
|
|
136
|
+
if "User-Agent" not in headers:
|
|
137
|
+
headers["User-Agent"] = (
|
|
138
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.3"
|
|
139
|
+
)
|
|
138
140
|
|
|
139
141
|
response = session.get(url, headers=headers)
|
|
140
142
|
if "cookie" in response.request.headers:
|
|
@@ -157,7 +159,7 @@ def get_all_website_links(url) -> dict:
|
|
|
157
159
|
continue
|
|
158
160
|
href = urljoin(url, href)
|
|
159
161
|
parsed_href = urlparse(href)
|
|
160
|
-
href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path,
|
|
162
|
+
href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path, "", "", ""))
|
|
161
163
|
if not is_valid(href):
|
|
162
164
|
continue
|
|
163
165
|
if href in urls:
|
|
@@ -203,7 +205,15 @@ def get_readable_text_from_soup(soup) -> str:
|
|
|
203
205
|
return html_converter.handle(str(soup))
|
|
204
206
|
|
|
205
207
|
|
|
206
|
-
def get_all_website_links_recursively(
|
|
208
|
+
def get_all_website_links_recursively(
|
|
209
|
+
url,
|
|
210
|
+
reviewed_urls,
|
|
211
|
+
limit=None,
|
|
212
|
+
crawl_depth: int = 1,
|
|
213
|
+
current_depth: int = 0,
|
|
214
|
+
filters: List[str] = None,
|
|
215
|
+
headers=None,
|
|
216
|
+
):
|
|
207
217
|
"""
|
|
208
218
|
Recursively gathers all links from a given website up to a specified limit.
|
|
209
219
|
|
|
@@ -227,7 +237,7 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
|
|
|
227
237
|
matches_filter = any(re.match(f, url) is not None for f in filters)
|
|
228
238
|
if url not in reviewed_urls and matches_filter:
|
|
229
239
|
try:
|
|
230
|
-
reviewed_urls[url] = get_all_website_links(url)
|
|
240
|
+
reviewed_urls[url] = get_all_website_links(url, headers=headers)
|
|
231
241
|
except Exception as e:
|
|
232
242
|
error_message = traceback.format_exc().splitlines()[-1]
|
|
233
243
|
logger.error("An exception occurred: %s", str(e))
|
|
@@ -271,10 +281,14 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
|
|
|
271
281
|
reviewed_urls.update(new_revised_urls)
|
|
272
282
|
|
|
273
283
|
for new_url in new_revised_urls:
|
|
274
|
-
get_all_website_links_recursively(
|
|
284
|
+
get_all_website_links_recursively(
|
|
285
|
+
new_url, reviewed_urls, limit, crawl_depth=crawl_depth, current_depth=current_depth + 1, filters=filters
|
|
286
|
+
)
|
|
275
287
|
|
|
276
288
|
|
|
277
|
-
def get_all_websites(
|
|
289
|
+
def get_all_websites(
|
|
290
|
+
urls, limit=1, html=False, crawl_depth: int = 1, filters: List[str] = None, headers: dict = None
|
|
291
|
+
) -> pd.DataFrame:
|
|
278
292
|
"""
|
|
279
293
|
Crawl a list of websites and return a DataFrame containing the results.
|
|
280
294
|
|
|
@@ -284,6 +298,7 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
|
|
|
284
298
|
crawl_depth (int): Crawl depth for URLs.
|
|
285
299
|
html (bool): a boolean indicating whether to include the HTML content in the results
|
|
286
300
|
filters (List[str]): Crawl URLs that only match these regex patterns.
|
|
301
|
+
headers (dict): headers of request
|
|
287
302
|
|
|
288
303
|
Returns:
|
|
289
304
|
A DataFrame containing the results.
|
|
@@ -299,7 +314,9 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
|
|
|
299
314
|
if urlparse(url).scheme == "":
|
|
300
315
|
# Try HTTPS first
|
|
301
316
|
url = "https://" + url
|
|
302
|
-
get_all_website_links_recursively(
|
|
317
|
+
get_all_website_links_recursively(
|
|
318
|
+
url, reviewed_urls, limit, crawl_depth=crawl_depth, filters=filters, headers=headers
|
|
319
|
+
)
|
|
303
320
|
|
|
304
321
|
# Use a ThreadPoolExecutor to run the helper function in parallel.
|
|
305
322
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
@@ -311,9 +328,7 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
|
|
|
311
328
|
columns_to_ignore = ["urls"]
|
|
312
329
|
if html is False:
|
|
313
330
|
columns_to_ignore += ["html_content"]
|
|
314
|
-
df = dict_to_dataframe(
|
|
315
|
-
reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url"
|
|
316
|
-
)
|
|
331
|
+
df = dict_to_dataframe(reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url")
|
|
317
332
|
|
|
318
333
|
if not df.empty and df[df.error.isna()].empty:
|
|
319
334
|
raise Exception(str(df.iloc[0].error))
|
|
@@ -7,17 +7,11 @@ from mindsdb.utilities.security import validate_urls
|
|
|
7
7
|
from .urlcrawl_helpers import get_all_websites
|
|
8
8
|
|
|
9
9
|
from mindsdb.integrations.libs.api_handler import APIResource, APIHandler
|
|
10
|
-
from mindsdb.integrations.utilities.sql_utils import
|
|
10
|
+
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class CrawlerTable(APIResource):
|
|
14
|
-
|
|
15
|
-
def list(
|
|
16
|
-
self,
|
|
17
|
-
conditions: List[FilterCondition] = None,
|
|
18
|
-
limit: int = None,
|
|
19
|
-
**kwargs
|
|
20
|
-
) -> pd.DataFrame:
|
|
14
|
+
def list(self, conditions: List[FilterCondition] = None, limit: int = None, **kwargs) -> pd.DataFrame:
|
|
21
15
|
"""
|
|
22
16
|
Selects data from the provided websites
|
|
23
17
|
|
|
@@ -30,27 +24,34 @@ class CrawlerTable(APIResource):
|
|
|
30
24
|
urls = []
|
|
31
25
|
crawl_depth = None
|
|
32
26
|
per_url_limit = None
|
|
27
|
+
headers = {}
|
|
33
28
|
for condition in conditions:
|
|
34
|
-
if condition.column ==
|
|
29
|
+
if condition.column == "url":
|
|
35
30
|
if condition.op == FilterOperator.IN:
|
|
36
31
|
urls = condition.value
|
|
37
32
|
elif condition.op == FilterOperator.EQUAL:
|
|
38
33
|
urls = [condition.value]
|
|
39
34
|
condition.applied = True
|
|
40
|
-
if condition.column ==
|
|
35
|
+
if condition.column == "crawl_depth" and condition.op == FilterOperator.EQUAL:
|
|
41
36
|
crawl_depth = condition.value
|
|
42
37
|
condition.applied = True
|
|
43
|
-
if condition.column ==
|
|
38
|
+
if condition.column == "per_url_limit" and condition.op == FilterOperator.EQUAL:
|
|
44
39
|
per_url_limit = condition.value
|
|
45
40
|
condition.applied = True
|
|
41
|
+
if condition.column.lower() == "user_agent" and condition.op == FilterOperator.EQUAL:
|
|
42
|
+
headers["User-Agent"] = condition.value
|
|
43
|
+
condition.applied = True
|
|
46
44
|
|
|
47
45
|
if len(urls) == 0:
|
|
48
46
|
raise NotImplementedError(
|
|
49
|
-
'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"'
|
|
47
|
+
'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"'
|
|
48
|
+
)
|
|
50
49
|
|
|
51
|
-
allowed_urls = config.get(
|
|
50
|
+
allowed_urls = config.get("web_crawling_allowed_sites", [])
|
|
52
51
|
if allowed_urls and not validate_urls(urls, allowed_urls):
|
|
53
|
-
raise ValueError(
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}."
|
|
54
|
+
)
|
|
54
55
|
|
|
55
56
|
if limit is None and per_url_limit is None and crawl_depth is None:
|
|
56
57
|
per_url_limit = 1
|
|
@@ -58,10 +59,10 @@ class CrawlerTable(APIResource):
|
|
|
58
59
|
# crawl every url separately
|
|
59
60
|
results = []
|
|
60
61
|
for url in urls:
|
|
61
|
-
results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth))
|
|
62
|
+
results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth, headers=headers))
|
|
62
63
|
result = pd.concat(results)
|
|
63
64
|
else:
|
|
64
|
-
result = get_all_websites(urls, limit, crawl_depth=crawl_depth)
|
|
65
|
+
result = get_all_websites(urls, limit, crawl_depth=crawl_depth, headers=headers)
|
|
65
66
|
|
|
66
67
|
if limit is not None and len(result) > limit:
|
|
67
68
|
result = result[:limit]
|
|
@@ -72,11 +73,7 @@ class CrawlerTable(APIResource):
|
|
|
72
73
|
"""
|
|
73
74
|
Returns the columns of the crawler table
|
|
74
75
|
"""
|
|
75
|
-
return [
|
|
76
|
-
'url',
|
|
77
|
-
'text_content',
|
|
78
|
-
'error'
|
|
79
|
-
]
|
|
76
|
+
return ["url", "text_content", "error"]
|
|
80
77
|
|
|
81
78
|
|
|
82
79
|
class WebHandler(APIHandler):
|
|
@@ -87,7 +84,7 @@ class WebHandler(APIHandler):
|
|
|
87
84
|
def __init__(self, name=None, **kwargs):
|
|
88
85
|
super().__init__(name)
|
|
89
86
|
crawler = CrawlerTable(self)
|
|
90
|
-
self._register_table(
|
|
87
|
+
self._register_table("crawler", crawler)
|
|
91
88
|
|
|
92
89
|
def check_connection(self) -> HandlerStatusResponse:
|
|
93
90
|
"""
|
|
@@ -334,12 +334,21 @@ class VectorStoreHandler(BaseHandler):
|
|
|
334
334
|
|
|
335
335
|
if not df_update.empty:
|
|
336
336
|
# get values of existed `created_at` and return them to metadata
|
|
337
|
-
|
|
337
|
+
origin_id_col = "_original_doc_id"
|
|
338
|
+
|
|
339
|
+
created_dates, ids = {}, {}
|
|
340
|
+
for _, row in df_existed.iterrows():
|
|
341
|
+
chunk_id = row[id_col]
|
|
342
|
+
created_dates[chunk_id] = row[metadata_col].get("_created_at")
|
|
343
|
+
ids[chunk_id] = row[metadata_col].get(origin_id_col)
|
|
338
344
|
|
|
339
345
|
def keep_created_at(row):
|
|
340
346
|
val = created_dates.get(row[id_col])
|
|
341
347
|
if val:
|
|
342
348
|
row[metadata_col]["_created_at"] = val
|
|
349
|
+
# keep id column
|
|
350
|
+
if origin_id_col not in row[metadata_col]:
|
|
351
|
+
row[metadata_col][origin_id_col] = ids.get(row[id_col])
|
|
343
352
|
return row
|
|
344
353
|
|
|
345
354
|
df_update.apply(keep_created_at, axis=1)
|
|
@@ -37,54 +37,74 @@ def get_api_key(
|
|
|
37
37
|
|
|
38
38
|
# 1
|
|
39
39
|
if "using" in create_args and f"{api_name.lower()}_api_key" in create_args["using"]:
|
|
40
|
-
|
|
40
|
+
api_key = create_args["using"][f"{api_name.lower()}_api_key"]
|
|
41
|
+
if api_key:
|
|
42
|
+
return api_key
|
|
41
43
|
|
|
42
44
|
# 1.5 - Check for generic api_key in using
|
|
43
45
|
if "using" in create_args and "api_key" in create_args["using"]:
|
|
44
|
-
|
|
46
|
+
api_key = create_args["using"]["api_key"]
|
|
47
|
+
if api_key:
|
|
48
|
+
return api_key
|
|
45
49
|
|
|
46
50
|
# 2
|
|
47
51
|
if f"{api_name.lower()}_api_key" in create_args:
|
|
48
|
-
|
|
52
|
+
api_key = create_args[f"{api_name.lower()}_api_key"]
|
|
53
|
+
if api_key:
|
|
54
|
+
return api_key
|
|
49
55
|
|
|
50
56
|
# 2.5 - Check for generic api_key
|
|
51
57
|
if "api_key" in create_args:
|
|
52
|
-
|
|
58
|
+
api_key = create_args["api_key"]
|
|
59
|
+
if api_key:
|
|
60
|
+
return api_key
|
|
53
61
|
|
|
54
62
|
# 3 - Check in params dictionary if it exists (for agents)
|
|
55
63
|
if "params" in create_args and create_args["params"] is not None:
|
|
56
64
|
if f"{api_name.lower()}_api_key" in create_args["params"]:
|
|
57
|
-
|
|
65
|
+
api_key = create_args["params"][f"{api_name.lower()}_api_key"]
|
|
66
|
+
if api_key:
|
|
67
|
+
return api_key
|
|
58
68
|
# 3.5 - Check for generic api_key in params
|
|
59
69
|
if "api_key" in create_args["params"]:
|
|
60
|
-
|
|
70
|
+
api_key = create_args["params"]["api_key"]
|
|
71
|
+
if api_key:
|
|
72
|
+
return api_key
|
|
61
73
|
|
|
62
74
|
# 4
|
|
63
75
|
if engine_storage is not None:
|
|
64
76
|
connection_args = engine_storage.get_connection_args()
|
|
65
77
|
if f"{api_name.lower()}_api_key" in connection_args:
|
|
66
|
-
|
|
78
|
+
api_key = connection_args[f"{api_name.lower()}_api_key"]
|
|
79
|
+
if api_key:
|
|
80
|
+
return api_key
|
|
67
81
|
# 4.5 - Check for generic api_key in connection_args
|
|
68
82
|
if "api_key" in connection_args:
|
|
69
|
-
|
|
83
|
+
api_key = connection_args["api_key"]
|
|
84
|
+
if api_key:
|
|
85
|
+
return api_key
|
|
70
86
|
|
|
71
87
|
# 5
|
|
72
88
|
api_key = os.getenv(f"{api_name.lower()}_api_key")
|
|
73
|
-
if api_key
|
|
89
|
+
if api_key:
|
|
74
90
|
return api_key
|
|
75
91
|
api_key = os.getenv(f"{api_name.upper()}_API_KEY")
|
|
76
|
-
if api_key
|
|
92
|
+
if api_key:
|
|
77
93
|
return api_key
|
|
78
94
|
|
|
79
95
|
# 6
|
|
80
96
|
config = Config()
|
|
81
97
|
api_cfg = config.get(api_name, {})
|
|
82
98
|
if f"{api_name.lower()}_api_key" in api_cfg:
|
|
83
|
-
|
|
99
|
+
api_key = api_cfg[f"{api_name.lower()}_api_key"]
|
|
100
|
+
if api_key:
|
|
101
|
+
return api_key
|
|
84
102
|
|
|
85
103
|
# 7
|
|
86
104
|
if "api_keys" in create_args and api_name in create_args["api_keys"]:
|
|
87
|
-
|
|
105
|
+
api_key = create_args["api_keys"][api_name]
|
|
106
|
+
if api_key:
|
|
107
|
+
return api_key
|
|
88
108
|
|
|
89
109
|
if strict:
|
|
90
110
|
provider_upper = api_name.upper()
|