MindsDB 25.7.3.0__py3-none-any.whl → 25.7.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (61) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/a2a/common/server/server.py +16 -6
  3. mindsdb/api/executor/command_executor.py +206 -135
  4. mindsdb/api/executor/datahub/datanodes/project_datanode.py +14 -3
  5. mindsdb/api/executor/planner/plan_join.py +3 -0
  6. mindsdb/api/executor/planner/plan_join_ts.py +117 -100
  7. mindsdb/api/executor/planner/query_planner.py +1 -0
  8. mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +54 -85
  9. mindsdb/api/http/initialize.py +16 -43
  10. mindsdb/api/http/namespaces/agents.py +23 -20
  11. mindsdb/api/http/namespaces/chatbots.py +83 -120
  12. mindsdb/api/http/namespaces/file.py +1 -1
  13. mindsdb/api/http/namespaces/jobs.py +38 -60
  14. mindsdb/api/http/namespaces/tree.py +69 -61
  15. mindsdb/api/mcp/start.py +2 -0
  16. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +3 -2
  17. mindsdb/integrations/handlers/autogluon_handler/requirements.txt +1 -1
  18. mindsdb/integrations/handlers/autosklearn_handler/requirements.txt +1 -1
  19. mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +25 -5
  20. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +3 -3
  21. mindsdb/integrations/handlers/flaml_handler/requirements.txt +1 -1
  22. mindsdb/integrations/handlers/google_calendar_handler/google_calendar_tables.py +82 -73
  23. mindsdb/integrations/handlers/hubspot_handler/requirements.txt +1 -1
  24. mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +83 -76
  25. mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
  26. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +5 -2
  27. mindsdb/integrations/handlers/litellm_handler/settings.py +2 -1
  28. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +106 -90
  29. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +41 -39
  30. mindsdb/integrations/handlers/salesforce_handler/constants.py +208 -0
  31. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +141 -80
  32. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +0 -1
  33. mindsdb/integrations/handlers/tpot_handler/requirements.txt +1 -1
  34. mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +32 -17
  35. mindsdb/integrations/handlers/web_handler/web_handler.py +19 -22
  36. mindsdb/integrations/libs/vectordatabase_handler.py +10 -1
  37. mindsdb/integrations/utilities/handler_utils.py +32 -12
  38. mindsdb/interfaces/agents/agents_controller.py +167 -108
  39. mindsdb/interfaces/agents/langchain_agent.py +10 -3
  40. mindsdb/interfaces/data_catalog/data_catalog_loader.py +4 -4
  41. mindsdb/interfaces/database/database.py +38 -13
  42. mindsdb/interfaces/database/integrations.py +20 -5
  43. mindsdb/interfaces/database/projects.py +63 -16
  44. mindsdb/interfaces/database/views.py +86 -60
  45. mindsdb/interfaces/jobs/jobs_controller.py +103 -110
  46. mindsdb/interfaces/knowledge_base/controller.py +26 -5
  47. mindsdb/interfaces/knowledge_base/evaluate.py +2 -1
  48. mindsdb/interfaces/knowledge_base/executor.py +24 -0
  49. mindsdb/interfaces/query_context/context_controller.py +100 -133
  50. mindsdb/interfaces/skills/skills_controller.py +18 -6
  51. mindsdb/interfaces/storage/db.py +40 -6
  52. mindsdb/interfaces/variables/variables_controller.py +8 -15
  53. mindsdb/utilities/config.py +3 -3
  54. mindsdb/utilities/functions.py +72 -60
  55. mindsdb/utilities/log.py +38 -6
  56. mindsdb/utilities/ps.py +7 -7
  57. {mindsdb-25.7.3.0.dist-info → mindsdb-25.7.4.0.dist-info}/METADATA +246 -247
  58. {mindsdb-25.7.3.0.dist-info → mindsdb-25.7.4.0.dist-info}/RECORD +61 -60
  59. {mindsdb-25.7.3.0.dist-info → mindsdb-25.7.4.0.dist-info}/WHEEL +0 -0
  60. {mindsdb-25.7.3.0.dist-info → mindsdb-25.7.4.0.dist-info}/licenses/LICENSE +0 -0
  61. {mindsdb-25.7.3.0.dist-info → mindsdb-25.7.4.0.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,7 @@ from mindsdb.integrations.libs.response import (
11
11
  RESPONSE_TYPE,
12
12
  )
13
13
  from mindsdb.integrations.handlers.salesforce_handler.salesforce_tables import create_table_class
14
+ from mindsdb.integrations.handlers.salesforce_handler.constants import get_soql_instructions
14
15
  from mindsdb.utilities import log
15
16
 
16
17
 
@@ -156,91 +157,152 @@ class SalesforceHandler(MetaAPIHandler):
156
157
 
157
158
  def _get_resource_names(self) -> List[str]:
158
159
  """
159
- Retrieves the names of the Salesforce resources, with more aggressive filtering to remove tables.
160
+ Retrieves the names of the Salesforce resources with optimized pre-filtering.
160
161
  Returns:
161
162
  List[str]: A list of filtered resource names.
162
163
  """
163
164
  if not self.resource_names:
164
- all_resources = [
165
- resource["name"]
166
- for resource in self.connection.sobjects.describe()["sobjects"]
167
- if resource.get("queryable", False)
168
- ]
165
+ # Check for user-specified table filtering first
166
+ include_tables = self.connection_data.get("include_tables") or self.connection_data.get("tables")
167
+ exclude_tables = self.connection_data.get("exclude_tables", [])
168
+
169
+ if include_tables:
170
+ # OPTIMIZATION: Skip expensive global describe() call
171
+ # Only validate the specified tables
172
+ logger.info(f"Using pre-filtered table list: {include_tables}")
173
+ self.resource_names = self._validate_specified_tables(include_tables, exclude_tables)
174
+ else:
175
+ # Fallback to full discovery with hard-coded filtering
176
+ logger.info("No table filter specified, performing full discovery...")
177
+ self.resource_names = self._discover_all_tables_with_filtering(exclude_tables)
169
178
 
170
- # Define patterns for tables to be filtered out.
171
- # Expanded suffixes and prefixes and exact matches
172
- ignore_suffixes = ("Share", "History", "Feed", "ChangeEvent", "Tag", "Permission", "Setup", "Consent")
173
- ignore_prefixes = (
174
- "Apex",
175
- "CommPlatform",
176
- "Lightning",
177
- "Flow",
178
- "Transaction",
179
- "AI",
180
- "Aura",
181
- "ContentWorkspace",
182
- "Collaboration",
183
- "Datacloud",
184
- )
185
- ignore_exact = {
186
- "EntityDefinition",
187
- "FieldDefinition",
188
- "RecordType",
189
- "CaseStatus",
190
- "UserRole",
191
- "UserLicense",
192
- "UserPermissionAccess",
193
- "UserRecordAccess",
194
- "Folder",
195
- "Group",
196
- "Note",
197
- "ProcessDefinition",
198
- "ProcessInstance",
199
- "ContentFolder",
200
- "ContentDocumentSubscription",
201
- "DashboardComponent",
202
- "Report",
203
- "Dashboard",
204
- "Topic",
205
- "TopicAssignment",
206
- "Period",
207
- "Partner",
208
- "PackageLicense",
209
- "ColorDefinition",
210
- "DataUsePurpose",
211
- "DataUseLegalBasis",
212
- }
213
-
214
- ignore_substrings = (
215
- "CleanInfo",
216
- "Template",
217
- "Rule",
218
- "Definition",
219
- "Status",
220
- "Policy",
221
- "Setting",
222
- "Access",
223
- "Config",
224
- "Subscription",
225
- "DataType",
226
- "MilestoneType",
227
- "Entitlement",
228
- "Auth",
229
- )
230
-
231
- filtered = []
232
- for r in all_resources:
233
- if (
234
- not r.endswith(ignore_suffixes)
235
- and not r.startswith(ignore_prefixes)
236
- and not any(sub in r for sub in ignore_substrings)
237
- and r not in ignore_exact
238
- ):
239
- filtered.append(r)
240
-
241
- self.resource_names = [r for r in filtered]
242
179
  return self.resource_names
243
180
 
181
+ def _validate_specified_tables(self, include_tables: List[str], exclude_tables: List[str]) -> List[str]:
182
+ """
183
+ Validate user-specified tables without expensive global describe() call.
184
+
185
+ Args:
186
+ include_tables: List of table names to include
187
+ exclude_tables: List of table names to exclude
188
+
189
+ Returns:
190
+ List[str]: Validated and filtered table names
191
+ """
192
+ validated_tables = []
193
+
194
+ for table_name in include_tables:
195
+ # Skip if explicitly excluded
196
+ if table_name in exclude_tables:
197
+ logger.info(f"Skipping excluded table: {table_name}")
198
+ continue
199
+
200
+ try:
201
+ # Quick validation: check if table exists and is queryable
202
+ # This is much faster than global describe()
203
+ metadata = getattr(self.connection.sobjects, table_name).describe()
204
+ if metadata.get("queryable", False):
205
+ validated_tables.append(table_name)
206
+ logger.debug(f"Validated table: {table_name}")
207
+ else:
208
+ logger.warning(f"Table {table_name} is not queryable, skipping")
209
+ except Exception as e:
210
+ logger.warning(f"Table {table_name} not found or accessible: {e}")
211
+
212
+ logger.info(f"Validated {len(validated_tables)} tables from include_tables")
213
+ return validated_tables
214
+
215
+ def _discover_all_tables_with_filtering(self, exclude_tables: List[str]) -> List[str]:
216
+ """
217
+ Fallback method: discover all tables with hard-coded filtering.
218
+
219
+ Args:
220
+ exclude_tables: List of table names to exclude
221
+
222
+ Returns:
223
+ List[str]: Filtered table names
224
+ """
225
+ # This is the original expensive approach - only used when no include_tables specified
226
+ all_resources = [
227
+ resource["name"]
228
+ for resource in self.connection.sobjects.describe()["sobjects"]
229
+ if resource.get("queryable", False)
230
+ ]
231
+
232
+ # Apply hard-coded filtering (existing logic)
233
+ ignore_suffixes = ("Share", "History", "Feed", "ChangeEvent", "Tag", "Permission", "Setup", "Consent")
234
+ ignore_prefixes = (
235
+ "Apex",
236
+ "CommPlatform",
237
+ "Lightning",
238
+ "Flow",
239
+ "Transaction",
240
+ "AI",
241
+ "Aura",
242
+ "ContentWorkspace",
243
+ "Collaboration",
244
+ "Datacloud",
245
+ )
246
+ ignore_exact = {
247
+ "EntityDefinition",
248
+ "FieldDefinition",
249
+ "RecordType",
250
+ "CaseStatus",
251
+ "UserRole",
252
+ "UserLicense",
253
+ "UserPermissionAccess",
254
+ "UserRecordAccess",
255
+ "Folder",
256
+ "Group",
257
+ "Note",
258
+ "ProcessDefinition",
259
+ "ProcessInstance",
260
+ "ContentFolder",
261
+ "ContentDocumentSubscription",
262
+ "DashboardComponent",
263
+ "Report",
264
+ "Dashboard",
265
+ "Topic",
266
+ "TopicAssignment",
267
+ "Period",
268
+ "Partner",
269
+ "PackageLicense",
270
+ "ColorDefinition",
271
+ "DataUsePurpose",
272
+ "DataUseLegalBasis",
273
+ }
274
+
275
+ ignore_substrings = (
276
+ "CleanInfo",
277
+ "Template",
278
+ "Rule",
279
+ "Definition",
280
+ "Status",
281
+ "Policy",
282
+ "Setting",
283
+ "Access",
284
+ "Config",
285
+ "Subscription",
286
+ "DataType",
287
+ "MilestoneType",
288
+ "Entitlement",
289
+ "Auth",
290
+ )
291
+
292
+ # Apply hard-coded filtering
293
+ filtered = []
294
+ for r in all_resources:
295
+ if (
296
+ not r.endswith(ignore_suffixes)
297
+ and not r.startswith(ignore_prefixes)
298
+ and not any(sub in r for sub in ignore_substrings)
299
+ and r not in ignore_exact
300
+ and r not in exclude_tables # Apply user exclusions
301
+ ):
302
+ filtered.append(r)
303
+
304
+ return filtered
305
+
244
306
  def meta_get_handler_info(self, **kwargs) -> str:
245
307
  """
246
308
  Retrieves information about the design and implementation of the API handler.
@@ -254,8 +316,7 @@ class SalesforceHandler(MetaAPIHandler):
254
316
  Returns:
255
317
  str: A string containing information about the API handler's design and implementation.
256
318
  """
257
- # TODO: Relationships? Aliases?
258
- return "When filtering on a Date or DateTime field, the value MUST be an unquoted literal in YYYY-MM-DD or YYYY-MM-DDThh:mm:ssZ format. For example, CloseDate >= 2025-05-28 is correct; CloseDate >= '2025-05-28' is incorrect."
319
+ return get_soql_instructions(self.name)
259
320
 
260
321
  def meta_get_tables(self, table_names: Optional[List[str]] = None) -> Response:
261
322
  """
@@ -176,7 +176,6 @@ def create_table_class(resource_name: Text) -> MetaAPIResource:
176
176
  "table_description": "",
177
177
  "row_count": None,
178
178
  }
179
-
180
179
  # Get row count if Id column is aggregatable.
181
180
  row_count = None
182
181
  # if next(field for field in resource_metadata['fields'] if field['name'] == 'Id').get('aggregatable', False):
@@ -1,2 +1,2 @@
1
1
  tpot<=0.11.7
2
- type_infer==0.0.20
2
+ type_infer==0.0.23
@@ -100,26 +100,25 @@ def parallel_get_all_website_links(urls) -> dict:
100
100
  return url_contents
101
101
 
102
102
  with concurrent.futures.ProcessPoolExecutor() as executor:
103
- future_to_url = {
104
- executor.submit(get_all_website_links, url): url for url in urls
105
- }
103
+ future_to_url = {executor.submit(get_all_website_links, url): url for url in urls}
106
104
  for future in concurrent.futures.as_completed(future_to_url):
107
105
  url = future_to_url[future]
108
106
  try:
109
107
  url_contents[url] = future.result()
110
108
  except Exception as exc:
111
- logger.error(f'{url} generated an exception: {exc}')
109
+ logger.error(f"{url} generated an exception: {exc}")
112
110
  # don't raise the exception, just log it, continue processing other urls
113
111
 
114
112
  return url_contents
115
113
 
116
114
 
117
- def get_all_website_links(url) -> dict:
115
+ def get_all_website_links(url, headers: dict = None) -> dict:
118
116
  """
119
117
  Fetch all website links from a URL.
120
118
 
121
119
  Args:
122
120
  url (str): the URL to fetch links from
121
+ headers (dict): a dictionary of headers to use when fetching links
123
122
 
124
123
  Returns:
125
124
  A dictionary containing the URL, the extracted links, the HTML content, the text content, and any error that occurred.
@@ -132,9 +131,12 @@ def get_all_website_links(url) -> dict:
132
131
  session = requests.Session()
133
132
 
134
133
  # Add headers to mimic a real browser request
135
- headers = {
136
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
137
- }
134
+ if headers is None:
135
+ headers = {}
136
+ if "User-Agent" not in headers:
137
+ headers["User-Agent"] = (
138
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.3"
139
+ )
138
140
 
139
141
  response = session.get(url, headers=headers)
140
142
  if "cookie" in response.request.headers:
@@ -157,7 +159,7 @@ def get_all_website_links(url) -> dict:
157
159
  continue
158
160
  href = urljoin(url, href)
159
161
  parsed_href = urlparse(href)
160
- href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path, '', '', ''))
162
+ href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path, "", "", ""))
161
163
  if not is_valid(href):
162
164
  continue
163
165
  if href in urls:
@@ -203,7 +205,15 @@ def get_readable_text_from_soup(soup) -> str:
203
205
  return html_converter.handle(str(soup))
204
206
 
205
207
 
206
- def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_depth: int = 1, current_depth: int = 0, filters: List[str] = None):
208
+ def get_all_website_links_recursively(
209
+ url,
210
+ reviewed_urls,
211
+ limit=None,
212
+ crawl_depth: int = 1,
213
+ current_depth: int = 0,
214
+ filters: List[str] = None,
215
+ headers=None,
216
+ ):
207
217
  """
208
218
  Recursively gathers all links from a given website up to a specified limit.
209
219
 
@@ -227,7 +237,7 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
227
237
  matches_filter = any(re.match(f, url) is not None for f in filters)
228
238
  if url not in reviewed_urls and matches_filter:
229
239
  try:
230
- reviewed_urls[url] = get_all_website_links(url)
240
+ reviewed_urls[url] = get_all_website_links(url, headers=headers)
231
241
  except Exception as e:
232
242
  error_message = traceback.format_exc().splitlines()[-1]
233
243
  logger.error("An exception occurred: %s", str(e))
@@ -271,10 +281,14 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
271
281
  reviewed_urls.update(new_revised_urls)
272
282
 
273
283
  for new_url in new_revised_urls:
274
- get_all_website_links_recursively(new_url, reviewed_urls, limit, crawl_depth=crawl_depth, current_depth=current_depth + 1, filters=filters)
284
+ get_all_website_links_recursively(
285
+ new_url, reviewed_urls, limit, crawl_depth=crawl_depth, current_depth=current_depth + 1, filters=filters
286
+ )
275
287
 
276
288
 
277
- def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: List[str] = None) -> pd.DataFrame:
289
+ def get_all_websites(
290
+ urls, limit=1, html=False, crawl_depth: int = 1, filters: List[str] = None, headers: dict = None
291
+ ) -> pd.DataFrame:
278
292
  """
279
293
  Crawl a list of websites and return a DataFrame containing the results.
280
294
 
@@ -284,6 +298,7 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
284
298
  crawl_depth (int): Crawl depth for URLs.
285
299
  html (bool): a boolean indicating whether to include the HTML content in the results
286
300
  filters (List[str]): Crawl URLs that only match these regex patterns.
301
+ headers (dict): headers of request
287
302
 
288
303
  Returns:
289
304
  A DataFrame containing the results.
@@ -299,7 +314,9 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
299
314
  if urlparse(url).scheme == "":
300
315
  # Try HTTPS first
301
316
  url = "https://" + url
302
- get_all_website_links_recursively(url, reviewed_urls, limit, crawl_depth=crawl_depth, filters=filters)
317
+ get_all_website_links_recursively(
318
+ url, reviewed_urls, limit, crawl_depth=crawl_depth, filters=filters, headers=headers
319
+ )
303
320
 
304
321
  # Use a ThreadPoolExecutor to run the helper function in parallel.
305
322
  with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -311,9 +328,7 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
311
328
  columns_to_ignore = ["urls"]
312
329
  if html is False:
313
330
  columns_to_ignore += ["html_content"]
314
- df = dict_to_dataframe(
315
- reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url"
316
- )
331
+ df = dict_to_dataframe(reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url")
317
332
 
318
333
  if not df.empty and df[df.error.isna()].empty:
319
334
  raise Exception(str(df.iloc[0].error))
@@ -7,17 +7,11 @@ from mindsdb.utilities.security import validate_urls
7
7
  from .urlcrawl_helpers import get_all_websites
8
8
 
9
9
  from mindsdb.integrations.libs.api_handler import APIResource, APIHandler
10
- from mindsdb.integrations.utilities.sql_utils import (FilterCondition, FilterOperator)
10
+ from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
11
11
 
12
12
 
13
13
  class CrawlerTable(APIResource):
14
-
15
- def list(
16
- self,
17
- conditions: List[FilterCondition] = None,
18
- limit: int = None,
19
- **kwargs
20
- ) -> pd.DataFrame:
14
+ def list(self, conditions: List[FilterCondition] = None, limit: int = None, **kwargs) -> pd.DataFrame:
21
15
  """
22
16
  Selects data from the provided websites
23
17
 
@@ -30,27 +24,34 @@ class CrawlerTable(APIResource):
30
24
  urls = []
31
25
  crawl_depth = None
32
26
  per_url_limit = None
27
+ headers = {}
33
28
  for condition in conditions:
34
- if condition.column == 'url':
29
+ if condition.column == "url":
35
30
  if condition.op == FilterOperator.IN:
36
31
  urls = condition.value
37
32
  elif condition.op == FilterOperator.EQUAL:
38
33
  urls = [condition.value]
39
34
  condition.applied = True
40
- if condition.column == 'crawl_depth' and condition.op == FilterOperator.EQUAL:
35
+ if condition.column == "crawl_depth" and condition.op == FilterOperator.EQUAL:
41
36
  crawl_depth = condition.value
42
37
  condition.applied = True
43
- if condition.column == 'per_url_limit' and condition.op == FilterOperator.EQUAL:
38
+ if condition.column == "per_url_limit" and condition.op == FilterOperator.EQUAL:
44
39
  per_url_limit = condition.value
45
40
  condition.applied = True
41
+ if condition.column.lower() == "user_agent" and condition.op == FilterOperator.EQUAL:
42
+ headers["User-Agent"] = condition.value
43
+ condition.applied = True
46
44
 
47
45
  if len(urls) == 0:
48
46
  raise NotImplementedError(
49
- 'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"')
47
+ 'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"'
48
+ )
50
49
 
51
- allowed_urls = config.get('web_crawling_allowed_sites', [])
50
+ allowed_urls = config.get("web_crawling_allowed_sites", [])
52
51
  if allowed_urls and not validate_urls(urls, allowed_urls):
53
- raise ValueError(f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}.")
52
+ raise ValueError(
53
+ f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}."
54
+ )
54
55
 
55
56
  if limit is None and per_url_limit is None and crawl_depth is None:
56
57
  per_url_limit = 1
@@ -58,10 +59,10 @@ class CrawlerTable(APIResource):
58
59
  # crawl every url separately
59
60
  results = []
60
61
  for url in urls:
61
- results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth))
62
+ results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth, headers=headers))
62
63
  result = pd.concat(results)
63
64
  else:
64
- result = get_all_websites(urls, limit, crawl_depth=crawl_depth)
65
+ result = get_all_websites(urls, limit, crawl_depth=crawl_depth, headers=headers)
65
66
 
66
67
  if limit is not None and len(result) > limit:
67
68
  result = result[:limit]
@@ -72,11 +73,7 @@ class CrawlerTable(APIResource):
72
73
  """
73
74
  Returns the columns of the crawler table
74
75
  """
75
- return [
76
- 'url',
77
- 'text_content',
78
- 'error'
79
- ]
76
+ return ["url", "text_content", "error"]
80
77
 
81
78
 
82
79
  class WebHandler(APIHandler):
@@ -87,7 +84,7 @@ class WebHandler(APIHandler):
87
84
  def __init__(self, name=None, **kwargs):
88
85
  super().__init__(name)
89
86
  crawler = CrawlerTable(self)
90
- self._register_table('crawler', crawler)
87
+ self._register_table("crawler", crawler)
91
88
 
92
89
  def check_connection(self) -> HandlerStatusResponse:
93
90
  """
@@ -334,12 +334,21 @@ class VectorStoreHandler(BaseHandler):
334
334
 
335
335
  if not df_update.empty:
336
336
  # get values of existed `created_at` and return them to metadata
337
- created_dates = {row[id_col]: row[metadata_col].get("_created_at") for _, row in df_existed.iterrows()}
337
+ origin_id_col = "_original_doc_id"
338
+
339
+ created_dates, ids = {}, {}
340
+ for _, row in df_existed.iterrows():
341
+ chunk_id = row[id_col]
342
+ created_dates[chunk_id] = row[metadata_col].get("_created_at")
343
+ ids[chunk_id] = row[metadata_col].get(origin_id_col)
338
344
 
339
345
  def keep_created_at(row):
340
346
  val = created_dates.get(row[id_col])
341
347
  if val:
342
348
  row[metadata_col]["_created_at"] = val
349
+ # keep id column
350
+ if origin_id_col not in row[metadata_col]:
351
+ row[metadata_col][origin_id_col] = ids.get(row[id_col])
343
352
  return row
344
353
 
345
354
  df_update.apply(keep_created_at, axis=1)
@@ -37,54 +37,74 @@ def get_api_key(
37
37
 
38
38
  # 1
39
39
  if "using" in create_args and f"{api_name.lower()}_api_key" in create_args["using"]:
40
- return create_args["using"][f"{api_name.lower()}_api_key"]
40
+ api_key = create_args["using"][f"{api_name.lower()}_api_key"]
41
+ if api_key:
42
+ return api_key
41
43
 
42
44
  # 1.5 - Check for generic api_key in using
43
45
  if "using" in create_args and "api_key" in create_args["using"]:
44
- return create_args["using"]["api_key"]
46
+ api_key = create_args["using"]["api_key"]
47
+ if api_key:
48
+ return api_key
45
49
 
46
50
  # 2
47
51
  if f"{api_name.lower()}_api_key" in create_args:
48
- return create_args[f"{api_name.lower()}_api_key"]
52
+ api_key = create_args[f"{api_name.lower()}_api_key"]
53
+ if api_key:
54
+ return api_key
49
55
 
50
56
  # 2.5 - Check for generic api_key
51
57
  if "api_key" in create_args:
52
- return create_args["api_key"]
58
+ api_key = create_args["api_key"]
59
+ if api_key:
60
+ return api_key
53
61
 
54
62
  # 3 - Check in params dictionary if it exists (for agents)
55
63
  if "params" in create_args and create_args["params"] is not None:
56
64
  if f"{api_name.lower()}_api_key" in create_args["params"]:
57
- return create_args["params"][f"{api_name.lower()}_api_key"]
65
+ api_key = create_args["params"][f"{api_name.lower()}_api_key"]
66
+ if api_key:
67
+ return api_key
58
68
  # 3.5 - Check for generic api_key in params
59
69
  if "api_key" in create_args["params"]:
60
- return create_args["params"]["api_key"]
70
+ api_key = create_args["params"]["api_key"]
71
+ if api_key:
72
+ return api_key
61
73
 
62
74
  # 4
63
75
  if engine_storage is not None:
64
76
  connection_args = engine_storage.get_connection_args()
65
77
  if f"{api_name.lower()}_api_key" in connection_args:
66
- return connection_args[f"{api_name.lower()}_api_key"]
78
+ api_key = connection_args[f"{api_name.lower()}_api_key"]
79
+ if api_key:
80
+ return api_key
67
81
  # 4.5 - Check for generic api_key in connection_args
68
82
  if "api_key" in connection_args:
69
- return connection_args["api_key"]
83
+ api_key = connection_args["api_key"]
84
+ if api_key:
85
+ return api_key
70
86
 
71
87
  # 5
72
88
  api_key = os.getenv(f"{api_name.lower()}_api_key")
73
- if api_key is not None:
89
+ if api_key:
74
90
  return api_key
75
91
  api_key = os.getenv(f"{api_name.upper()}_API_KEY")
76
- if api_key is not None:
92
+ if api_key:
77
93
  return api_key
78
94
 
79
95
  # 6
80
96
  config = Config()
81
97
  api_cfg = config.get(api_name, {})
82
98
  if f"{api_name.lower()}_api_key" in api_cfg:
83
- return api_cfg[f"{api_name.lower()}_api_key"]
99
+ api_key = api_cfg[f"{api_name.lower()}_api_key"]
100
+ if api_key:
101
+ return api_key
84
102
 
85
103
  # 7
86
104
  if "api_keys" in create_args and api_name in create_args["api_keys"]:
87
- return create_args["api_keys"][api_name]
105
+ api_key = create_args["api_keys"][api_name]
106
+ if api_key:
107
+ return api_key
88
108
 
89
109
  if strict:
90
110
  provider_upper = api_name.upper()