MindsDB 25.7.3.0__py3-none-any.whl → 25.8.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (102) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +11 -1
  3. mindsdb/api/a2a/common/server/server.py +16 -6
  4. mindsdb/api/executor/command_executor.py +215 -150
  5. mindsdb/api/executor/datahub/datanodes/project_datanode.py +14 -3
  6. mindsdb/api/executor/planner/plan_join.py +3 -0
  7. mindsdb/api/executor/planner/plan_join_ts.py +117 -100
  8. mindsdb/api/executor/planner/query_planner.py +1 -0
  9. mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +54 -85
  10. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +21 -24
  11. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +9 -3
  12. mindsdb/api/executor/sql_query/steps/subselect_step.py +11 -8
  13. mindsdb/api/executor/utilities/mysql_to_duckdb_functions.py +264 -0
  14. mindsdb/api/executor/utilities/sql.py +30 -0
  15. mindsdb/api/http/initialize.py +18 -44
  16. mindsdb/api/http/namespaces/agents.py +23 -20
  17. mindsdb/api/http/namespaces/chatbots.py +83 -120
  18. mindsdb/api/http/namespaces/file.py +1 -1
  19. mindsdb/api/http/namespaces/jobs.py +38 -60
  20. mindsdb/api/http/namespaces/tree.py +69 -61
  21. mindsdb/api/http/namespaces/views.py +56 -72
  22. mindsdb/api/mcp/start.py +2 -0
  23. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +3 -2
  24. mindsdb/integrations/handlers/autogluon_handler/requirements.txt +1 -1
  25. mindsdb/integrations/handlers/autosklearn_handler/requirements.txt +1 -1
  26. mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +25 -5
  27. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +3 -3
  28. mindsdb/integrations/handlers/db2_handler/db2_handler.py +19 -23
  29. mindsdb/integrations/handlers/flaml_handler/requirements.txt +1 -1
  30. mindsdb/integrations/handlers/gong_handler/__about__.py +2 -0
  31. mindsdb/integrations/handlers/gong_handler/__init__.py +30 -0
  32. mindsdb/integrations/handlers/gong_handler/connection_args.py +37 -0
  33. mindsdb/integrations/handlers/gong_handler/gong_handler.py +164 -0
  34. mindsdb/integrations/handlers/gong_handler/gong_tables.py +508 -0
  35. mindsdb/integrations/handlers/gong_handler/icon.svg +25 -0
  36. mindsdb/integrations/handlers/gong_handler/test_gong_handler.py +125 -0
  37. mindsdb/integrations/handlers/google_calendar_handler/google_calendar_tables.py +82 -73
  38. mindsdb/integrations/handlers/hubspot_handler/requirements.txt +1 -1
  39. mindsdb/integrations/handlers/huggingface_handler/__init__.py +8 -12
  40. mindsdb/integrations/handlers/huggingface_handler/finetune.py +203 -223
  41. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +360 -383
  42. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -7
  43. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -7
  44. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  45. mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +83 -77
  46. mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
  47. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +5 -2
  48. mindsdb/integrations/handlers/litellm_handler/settings.py +2 -1
  49. mindsdb/integrations/handlers/openai_handler/constants.py +11 -30
  50. mindsdb/integrations/handlers/openai_handler/helpers.py +27 -34
  51. mindsdb/integrations/handlers/openai_handler/openai_handler.py +14 -12
  52. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +106 -90
  53. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +41 -39
  54. mindsdb/integrations/handlers/salesforce_handler/constants.py +215 -0
  55. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +141 -80
  56. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +0 -1
  57. mindsdb/integrations/handlers/tpot_handler/requirements.txt +1 -1
  58. mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +32 -17
  59. mindsdb/integrations/handlers/web_handler/web_handler.py +19 -22
  60. mindsdb/integrations/libs/llm/config.py +0 -14
  61. mindsdb/integrations/libs/llm/utils.py +0 -15
  62. mindsdb/integrations/libs/vectordatabase_handler.py +10 -1
  63. mindsdb/integrations/utilities/files/file_reader.py +5 -19
  64. mindsdb/integrations/utilities/handler_utils.py +32 -12
  65. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +1 -1
  66. mindsdb/interfaces/agents/agents_controller.py +246 -149
  67. mindsdb/interfaces/agents/constants.py +0 -1
  68. mindsdb/interfaces/agents/langchain_agent.py +11 -6
  69. mindsdb/interfaces/data_catalog/data_catalog_loader.py +4 -4
  70. mindsdb/interfaces/database/database.py +38 -13
  71. mindsdb/interfaces/database/integrations.py +20 -5
  72. mindsdb/interfaces/database/projects.py +174 -23
  73. mindsdb/interfaces/database/views.py +86 -60
  74. mindsdb/interfaces/jobs/jobs_controller.py +103 -110
  75. mindsdb/interfaces/knowledge_base/controller.py +33 -6
  76. mindsdb/interfaces/knowledge_base/evaluate.py +2 -1
  77. mindsdb/interfaces/knowledge_base/executor.py +24 -0
  78. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +6 -10
  79. mindsdb/interfaces/knowledge_base/preprocessing/text_splitter.py +73 -0
  80. mindsdb/interfaces/query_context/context_controller.py +111 -145
  81. mindsdb/interfaces/skills/skills_controller.py +18 -6
  82. mindsdb/interfaces/storage/db.py +40 -6
  83. mindsdb/interfaces/variables/variables_controller.py +8 -15
  84. mindsdb/utilities/config.py +5 -3
  85. mindsdb/utilities/fs.py +54 -17
  86. mindsdb/utilities/functions.py +72 -60
  87. mindsdb/utilities/log.py +38 -6
  88. mindsdb/utilities/ps.py +7 -7
  89. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/METADATA +282 -268
  90. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/RECORD +94 -92
  91. mindsdb/integrations/handlers/anyscale_endpoints_handler/__about__.py +0 -9
  92. mindsdb/integrations/handlers/anyscale_endpoints_handler/__init__.py +0 -20
  93. mindsdb/integrations/handlers/anyscale_endpoints_handler/anyscale_endpoints_handler.py +0 -290
  94. mindsdb/integrations/handlers/anyscale_endpoints_handler/creation_args.py +0 -14
  95. mindsdb/integrations/handlers/anyscale_endpoints_handler/icon.svg +0 -4
  96. mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -2
  97. mindsdb/integrations/handlers/anyscale_endpoints_handler/settings.py +0 -51
  98. mindsdb/integrations/handlers/anyscale_endpoints_handler/tests/test_anyscale_endpoints_handler.py +0 -212
  99. /mindsdb/integrations/handlers/{anyscale_endpoints_handler/tests/__init__.py → gong_handler/requirements.txt} +0 -0
  100. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/WHEEL +0 -0
  101. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/licenses/LICENSE +0 -0
  102. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,215 @@
1
+ """
2
+ Constants for Salesforce handler.
3
+ """
4
+
5
+
6
+ def get_soql_instructions(integration_name):
7
+ return f"""This handler executes SOQL (Salesforce Object Query Language), NOT SQL! Follow these rules strictly:
8
+
9
+ **BASIC STRUCTURE:**
10
+ - NO "SELECT *" - must explicitly list all fields
11
+ SQL: SELECT * FROM Account;
12
+ SOQL: SELECT Id, Name, Industry FROM Account
13
+ - NO table aliases - use full table names only
14
+ SQL: SELECT a.Name FROM Account a;
15
+ SOQL: SELECT Name FROM Account
16
+ - NO column aliases - field names cannot be aliased
17
+ SQL: SELECT Name AS CompanyName FROM Account;
18
+ SOQL: SELECT Name FROM Account
19
+ - NO DISTINCT keyword - not supported in SOQL
20
+ SQL: SELECT DISTINCT Industry FROM Account;
21
+ SOQL: Not possible - use separate logic
22
+ - NO subqueries in FROM clause - only relationship-based subqueries allowed
23
+ SQL: SELECT * FROM (SELECT Name FROM Account) AS AccountNames;
24
+ SOQL: Not supported
25
+ - Do not use fields that are not defined in the schema or data catalog. Always reference exact field names.
26
+
27
+ **FIELD SELECTION:**
28
+ - Always include Id field when querying
29
+ CORRECT: SELECT Id, Name, Industry FROM Account
30
+ INCORRECT: SELECT Name, Industry FROM Account
31
+ - Field names are case-sensitive
32
+ CORRECT: SELECT CreatedDate FROM Account
33
+ INCORRECT: SELECT createddate FROM Account
34
+ - Use exact field names from the data catalog
35
+ CORRECT: SELECT CustomerPriority__c FROM Account
36
+ INCORRECT: SELECT customer_priority FROM Account
37
+
38
+ **FILTERING (WHERE clause):**
39
+ - Date/DateTime fields: Use unquoted literals in YYYY-MM-DD or YYYY-MM-DDThh:mm:ssZ format
40
+ CORRECT: WHERE CloseDate >= 2025-05-28
41
+ CORRECT: WHERE CreatedDate >= 2025-05-28T10:30:00Z
42
+ INCORRECT: WHERE CloseDate >= '2025-05-28'
43
+ INCORRECT: WHERE CreatedDate >= "2025-05-28"
44
+ - Special date literals: TODAY, YESTERDAY, LAST_WEEK, LAST_MONTH, LAST_QUARTER, LAST_YEAR, THIS_WEEK, THIS_MONTH, THIS_QUARTER, THIS_YEAR
45
+ CORRECT: WHERE CreatedDate = TODAY
46
+ CORRECT: WHERE LastModifiedDate >= LAST_MONTH
47
+ CORRECT: WHERE CloseDate >= THIS_QUARTER
48
+ - Date arithmetic (e.g., TODAY - 10) is not supported. Use literals like LAST_N_DAYS:10 instead.
49
+ CORRECT: WHERE CloseDate >= LAST_N_DAYS:10
50
+ INCORRECT: WHERE CloseDate >= TODAY - 10
51
+ - LIKE operator: Only supports % wildcard, NO underscore (_) wildcard
52
+ CORRECT: WHERE Name LIKE '%Corp%'
53
+ CORRECT: WHERE Name LIKE 'Acme%'
54
+ INCORRECT: WHERE Name LIKE 'A_me%'
55
+ - BETWEEN operator: NOT supported, use >= AND <= instead
56
+ SQL: WHERE CreatedDate BETWEEN '2025-01-01' AND '2025-12-31'
57
+ SOQL: WHERE CreatedDate >= 2025-01-01 AND CreatedDate <= 2025-12-31
58
+ - Boolean values: Use lowercase true/false, NOT TRUE/FALSE
59
+ CORRECT: WHERE Active__c = true
60
+ CORRECT: WHERE IsDeleted = false
61
+ INCORRECT: WHERE Active__c = TRUE
62
+ INCORRECT: WHERE IsDeleted = FALSE
63
+ - NULL values: Use lowercase null, NOT NULL
64
+ CORRECT: WHERE ParentId = null
65
+ CORRECT: WHERE Description != null
66
+ INCORRECT: WHERE ParentId IS NULL
67
+ INCORRECT: WHERE Description IS NOT NULL
68
+ - String values: Use single quotes for strings
69
+ CORRECT: WHERE Industry = 'Technology'
70
+ CORRECT: WHERE Name = 'Acme Corp'
71
+ INCORRECT: WHERE Industry = "Technology"
72
+ - Multi-select picklist fields: Use INCLUDES('value1;value2') or EXCLUDES('value1;value2')
73
+ CORRECT: WHERE Services__c INCLUDES ('Consulting;Support')
74
+ CORRECT: WHERE Services__c EXCLUDES ('Training')
75
+ INCORRECT: WHERE Services__c = 'Consulting'
76
+ - Limited subquery support - only IN/NOT IN with non-correlated subqueries in WHERE clause
77
+ CORRECT: SELECT Id FROM Contact WHERE Id NOT IN (SELECT WhoId FROM Task)
78
+ INCORRECT: SELECT Id FROM Contact WHERE NOT EXISTS (SELECT 1 FROM Task WHERE WhoId = Contact.Id)
79
+
80
+ **JOINS:**
81
+ - NO explicit JOIN syntax supported
82
+ SQL: SELECT a.Name, c.FirstName FROM Account a JOIN Contact c ON a.Id = c.AccountId
83
+ SOQL: Not supported - use relationship traversal (not applicable in this use case)
84
+
85
+ **AGGREGATES:**
86
+ - NO COUNT(*) - use COUNT(Id) instead
87
+ SQL: SELECT COUNT(*) FROM Account
88
+ SOQL: SELECT COUNT(Id) FROM Account
89
+ - Cannot mix aggregate functions with non-aggregate fields unless using GROUP BY
90
+ CORRECT: SELECT Industry, COUNT(Id) FROM Account GROUP BY Industry
91
+ CORRECT: SELECT COUNT(Id) FROM Account
92
+ INCORRECT: SELECT Industry, Name, COUNT(Id) FROM Account
93
+ - NO GROUP_CONCAT or string aggregation functions
94
+ SQL: SELECT GROUP_CONCAT(Name) FROM Account
95
+ SOQL: Not supported
96
+ - NO HAVING clause
97
+ SQL: SELECT Industry, COUNT(*) FROM Account GROUP BY Industry HAVING COUNT(*) > 5
98
+ SOQL: Not supported - filter with separate logic
99
+ - GROUP BY has limited field type support
100
+ CORRECT: SELECT Industry, COUNT(Id) FROM Account GROUP BY Industry
101
+ INCORRECT: SELECT Description, COUNT(Id) FROM Account GROUP BY Description (textarea fields not supported)
102
+
103
+ **FUNCTIONS:**
104
+ - Date functions: CALENDAR_MONTH(), CALENDAR_YEAR(), CALENDAR_QUARTER(), DAY_IN_MONTH(), DAY_IN_WEEK(), DAY_IN_YEAR(), HOUR_IN_DAY(), WEEK_IN_MONTH(), WEEK_IN_YEAR()
105
+ CORRECT: SELECT Id, Name FROM Account WHERE CALENDAR_YEAR(CreatedDate) = 2025
106
+ CORRECT: SELECT Id, Name FROM Account WHERE CALENDAR_MONTH(CreatedDate) = 5
107
+ CORRECT: SELECT Id, Name FROM Account WHERE DAY_IN_WEEK(CreatedDate) = 2
108
+ - NO math functions: ROUND, FLOOR, CEILING, ABS, etc.
109
+ SQL: SELECT ROUND(AnnualRevenue, 2) FROM Account
110
+ SOQL: Not supported
111
+ - NO conditional functions: CASE WHEN, COALESCE, NULLIF, etc.
112
+ SQL: SELECT CASE WHEN Industry = 'Technology' THEN 'Tech' ELSE 'Other' END FROM Account
113
+ SOQL: Not supported
114
+ - NO string functions except INCLUDES/EXCLUDES for multi-select picklists
115
+ SQL: SELECT UPPER(Name) FROM Account
116
+ SOQL: Not supported
117
+
118
+ **OPERATORS:**
119
+ - Supported: =, !=, <, >, <=, >=, LIKE, IN, NOT IN, INCLUDES, EXCLUDES
120
+ CORRECT: WHERE Industry = 'Technology'
121
+ CORRECT: WHERE AnnualRevenue >= 1000000
122
+ CORRECT: WHERE Industry IN ('Technology', 'Finance')
123
+ CORRECT: WHERE Industry NOT IN ('Government', 'Non-Profit')
124
+ CORRECT: WHERE Services__c INCLUDES ('Consulting')
125
+ - NOT supported: REGEXP, BETWEEN, EXISTS, NOT EXISTS
126
+ SQL: WHERE Name REGEXP '^[A-Z]'
127
+ SOQL: Not supported
128
+
129
+ **SORTING & LIMITING:**
130
+ - ORDER BY: Fully supported
131
+ CORRECT: SELECT Id, Name FROM Account ORDER BY Name ASC
132
+ CORRECT: SELECT Id, Name FROM Account ORDER BY CreatedDate DESC, Name ASC
133
+ CORRECT: SELECT Id, Name FROM Account ORDER BY Name NULLS LAST
134
+ - LIMIT: Maximum 2000 records, use smaller limits for better performance
135
+ CORRECT: SELECT Id, Name FROM Account LIMIT 100
136
+ CORRECT: SELECT Id, Name FROM Account LIMIT 2000
137
+ INCORRECT: SELECT Id, Name FROM Account LIMIT 5000
138
+ - NO OFFSET: Not supported for pagination
139
+ SQL: SELECT Id, Name FROM Account LIMIT 10 OFFSET 20
140
+ SOQL: Not supported
141
+
142
+ **DATA TYPES:**
143
+ - picklist: Single-select dropdown, use = operator with string values
144
+ CORRECT: WHERE Industry = 'Technology'
145
+ CORRECT: WHERE Rating = 'Hot'
146
+ - reference: Foreign key field, typically ends with Id
147
+ CORRECT: WHERE OwnerId = '00530000003OOwn'
148
+ CORRECT: WHERE AccountId = '0013000000UzXyz'
149
+ - boolean: Use lowercase true/false
150
+ CORRECT: WHERE IsDeleted = false
151
+ CORRECT: WHERE Active__c = true
152
+ - currency: Numeric field for money values
153
+ CORRECT: WHERE AnnualRevenue > 1000000
154
+ CORRECT: WHERE AnnualRevenue >= 500000.50
155
+ - date: Date only, use YYYY-MM-DD format
156
+ CORRECT: WHERE LastActivityDate = 2025-05-28
157
+ CORRECT: WHERE SLAExpirationDate__c >= 2025-01-01
158
+ - datetime: Date and time, use YYYY-MM-DDThh:mm:ssZ format
159
+ CORRECT: WHERE CreatedDate >= 2025-05-28T10:30:00Z
160
+ CORRECT: WHERE LastModifiedDate = 2025-05-28T00:00:00Z
161
+ - double/int: Numeric fields
162
+ CORRECT: WHERE NumberOfEmployees > 100
163
+ CORRECT: WHERE NumberofLocations__c >= 5.5
164
+ - string/textarea: Text fields, use single quotes
165
+ CORRECT: WHERE Name = 'Acme Corporation'
166
+ CORRECT: WHERE Description = 'Leading tech company'
167
+ - phone/url/email: Specialized string fields, treat as strings
168
+ CORRECT: WHERE Phone = '555-1234'
169
+ CORRECT: WHERE Website = 'https://example.com'
170
+
171
+ **COMMON MISTAKES TO AVOID:**
172
+ - Using SELECT * (not allowed)
173
+ WRONG: SELECT * FROM Account
174
+ RIGHT: SELECT Id, Name, Industry FROM Account
175
+ - Quoting date literals (dates must be unquoted)
176
+ WRONG: WHERE CreatedDate >= '2025-01-01'
177
+ RIGHT: WHERE CreatedDate >= 2025-01-01
178
+ - Using SQL JOIN syntax (not supported)
179
+ WRONG: SELECT Account.Name FROM Account JOIN Contact ON Account.Id = Contact.AccountId
180
+ RIGHT: Use relationship traversal (not applicable in this use case)
181
+ - Using BETWEEN operator (not supported)
182
+ WRONG: WHERE CreatedDate BETWEEN 2025-01-01 AND 2025-12-31
183
+ RIGHT: WHERE CreatedDate >= 2025-01-01 AND CreatedDate <= 2025-12-31
184
+ - Using uppercase TRUE/FALSE/NULL (must be lowercase)
185
+ WRONG: WHERE Active__c = TRUE
186
+ RIGHT: WHERE Active__c = true
187
+ - Using underscore _ in LIKE patterns (only % supported)
188
+ WRONG: WHERE Name LIKE 'A_me%'
189
+ RIGHT: WHERE Name LIKE 'A%me%'
190
+ - Mixing aggregate and non-aggregate fields without GROUP BY
191
+ WRONG: SELECT Name, COUNT(Id) FROM Account
192
+ RIGHT: SELECT Industry, COUNT(Id) FROM Account GROUP BY Industry
193
+
194
+ **EXAMPLE QUERIES:**
195
+ - Basic selection: SELECT Id, Name, Industry FROM Account WHERE Industry = 'Technology'
196
+ - Date filtering: SELECT Id, Name FROM Account WHERE CreatedDate >= 2025-01-01
197
+ - Multiple conditions: SELECT Id, Name FROM Account WHERE Name LIKE '%Corp%' AND Industry IN ('Technology', 'Finance')
198
+ - Aggregation: SELECT Industry, COUNT(Id) FROM Account GROUP BY Industry
199
+ - Boolean and numeric: SELECT Id, Name FROM Account WHERE Active__c = true AND NumberOfEmployees > 100
200
+ - Date functions: SELECT Id, Name FROM Account WHERE CALENDAR_YEAR(CreatedDate) = 2025
201
+ - Null checks: SELECT Id, Name FROM Account WHERE ParentId = null
202
+ - Multi-select picklist: SELECT Id, Name FROM Account WHERE Services__c INCLUDES ('Consulting;Support')
203
+ - Sorting and limiting: SELECT Id, Name FROM Account ORDER BY Name ASC LIMIT 50
204
+
205
+
206
+ ***EXECUTION INSTRUCTIONS. IMPORTANT!***
207
+ After generating the core SOQL (and nothing else), always make sure you wrap it exactly as:
208
+
209
+ SELECT *
210
+ FROM {integration_name}(
211
+ /* your generated SOQL goes here, without a trailing semicolon */
212
+ )
213
+
214
+ Return only that wrapper call.
215
+ """
@@ -11,6 +11,7 @@ from mindsdb.integrations.libs.response import (
11
11
  RESPONSE_TYPE,
12
12
  )
13
13
  from mindsdb.integrations.handlers.salesforce_handler.salesforce_tables import create_table_class
14
+ from mindsdb.integrations.handlers.salesforce_handler.constants import get_soql_instructions
14
15
  from mindsdb.utilities import log
15
16
 
16
17
 
@@ -156,91 +157,152 @@ class SalesforceHandler(MetaAPIHandler):
156
157
 
157
158
  def _get_resource_names(self) -> List[str]:
158
159
  """
159
- Retrieves the names of the Salesforce resources, with more aggressive filtering to remove tables.
160
+ Retrieves the names of the Salesforce resources with optimized pre-filtering.
160
161
  Returns:
161
162
  List[str]: A list of filtered resource names.
162
163
  """
163
164
  if not self.resource_names:
164
- all_resources = [
165
- resource["name"]
166
- for resource in self.connection.sobjects.describe()["sobjects"]
167
- if resource.get("queryable", False)
168
- ]
165
+ # Check for user-specified table filtering first
166
+ include_tables = self.connection_data.get("include_tables") or self.connection_data.get("tables")
167
+ exclude_tables = self.connection_data.get("exclude_tables", [])
168
+
169
+ if include_tables:
170
+ # OPTIMIZATION: Skip expensive global describe() call
171
+ # Only validate the specified tables
172
+ logger.info(f"Using pre-filtered table list: {include_tables}")
173
+ self.resource_names = self._validate_specified_tables(include_tables, exclude_tables)
174
+ else:
175
+ # Fallback to full discovery with hard-coded filtering
176
+ logger.info("No table filter specified, performing full discovery...")
177
+ self.resource_names = self._discover_all_tables_with_filtering(exclude_tables)
169
178
 
170
- # Define patterns for tables to be filtered out.
171
- # Expanded suffixes and prefixes and exact matches
172
- ignore_suffixes = ("Share", "History", "Feed", "ChangeEvent", "Tag", "Permission", "Setup", "Consent")
173
- ignore_prefixes = (
174
- "Apex",
175
- "CommPlatform",
176
- "Lightning",
177
- "Flow",
178
- "Transaction",
179
- "AI",
180
- "Aura",
181
- "ContentWorkspace",
182
- "Collaboration",
183
- "Datacloud",
184
- )
185
- ignore_exact = {
186
- "EntityDefinition",
187
- "FieldDefinition",
188
- "RecordType",
189
- "CaseStatus",
190
- "UserRole",
191
- "UserLicense",
192
- "UserPermissionAccess",
193
- "UserRecordAccess",
194
- "Folder",
195
- "Group",
196
- "Note",
197
- "ProcessDefinition",
198
- "ProcessInstance",
199
- "ContentFolder",
200
- "ContentDocumentSubscription",
201
- "DashboardComponent",
202
- "Report",
203
- "Dashboard",
204
- "Topic",
205
- "TopicAssignment",
206
- "Period",
207
- "Partner",
208
- "PackageLicense",
209
- "ColorDefinition",
210
- "DataUsePurpose",
211
- "DataUseLegalBasis",
212
- }
213
-
214
- ignore_substrings = (
215
- "CleanInfo",
216
- "Template",
217
- "Rule",
218
- "Definition",
219
- "Status",
220
- "Policy",
221
- "Setting",
222
- "Access",
223
- "Config",
224
- "Subscription",
225
- "DataType",
226
- "MilestoneType",
227
- "Entitlement",
228
- "Auth",
229
- )
230
-
231
- filtered = []
232
- for r in all_resources:
233
- if (
234
- not r.endswith(ignore_suffixes)
235
- and not r.startswith(ignore_prefixes)
236
- and not any(sub in r for sub in ignore_substrings)
237
- and r not in ignore_exact
238
- ):
239
- filtered.append(r)
240
-
241
- self.resource_names = [r for r in filtered]
242
179
  return self.resource_names
243
180
 
181
+ def _validate_specified_tables(self, include_tables: List[str], exclude_tables: List[str]) -> List[str]:
182
+ """
183
+ Validate user-specified tables without expensive global describe() call.
184
+
185
+ Args:
186
+ include_tables: List of table names to include
187
+ exclude_tables: List of table names to exclude
188
+
189
+ Returns:
190
+ List[str]: Validated and filtered table names
191
+ """
192
+ validated_tables = []
193
+
194
+ for table_name in include_tables:
195
+ # Skip if explicitly excluded
196
+ if table_name in exclude_tables:
197
+ logger.info(f"Skipping excluded table: {table_name}")
198
+ continue
199
+
200
+ try:
201
+ # Quick validation: check if table exists and is queryable
202
+ # This is much faster than global describe()
203
+ metadata = getattr(self.connection.sobjects, table_name).describe()
204
+ if metadata.get("queryable", False):
205
+ validated_tables.append(table_name)
206
+ logger.debug(f"Validated table: {table_name}")
207
+ else:
208
+ logger.warning(f"Table {table_name} is not queryable, skipping")
209
+ except Exception as e:
210
+ logger.warning(f"Table {table_name} not found or accessible: {e}")
211
+
212
+ logger.info(f"Validated {len(validated_tables)} tables from include_tables")
213
+ return validated_tables
214
+
215
+ def _discover_all_tables_with_filtering(self, exclude_tables: List[str]) -> List[str]:
216
+ """
217
+ Fallback method: discover all tables with hard-coded filtering.
218
+
219
+ Args:
220
+ exclude_tables: List of table names to exclude
221
+
222
+ Returns:
223
+ List[str]: Filtered table names
224
+ """
225
+ # This is the original expensive approach - only used when no include_tables specified
226
+ all_resources = [
227
+ resource["name"]
228
+ for resource in self.connection.sobjects.describe()["sobjects"]
229
+ if resource.get("queryable", False)
230
+ ]
231
+
232
+ # Apply hard-coded filtering (existing logic)
233
+ ignore_suffixes = ("Share", "History", "Feed", "ChangeEvent", "Tag", "Permission", "Setup", "Consent")
234
+ ignore_prefixes = (
235
+ "Apex",
236
+ "CommPlatform",
237
+ "Lightning",
238
+ "Flow",
239
+ "Transaction",
240
+ "AI",
241
+ "Aura",
242
+ "ContentWorkspace",
243
+ "Collaboration",
244
+ "Datacloud",
245
+ )
246
+ ignore_exact = {
247
+ "EntityDefinition",
248
+ "FieldDefinition",
249
+ "RecordType",
250
+ "CaseStatus",
251
+ "UserRole",
252
+ "UserLicense",
253
+ "UserPermissionAccess",
254
+ "UserRecordAccess",
255
+ "Folder",
256
+ "Group",
257
+ "Note",
258
+ "ProcessDefinition",
259
+ "ProcessInstance",
260
+ "ContentFolder",
261
+ "ContentDocumentSubscription",
262
+ "DashboardComponent",
263
+ "Report",
264
+ "Dashboard",
265
+ "Topic",
266
+ "TopicAssignment",
267
+ "Period",
268
+ "Partner",
269
+ "PackageLicense",
270
+ "ColorDefinition",
271
+ "DataUsePurpose",
272
+ "DataUseLegalBasis",
273
+ }
274
+
275
+ ignore_substrings = (
276
+ "CleanInfo",
277
+ "Template",
278
+ "Rule",
279
+ "Definition",
280
+ "Status",
281
+ "Policy",
282
+ "Setting",
283
+ "Access",
284
+ "Config",
285
+ "Subscription",
286
+ "DataType",
287
+ "MilestoneType",
288
+ "Entitlement",
289
+ "Auth",
290
+ )
291
+
292
+ # Apply hard-coded filtering
293
+ filtered = []
294
+ for r in all_resources:
295
+ if (
296
+ not r.endswith(ignore_suffixes)
297
+ and not r.startswith(ignore_prefixes)
298
+ and not any(sub in r for sub in ignore_substrings)
299
+ and r not in ignore_exact
300
+ and r not in exclude_tables # Apply user exclusions
301
+ ):
302
+ filtered.append(r)
303
+
304
+ return filtered
305
+
244
306
  def meta_get_handler_info(self, **kwargs) -> str:
245
307
  """
246
308
  Retrieves information about the design and implementation of the API handler.
@@ -254,8 +316,7 @@ class SalesforceHandler(MetaAPIHandler):
254
316
  Returns:
255
317
  str: A string containing information about the API handler's design and implementation.
256
318
  """
257
- # TODO: Relationships? Aliases?
258
- return "When filtering on a Date or DateTime field, the value MUST be an unquoted literal in YYYY-MM-DD or YYYY-MM-DDThh:mm:ssZ format. For example, CloseDate >= 2025-05-28 is correct; CloseDate >= '2025-05-28' is incorrect."
319
+ return get_soql_instructions(self.name)
259
320
 
260
321
  def meta_get_tables(self, table_names: Optional[List[str]] = None) -> Response:
261
322
  """
@@ -176,7 +176,6 @@ def create_table_class(resource_name: Text) -> MetaAPIResource:
176
176
  "table_description": "",
177
177
  "row_count": None,
178
178
  }
179
-
180
179
  # Get row count if Id column is aggregatable.
181
180
  row_count = None
182
181
  # if next(field for field in resource_metadata['fields'] if field['name'] == 'Id').get('aggregatable', False):
@@ -1,2 +1,2 @@
1
1
  tpot<=0.11.7
2
- type_infer==0.0.20
2
+ type_infer==0.0.23
@@ -100,26 +100,25 @@ def parallel_get_all_website_links(urls) -> dict:
100
100
  return url_contents
101
101
 
102
102
  with concurrent.futures.ProcessPoolExecutor() as executor:
103
- future_to_url = {
104
- executor.submit(get_all_website_links, url): url for url in urls
105
- }
103
+ future_to_url = {executor.submit(get_all_website_links, url): url for url in urls}
106
104
  for future in concurrent.futures.as_completed(future_to_url):
107
105
  url = future_to_url[future]
108
106
  try:
109
107
  url_contents[url] = future.result()
110
108
  except Exception as exc:
111
- logger.error(f'{url} generated an exception: {exc}')
109
+ logger.error(f"{url} generated an exception: {exc}")
112
110
  # don't raise the exception, just log it, continue processing other urls
113
111
 
114
112
  return url_contents
115
113
 
116
114
 
117
- def get_all_website_links(url) -> dict:
115
+ def get_all_website_links(url, headers: dict = None) -> dict:
118
116
  """
119
117
  Fetch all website links from a URL.
120
118
 
121
119
  Args:
122
120
  url (str): the URL to fetch links from
121
+ headers (dict): a dictionary of headers to use when fetching links
123
122
 
124
123
  Returns:
125
124
  A dictionary containing the URL, the extracted links, the HTML content, the text content, and any error that occurred.
@@ -132,9 +131,12 @@ def get_all_website_links(url) -> dict:
132
131
  session = requests.Session()
133
132
 
134
133
  # Add headers to mimic a real browser request
135
- headers = {
136
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
137
- }
134
+ if headers is None:
135
+ headers = {}
136
+ if "User-Agent" not in headers:
137
+ headers["User-Agent"] = (
138
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.3"
139
+ )
138
140
 
139
141
  response = session.get(url, headers=headers)
140
142
  if "cookie" in response.request.headers:
@@ -157,7 +159,7 @@ def get_all_website_links(url) -> dict:
157
159
  continue
158
160
  href = urljoin(url, href)
159
161
  parsed_href = urlparse(href)
160
- href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path, '', '', ''))
162
+ href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path, "", "", ""))
161
163
  if not is_valid(href):
162
164
  continue
163
165
  if href in urls:
@@ -203,7 +205,15 @@ def get_readable_text_from_soup(soup) -> str:
203
205
  return html_converter.handle(str(soup))
204
206
 
205
207
 
206
- def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_depth: int = 1, current_depth: int = 0, filters: List[str] = None):
208
+ def get_all_website_links_recursively(
209
+ url,
210
+ reviewed_urls,
211
+ limit=None,
212
+ crawl_depth: int = 1,
213
+ current_depth: int = 0,
214
+ filters: List[str] = None,
215
+ headers=None,
216
+ ):
207
217
  """
208
218
  Recursively gathers all links from a given website up to a specified limit.
209
219
 
@@ -227,7 +237,7 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
227
237
  matches_filter = any(re.match(f, url) is not None for f in filters)
228
238
  if url not in reviewed_urls and matches_filter:
229
239
  try:
230
- reviewed_urls[url] = get_all_website_links(url)
240
+ reviewed_urls[url] = get_all_website_links(url, headers=headers)
231
241
  except Exception as e:
232
242
  error_message = traceback.format_exc().splitlines()[-1]
233
243
  logger.error("An exception occurred: %s", str(e))
@@ -271,10 +281,14 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
271
281
  reviewed_urls.update(new_revised_urls)
272
282
 
273
283
  for new_url in new_revised_urls:
274
- get_all_website_links_recursively(new_url, reviewed_urls, limit, crawl_depth=crawl_depth, current_depth=current_depth + 1, filters=filters)
284
+ get_all_website_links_recursively(
285
+ new_url, reviewed_urls, limit, crawl_depth=crawl_depth, current_depth=current_depth + 1, filters=filters
286
+ )
275
287
 
276
288
 
277
- def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: List[str] = None) -> pd.DataFrame:
289
+ def get_all_websites(
290
+ urls, limit=1, html=False, crawl_depth: int = 1, filters: List[str] = None, headers: dict = None
291
+ ) -> pd.DataFrame:
278
292
  """
279
293
  Crawl a list of websites and return a DataFrame containing the results.
280
294
 
@@ -284,6 +298,7 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
284
298
  crawl_depth (int): Crawl depth for URLs.
285
299
  html (bool): a boolean indicating whether to include the HTML content in the results
286
300
  filters (List[str]): Crawl URLs that only match these regex patterns.
301
+ headers (dict): headers of request
287
302
 
288
303
  Returns:
289
304
  A DataFrame containing the results.
@@ -299,7 +314,9 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
299
314
  if urlparse(url).scheme == "":
300
315
  # Try HTTPS first
301
316
  url = "https://" + url
302
- get_all_website_links_recursively(url, reviewed_urls, limit, crawl_depth=crawl_depth, filters=filters)
317
+ get_all_website_links_recursively(
318
+ url, reviewed_urls, limit, crawl_depth=crawl_depth, filters=filters, headers=headers
319
+ )
303
320
 
304
321
  # Use a ThreadPoolExecutor to run the helper function in parallel.
305
322
  with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -311,9 +328,7 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
311
328
  columns_to_ignore = ["urls"]
312
329
  if html is False:
313
330
  columns_to_ignore += ["html_content"]
314
- df = dict_to_dataframe(
315
- reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url"
316
- )
331
+ df = dict_to_dataframe(reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url")
317
332
 
318
333
  if not df.empty and df[df.error.isna()].empty:
319
334
  raise Exception(str(df.iloc[0].error))