dvt-core 0.52.2__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dvt-core might be problematic. Click here for more details.

Files changed (275) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2039 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +804 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +50 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.py +624 -0
  74. dbt/compute/federated_executor.py +837 -0
  75. dbt/compute/filter_pushdown.cpython-310-darwin.so +0 -0
  76. dbt/compute/filter_pushdown.py +273 -0
  77. dbt/compute/jar_provisioning.cpython-310-darwin.so +0 -0
  78. dbt/compute/jar_provisioning.py +255 -0
  79. dbt/compute/java_compat.cpython-310-darwin.so +0 -0
  80. dbt/compute/java_compat.py +689 -0
  81. dbt/compute/jdbc_utils.cpython-310-darwin.so +0 -0
  82. dbt/compute/jdbc_utils.py +678 -0
  83. dbt/compute/smart_selector.cpython-310-darwin.so +0 -0
  84. dbt/compute/smart_selector.py +311 -0
  85. dbt/compute/strategies/__init__.py +54 -0
  86. dbt/compute/strategies/base.py +165 -0
  87. dbt/compute/strategies/dataproc.py +207 -0
  88. dbt/compute/strategies/emr.py +203 -0
  89. dbt/compute/strategies/local.py +364 -0
  90. dbt/compute/strategies/standalone.py +262 -0
  91. dbt/config/__init__.py +4 -0
  92. dbt/config/catalogs.py +94 -0
  93. dbt/config/compute.cpython-310-darwin.so +0 -0
  94. dbt/config/compute.py +547 -0
  95. dbt/config/dvt_profile.cpython-310-darwin.so +0 -0
  96. dbt/config/dvt_profile.py +342 -0
  97. dbt/config/profile.py +422 -0
  98. dbt/config/project.py +873 -0
  99. dbt/config/project_utils.py +28 -0
  100. dbt/config/renderer.py +231 -0
  101. dbt/config/runtime.py +553 -0
  102. dbt/config/selectors.py +208 -0
  103. dbt/config/utils.py +77 -0
  104. dbt/constants.py +28 -0
  105. dbt/context/__init__.py +0 -0
  106. dbt/context/base.py +745 -0
  107. dbt/context/configured.py +135 -0
  108. dbt/context/context_config.py +382 -0
  109. dbt/context/docs.py +82 -0
  110. dbt/context/exceptions_jinja.py +178 -0
  111. dbt/context/macro_resolver.py +195 -0
  112. dbt/context/macros.py +171 -0
  113. dbt/context/manifest.py +72 -0
  114. dbt/context/providers.py +2249 -0
  115. dbt/context/query_header.py +13 -0
  116. dbt/context/secret.py +58 -0
  117. dbt/context/target.py +74 -0
  118. dbt/contracts/__init__.py +0 -0
  119. dbt/contracts/files.py +413 -0
  120. dbt/contracts/graph/__init__.py +0 -0
  121. dbt/contracts/graph/manifest.py +1904 -0
  122. dbt/contracts/graph/metrics.py +97 -0
  123. dbt/contracts/graph/model_config.py +70 -0
  124. dbt/contracts/graph/node_args.py +42 -0
  125. dbt/contracts/graph/nodes.py +1806 -0
  126. dbt/contracts/graph/semantic_manifest.py +232 -0
  127. dbt/contracts/graph/unparsed.py +811 -0
  128. dbt/contracts/project.py +417 -0
  129. dbt/contracts/results.py +53 -0
  130. dbt/contracts/selection.py +23 -0
  131. dbt/contracts/sql.py +85 -0
  132. dbt/contracts/state.py +68 -0
  133. dbt/contracts/util.py +46 -0
  134. dbt/deprecations.py +346 -0
  135. dbt/deps/__init__.py +0 -0
  136. dbt/deps/base.py +152 -0
  137. dbt/deps/git.py +195 -0
  138. dbt/deps/local.py +79 -0
  139. dbt/deps/registry.py +130 -0
  140. dbt/deps/resolver.py +149 -0
  141. dbt/deps/tarball.py +120 -0
  142. dbt/docs/source/_ext/dbt_click.py +119 -0
  143. dbt/docs/source/conf.py +32 -0
  144. dbt/env_vars.py +64 -0
  145. dbt/event_time/event_time.py +40 -0
  146. dbt/event_time/sample_window.py +60 -0
  147. dbt/events/__init__.py +15 -0
  148. dbt/events/base_types.py +36 -0
  149. dbt/events/core_types_pb2.py +2 -0
  150. dbt/events/logging.py +108 -0
  151. dbt/events/types.py +2516 -0
  152. dbt/exceptions.py +1486 -0
  153. dbt/flags.py +89 -0
  154. dbt/graph/__init__.py +11 -0
  155. dbt/graph/cli.py +247 -0
  156. dbt/graph/graph.py +172 -0
  157. dbt/graph/queue.py +214 -0
  158. dbt/graph/selector.py +374 -0
  159. dbt/graph/selector_methods.py +975 -0
  160. dbt/graph/selector_spec.py +222 -0
  161. dbt/graph/thread_pool.py +18 -0
  162. dbt/hooks.py +21 -0
  163. dbt/include/README.md +49 -0
  164. dbt/include/__init__.py +3 -0
  165. dbt/include/starter_project/.gitignore +4 -0
  166. dbt/include/starter_project/README.md +15 -0
  167. dbt/include/starter_project/__init__.py +3 -0
  168. dbt/include/starter_project/analyses/.gitkeep +0 -0
  169. dbt/include/starter_project/dbt_project.yml +36 -0
  170. dbt/include/starter_project/macros/.gitkeep +0 -0
  171. dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
  172. dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
  173. dbt/include/starter_project/models/example/schema.yml +21 -0
  174. dbt/include/starter_project/seeds/.gitkeep +0 -0
  175. dbt/include/starter_project/snapshots/.gitkeep +0 -0
  176. dbt/include/starter_project/tests/.gitkeep +0 -0
  177. dbt/internal_deprecations.py +26 -0
  178. dbt/jsonschemas/__init__.py +3 -0
  179. dbt/jsonschemas/jsonschemas.py +309 -0
  180. dbt/jsonschemas/project/0.0.110.json +4717 -0
  181. dbt/jsonschemas/project/0.0.85.json +2015 -0
  182. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  183. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  184. dbt/jsonschemas/resources/latest.json +6773 -0
  185. dbt/links.py +4 -0
  186. dbt/materializations/__init__.py +0 -0
  187. dbt/materializations/incremental/__init__.py +0 -0
  188. dbt/materializations/incremental/microbatch.py +236 -0
  189. dbt/mp_context.py +8 -0
  190. dbt/node_types.py +37 -0
  191. dbt/parser/__init__.py +23 -0
  192. dbt/parser/analysis.py +21 -0
  193. dbt/parser/base.py +548 -0
  194. dbt/parser/common.py +266 -0
  195. dbt/parser/docs.py +52 -0
  196. dbt/parser/fixtures.py +51 -0
  197. dbt/parser/functions.py +30 -0
  198. dbt/parser/generic_test.py +100 -0
  199. dbt/parser/generic_test_builders.py +333 -0
  200. dbt/parser/hooks.py +118 -0
  201. dbt/parser/macros.py +137 -0
  202. dbt/parser/manifest.py +2204 -0
  203. dbt/parser/models.py +573 -0
  204. dbt/parser/partial.py +1178 -0
  205. dbt/parser/read_files.py +445 -0
  206. dbt/parser/schema_generic_tests.py +422 -0
  207. dbt/parser/schema_renderer.py +111 -0
  208. dbt/parser/schema_yaml_readers.py +935 -0
  209. dbt/parser/schemas.py +1466 -0
  210. dbt/parser/search.py +149 -0
  211. dbt/parser/seeds.py +28 -0
  212. dbt/parser/singular_test.py +20 -0
  213. dbt/parser/snapshots.py +44 -0
  214. dbt/parser/sources.py +558 -0
  215. dbt/parser/sql.py +62 -0
  216. dbt/parser/unit_tests.py +621 -0
  217. dbt/plugins/__init__.py +20 -0
  218. dbt/plugins/contracts.py +9 -0
  219. dbt/plugins/exceptions.py +2 -0
  220. dbt/plugins/manager.py +163 -0
  221. dbt/plugins/manifest.py +21 -0
  222. dbt/profiler.py +20 -0
  223. dbt/py.typed +1 -0
  224. dbt/query_analyzer.cpython-310-darwin.so +0 -0
  225. dbt/query_analyzer.py +410 -0
  226. dbt/runners/__init__.py +2 -0
  227. dbt/runners/exposure_runner.py +7 -0
  228. dbt/runners/no_op_runner.py +45 -0
  229. dbt/runners/saved_query_runner.py +7 -0
  230. dbt/selected_resources.py +8 -0
  231. dbt/task/__init__.py +0 -0
  232. dbt/task/base.py +503 -0
  233. dbt/task/build.py +197 -0
  234. dbt/task/clean.py +56 -0
  235. dbt/task/clone.py +161 -0
  236. dbt/task/compile.py +150 -0
  237. dbt/task/compute.py +454 -0
  238. dbt/task/debug.py +505 -0
  239. dbt/task/deps.py +280 -0
  240. dbt/task/docs/__init__.py +3 -0
  241. dbt/task/docs/generate.py +660 -0
  242. dbt/task/docs/index.html +250 -0
  243. dbt/task/docs/serve.py +29 -0
  244. dbt/task/freshness.py +322 -0
  245. dbt/task/function.py +121 -0
  246. dbt/task/group_lookup.py +46 -0
  247. dbt/task/init.py +553 -0
  248. dbt/task/java.py +316 -0
  249. dbt/task/list.py +236 -0
  250. dbt/task/printer.py +175 -0
  251. dbt/task/retry.py +175 -0
  252. dbt/task/run.py +1306 -0
  253. dbt/task/run_operation.py +141 -0
  254. dbt/task/runnable.py +758 -0
  255. dbt/task/seed.py +103 -0
  256. dbt/task/show.py +149 -0
  257. dbt/task/snapshot.py +56 -0
  258. dbt/task/spark.py +414 -0
  259. dbt/task/sql.py +110 -0
  260. dbt/task/target_sync.py +759 -0
  261. dbt/task/test.py +464 -0
  262. dbt/tests/fixtures/__init__.py +1 -0
  263. dbt/tests/fixtures/project.py +620 -0
  264. dbt/tests/util.py +651 -0
  265. dbt/tracking.py +529 -0
  266. dbt/utils/__init__.py +3 -0
  267. dbt/utils/artifact_upload.py +151 -0
  268. dbt/utils/utils.py +408 -0
  269. dbt/version.py +268 -0
  270. dvt_cli/__init__.py +72 -0
  271. dvt_core-0.52.2.dist-info/METADATA +286 -0
  272. dvt_core-0.52.2.dist-info/RECORD +275 -0
  273. dvt_core-0.52.2.dist-info/WHEEL +5 -0
  274. dvt_core-0.52.2.dist-info/entry_points.txt +2 -0
  275. dvt_core-0.52.2.dist-info/top_level.txt +2 -0
@@ -0,0 +1,273 @@
1
+ """
2
+ Filter pushdown optimization for federated queries.
3
+
4
+ Extracts filters (WHERE, LIMIT, ORDER BY) from compiled SQL and rewrites them
5
+ to be pushed down to source databases in their native SQL dialects.
6
+ """
7
+
8
+ import re
9
+ import sys
10
+ import sqlparse
11
+ from sqlparse.sql import Statement, Token, TokenList, Identifier, Where, Comparison
12
+ from sqlparse.tokens import Keyword, Whitespace
13
+ from typing import Dict, List, Optional, Any
14
+
15
+
16
+ class FilterPushdownOptimizer:
17
+ """
18
+ Optimizes federated queries by pushing filters down to source databases.
19
+
20
+ Strategy:
21
+ 1. Parse compiled SQL to extract filters per source table
22
+ 2. Rewrite filters in each source adapter's SQL dialect
23
+ 3. Return subqueries for JDBC reads instead of plain table names
24
+
25
+ Example:
26
+ Input SQL:
27
+ SELECT * FROM snowflake_table WHERE date > '2024-01-01' LIMIT 10
28
+
29
+ Output:
30
+ JDBC subquery: (SELECT * FROM snowflake_table WHERE date > '2024-01-01' LIMIT 10)
31
+ """
32
+
33
+ def __init__(self, compiled_sql: str, source_tables: List[Any]):
34
+ """
35
+ Initialize optimizer with compiled SQL and source table metadata.
36
+
37
+ Args:
38
+ compiled_sql: The fully compiled SQL from the model
39
+ source_tables: List of SourceTableMetadata objects
40
+ """
41
+ self.compiled_sql = compiled_sql
42
+ self.source_tables = source_tables
43
+ self.parsed = sqlparse.parse(compiled_sql)[0] if compiled_sql else None
44
+
45
+ def extract_limit(self) -> Optional[int]:
46
+ """
47
+ Extract LIMIT clause from SQL.
48
+
49
+ Returns:
50
+ Limit value as integer, or None if no LIMIT clause
51
+ """
52
+ if not self.parsed:
53
+ return None
54
+
55
+ # Simple regex approach for LIMIT (works for most cases)
56
+ limit_match = re.search(r'\bLIMIT\s+(\d+)\b', self.compiled_sql, re.IGNORECASE)
57
+ if limit_match:
58
+ return int(limit_match.group(1))
59
+
60
+ return None
61
+
62
+ def extract_sample_clause(self) -> Optional[Dict[str, Any]]:
63
+ """
64
+ Extract SAMPLE/TABLESAMPLE clause from SQL (Snowflake-specific sampling).
65
+
66
+ Snowflake supports several SAMPLE methods:
67
+ - SAMPLE (N) or SAMPLE (N ROWS) - Row-count sampling
68
+ - SAMPLE SYSTEM (P) - System/block sampling with P% probability
69
+ - SAMPLE BERNOULLI (P) - Bernoulli/row-level sampling with P% probability
70
+ - SAMPLE BLOCK (P) - Alias for SYSTEM
71
+ - TABLESAMPLE ... - Alternative syntax
72
+ - REPEATABLE(seed) or SEED(seed) - Reproducible sampling
73
+
74
+ Returns:
75
+ Dict with keys:
76
+ - 'method': 'ROWS', 'SYSTEM', 'BERNOULLI', 'BLOCK'
77
+ - 'value': int (row count or percentage)
78
+ - 'seed': Optional int for reproducible sampling
79
+ - 'full_clause': The complete SAMPLE clause to push down
80
+ or None if no SAMPLE clause
81
+ """
82
+ if not self.parsed:
83
+ return None
84
+
85
+ # Try to match complete SAMPLE/TABLESAMPLE clause
86
+ # Pattern: (TABLE)?SAMPLE <method>? (value) (REPEATABLE|SEED)?(seed)?
87
+
88
+ # Match: SAMPLE (N) or SAMPLE (N ROWS) with optional seed
89
+ sample_rows = re.search(
90
+ r'\b(?:TABLE)?SAMPLE\s*\(\s*(\d+)(?:\s+ROWS)?\s*\)'
91
+ r'(?:\s+(?:REPEATABLE|SEED)\s*\(\s*(\d+)\s*\))?',
92
+ self.compiled_sql,
93
+ re.IGNORECASE
94
+ )
95
+ if sample_rows:
96
+ result = {
97
+ 'method': 'ROWS',
98
+ 'value': int(sample_rows.group(1)),
99
+ 'seed': int(sample_rows.group(2)) if sample_rows.group(2) else None
100
+ }
101
+ # Build full clause
102
+ clause = f"SAMPLE ({result['value']})"
103
+ if result['seed']:
104
+ clause += f" REPEATABLE ({result['seed']})"
105
+ result['full_clause'] = clause
106
+ return result
107
+
108
+ # Match: SAMPLE SYSTEM|BERNOULLI|BLOCK (P) with optional seed
109
+ sample_method = re.search(
110
+ r'\b(?:TABLE)?SAMPLE\s+(SYSTEM|BERNOULLI|BLOCK)\s*\(\s*(\d+(?:\.\d+)?)\s*\)'
111
+ r'(?:\s+(?:REPEATABLE|SEED)\s*\(\s*(\d+)\s*\))?',
112
+ self.compiled_sql,
113
+ re.IGNORECASE
114
+ )
115
+ if sample_method:
116
+ method = sample_method.group(1).upper()
117
+ # BLOCK is an alias for SYSTEM
118
+ if method == 'BLOCK':
119
+ method = 'SYSTEM'
120
+
121
+ result = {
122
+ 'method': method,
123
+ 'value': float(sample_method.group(2)),
124
+ 'seed': int(sample_method.group(3)) if sample_method.group(3) else None
125
+ }
126
+ # Build full clause
127
+ clause = f"SAMPLE {result['method']} ({result['value']})"
128
+ if result['seed']:
129
+ clause += f" REPEATABLE ({result['seed']})"
130
+ result['full_clause'] = clause
131
+ return result
132
+
133
+ return None
134
+
135
+ def extract_where_clauses(self) -> Dict[str, List[str]]:
136
+ """
137
+ Extract WHERE clauses that apply to specific source tables.
138
+
139
+ Returns:
140
+ Dict mapping table name/alias to list of WHERE conditions
141
+
142
+ Example:
143
+ {
144
+ 'snowflake_table': ['date > \'2024-01-01\'', 'status = \'active\''],
145
+ 'postgres_table': ['id > 100']
146
+ }
147
+ """
148
+ # TODO: Implement WHERE clause extraction using sqlparse
149
+ # For now, return empty dict - LIMIT pushdown is the priority
150
+ return {}
151
+
152
+ def build_pushdown_subquery(
153
+ self,
154
+ source_table: Any,
155
+ adapter_type: str
156
+ ) -> Optional[str]:
157
+ """
158
+ Build a subquery with pushed-down filters for a specific source table.
159
+
160
+ Args:
161
+ source_table: SourceTableMetadata object
162
+ adapter_type: Adapter type (postgres, snowflake, etc.)
163
+
164
+ Returns:
165
+ SQL subquery with filters, or None if no pushdown possible
166
+
167
+ Example:
168
+ Input: table="schema.table", LIMIT 10
169
+ Output: "(SELECT * FROM schema.table LIMIT 10)"
170
+ """
171
+ limit = self.extract_limit()
172
+ sample_clause = self.extract_sample_clause()
173
+ where_clauses = self.extract_where_clauses()
174
+
175
+ # DVT v0.4.7: Suppressed debug output for clean console
176
+ # Debug info: LIMIT={limit}, SAMPLE={sample_clause}, WHERE={where_clauses}
177
+
178
+ # If no filters to push down, return None (read full table)
179
+ if not limit and not sample_clause and not where_clauses:
180
+ return None
181
+
182
+ # Build subquery
183
+ qualified_name = source_table.qualified_name
184
+ subquery_parts = [f"SELECT * FROM {qualified_name}"]
185
+
186
+ # Add SAMPLE clause (Snowflake-specific, goes right after FROM)
187
+ if sample_clause and adapter_type.lower() == 'snowflake':
188
+ # Use the pre-built full_clause from extract_sample_clause
189
+ # This includes all sampling options: method, value, and seed
190
+ subquery_parts.append(sample_clause['full_clause'])
191
+
192
+ # Add WHERE clauses (if any)
193
+ table_key = source_table.identifier # or qualified_name
194
+ if table_key in where_clauses:
195
+ conditions = " AND ".join(where_clauses[table_key])
196
+ subquery_parts.append(f"WHERE {conditions}")
197
+
198
+ # Add LIMIT (if present and no SAMPLE used)
199
+ # Note: SAMPLE takes precedence over LIMIT for Snowflake
200
+ if limit and not (sample_clause and adapter_type.lower() == 'snowflake'):
201
+ # Rewrite LIMIT in adapter's dialect
202
+ limit_clause = self._rewrite_limit_for_adapter(limit, adapter_type)
203
+ if limit_clause:
204
+ subquery_parts.append(limit_clause)
205
+
206
+ subquery = " ".join(subquery_parts)
207
+ # DVT v0.4.7: Suppressed debug output
208
+ return f"({subquery})"
209
+
210
+ def _rewrite_limit_for_adapter(self, limit: int, adapter_type: str) -> Optional[str]:
211
+ """
212
+ Rewrite LIMIT clause for specific adapter's SQL dialect.
213
+
214
+ Args:
215
+ limit: Limit value
216
+ adapter_type: Adapter type (postgres, snowflake, redshift, etc.)
217
+
218
+ Returns:
219
+ LIMIT clause in adapter's dialect
220
+ """
221
+ # Most adapters support standard LIMIT syntax
222
+ standard_adapters = [
223
+ 'postgres', 'postgresql',
224
+ 'snowflake',
225
+ 'redshift',
226
+ 'mysql',
227
+ 'sqlite',
228
+ 'bigquery'
229
+ ]
230
+
231
+ if adapter_type.lower() in standard_adapters:
232
+ return f"LIMIT {limit}"
233
+
234
+ # SQL Server / TSQL uses TOP
235
+ if adapter_type.lower() in ['sqlserver', 'mssql', 'tsql']:
236
+ # Note: This should go in SELECT clause, not at the end
237
+ # For now, return None - we'll handle this in a future iteration
238
+ return None
239
+
240
+ # Oracle uses ROWNUM or FETCH FIRST (12c+)
241
+ if adapter_type.lower() == 'oracle':
242
+ return f"FETCH FIRST {limit} ROWS ONLY"
243
+
244
+ # Default: standard LIMIT
245
+ return f"LIMIT {limit}"
246
+
247
+
248
+ def optimize_jdbc_table_read(
249
+ source_table: Any,
250
+ compiled_sql: str,
251
+ source_tables: List[Any],
252
+ adapter_type: str
253
+ ) -> str:
254
+ """
255
+ Optimize JDBC table read by pushing down filters.
256
+
257
+ Args:
258
+ source_table: SourceTableMetadata for this table
259
+ compiled_sql: Compiled SQL from the model
260
+ source_tables: All source tables in the query
261
+ adapter_type: Source adapter type
262
+
263
+ Returns:
264
+ Table identifier (plain name or subquery with filters)
265
+ """
266
+ optimizer = FilterPushdownOptimizer(compiled_sql, source_tables)
267
+ subquery = optimizer.build_pushdown_subquery(source_table, adapter_type)
268
+
269
+ if subquery:
270
+ return subquery
271
+ else:
272
+ # No filters to push down - read full table
273
+ return source_table.qualified_name
@@ -0,0 +1,255 @@
1
+ """
2
+ JAR Provisioning Module
3
+
4
+ Centralized JDBC JAR provisioning for Spark compute engines.
5
+
6
+ v0.5.98: Supports two provisioning strategies:
7
+ - LocalJARProvisioning: Uses spark.jars with local file paths (fast startup)
8
+ - RemoteJARProvisioning: Uses spark.jars.packages with Maven coordinates (remote clusters)
9
+
10
+ Local Spark uses local JARs from .dvt/jdbc_jars/ for instant startup.
11
+ Remote clusters (Databricks, EMR, Dataproc, Standalone) use Maven coordinates
12
+ so Spark workers can download JARs directly from Maven Central.
13
+ """
14
+
15
+ import glob
16
+ import os
17
+ from abc import ABC, abstractmethod
18
+ from pathlib import Path
19
+ from typing import Dict, List, Optional, Set
20
+
21
+
22
+ # Maven coordinates for JDBC drivers - used by remote clusters
23
+ # Format: groupId:artifactId:version
24
+ # These are the same JARs as downloaded by `dvt target sync`, but expressed as Maven coordinates
25
+ JDBC_MAVEN_COORDINATES = {
26
+ # Official dbt-labs adapters - JDBC drivers only
27
+ "postgres": "org.postgresql:postgresql:42.7.4",
28
+ "snowflake": "net.snowflake:snowflake-jdbc:3.16.1",
29
+ "bigquery": "com.google.cloud.bigdataoss:gcs-connector:hadoop3-2.2.22",
30
+ "redshift": "com.amazon.redshift:redshift-jdbc42:2.1.0.32",
31
+ "spark": "", # Native, no JDBC needed
32
+ "databricks": "com.databricks:databricks-jdbc:2.6.36",
33
+ "trino": "io.trino:trino-jdbc:443",
34
+ "duckdb": "org.duckdb:duckdb_jdbc:1.1.3",
35
+ # Community adapters - JDBC drivers only (verified on Maven)
36
+ "mysql": "com.mysql:mysql-connector-j:9.1.0",
37
+ "sqlserver": "com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre11",
38
+ "synapse": "com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre11",
39
+ "fabric": "com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre11",
40
+ "oracle": "com.oracle.database.jdbc:ojdbc11:23.6.0.24.10",
41
+ "teradata": "com.teradata.jdbc:terajdbc:20.00.00.20",
42
+ "clickhouse": "com.clickhouse:clickhouse-jdbc:0.6.5",
43
+ "greenplum": "org.postgresql:postgresql:42.7.4", # PostgreSQL compatible
44
+ "vertica": "com.vertica.jdbc:vertica-jdbc:24.3.0-0",
45
+ "sqlite": "org.xerial:sqlite-jdbc:3.47.1.0",
46
+ "mariadb": "org.mariadb.jdbc:mariadb-java-client:3.4.1",
47
+ "exasol": "com.exasol:exasol-jdbc:24.2.0",
48
+ "db2": "com.ibm.db2:jcc:11.5.9.0",
49
+ "presto": "io.prestosql:presto-jdbc:350",
50
+ "hive": "org.apache.hive:hive-jdbc:3.1.3",
51
+ "singlestore": "com.singlestore:singlestore-jdbc-client:1.2.9",
52
+ "starrocks": "com.mysql:mysql-connector-j:9.1.0", # MySQL wire protocol
53
+ "doris": "com.mysql:mysql-connector-j:9.1.0", # MySQL wire protocol
54
+ "materialize": "org.postgresql:postgresql:42.7.4", # PostgreSQL wire protocol
55
+ "neo4j": "org.neo4j:neo4j-jdbc-driver:4.0.10",
56
+ "timescaledb": "org.postgresql:postgresql:42.7.4", # PostgreSQL extension
57
+ "questdb": "org.postgresql:postgresql:42.7.4", # PostgreSQL wire protocol
58
+ }
59
+
60
+
61
+ class JARProvisioning(ABC):
62
+ """Abstract base class for JAR provisioning strategies."""
63
+
64
+ @abstractmethod
65
+ def get_spark_config(self, adapter_types: Set[str]) -> Dict[str, str]:
66
+ """
67
+ Get Spark configuration for JDBC JARs.
68
+
69
+ :param adapter_types: Set of adapter types that need JDBC drivers
70
+ :returns: Dictionary of Spark config keys/values
71
+ """
72
+ pass
73
+
74
+ @abstractmethod
75
+ def get_provisioning_type(self) -> str:
76
+ """
77
+ Get the type of JAR provisioning.
78
+
79
+ :returns: 'local' or 'maven'
80
+ """
81
+ pass
82
+
83
+
84
+ class LocalJARProvisioning(JARProvisioning):
85
+ """
86
+ Local JAR provisioning using spark.jars with local file paths.
87
+
88
+ Best for local Spark (spark-local) where JARs are pre-downloaded
89
+ to .dvt/jdbc_jars/ directory via `dvt target sync`.
90
+
91
+ Advantages:
92
+ - Instant startup (no JAR download at runtime)
93
+ - Works offline
94
+ - Consistent JAR versions
95
+
96
+ Disadvantages:
97
+ - Only works for local Spark (JARs must be on local filesystem)
98
+ - Requires running `dvt target sync` first
99
+ """
100
+
101
+ def __init__(self, project_dir: Optional[str] = None):
102
+ """
103
+ Initialize local JAR provisioning.
104
+
105
+ :param project_dir: Path to project root directory (defaults to cwd)
106
+ """
107
+ self.project_dir = project_dir or os.getcwd()
108
+ self.jdbc_jars_dir = os.path.join(self.project_dir, ".dvt", "jdbc_jars")
109
+
110
+ def get_jar_paths(self) -> List[str]:
111
+ """
112
+ Discover all JDBC JAR files from project cache.
113
+
114
+ :returns: List of absolute JAR file paths
115
+ """
116
+ if not os.path.exists(self.jdbc_jars_dir):
117
+ return []
118
+
119
+ jar_pattern = os.path.join(self.jdbc_jars_dir, "*.jar")
120
+ return sorted(glob.glob(jar_pattern))
121
+
122
+ def get_spark_config(self, adapter_types: Set[str]) -> Dict[str, str]:
123
+ """
124
+ Get Spark config using local JAR paths.
125
+
126
+ :param adapter_types: Set of adapter types (ignored - uses all JARs found)
127
+ :returns: Dictionary with spark.jars config
128
+ """
129
+ jar_paths = self.get_jar_paths()
130
+
131
+ if jar_paths:
132
+ return {"spark.jars": ",".join(jar_paths)}
133
+ return {}
134
+
135
+ def get_provisioning_type(self) -> str:
136
+ """Get provisioning type."""
137
+ return "local"
138
+
139
+
140
+ class RemoteJARProvisioning(JARProvisioning):
141
+ """
142
+ Remote JAR provisioning using spark.jars.packages with Maven coordinates.
143
+
144
+ Best for remote Spark clusters (Databricks, EMR, Dataproc, Standalone)
145
+ where Spark workers need to download JARs from Maven Central.
146
+
147
+ Advantages:
148
+ - Works with any remote Spark cluster
149
+ - No need to pre-install JARs on cluster
150
+ - Spark handles dependency resolution
151
+
152
+ Disadvantages:
153
+ - Requires network access to Maven Central
154
+ - First query may be slower (JAR download)
155
+ - May need to configure Maven repositories for private JARs
156
+ """
157
+
158
+ def __init__(self, profiles_dir: Optional[str] = None):
159
+ """
160
+ Initialize remote JAR provisioning.
161
+
162
+ :param profiles_dir: Path to DVT profiles directory (for scanning profiles.yml)
163
+ """
164
+ self.profiles_dir = profiles_dir or str(Path.home() / ".dvt")
165
+
166
+ def get_maven_coordinates(self, adapter_types: Set[str]) -> List[str]:
167
+ """
168
+ Get Maven coordinates for the specified adapter types.
169
+
170
+ :param adapter_types: Set of adapter types
171
+ :returns: List of Maven coordinates (group:artifact:version)
172
+ """
173
+ coordinates = []
174
+ seen = set() # Avoid duplicates (e.g., postgres and timescaledb both use postgresql)
175
+
176
+ for adapter_type in adapter_types:
177
+ coord = JDBC_MAVEN_COORDINATES.get(adapter_type.lower(), "")
178
+ if coord and coord not in seen:
179
+ coordinates.append(coord)
180
+ seen.add(coord)
181
+
182
+ return sorted(coordinates)
183
+
184
+ def get_spark_config(self, adapter_types: Set[str]) -> Dict[str, str]:
185
+ """
186
+ Get Spark config using Maven coordinates.
187
+
188
+ :param adapter_types: Set of adapter types that need JDBC drivers
189
+ :returns: Dictionary with spark.jars.packages config
190
+ """
191
+ coordinates = self.get_maven_coordinates(adapter_types)
192
+
193
+ if coordinates:
194
+ return {"spark.jars.packages": ",".join(coordinates)}
195
+ return {}
196
+
197
+ def get_provisioning_type(self) -> str:
198
+ """Get provisioning type."""
199
+ return "maven"
200
+
201
+
202
+ def get_required_adapter_types(profiles_dir: Optional[str] = None) -> Set[str]:
203
+ """
204
+ Scan profiles.yml and return the set of adapter types needed.
205
+
206
+ :param profiles_dir: Path to DVT profiles directory
207
+ :returns: Set of adapter type names (e.g., {'postgres', 'snowflake'})
208
+ """
209
+ from dbt.clients.yaml_helper import load_yaml_text
210
+
211
+ if profiles_dir is None:
212
+ profiles_dir = str(Path.home() / ".dvt")
213
+
214
+ profiles_path = Path(profiles_dir) / "profiles.yml"
215
+ if not profiles_path.exists():
216
+ return set()
217
+
218
+ try:
219
+ content = profiles_path.read_text()
220
+ profiles = load_yaml_text(content) or {}
221
+
222
+ adapter_types = set()
223
+ for profile_name, profile_data in profiles.items():
224
+ if isinstance(profile_data, dict):
225
+ outputs = profile_data.get("outputs", {})
226
+ for target_name, target_config in outputs.items():
227
+ if isinstance(target_config, dict):
228
+ adapter_type = target_config.get("type")
229
+ if adapter_type:
230
+ adapter_types.add(adapter_type.lower())
231
+
232
+ return adapter_types
233
+
234
+ except Exception:
235
+ return set()
236
+
237
+
238
+ def get_provisioning_for_platform(
239
+ platform: str,
240
+ project_dir: Optional[str] = None,
241
+ profiles_dir: Optional[str] = None,
242
+ ) -> JARProvisioning:
243
+ """
244
+ Factory function to get the appropriate JAR provisioning strategy.
245
+
246
+ :param platform: Spark platform ('local', 'databricks', 'emr', 'dataproc', 'standalone')
247
+ :param project_dir: Project directory (for local provisioning)
248
+ :param profiles_dir: Profiles directory (for remote provisioning)
249
+ :returns: JARProvisioning instance
250
+ """
251
+ if platform.lower() == "local":
252
+ return LocalJARProvisioning(project_dir=project_dir)
253
+ else:
254
+ # All remote platforms use Maven coordinates
255
+ return RemoteJARProvisioning(profiles_dir=profiles_dir)