dvt-core 0.52.2__cp310-cp310-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dvt-core might be problematic. Click here for more details.
- dbt/__init__.py +7 -0
- dbt/_pydantic_shim.py +26 -0
- dbt/artifacts/__init__.py +0 -0
- dbt/artifacts/exceptions/__init__.py +1 -0
- dbt/artifacts/exceptions/schemas.py +31 -0
- dbt/artifacts/resources/__init__.py +116 -0
- dbt/artifacts/resources/base.py +67 -0
- dbt/artifacts/resources/types.py +93 -0
- dbt/artifacts/resources/v1/analysis.py +10 -0
- dbt/artifacts/resources/v1/catalog.py +23 -0
- dbt/artifacts/resources/v1/components.py +274 -0
- dbt/artifacts/resources/v1/config.py +277 -0
- dbt/artifacts/resources/v1/documentation.py +11 -0
- dbt/artifacts/resources/v1/exposure.py +51 -0
- dbt/artifacts/resources/v1/function.py +52 -0
- dbt/artifacts/resources/v1/generic_test.py +31 -0
- dbt/artifacts/resources/v1/group.py +21 -0
- dbt/artifacts/resources/v1/hook.py +11 -0
- dbt/artifacts/resources/v1/macro.py +29 -0
- dbt/artifacts/resources/v1/metric.py +172 -0
- dbt/artifacts/resources/v1/model.py +145 -0
- dbt/artifacts/resources/v1/owner.py +10 -0
- dbt/artifacts/resources/v1/saved_query.py +111 -0
- dbt/artifacts/resources/v1/seed.py +41 -0
- dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dbt/artifacts/resources/v1/semantic_model.py +314 -0
- dbt/artifacts/resources/v1/singular_test.py +14 -0
- dbt/artifacts/resources/v1/snapshot.py +91 -0
- dbt/artifacts/resources/v1/source_definition.py +84 -0
- dbt/artifacts/resources/v1/sql_operation.py +10 -0
- dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
- dbt/artifacts/schemas/__init__.py +0 -0
- dbt/artifacts/schemas/base.py +191 -0
- dbt/artifacts/schemas/batch_results.py +24 -0
- dbt/artifacts/schemas/catalog/__init__.py +11 -0
- dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
- dbt/artifacts/schemas/freshness/__init__.py +1 -0
- dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
- dbt/artifacts/schemas/manifest/__init__.py +2 -0
- dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
- dbt/artifacts/schemas/results.py +147 -0
- dbt/artifacts/schemas/run/__init__.py +2 -0
- dbt/artifacts/schemas/run/v5/__init__.py +0 -0
- dbt/artifacts/schemas/run/v5/run.py +184 -0
- dbt/artifacts/schemas/upgrades/__init__.py +4 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dbt/artifacts/utils/validation.py +153 -0
- dbt/cli/__init__.py +1 -0
- dbt/cli/context.py +17 -0
- dbt/cli/exceptions.py +57 -0
- dbt/cli/flags.py +560 -0
- dbt/cli/main.py +2039 -0
- dbt/cli/option_types.py +121 -0
- dbt/cli/options.py +80 -0
- dbt/cli/params.py +804 -0
- dbt/cli/requires.py +490 -0
- dbt/cli/resolvers.py +50 -0
- dbt/cli/types.py +40 -0
- dbt/clients/__init__.py +0 -0
- dbt/clients/checked_load.py +83 -0
- dbt/clients/git.py +164 -0
- dbt/clients/jinja.py +206 -0
- dbt/clients/jinja_static.py +245 -0
- dbt/clients/registry.py +192 -0
- dbt/clients/yaml_helper.py +68 -0
- dbt/compilation.py +876 -0
- dbt/compute/__init__.py +14 -0
- dbt/compute/engines/__init__.py +12 -0
- dbt/compute/engines/spark_engine.py +624 -0
- dbt/compute/federated_executor.py +837 -0
- dbt/compute/filter_pushdown.cpython-310-darwin.so +0 -0
- dbt/compute/filter_pushdown.py +273 -0
- dbt/compute/jar_provisioning.cpython-310-darwin.so +0 -0
- dbt/compute/jar_provisioning.py +255 -0
- dbt/compute/java_compat.cpython-310-darwin.so +0 -0
- dbt/compute/java_compat.py +689 -0
- dbt/compute/jdbc_utils.cpython-310-darwin.so +0 -0
- dbt/compute/jdbc_utils.py +678 -0
- dbt/compute/smart_selector.cpython-310-darwin.so +0 -0
- dbt/compute/smart_selector.py +311 -0
- dbt/compute/strategies/__init__.py +54 -0
- dbt/compute/strategies/base.py +165 -0
- dbt/compute/strategies/dataproc.py +207 -0
- dbt/compute/strategies/emr.py +203 -0
- dbt/compute/strategies/local.py +364 -0
- dbt/compute/strategies/standalone.py +262 -0
- dbt/config/__init__.py +4 -0
- dbt/config/catalogs.py +94 -0
- dbt/config/compute.cpython-310-darwin.so +0 -0
- dbt/config/compute.py +547 -0
- dbt/config/dvt_profile.cpython-310-darwin.so +0 -0
- dbt/config/dvt_profile.py +342 -0
- dbt/config/profile.py +422 -0
- dbt/config/project.py +873 -0
- dbt/config/project_utils.py +28 -0
- dbt/config/renderer.py +231 -0
- dbt/config/runtime.py +553 -0
- dbt/config/selectors.py +208 -0
- dbt/config/utils.py +77 -0
- dbt/constants.py +28 -0
- dbt/context/__init__.py +0 -0
- dbt/context/base.py +745 -0
- dbt/context/configured.py +135 -0
- dbt/context/context_config.py +382 -0
- dbt/context/docs.py +82 -0
- dbt/context/exceptions_jinja.py +178 -0
- dbt/context/macro_resolver.py +195 -0
- dbt/context/macros.py +171 -0
- dbt/context/manifest.py +72 -0
- dbt/context/providers.py +2249 -0
- dbt/context/query_header.py +13 -0
- dbt/context/secret.py +58 -0
- dbt/context/target.py +74 -0
- dbt/contracts/__init__.py +0 -0
- dbt/contracts/files.py +413 -0
- dbt/contracts/graph/__init__.py +0 -0
- dbt/contracts/graph/manifest.py +1904 -0
- dbt/contracts/graph/metrics.py +97 -0
- dbt/contracts/graph/model_config.py +70 -0
- dbt/contracts/graph/node_args.py +42 -0
- dbt/contracts/graph/nodes.py +1806 -0
- dbt/contracts/graph/semantic_manifest.py +232 -0
- dbt/contracts/graph/unparsed.py +811 -0
- dbt/contracts/project.py +417 -0
- dbt/contracts/results.py +53 -0
- dbt/contracts/selection.py +23 -0
- dbt/contracts/sql.py +85 -0
- dbt/contracts/state.py +68 -0
- dbt/contracts/util.py +46 -0
- dbt/deprecations.py +346 -0
- dbt/deps/__init__.py +0 -0
- dbt/deps/base.py +152 -0
- dbt/deps/git.py +195 -0
- dbt/deps/local.py +79 -0
- dbt/deps/registry.py +130 -0
- dbt/deps/resolver.py +149 -0
- dbt/deps/tarball.py +120 -0
- dbt/docs/source/_ext/dbt_click.py +119 -0
- dbt/docs/source/conf.py +32 -0
- dbt/env_vars.py +64 -0
- dbt/event_time/event_time.py +40 -0
- dbt/event_time/sample_window.py +60 -0
- dbt/events/__init__.py +15 -0
- dbt/events/base_types.py +36 -0
- dbt/events/core_types_pb2.py +2 -0
- dbt/events/logging.py +108 -0
- dbt/events/types.py +2516 -0
- dbt/exceptions.py +1486 -0
- dbt/flags.py +89 -0
- dbt/graph/__init__.py +11 -0
- dbt/graph/cli.py +247 -0
- dbt/graph/graph.py +172 -0
- dbt/graph/queue.py +214 -0
- dbt/graph/selector.py +374 -0
- dbt/graph/selector_methods.py +975 -0
- dbt/graph/selector_spec.py +222 -0
- dbt/graph/thread_pool.py +18 -0
- dbt/hooks.py +21 -0
- dbt/include/README.md +49 -0
- dbt/include/__init__.py +3 -0
- dbt/include/starter_project/.gitignore +4 -0
- dbt/include/starter_project/README.md +15 -0
- dbt/include/starter_project/__init__.py +3 -0
- dbt/include/starter_project/analyses/.gitkeep +0 -0
- dbt/include/starter_project/dbt_project.yml +36 -0
- dbt/include/starter_project/macros/.gitkeep +0 -0
- dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
- dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
- dbt/include/starter_project/models/example/schema.yml +21 -0
- dbt/include/starter_project/seeds/.gitkeep +0 -0
- dbt/include/starter_project/snapshots/.gitkeep +0 -0
- dbt/include/starter_project/tests/.gitkeep +0 -0
- dbt/internal_deprecations.py +26 -0
- dbt/jsonschemas/__init__.py +3 -0
- dbt/jsonschemas/jsonschemas.py +309 -0
- dbt/jsonschemas/project/0.0.110.json +4717 -0
- dbt/jsonschemas/project/0.0.85.json +2015 -0
- dbt/jsonschemas/resources/0.0.110.json +2636 -0
- dbt/jsonschemas/resources/0.0.85.json +2536 -0
- dbt/jsonschemas/resources/latest.json +6773 -0
- dbt/links.py +4 -0
- dbt/materializations/__init__.py +0 -0
- dbt/materializations/incremental/__init__.py +0 -0
- dbt/materializations/incremental/microbatch.py +236 -0
- dbt/mp_context.py +8 -0
- dbt/node_types.py +37 -0
- dbt/parser/__init__.py +23 -0
- dbt/parser/analysis.py +21 -0
- dbt/parser/base.py +548 -0
- dbt/parser/common.py +266 -0
- dbt/parser/docs.py +52 -0
- dbt/parser/fixtures.py +51 -0
- dbt/parser/functions.py +30 -0
- dbt/parser/generic_test.py +100 -0
- dbt/parser/generic_test_builders.py +333 -0
- dbt/parser/hooks.py +118 -0
- dbt/parser/macros.py +137 -0
- dbt/parser/manifest.py +2204 -0
- dbt/parser/models.py +573 -0
- dbt/parser/partial.py +1178 -0
- dbt/parser/read_files.py +445 -0
- dbt/parser/schema_generic_tests.py +422 -0
- dbt/parser/schema_renderer.py +111 -0
- dbt/parser/schema_yaml_readers.py +935 -0
- dbt/parser/schemas.py +1466 -0
- dbt/parser/search.py +149 -0
- dbt/parser/seeds.py +28 -0
- dbt/parser/singular_test.py +20 -0
- dbt/parser/snapshots.py +44 -0
- dbt/parser/sources.py +558 -0
- dbt/parser/sql.py +62 -0
- dbt/parser/unit_tests.py +621 -0
- dbt/plugins/__init__.py +20 -0
- dbt/plugins/contracts.py +9 -0
- dbt/plugins/exceptions.py +2 -0
- dbt/plugins/manager.py +163 -0
- dbt/plugins/manifest.py +21 -0
- dbt/profiler.py +20 -0
- dbt/py.typed +1 -0
- dbt/query_analyzer.cpython-310-darwin.so +0 -0
- dbt/query_analyzer.py +410 -0
- dbt/runners/__init__.py +2 -0
- dbt/runners/exposure_runner.py +7 -0
- dbt/runners/no_op_runner.py +45 -0
- dbt/runners/saved_query_runner.py +7 -0
- dbt/selected_resources.py +8 -0
- dbt/task/__init__.py +0 -0
- dbt/task/base.py +503 -0
- dbt/task/build.py +197 -0
- dbt/task/clean.py +56 -0
- dbt/task/clone.py +161 -0
- dbt/task/compile.py +150 -0
- dbt/task/compute.py +454 -0
- dbt/task/debug.py +505 -0
- dbt/task/deps.py +280 -0
- dbt/task/docs/__init__.py +3 -0
- dbt/task/docs/generate.py +660 -0
- dbt/task/docs/index.html +250 -0
- dbt/task/docs/serve.py +29 -0
- dbt/task/freshness.py +322 -0
- dbt/task/function.py +121 -0
- dbt/task/group_lookup.py +46 -0
- dbt/task/init.py +553 -0
- dbt/task/java.py +316 -0
- dbt/task/list.py +236 -0
- dbt/task/printer.py +175 -0
- dbt/task/retry.py +175 -0
- dbt/task/run.py +1306 -0
- dbt/task/run_operation.py +141 -0
- dbt/task/runnable.py +758 -0
- dbt/task/seed.py +103 -0
- dbt/task/show.py +149 -0
- dbt/task/snapshot.py +56 -0
- dbt/task/spark.py +414 -0
- dbt/task/sql.py +110 -0
- dbt/task/target_sync.py +759 -0
- dbt/task/test.py +464 -0
- dbt/tests/fixtures/__init__.py +1 -0
- dbt/tests/fixtures/project.py +620 -0
- dbt/tests/util.py +651 -0
- dbt/tracking.py +529 -0
- dbt/utils/__init__.py +3 -0
- dbt/utils/artifact_upload.py +151 -0
- dbt/utils/utils.py +408 -0
- dbt/version.py +268 -0
- dvt_cli/__init__.py +72 -0
- dvt_core-0.52.2.dist-info/METADATA +286 -0
- dvt_core-0.52.2.dist-info/RECORD +275 -0
- dvt_core-0.52.2.dist-info/WHEEL +5 -0
- dvt_core-0.52.2.dist-info/entry_points.txt +2 -0
- dvt_core-0.52.2.dist-info/top_level.txt +2 -0
|
Binary file
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Filter pushdown optimization for federated queries.
|
|
3
|
+
|
|
4
|
+
Extracts filters (WHERE, LIMIT, ORDER BY) from compiled SQL and rewrites them
|
|
5
|
+
to be pushed down to source databases in their native SQL dialects.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
import sys
|
|
10
|
+
import sqlparse
|
|
11
|
+
from sqlparse.sql import Statement, Token, TokenList, Identifier, Where, Comparison
|
|
12
|
+
from sqlparse.tokens import Keyword, Whitespace
|
|
13
|
+
from typing import Dict, List, Optional, Any
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FilterPushdownOptimizer:
|
|
17
|
+
"""
|
|
18
|
+
Optimizes federated queries by pushing filters down to source databases.
|
|
19
|
+
|
|
20
|
+
Strategy:
|
|
21
|
+
1. Parse compiled SQL to extract filters per source table
|
|
22
|
+
2. Rewrite filters in each source adapter's SQL dialect
|
|
23
|
+
3. Return subqueries for JDBC reads instead of plain table names
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
Input SQL:
|
|
27
|
+
SELECT * FROM snowflake_table WHERE date > '2024-01-01' LIMIT 10
|
|
28
|
+
|
|
29
|
+
Output:
|
|
30
|
+
JDBC subquery: (SELECT * FROM snowflake_table WHERE date > '2024-01-01' LIMIT 10)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, compiled_sql: str, source_tables: List[Any]):
|
|
34
|
+
"""
|
|
35
|
+
Initialize optimizer with compiled SQL and source table metadata.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
compiled_sql: The fully compiled SQL from the model
|
|
39
|
+
source_tables: List of SourceTableMetadata objects
|
|
40
|
+
"""
|
|
41
|
+
self.compiled_sql = compiled_sql
|
|
42
|
+
self.source_tables = source_tables
|
|
43
|
+
self.parsed = sqlparse.parse(compiled_sql)[0] if compiled_sql else None
|
|
44
|
+
|
|
45
|
+
def extract_limit(self) -> Optional[int]:
|
|
46
|
+
"""
|
|
47
|
+
Extract LIMIT clause from SQL.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Limit value as integer, or None if no LIMIT clause
|
|
51
|
+
"""
|
|
52
|
+
if not self.parsed:
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
# Simple regex approach for LIMIT (works for most cases)
|
|
56
|
+
limit_match = re.search(r'\bLIMIT\s+(\d+)\b', self.compiled_sql, re.IGNORECASE)
|
|
57
|
+
if limit_match:
|
|
58
|
+
return int(limit_match.group(1))
|
|
59
|
+
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
def extract_sample_clause(self) -> Optional[Dict[str, Any]]:
|
|
63
|
+
"""
|
|
64
|
+
Extract SAMPLE/TABLESAMPLE clause from SQL (Snowflake-specific sampling).
|
|
65
|
+
|
|
66
|
+
Snowflake supports several SAMPLE methods:
|
|
67
|
+
- SAMPLE (N) or SAMPLE (N ROWS) - Row-count sampling
|
|
68
|
+
- SAMPLE SYSTEM (P) - System/block sampling with P% probability
|
|
69
|
+
- SAMPLE BERNOULLI (P) - Bernoulli/row-level sampling with P% probability
|
|
70
|
+
- SAMPLE BLOCK (P) - Alias for SYSTEM
|
|
71
|
+
- TABLESAMPLE ... - Alternative syntax
|
|
72
|
+
- REPEATABLE(seed) or SEED(seed) - Reproducible sampling
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Dict with keys:
|
|
76
|
+
- 'method': 'ROWS', 'SYSTEM', 'BERNOULLI', 'BLOCK'
|
|
77
|
+
- 'value': int (row count or percentage)
|
|
78
|
+
- 'seed': Optional int for reproducible sampling
|
|
79
|
+
- 'full_clause': The complete SAMPLE clause to push down
|
|
80
|
+
or None if no SAMPLE clause
|
|
81
|
+
"""
|
|
82
|
+
if not self.parsed:
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
# Try to match complete SAMPLE/TABLESAMPLE clause
|
|
86
|
+
# Pattern: (TABLE)?SAMPLE <method>? (value) (REPEATABLE|SEED)?(seed)?
|
|
87
|
+
|
|
88
|
+
# Match: SAMPLE (N) or SAMPLE (N ROWS) with optional seed
|
|
89
|
+
sample_rows = re.search(
|
|
90
|
+
r'\b(?:TABLE)?SAMPLE\s*\(\s*(\d+)(?:\s+ROWS)?\s*\)'
|
|
91
|
+
r'(?:\s+(?:REPEATABLE|SEED)\s*\(\s*(\d+)\s*\))?',
|
|
92
|
+
self.compiled_sql,
|
|
93
|
+
re.IGNORECASE
|
|
94
|
+
)
|
|
95
|
+
if sample_rows:
|
|
96
|
+
result = {
|
|
97
|
+
'method': 'ROWS',
|
|
98
|
+
'value': int(sample_rows.group(1)),
|
|
99
|
+
'seed': int(sample_rows.group(2)) if sample_rows.group(2) else None
|
|
100
|
+
}
|
|
101
|
+
# Build full clause
|
|
102
|
+
clause = f"SAMPLE ({result['value']})"
|
|
103
|
+
if result['seed']:
|
|
104
|
+
clause += f" REPEATABLE ({result['seed']})"
|
|
105
|
+
result['full_clause'] = clause
|
|
106
|
+
return result
|
|
107
|
+
|
|
108
|
+
# Match: SAMPLE SYSTEM|BERNOULLI|BLOCK (P) with optional seed
|
|
109
|
+
sample_method = re.search(
|
|
110
|
+
r'\b(?:TABLE)?SAMPLE\s+(SYSTEM|BERNOULLI|BLOCK)\s*\(\s*(\d+(?:\.\d+)?)\s*\)'
|
|
111
|
+
r'(?:\s+(?:REPEATABLE|SEED)\s*\(\s*(\d+)\s*\))?',
|
|
112
|
+
self.compiled_sql,
|
|
113
|
+
re.IGNORECASE
|
|
114
|
+
)
|
|
115
|
+
if sample_method:
|
|
116
|
+
method = sample_method.group(1).upper()
|
|
117
|
+
# BLOCK is an alias for SYSTEM
|
|
118
|
+
if method == 'BLOCK':
|
|
119
|
+
method = 'SYSTEM'
|
|
120
|
+
|
|
121
|
+
result = {
|
|
122
|
+
'method': method,
|
|
123
|
+
'value': float(sample_method.group(2)),
|
|
124
|
+
'seed': int(sample_method.group(3)) if sample_method.group(3) else None
|
|
125
|
+
}
|
|
126
|
+
# Build full clause
|
|
127
|
+
clause = f"SAMPLE {result['method']} ({result['value']})"
|
|
128
|
+
if result['seed']:
|
|
129
|
+
clause += f" REPEATABLE ({result['seed']})"
|
|
130
|
+
result['full_clause'] = clause
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
def extract_where_clauses(self) -> Dict[str, List[str]]:
|
|
136
|
+
"""
|
|
137
|
+
Extract WHERE clauses that apply to specific source tables.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Dict mapping table name/alias to list of WHERE conditions
|
|
141
|
+
|
|
142
|
+
Example:
|
|
143
|
+
{
|
|
144
|
+
'snowflake_table': ['date > \'2024-01-01\'', 'status = \'active\''],
|
|
145
|
+
'postgres_table': ['id > 100']
|
|
146
|
+
}
|
|
147
|
+
"""
|
|
148
|
+
# TODO: Implement WHERE clause extraction using sqlparse
|
|
149
|
+
# For now, return empty dict - LIMIT pushdown is the priority
|
|
150
|
+
return {}
|
|
151
|
+
|
|
152
|
+
def build_pushdown_subquery(
|
|
153
|
+
self,
|
|
154
|
+
source_table: Any,
|
|
155
|
+
adapter_type: str
|
|
156
|
+
) -> Optional[str]:
|
|
157
|
+
"""
|
|
158
|
+
Build a subquery with pushed-down filters for a specific source table.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
source_table: SourceTableMetadata object
|
|
162
|
+
adapter_type: Adapter type (postgres, snowflake, etc.)
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
SQL subquery with filters, or None if no pushdown possible
|
|
166
|
+
|
|
167
|
+
Example:
|
|
168
|
+
Input: table="schema.table", LIMIT 10
|
|
169
|
+
Output: "(SELECT * FROM schema.table LIMIT 10)"
|
|
170
|
+
"""
|
|
171
|
+
limit = self.extract_limit()
|
|
172
|
+
sample_clause = self.extract_sample_clause()
|
|
173
|
+
where_clauses = self.extract_where_clauses()
|
|
174
|
+
|
|
175
|
+
# DVT v0.4.7: Suppressed debug output for clean console
|
|
176
|
+
# Debug info: LIMIT={limit}, SAMPLE={sample_clause}, WHERE={where_clauses}
|
|
177
|
+
|
|
178
|
+
# If no filters to push down, return None (read full table)
|
|
179
|
+
if not limit and not sample_clause and not where_clauses:
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
# Build subquery
|
|
183
|
+
qualified_name = source_table.qualified_name
|
|
184
|
+
subquery_parts = [f"SELECT * FROM {qualified_name}"]
|
|
185
|
+
|
|
186
|
+
# Add SAMPLE clause (Snowflake-specific, goes right after FROM)
|
|
187
|
+
if sample_clause and adapter_type.lower() == 'snowflake':
|
|
188
|
+
# Use the pre-built full_clause from extract_sample_clause
|
|
189
|
+
# This includes all sampling options: method, value, and seed
|
|
190
|
+
subquery_parts.append(sample_clause['full_clause'])
|
|
191
|
+
|
|
192
|
+
# Add WHERE clauses (if any)
|
|
193
|
+
table_key = source_table.identifier # or qualified_name
|
|
194
|
+
if table_key in where_clauses:
|
|
195
|
+
conditions = " AND ".join(where_clauses[table_key])
|
|
196
|
+
subquery_parts.append(f"WHERE {conditions}")
|
|
197
|
+
|
|
198
|
+
# Add LIMIT (if present and no SAMPLE used)
|
|
199
|
+
# Note: SAMPLE takes precedence over LIMIT for Snowflake
|
|
200
|
+
if limit and not (sample_clause and adapter_type.lower() == 'snowflake'):
|
|
201
|
+
# Rewrite LIMIT in adapter's dialect
|
|
202
|
+
limit_clause = self._rewrite_limit_for_adapter(limit, adapter_type)
|
|
203
|
+
if limit_clause:
|
|
204
|
+
subquery_parts.append(limit_clause)
|
|
205
|
+
|
|
206
|
+
subquery = " ".join(subquery_parts)
|
|
207
|
+
# DVT v0.4.7: Suppressed debug output
|
|
208
|
+
return f"({subquery})"
|
|
209
|
+
|
|
210
|
+
def _rewrite_limit_for_adapter(self, limit: int, adapter_type: str) -> Optional[str]:
|
|
211
|
+
"""
|
|
212
|
+
Rewrite LIMIT clause for specific adapter's SQL dialect.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
limit: Limit value
|
|
216
|
+
adapter_type: Adapter type (postgres, snowflake, redshift, etc.)
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
LIMIT clause in adapter's dialect
|
|
220
|
+
"""
|
|
221
|
+
# Most adapters support standard LIMIT syntax
|
|
222
|
+
standard_adapters = [
|
|
223
|
+
'postgres', 'postgresql',
|
|
224
|
+
'snowflake',
|
|
225
|
+
'redshift',
|
|
226
|
+
'mysql',
|
|
227
|
+
'sqlite',
|
|
228
|
+
'bigquery'
|
|
229
|
+
]
|
|
230
|
+
|
|
231
|
+
if adapter_type.lower() in standard_adapters:
|
|
232
|
+
return f"LIMIT {limit}"
|
|
233
|
+
|
|
234
|
+
# SQL Server / TSQL uses TOP
|
|
235
|
+
if adapter_type.lower() in ['sqlserver', 'mssql', 'tsql']:
|
|
236
|
+
# Note: This should go in SELECT clause, not at the end
|
|
237
|
+
# For now, return None - we'll handle this in a future iteration
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
# Oracle uses ROWNUM or FETCH FIRST (12c+)
|
|
241
|
+
if adapter_type.lower() == 'oracle':
|
|
242
|
+
return f"FETCH FIRST {limit} ROWS ONLY"
|
|
243
|
+
|
|
244
|
+
# Default: standard LIMIT
|
|
245
|
+
return f"LIMIT {limit}"
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def optimize_jdbc_table_read(
|
|
249
|
+
source_table: Any,
|
|
250
|
+
compiled_sql: str,
|
|
251
|
+
source_tables: List[Any],
|
|
252
|
+
adapter_type: str
|
|
253
|
+
) -> str:
|
|
254
|
+
"""
|
|
255
|
+
Optimize JDBC table read by pushing down filters.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
source_table: SourceTableMetadata for this table
|
|
259
|
+
compiled_sql: Compiled SQL from the model
|
|
260
|
+
source_tables: All source tables in the query
|
|
261
|
+
adapter_type: Source adapter type
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
Table identifier (plain name or subquery with filters)
|
|
265
|
+
"""
|
|
266
|
+
optimizer = FilterPushdownOptimizer(compiled_sql, source_tables)
|
|
267
|
+
subquery = optimizer.build_pushdown_subquery(source_table, adapter_type)
|
|
268
|
+
|
|
269
|
+
if subquery:
|
|
270
|
+
return subquery
|
|
271
|
+
else:
|
|
272
|
+
# No filters to push down - read full table
|
|
273
|
+
return source_table.qualified_name
|
|
Binary file
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""
|
|
2
|
+
JAR Provisioning Module
|
|
3
|
+
|
|
4
|
+
Centralized JDBC JAR provisioning for Spark compute engines.
|
|
5
|
+
|
|
6
|
+
v0.5.98: Supports two provisioning strategies:
|
|
7
|
+
- LocalJARProvisioning: Uses spark.jars with local file paths (fast startup)
|
|
8
|
+
- RemoteJARProvisioning: Uses spark.jars.packages with Maven coordinates (remote clusters)
|
|
9
|
+
|
|
10
|
+
Local Spark uses local JARs from .dvt/jdbc_jars/ for instant startup.
|
|
11
|
+
Remote clusters (Databricks, EMR, Dataproc, Standalone) use Maven coordinates
|
|
12
|
+
so Spark workers can download JARs directly from Maven Central.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import glob
|
|
16
|
+
import os
|
|
17
|
+
from abc import ABC, abstractmethod
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Dict, List, Optional, Set
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Maven coordinates for JDBC drivers - used by remote clusters
|
|
23
|
+
# Format: groupId:artifactId:version
|
|
24
|
+
# These are the same JARs as downloaded by `dvt target sync`, but expressed as Maven coordinates
|
|
25
|
+
JDBC_MAVEN_COORDINATES = {
|
|
26
|
+
# Official dbt-labs adapters - JDBC drivers only
|
|
27
|
+
"postgres": "org.postgresql:postgresql:42.7.4",
|
|
28
|
+
"snowflake": "net.snowflake:snowflake-jdbc:3.16.1",
|
|
29
|
+
"bigquery": "com.google.cloud.bigdataoss:gcs-connector:hadoop3-2.2.22",
|
|
30
|
+
"redshift": "com.amazon.redshift:redshift-jdbc42:2.1.0.32",
|
|
31
|
+
"spark": "", # Native, no JDBC needed
|
|
32
|
+
"databricks": "com.databricks:databricks-jdbc:2.6.36",
|
|
33
|
+
"trino": "io.trino:trino-jdbc:443",
|
|
34
|
+
"duckdb": "org.duckdb:duckdb_jdbc:1.1.3",
|
|
35
|
+
# Community adapters - JDBC drivers only (verified on Maven)
|
|
36
|
+
"mysql": "com.mysql:mysql-connector-j:9.1.0",
|
|
37
|
+
"sqlserver": "com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre11",
|
|
38
|
+
"synapse": "com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre11",
|
|
39
|
+
"fabric": "com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre11",
|
|
40
|
+
"oracle": "com.oracle.database.jdbc:ojdbc11:23.6.0.24.10",
|
|
41
|
+
"teradata": "com.teradata.jdbc:terajdbc:20.00.00.20",
|
|
42
|
+
"clickhouse": "com.clickhouse:clickhouse-jdbc:0.6.5",
|
|
43
|
+
"greenplum": "org.postgresql:postgresql:42.7.4", # PostgreSQL compatible
|
|
44
|
+
"vertica": "com.vertica.jdbc:vertica-jdbc:24.3.0-0",
|
|
45
|
+
"sqlite": "org.xerial:sqlite-jdbc:3.47.1.0",
|
|
46
|
+
"mariadb": "org.mariadb.jdbc:mariadb-java-client:3.4.1",
|
|
47
|
+
"exasol": "com.exasol:exasol-jdbc:24.2.0",
|
|
48
|
+
"db2": "com.ibm.db2:jcc:11.5.9.0",
|
|
49
|
+
"presto": "io.prestosql:presto-jdbc:350",
|
|
50
|
+
"hive": "org.apache.hive:hive-jdbc:3.1.3",
|
|
51
|
+
"singlestore": "com.singlestore:singlestore-jdbc-client:1.2.9",
|
|
52
|
+
"starrocks": "com.mysql:mysql-connector-j:9.1.0", # MySQL wire protocol
|
|
53
|
+
"doris": "com.mysql:mysql-connector-j:9.1.0", # MySQL wire protocol
|
|
54
|
+
"materialize": "org.postgresql:postgresql:42.7.4", # PostgreSQL wire protocol
|
|
55
|
+
"neo4j": "org.neo4j:neo4j-jdbc-driver:4.0.10",
|
|
56
|
+
"timescaledb": "org.postgresql:postgresql:42.7.4", # PostgreSQL extension
|
|
57
|
+
"questdb": "org.postgresql:postgresql:42.7.4", # PostgreSQL wire protocol
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class JARProvisioning(ABC):
|
|
62
|
+
"""Abstract base class for JAR provisioning strategies."""
|
|
63
|
+
|
|
64
|
+
@abstractmethod
|
|
65
|
+
def get_spark_config(self, adapter_types: Set[str]) -> Dict[str, str]:
|
|
66
|
+
"""
|
|
67
|
+
Get Spark configuration for JDBC JARs.
|
|
68
|
+
|
|
69
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
70
|
+
:returns: Dictionary of Spark config keys/values
|
|
71
|
+
"""
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def get_provisioning_type(self) -> str:
|
|
76
|
+
"""
|
|
77
|
+
Get the type of JAR provisioning.
|
|
78
|
+
|
|
79
|
+
:returns: 'local' or 'maven'
|
|
80
|
+
"""
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class LocalJARProvisioning(JARProvisioning):
|
|
85
|
+
"""
|
|
86
|
+
Local JAR provisioning using spark.jars with local file paths.
|
|
87
|
+
|
|
88
|
+
Best for local Spark (spark-local) where JARs are pre-downloaded
|
|
89
|
+
to .dvt/jdbc_jars/ directory via `dvt target sync`.
|
|
90
|
+
|
|
91
|
+
Advantages:
|
|
92
|
+
- Instant startup (no JAR download at runtime)
|
|
93
|
+
- Works offline
|
|
94
|
+
- Consistent JAR versions
|
|
95
|
+
|
|
96
|
+
Disadvantages:
|
|
97
|
+
- Only works for local Spark (JARs must be on local filesystem)
|
|
98
|
+
- Requires running `dvt target sync` first
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
def __init__(self, project_dir: Optional[str] = None):
|
|
102
|
+
"""
|
|
103
|
+
Initialize local JAR provisioning.
|
|
104
|
+
|
|
105
|
+
:param project_dir: Path to project root directory (defaults to cwd)
|
|
106
|
+
"""
|
|
107
|
+
self.project_dir = project_dir or os.getcwd()
|
|
108
|
+
self.jdbc_jars_dir = os.path.join(self.project_dir, ".dvt", "jdbc_jars")
|
|
109
|
+
|
|
110
|
+
def get_jar_paths(self) -> List[str]:
|
|
111
|
+
"""
|
|
112
|
+
Discover all JDBC JAR files from project cache.
|
|
113
|
+
|
|
114
|
+
:returns: List of absolute JAR file paths
|
|
115
|
+
"""
|
|
116
|
+
if not os.path.exists(self.jdbc_jars_dir):
|
|
117
|
+
return []
|
|
118
|
+
|
|
119
|
+
jar_pattern = os.path.join(self.jdbc_jars_dir, "*.jar")
|
|
120
|
+
return sorted(glob.glob(jar_pattern))
|
|
121
|
+
|
|
122
|
+
def get_spark_config(self, adapter_types: Set[str]) -> Dict[str, str]:
|
|
123
|
+
"""
|
|
124
|
+
Get Spark config using local JAR paths.
|
|
125
|
+
|
|
126
|
+
:param adapter_types: Set of adapter types (ignored - uses all JARs found)
|
|
127
|
+
:returns: Dictionary with spark.jars config
|
|
128
|
+
"""
|
|
129
|
+
jar_paths = self.get_jar_paths()
|
|
130
|
+
|
|
131
|
+
if jar_paths:
|
|
132
|
+
return {"spark.jars": ",".join(jar_paths)}
|
|
133
|
+
return {}
|
|
134
|
+
|
|
135
|
+
def get_provisioning_type(self) -> str:
|
|
136
|
+
"""Get provisioning type."""
|
|
137
|
+
return "local"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class RemoteJARProvisioning(JARProvisioning):
|
|
141
|
+
"""
|
|
142
|
+
Remote JAR provisioning using spark.jars.packages with Maven coordinates.
|
|
143
|
+
|
|
144
|
+
Best for remote Spark clusters (Databricks, EMR, Dataproc, Standalone)
|
|
145
|
+
where Spark workers need to download JARs from Maven Central.
|
|
146
|
+
|
|
147
|
+
Advantages:
|
|
148
|
+
- Works with any remote Spark cluster
|
|
149
|
+
- No need to pre-install JARs on cluster
|
|
150
|
+
- Spark handles dependency resolution
|
|
151
|
+
|
|
152
|
+
Disadvantages:
|
|
153
|
+
- Requires network access to Maven Central
|
|
154
|
+
- First query may be slower (JAR download)
|
|
155
|
+
- May need to configure Maven repositories for private JARs
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
def __init__(self, profiles_dir: Optional[str] = None):
|
|
159
|
+
"""
|
|
160
|
+
Initialize remote JAR provisioning.
|
|
161
|
+
|
|
162
|
+
:param profiles_dir: Path to DVT profiles directory (for scanning profiles.yml)
|
|
163
|
+
"""
|
|
164
|
+
self.profiles_dir = profiles_dir or str(Path.home() / ".dvt")
|
|
165
|
+
|
|
166
|
+
def get_maven_coordinates(self, adapter_types: Set[str]) -> List[str]:
|
|
167
|
+
"""
|
|
168
|
+
Get Maven coordinates for the specified adapter types.
|
|
169
|
+
|
|
170
|
+
:param adapter_types: Set of adapter types
|
|
171
|
+
:returns: List of Maven coordinates (group:artifact:version)
|
|
172
|
+
"""
|
|
173
|
+
coordinates = []
|
|
174
|
+
seen = set() # Avoid duplicates (e.g., postgres and timescaledb both use postgresql)
|
|
175
|
+
|
|
176
|
+
for adapter_type in adapter_types:
|
|
177
|
+
coord = JDBC_MAVEN_COORDINATES.get(adapter_type.lower(), "")
|
|
178
|
+
if coord and coord not in seen:
|
|
179
|
+
coordinates.append(coord)
|
|
180
|
+
seen.add(coord)
|
|
181
|
+
|
|
182
|
+
return sorted(coordinates)
|
|
183
|
+
|
|
184
|
+
def get_spark_config(self, adapter_types: Set[str]) -> Dict[str, str]:
|
|
185
|
+
"""
|
|
186
|
+
Get Spark config using Maven coordinates.
|
|
187
|
+
|
|
188
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
189
|
+
:returns: Dictionary with spark.jars.packages config
|
|
190
|
+
"""
|
|
191
|
+
coordinates = self.get_maven_coordinates(adapter_types)
|
|
192
|
+
|
|
193
|
+
if coordinates:
|
|
194
|
+
return {"spark.jars.packages": ",".join(coordinates)}
|
|
195
|
+
return {}
|
|
196
|
+
|
|
197
|
+
def get_provisioning_type(self) -> str:
|
|
198
|
+
"""Get provisioning type."""
|
|
199
|
+
return "maven"
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def get_required_adapter_types(profiles_dir: Optional[str] = None) -> Set[str]:
|
|
203
|
+
"""
|
|
204
|
+
Scan profiles.yml and return the set of adapter types needed.
|
|
205
|
+
|
|
206
|
+
:param profiles_dir: Path to DVT profiles directory
|
|
207
|
+
:returns: Set of adapter type names (e.g., {'postgres', 'snowflake'})
|
|
208
|
+
"""
|
|
209
|
+
from dbt.clients.yaml_helper import load_yaml_text
|
|
210
|
+
|
|
211
|
+
if profiles_dir is None:
|
|
212
|
+
profiles_dir = str(Path.home() / ".dvt")
|
|
213
|
+
|
|
214
|
+
profiles_path = Path(profiles_dir) / "profiles.yml"
|
|
215
|
+
if not profiles_path.exists():
|
|
216
|
+
return set()
|
|
217
|
+
|
|
218
|
+
try:
|
|
219
|
+
content = profiles_path.read_text()
|
|
220
|
+
profiles = load_yaml_text(content) or {}
|
|
221
|
+
|
|
222
|
+
adapter_types = set()
|
|
223
|
+
for profile_name, profile_data in profiles.items():
|
|
224
|
+
if isinstance(profile_data, dict):
|
|
225
|
+
outputs = profile_data.get("outputs", {})
|
|
226
|
+
for target_name, target_config in outputs.items():
|
|
227
|
+
if isinstance(target_config, dict):
|
|
228
|
+
adapter_type = target_config.get("type")
|
|
229
|
+
if adapter_type:
|
|
230
|
+
adapter_types.add(adapter_type.lower())
|
|
231
|
+
|
|
232
|
+
return adapter_types
|
|
233
|
+
|
|
234
|
+
except Exception:
|
|
235
|
+
return set()
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def get_provisioning_for_platform(
|
|
239
|
+
platform: str,
|
|
240
|
+
project_dir: Optional[str] = None,
|
|
241
|
+
profiles_dir: Optional[str] = None,
|
|
242
|
+
) -> JARProvisioning:
|
|
243
|
+
"""
|
|
244
|
+
Factory function to get the appropriate JAR provisioning strategy.
|
|
245
|
+
|
|
246
|
+
:param platform: Spark platform ('local', 'databricks', 'emr', 'dataproc', 'standalone')
|
|
247
|
+
:param project_dir: Project directory (for local provisioning)
|
|
248
|
+
:param profiles_dir: Profiles directory (for remote provisioning)
|
|
249
|
+
:returns: JARProvisioning instance
|
|
250
|
+
"""
|
|
251
|
+
if platform.lower() == "local":
|
|
252
|
+
return LocalJARProvisioning(project_dir=project_dir)
|
|
253
|
+
else:
|
|
254
|
+
# All remote platforms use Maven coordinates
|
|
255
|
+
return RemoteJARProvisioning(profiles_dir=profiles_dir)
|
|
Binary file
|