dvt-core 1.11.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dvt-core might be problematic. Click here for more details.
- dvt/__init__.py +7 -0
- dvt/_pydantic_shim.py +26 -0
- dvt/adapters/__init__.py +16 -0
- dvt/adapters/multi_adapter_manager.py +268 -0
- dvt/artifacts/__init__.py +0 -0
- dvt/artifacts/exceptions/__init__.py +1 -0
- dvt/artifacts/exceptions/schemas.py +31 -0
- dvt/artifacts/resources/__init__.py +116 -0
- dvt/artifacts/resources/base.py +68 -0
- dvt/artifacts/resources/types.py +93 -0
- dvt/artifacts/resources/v1/analysis.py +10 -0
- dvt/artifacts/resources/v1/catalog.py +23 -0
- dvt/artifacts/resources/v1/components.py +275 -0
- dvt/artifacts/resources/v1/config.py +282 -0
- dvt/artifacts/resources/v1/documentation.py +11 -0
- dvt/artifacts/resources/v1/exposure.py +52 -0
- dvt/artifacts/resources/v1/function.py +53 -0
- dvt/artifacts/resources/v1/generic_test.py +32 -0
- dvt/artifacts/resources/v1/group.py +22 -0
- dvt/artifacts/resources/v1/hook.py +11 -0
- dvt/artifacts/resources/v1/macro.py +30 -0
- dvt/artifacts/resources/v1/metric.py +173 -0
- dvt/artifacts/resources/v1/model.py +146 -0
- dvt/artifacts/resources/v1/owner.py +10 -0
- dvt/artifacts/resources/v1/saved_query.py +112 -0
- dvt/artifacts/resources/v1/seed.py +42 -0
- dvt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dvt/artifacts/resources/v1/semantic_model.py +315 -0
- dvt/artifacts/resources/v1/singular_test.py +14 -0
- dvt/artifacts/resources/v1/snapshot.py +92 -0
- dvt/artifacts/resources/v1/source_definition.py +85 -0
- dvt/artifacts/resources/v1/sql_operation.py +10 -0
- dvt/artifacts/resources/v1/unit_test_definition.py +78 -0
- dvt/artifacts/schemas/__init__.py +0 -0
- dvt/artifacts/schemas/base.py +191 -0
- dvt/artifacts/schemas/batch_results.py +24 -0
- dvt/artifacts/schemas/catalog/__init__.py +12 -0
- dvt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dvt/artifacts/schemas/catalog/v1/catalog.py +60 -0
- dvt/artifacts/schemas/freshness/__init__.py +1 -0
- dvt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dvt/artifacts/schemas/freshness/v3/freshness.py +159 -0
- dvt/artifacts/schemas/manifest/__init__.py +2 -0
- dvt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dvt/artifacts/schemas/manifest/v12/manifest.py +212 -0
- dvt/artifacts/schemas/results.py +148 -0
- dvt/artifacts/schemas/run/__init__.py +2 -0
- dvt/artifacts/schemas/run/v5/__init__.py +0 -0
- dvt/artifacts/schemas/run/v5/run.py +184 -0
- dvt/artifacts/schemas/upgrades/__init__.py +4 -0
- dvt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dvt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dvt/artifacts/utils/validation.py +153 -0
- dvt/cli/__init__.py +1 -0
- dvt/cli/context.py +16 -0
- dvt/cli/exceptions.py +56 -0
- dvt/cli/flags.py +558 -0
- dvt/cli/main.py +971 -0
- dvt/cli/option_types.py +121 -0
- dvt/cli/options.py +79 -0
- dvt/cli/params.py +803 -0
- dvt/cli/requires.py +478 -0
- dvt/cli/resolvers.py +32 -0
- dvt/cli/types.py +40 -0
- dvt/clients/__init__.py +0 -0
- dvt/clients/checked_load.py +82 -0
- dvt/clients/git.py +164 -0
- dvt/clients/jinja.py +206 -0
- dvt/clients/jinja_static.py +245 -0
- dvt/clients/registry.py +192 -0
- dvt/clients/yaml_helper.py +68 -0
- dvt/compilation.py +833 -0
- dvt/compute/__init__.py +26 -0
- dvt/compute/base.py +288 -0
- dvt/compute/engines/__init__.py +13 -0
- dvt/compute/engines/duckdb_engine.py +368 -0
- dvt/compute/engines/spark_engine.py +273 -0
- dvt/compute/query_analyzer.py +212 -0
- dvt/compute/router.py +483 -0
- dvt/config/__init__.py +4 -0
- dvt/config/catalogs.py +95 -0
- dvt/config/compute_config.py +406 -0
- dvt/config/profile.py +411 -0
- dvt/config/profiles_v2.py +464 -0
- dvt/config/project.py +893 -0
- dvt/config/renderer.py +232 -0
- dvt/config/runtime.py +491 -0
- dvt/config/selectors.py +209 -0
- dvt/config/utils.py +78 -0
- dvt/connectors/.gitignore +6 -0
- dvt/connectors/README.md +306 -0
- dvt/connectors/catalog.yml +217 -0
- dvt/connectors/download_connectors.py +300 -0
- dvt/constants.py +29 -0
- dvt/context/__init__.py +0 -0
- dvt/context/base.py +746 -0
- dvt/context/configured.py +136 -0
- dvt/context/context_config.py +350 -0
- dvt/context/docs.py +82 -0
- dvt/context/exceptions_jinja.py +179 -0
- dvt/context/macro_resolver.py +195 -0
- dvt/context/macros.py +171 -0
- dvt/context/manifest.py +73 -0
- dvt/context/providers.py +2198 -0
- dvt/context/query_header.py +14 -0
- dvt/context/secret.py +59 -0
- dvt/context/target.py +74 -0
- dvt/contracts/__init__.py +0 -0
- dvt/contracts/files.py +413 -0
- dvt/contracts/graph/__init__.py +0 -0
- dvt/contracts/graph/manifest.py +1904 -0
- dvt/contracts/graph/metrics.py +98 -0
- dvt/contracts/graph/model_config.py +71 -0
- dvt/contracts/graph/node_args.py +42 -0
- dvt/contracts/graph/nodes.py +1806 -0
- dvt/contracts/graph/semantic_manifest.py +233 -0
- dvt/contracts/graph/unparsed.py +812 -0
- dvt/contracts/project.py +417 -0
- dvt/contracts/results.py +53 -0
- dvt/contracts/selection.py +23 -0
- dvt/contracts/sql.py +86 -0
- dvt/contracts/state.py +69 -0
- dvt/contracts/util.py +46 -0
- dvt/deprecations.py +347 -0
- dvt/deps/__init__.py +0 -0
- dvt/deps/base.py +153 -0
- dvt/deps/git.py +196 -0
- dvt/deps/local.py +80 -0
- dvt/deps/registry.py +131 -0
- dvt/deps/resolver.py +149 -0
- dvt/deps/tarball.py +121 -0
- dvt/docs/source/_ext/dbt_click.py +118 -0
- dvt/docs/source/conf.py +32 -0
- dvt/env_vars.py +64 -0
- dvt/event_time/event_time.py +40 -0
- dvt/event_time/sample_window.py +60 -0
- dvt/events/__init__.py +16 -0
- dvt/events/base_types.py +37 -0
- dvt/events/core_types_pb2.py +2 -0
- dvt/events/logging.py +109 -0
- dvt/events/types.py +2534 -0
- dvt/exceptions.py +1487 -0
- dvt/flags.py +89 -0
- dvt/graph/__init__.py +11 -0
- dvt/graph/cli.py +248 -0
- dvt/graph/graph.py +172 -0
- dvt/graph/queue.py +213 -0
- dvt/graph/selector.py +375 -0
- dvt/graph/selector_methods.py +976 -0
- dvt/graph/selector_spec.py +223 -0
- dvt/graph/thread_pool.py +18 -0
- dvt/hooks.py +21 -0
- dvt/include/README.md +49 -0
- dvt/include/__init__.py +3 -0
- dvt/include/global_project.py +4 -0
- dvt/include/starter_project/.gitignore +4 -0
- dvt/include/starter_project/README.md +15 -0
- dvt/include/starter_project/__init__.py +3 -0
- dvt/include/starter_project/analyses/.gitkeep +0 -0
- dvt/include/starter_project/dvt_project.yml +36 -0
- dvt/include/starter_project/macros/.gitkeep +0 -0
- dvt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
- dvt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
- dvt/include/starter_project/models/example/schema.yml +21 -0
- dvt/include/starter_project/seeds/.gitkeep +0 -0
- dvt/include/starter_project/snapshots/.gitkeep +0 -0
- dvt/include/starter_project/tests/.gitkeep +0 -0
- dvt/internal_deprecations.py +27 -0
- dvt/jsonschemas/__init__.py +3 -0
- dvt/jsonschemas/jsonschemas.py +309 -0
- dvt/jsonschemas/project/0.0.110.json +4717 -0
- dvt/jsonschemas/project/0.0.85.json +2015 -0
- dvt/jsonschemas/resources/0.0.110.json +2636 -0
- dvt/jsonschemas/resources/0.0.85.json +2536 -0
- dvt/jsonschemas/resources/latest.json +6773 -0
- dvt/links.py +4 -0
- dvt/materializations/__init__.py +0 -0
- dvt/materializations/incremental/__init__.py +0 -0
- dvt/materializations/incremental/microbatch.py +235 -0
- dvt/mp_context.py +8 -0
- dvt/node_types.py +37 -0
- dvt/parser/__init__.py +23 -0
- dvt/parser/analysis.py +21 -0
- dvt/parser/base.py +549 -0
- dvt/parser/common.py +267 -0
- dvt/parser/docs.py +52 -0
- dvt/parser/fixtures.py +51 -0
- dvt/parser/functions.py +30 -0
- dvt/parser/generic_test.py +100 -0
- dvt/parser/generic_test_builders.py +334 -0
- dvt/parser/hooks.py +119 -0
- dvt/parser/macros.py +137 -0
- dvt/parser/manifest.py +2204 -0
- dvt/parser/models.py +574 -0
- dvt/parser/partial.py +1179 -0
- dvt/parser/read_files.py +445 -0
- dvt/parser/schema_generic_tests.py +423 -0
- dvt/parser/schema_renderer.py +111 -0
- dvt/parser/schema_yaml_readers.py +936 -0
- dvt/parser/schemas.py +1467 -0
- dvt/parser/search.py +149 -0
- dvt/parser/seeds.py +28 -0
- dvt/parser/singular_test.py +20 -0
- dvt/parser/snapshots.py +44 -0
- dvt/parser/sources.py +557 -0
- dvt/parser/sql.py +63 -0
- dvt/parser/unit_tests.py +622 -0
- dvt/plugins/__init__.py +20 -0
- dvt/plugins/contracts.py +10 -0
- dvt/plugins/exceptions.py +2 -0
- dvt/plugins/manager.py +164 -0
- dvt/plugins/manifest.py +21 -0
- dvt/profiler.py +20 -0
- dvt/py.typed +1 -0
- dvt/runners/__init__.py +2 -0
- dvt/runners/exposure_runner.py +7 -0
- dvt/runners/no_op_runner.py +46 -0
- dvt/runners/saved_query_runner.py +7 -0
- dvt/selected_resources.py +8 -0
- dvt/task/__init__.py +0 -0
- dvt/task/base.py +504 -0
- dvt/task/build.py +197 -0
- dvt/task/clean.py +57 -0
- dvt/task/clone.py +162 -0
- dvt/task/compile.py +151 -0
- dvt/task/compute.py +366 -0
- dvt/task/debug.py +650 -0
- dvt/task/deps.py +280 -0
- dvt/task/docs/__init__.py +3 -0
- dvt/task/docs/generate.py +408 -0
- dvt/task/docs/index.html +250 -0
- dvt/task/docs/serve.py +28 -0
- dvt/task/freshness.py +323 -0
- dvt/task/function.py +122 -0
- dvt/task/group_lookup.py +46 -0
- dvt/task/init.py +374 -0
- dvt/task/list.py +237 -0
- dvt/task/printer.py +176 -0
- dvt/task/profiles.py +256 -0
- dvt/task/retry.py +175 -0
- dvt/task/run.py +1146 -0
- dvt/task/run_operation.py +142 -0
- dvt/task/runnable.py +802 -0
- dvt/task/seed.py +104 -0
- dvt/task/show.py +150 -0
- dvt/task/snapshot.py +57 -0
- dvt/task/sql.py +111 -0
- dvt/task/test.py +464 -0
- dvt/tests/fixtures/__init__.py +1 -0
- dvt/tests/fixtures/project.py +620 -0
- dvt/tests/util.py +651 -0
- dvt/tracking.py +529 -0
- dvt/utils/__init__.py +3 -0
- dvt/utils/artifact_upload.py +151 -0
- dvt/utils/utils.py +408 -0
- dvt/version.py +249 -0
- dvt_core-1.11.0b4.dist-info/METADATA +252 -0
- dvt_core-1.11.0b4.dist-info/RECORD +261 -0
- dvt_core-1.11.0b4.dist-info/WHEEL +5 -0
- dvt_core-1.11.0b4.dist-info/entry_points.txt +2 -0
- dvt_core-1.11.0b4.dist-info/top_level.txt +1 -0
dvt/config/utils.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from typing import Any, Dict, Optional
|
|
2
|
+
|
|
3
|
+
from dvt import deprecations
|
|
4
|
+
from dvt.clients import yaml_helper
|
|
5
|
+
from dvt.events.types import InvalidOptionYAML
|
|
6
|
+
from dvt.exceptions import DbtExclusivePropertyUseError, OptionNotYamlDictError
|
|
7
|
+
|
|
8
|
+
from dbt_common.events.functions import fire_event
|
|
9
|
+
from dbt_common.exceptions import DbtValidationError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def parse_cli_vars(var_string: str) -> Dict[str, Any]:
|
|
13
|
+
return parse_cli_yaml_string(var_string, "vars")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def parse_cli_yaml_string(var_string: str, cli_option_name: str) -> Dict[str, Any]:
|
|
17
|
+
try:
|
|
18
|
+
cli_vars = yaml_helper.load_yaml_text(var_string)
|
|
19
|
+
var_type = type(cli_vars)
|
|
20
|
+
if cli_vars is not None and var_type is dict:
|
|
21
|
+
return cli_vars
|
|
22
|
+
else:
|
|
23
|
+
raise OptionNotYamlDictError(var_type, cli_option_name)
|
|
24
|
+
except (DbtValidationError, OptionNotYamlDictError):
|
|
25
|
+
fire_event(InvalidOptionYAML(option_name=cli_option_name))
|
|
26
|
+
raise
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def exclusive_primary_alt_value_setting(
|
|
30
|
+
dictionary: Optional[Dict[str, Any]],
|
|
31
|
+
primary: str,
|
|
32
|
+
alt: str,
|
|
33
|
+
parent_config: Optional[str] = None,
|
|
34
|
+
) -> None:
|
|
35
|
+
"""Munges in place under the primary the options for the primary and alt values
|
|
36
|
+
|
|
37
|
+
Sometimes we allow setting something via TWO keys, but not at the same time. If both the primary
|
|
38
|
+
key and alt key have values, an error gets raised. If the alt key has values, then we update
|
|
39
|
+
the dictionary to ensure the primary key contains the values. If neither are set, nothing happens.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
if dictionary is None:
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
primary_options = dictionary.get(primary)
|
|
46
|
+
alt_options = dictionary.get(alt)
|
|
47
|
+
|
|
48
|
+
if primary_options and alt_options:
|
|
49
|
+
where = f" in `{parent_config}`" if parent_config is not None else ""
|
|
50
|
+
raise DbtExclusivePropertyUseError(
|
|
51
|
+
f"Only `{alt}` or `{primary}` can be specified{where}, not both"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if alt in dictionary:
|
|
55
|
+
alt_value = dictionary.pop(alt)
|
|
56
|
+
dictionary[primary] = alt_value
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def normalize_warn_error_options(warn_error_options: Dict[str, Any]) -> None:
|
|
60
|
+
has_include = "include" in warn_error_options
|
|
61
|
+
has_exclude = "exclude" in warn_error_options
|
|
62
|
+
|
|
63
|
+
if has_include or has_exclude:
|
|
64
|
+
deprecations.buffer(
|
|
65
|
+
"weo-include-exclude-deprecation",
|
|
66
|
+
found_include=has_include,
|
|
67
|
+
found_exclude=has_exclude,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
exclusive_primary_alt_value_setting(
|
|
71
|
+
warn_error_options, "error", "include", "warn_error_options"
|
|
72
|
+
)
|
|
73
|
+
exclusive_primary_alt_value_setting(
|
|
74
|
+
warn_error_options, "warn", "exclude", "warn_error_options"
|
|
75
|
+
)
|
|
76
|
+
for key in ("error", "warn", "silence"):
|
|
77
|
+
if key in warn_error_options and warn_error_options[key] is None:
|
|
78
|
+
warn_error_options[key] = []
|
dvt/connectors/README.md
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
# DVT Database Connectors
|
|
2
|
+
|
|
3
|
+
This directory contains the connector catalog for DVT's compute engines (DuckDB and Spark).
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
DVT uses **dbt adapters** for all database connections, both for reading from source databases and writing to target databases. This provides a unified, Python-based connection mechanism that works with any database supported by the dbt ecosystem.
|
|
8
|
+
|
|
9
|
+
## No JARs Required
|
|
10
|
+
|
|
11
|
+
Unlike traditional Spark solutions that require JDBC JAR files, DVT extracts data from databases using dbt adapters and transfers it to the compute layer via Apache Arrow format. This approach:
|
|
12
|
+
|
|
13
|
+
- **Eliminates JAR dependencies** - Pure Python solution
|
|
14
|
+
- **Works with any dbt adapter** - 30+ databases supported out of the box
|
|
15
|
+
- **Provides consistent interface** - Same connection mechanism for all databases
|
|
16
|
+
- **Reduces package size** - No 200+ MB of JARs to download
|
|
17
|
+
- **Simplifies configuration** - Single `profiles.yml` for all connections
|
|
18
|
+
|
|
19
|
+
## Architecture
|
|
20
|
+
|
|
21
|
+
### Data Flow
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
Source DB → dbt adapter → Agate Table → Arrow Table →
|
|
25
|
+
Compute Engine (DuckDB/Spark) → Arrow Table → Target dbt adapter → Target DB
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Example: Cross-Database Query
|
|
29
|
+
|
|
30
|
+
```yaml
|
|
31
|
+
# profiles.yml
|
|
32
|
+
postgres_prod:
|
|
33
|
+
adapter: postgres
|
|
34
|
+
host: db.example.com
|
|
35
|
+
port: 5432
|
|
36
|
+
user: analytics
|
|
37
|
+
password: "{{ env_var('POSTGRES_PASSWORD') }}"
|
|
38
|
+
database: production
|
|
39
|
+
schema: public
|
|
40
|
+
|
|
41
|
+
mysql_legacy:
|
|
42
|
+
adapter: mysql
|
|
43
|
+
host: legacy-db.example.com
|
|
44
|
+
port: 3306
|
|
45
|
+
user: readonly
|
|
46
|
+
password: "{{ env_var('MYSQL_PASSWORD') }}"
|
|
47
|
+
database: orders_db
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
```yaml
|
|
51
|
+
# sources.yml
|
|
52
|
+
sources:
|
|
53
|
+
- name: postgres_source
|
|
54
|
+
profile: postgres_prod
|
|
55
|
+
tables:
|
|
56
|
+
- name: customers
|
|
57
|
+
|
|
58
|
+
- name: mysql_source
|
|
59
|
+
profile: mysql_legacy
|
|
60
|
+
tables:
|
|
61
|
+
- name: orders
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
```sql
|
|
65
|
+
-- models/cross_db_analysis.sql
|
|
66
|
+
select
|
|
67
|
+
c.customer_id,
|
|
68
|
+
c.name,
|
|
69
|
+
count(o.order_id) as order_count
|
|
70
|
+
from {{ source('postgres_source', 'customers') }} c
|
|
71
|
+
left join {{ source('mysql_source', 'orders') }} o
|
|
72
|
+
on c.customer_id = o.customer_id
|
|
73
|
+
group by c.customer_id, c.name
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
DVT will automatically:
|
|
77
|
+
1. Detect heterogeneous sources (PostgreSQL + MySQL)
|
|
78
|
+
2. Extract data via dbt adapters (`dbt-postgres`, `dbt-mysql`)
|
|
79
|
+
3. Convert to Arrow format for efficient transfer
|
|
80
|
+
4. Load into compute engine (DuckDB or Spark)
|
|
81
|
+
5. Execute the query in the compute layer
|
|
82
|
+
6. Return unified result set
|
|
83
|
+
|
|
84
|
+
## Supported Databases
|
|
85
|
+
|
|
86
|
+
DVT works with **any database that has a dbt adapter**. This includes:
|
|
87
|
+
|
|
88
|
+
**Relational Databases:**
|
|
89
|
+
- PostgreSQL (`dbt-postgres`)
|
|
90
|
+
- MySQL (`dbt-mysql`)
|
|
91
|
+
- SQL Server (`dbt-sqlserver`)
|
|
92
|
+
- Oracle (`dbt-oracle`)
|
|
93
|
+
|
|
94
|
+
**Cloud Data Warehouses:**
|
|
95
|
+
- Snowflake (`dbt-snowflake`)
|
|
96
|
+
- BigQuery (`dbt-bigquery`)
|
|
97
|
+
- Redshift (`dbt-redshift`)
|
|
98
|
+
- Databricks (`dbt-databricks`)
|
|
99
|
+
|
|
100
|
+
**Analytics Databases:**
|
|
101
|
+
- DuckDB (`dbt-duckdb`)
|
|
102
|
+
- ClickHouse (`dbt-clickhouse`)
|
|
103
|
+
- Trino (`dbt-trino`)
|
|
104
|
+
|
|
105
|
+
**And many more...**
|
|
106
|
+
|
|
107
|
+
See the [dbt adapter registry](https://docs.getdbt.com/docs/supported-data-platforms) for the complete list.
|
|
108
|
+
|
|
109
|
+
## Installation
|
|
110
|
+
|
|
111
|
+
Install DVT with the dbt adapters you need:
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# Install DVT core
|
|
115
|
+
pip install dvt-core
|
|
116
|
+
|
|
117
|
+
# Install adapters for your databases
|
|
118
|
+
pip install dbt-postgres dbt-mysql dbt-snowflake
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Configuration
|
|
122
|
+
|
|
123
|
+
Configure profiles in `~/.dvt/profiles.yml` using the same format as dbt:
|
|
124
|
+
|
|
125
|
+
```yaml
|
|
126
|
+
# PostgreSQL
|
|
127
|
+
postgres_prod:
|
|
128
|
+
adapter: postgres
|
|
129
|
+
host: db.example.com
|
|
130
|
+
port: 5432
|
|
131
|
+
user: analytics
|
|
132
|
+
password: "{{ env_var('POSTGRES_PASSWORD') }}"
|
|
133
|
+
database: production
|
|
134
|
+
schema: public
|
|
135
|
+
|
|
136
|
+
# MySQL
|
|
137
|
+
mysql_legacy:
|
|
138
|
+
adapter: mysql
|
|
139
|
+
host: legacy-db.example.com
|
|
140
|
+
port: 3306
|
|
141
|
+
user: readonly
|
|
142
|
+
password: "{{ env_var('MYSQL_PASSWORD') }}"
|
|
143
|
+
database: orders_db
|
|
144
|
+
|
|
145
|
+
# Snowflake
|
|
146
|
+
snowflake_analytics:
|
|
147
|
+
adapter: snowflake
|
|
148
|
+
account: mycompany
|
|
149
|
+
user: analytics
|
|
150
|
+
password: "{{ env_var('SNOWFLAKE_PASSWORD') }}"
|
|
151
|
+
database: analytics
|
|
152
|
+
warehouse: compute_wh
|
|
153
|
+
schema: public
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Test your connections:
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
dvt profiles test --all
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Compute Engines
|
|
163
|
+
|
|
164
|
+
DVT supports two compute engines for cross-database queries:
|
|
165
|
+
|
|
166
|
+
### DuckDB (Default)
|
|
167
|
+
- In-process analytical database
|
|
168
|
+
- Fast for datasets < 10GB
|
|
169
|
+
- Zero configuration required
|
|
170
|
+
- Perfect for development and small-to-medium workloads
|
|
171
|
+
|
|
172
|
+
### PySpark
|
|
173
|
+
- Distributed compute engine
|
|
174
|
+
- Scales to 100GB+ datasets
|
|
175
|
+
- Local or cluster mode
|
|
176
|
+
- No JDBC JARs required - uses dbt adapters
|
|
177
|
+
|
|
178
|
+
Configure in `dvt_project.yml`:
|
|
179
|
+
|
|
180
|
+
```yaml
|
|
181
|
+
compute:
|
|
182
|
+
default_engine: duckdb # or 'spark'
|
|
183
|
+
|
|
184
|
+
duckdb:
|
|
185
|
+
memory_limit: '4GB'
|
|
186
|
+
threads: 4
|
|
187
|
+
|
|
188
|
+
spark:
|
|
189
|
+
type: local
|
|
190
|
+
master: 'local[*]'
|
|
191
|
+
config:
|
|
192
|
+
spark.executor.memory: '4g'
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## How It Works
|
|
196
|
+
|
|
197
|
+
### Traditional Spark Approach (Old)
|
|
198
|
+
```
|
|
199
|
+
Spark → JDBC JARs (200+ MB) → Database
|
|
200
|
+
```
|
|
201
|
+
- Requires downloading JARs
|
|
202
|
+
- Different JAR for each database
|
|
203
|
+
- Version conflicts
|
|
204
|
+
- Large package size
|
|
205
|
+
|
|
206
|
+
### DVT Approach (New)
|
|
207
|
+
```
|
|
208
|
+
Database → dbt adapter → Arrow → Compute Engine
|
|
209
|
+
```
|
|
210
|
+
- Pure Python solution
|
|
211
|
+
- Uses existing dbt adapters
|
|
212
|
+
- No JARs needed
|
|
213
|
+
- Small package size (~10 MB core)
|
|
214
|
+
|
|
215
|
+
### Implementation Details
|
|
216
|
+
|
|
217
|
+
When DVT executes a cross-database query:
|
|
218
|
+
|
|
219
|
+
1. **Query Analysis**: DVT analyzes the SQL to identify all source databases
|
|
220
|
+
2. **Data Extraction**: For each source:
|
|
221
|
+
- Get dbt adapter for the profile
|
|
222
|
+
- Execute `SELECT * FROM table` via adapter
|
|
223
|
+
- Receive results as Agate table
|
|
224
|
+
3. **Arrow Conversion**: Convert Agate tables to Arrow format (zero-copy)
|
|
225
|
+
4. **Compute Layer**:
|
|
226
|
+
- **DuckDB**: Register Arrow tables directly
|
|
227
|
+
- **Spark**: Convert Arrow → Pandas → Spark DataFrame
|
|
228
|
+
5. **Query Execution**: Execute the original query in the compute engine
|
|
229
|
+
6. **Results**: Return results as Arrow table
|
|
230
|
+
|
|
231
|
+
## Catalog File
|
|
232
|
+
|
|
233
|
+
The `catalog.yml` file is retained for documentation purposes and to map database types to dbt adapter names:
|
|
234
|
+
|
|
235
|
+
```yaml
|
|
236
|
+
postgres:
|
|
237
|
+
adapter_name: postgres
|
|
238
|
+
dbt_package: dbt-postgres
|
|
239
|
+
description: PostgreSQL database
|
|
240
|
+
connection_docs: https://docs.getdbt.com/reference/warehouse-setups/postgres-setup
|
|
241
|
+
|
|
242
|
+
mysql:
|
|
243
|
+
adapter_name: mysql
|
|
244
|
+
dbt_package: dbt-mysql
|
|
245
|
+
description: MySQL database
|
|
246
|
+
connection_docs: https://docs.getdbt.com/reference/warehouse-setups/mysql-setup
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
## Troubleshooting
|
|
250
|
+
|
|
251
|
+
### Missing Adapter Error
|
|
252
|
+
|
|
253
|
+
If you see `adapter not found` errors:
|
|
254
|
+
|
|
255
|
+
```bash
|
|
256
|
+
# Check installed adapters
|
|
257
|
+
pip list | grep dbt-
|
|
258
|
+
|
|
259
|
+
# Install missing adapter
|
|
260
|
+
pip install dbt-postgres
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
### Connection Failures
|
|
264
|
+
|
|
265
|
+
If connections fail:
|
|
266
|
+
|
|
267
|
+
```bash
|
|
268
|
+
# Test individual profile
|
|
269
|
+
dvt profiles test postgres_prod
|
|
270
|
+
|
|
271
|
+
# Test all profiles
|
|
272
|
+
dvt profiles test --all
|
|
273
|
+
|
|
274
|
+
# Check profile configuration
|
|
275
|
+
dvt profiles show postgres_prod
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
### Performance Issues
|
|
279
|
+
|
|
280
|
+
For large datasets:
|
|
281
|
+
|
|
282
|
+
1. Switch to Spark engine:
|
|
283
|
+
```yaml
|
|
284
|
+
compute:
|
|
285
|
+
default_engine: spark
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
2. Increase memory limits:
|
|
289
|
+
```yaml
|
|
290
|
+
spark:
|
|
291
|
+
config:
|
|
292
|
+
spark.executor.memory: '8g'
|
|
293
|
+
spark.driver.memory: '4g'
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
3. Use per-model configuration:
|
|
297
|
+
```sql
|
|
298
|
+
{{ config(compute='spark') }}
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
## See Also
|
|
302
|
+
|
|
303
|
+
- [Multi-Profile Guide](../../docs/multi-profile-guide.md)
|
|
304
|
+
- [Compute Configuration](../../docs/compute-configuration.md)
|
|
305
|
+
- [DVT Architecture](../../docs/DVT_ARCHITECTURE.md)
|
|
306
|
+
- [dbt Adapter Documentation](https://docs.getdbt.com/docs/supported-data-platforms)
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
# DVT Spark JDBC Connector Catalog
|
|
2
|
+
#
|
|
3
|
+
# This file catalogs all JDBC connectors bundled with DVT for cross-database queries.
|
|
4
|
+
# Each connector enables Spark to read from a specific database type.
|
|
5
|
+
#
|
|
6
|
+
# Format:
|
|
7
|
+
# connector_name:
|
|
8
|
+
# description: Human-readable description
|
|
9
|
+
# driver_class: JDBC driver class name
|
|
10
|
+
# maven_coordinates: Maven group:artifact:version
|
|
11
|
+
# url_pattern: JDBC URL pattern
|
|
12
|
+
# homepage: Official project homepage
|
|
13
|
+
# license: Software license
|
|
14
|
+
|
|
15
|
+
version: "1.0.0"
|
|
16
|
+
|
|
17
|
+
connectors:
|
|
18
|
+
# Relational Databases
|
|
19
|
+
postgresql:
|
|
20
|
+
description: "PostgreSQL - Open source relational database"
|
|
21
|
+
driver_class: "org.postgresql.Driver"
|
|
22
|
+
maven_coordinates: "org.postgresql:postgresql:42.7.1"
|
|
23
|
+
url_pattern: "jdbc:postgresql://<host>:<port>/<database>"
|
|
24
|
+
homepage: "https://www.postgresql.org/"
|
|
25
|
+
license: "PostgreSQL License"
|
|
26
|
+
|
|
27
|
+
mysql:
|
|
28
|
+
description: "MySQL - Popular open source relational database"
|
|
29
|
+
driver_class: "com.mysql.cj.jdbc.Driver"
|
|
30
|
+
maven_coordinates: "com.mysql:mysql-connector-j:8.2.0"
|
|
31
|
+
url_pattern: "jdbc:mysql://<host>:<port>/<database>"
|
|
32
|
+
homepage: "https://www.mysql.com/"
|
|
33
|
+
license: "GPL v2"
|
|
34
|
+
|
|
35
|
+
mariadb:
|
|
36
|
+
description: "MariaDB - MySQL-compatible open source database"
|
|
37
|
+
driver_class: "org.mariadb.jdbc.Driver"
|
|
38
|
+
maven_coordinates: "org.mariadb.jdbc:mariadb-java-client:3.3.1"
|
|
39
|
+
url_pattern: "jdbc:mariadb://<host>:<port>/<database>"
|
|
40
|
+
homepage: "https://mariadb.org/"
|
|
41
|
+
license: "LGPL v2.1"
|
|
42
|
+
|
|
43
|
+
oracle:
|
|
44
|
+
description: "Oracle Database - Enterprise relational database"
|
|
45
|
+
driver_class: "oracle.jdbc.OracleDriver"
|
|
46
|
+
maven_coordinates: "com.oracle.database.jdbc:ojdbc11:23.3.0.23.09"
|
|
47
|
+
url_pattern: "jdbc:oracle:thin:@<host>:<port>:<sid>"
|
|
48
|
+
homepage: "https://www.oracle.com/database/"
|
|
49
|
+
license: "Oracle Free Use Terms and Conditions"
|
|
50
|
+
|
|
51
|
+
sqlserver:
|
|
52
|
+
description: "Microsoft SQL Server - Enterprise database"
|
|
53
|
+
driver_class: "com.microsoft.sqlserver.jdbc.SQLServerDriver"
|
|
54
|
+
maven_coordinates: "com.microsoft.sqlserver:mssql-jdbc:12.4.2.jre11"
|
|
55
|
+
url_pattern: "jdbc:sqlserver://<host>:<port>;databaseName=<database>"
|
|
56
|
+
homepage: "https://www.microsoft.com/sql-server/"
|
|
57
|
+
license: "MIT"
|
|
58
|
+
|
|
59
|
+
db2:
|
|
60
|
+
description: "IBM Db2 - Enterprise database"
|
|
61
|
+
driver_class: "com.ibm.db2.jcc.DB2Driver"
|
|
62
|
+
maven_coordinates: "com.ibm.db2:jcc:11.5.9.0"
|
|
63
|
+
url_pattern: "jdbc:db2://<host>:<port>/<database>"
|
|
64
|
+
homepage: "https://www.ibm.com/products/db2"
|
|
65
|
+
license: "IBM"
|
|
66
|
+
|
|
67
|
+
# Cloud Databases
|
|
68
|
+
snowflake:
|
|
69
|
+
description: "Snowflake - Cloud data warehouse"
|
|
70
|
+
driver_class: "net.snowflake.client.jdbc.SnowflakeDriver"
|
|
71
|
+
maven_coordinates: "net.snowflake:snowflake-jdbc:3.14.4"
|
|
72
|
+
url_pattern: "jdbc:snowflake://<account>.snowflakecomputing.com"
|
|
73
|
+
homepage: "https://www.snowflake.com/"
|
|
74
|
+
license: "Apache 2.0"
|
|
75
|
+
|
|
76
|
+
bigquery:
|
|
77
|
+
description: "Google BigQuery - Serverless data warehouse"
|
|
78
|
+
driver_class: "com.simba.googlebigquery.jdbc.Driver"
|
|
79
|
+
maven_coordinates: "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.35.1"
|
|
80
|
+
url_pattern: "jdbc:bigquery://<project>:<dataset>"
|
|
81
|
+
homepage: "https://cloud.google.com/bigquery"
|
|
82
|
+
license: "Apache 2.0"
|
|
83
|
+
|
|
84
|
+
redshift:
|
|
85
|
+
description: "Amazon Redshift - Cloud data warehouse"
|
|
86
|
+
driver_class: "com.amazon.redshift.jdbc.Driver"
|
|
87
|
+
maven_coordinates: "com.amazon.redshift:redshift-jdbc42:2.1.0.25"
|
|
88
|
+
url_pattern: "jdbc:redshift://<endpoint>:<port>/<database>"
|
|
89
|
+
homepage: "https://aws.amazon.com/redshift/"
|
|
90
|
+
license: "Apache 2.0"
|
|
91
|
+
|
|
92
|
+
athena:
|
|
93
|
+
description: "Amazon Athena - Serverless query service"
|
|
94
|
+
driver_class: "com.simba.athena.jdbc.Driver"
|
|
95
|
+
maven_coordinates: "com.amazonaws:aws-java-sdk-athena:1.12.565"
|
|
96
|
+
url_pattern: "jdbc:awsathena://<region>.amazonaws.com:443"
|
|
97
|
+
homepage: "https://aws.amazon.com/athena/"
|
|
98
|
+
license: "Apache 2.0"
|
|
99
|
+
|
|
100
|
+
# Analytics & MPP Databases
|
|
101
|
+
clickhouse:
|
|
102
|
+
description: "ClickHouse - Fast open-source column-oriented database"
|
|
103
|
+
driver_class: "com.clickhouse.jdbc.ClickHouseDriver"
|
|
104
|
+
maven_coordinates: "com.clickhouse:clickhouse-jdbc:0.6.0-patch1"
|
|
105
|
+
url_pattern: "jdbc:clickhouse://<host>:<port>/<database>"
|
|
106
|
+
homepage: "https://clickhouse.com/"
|
|
107
|
+
license: "Apache 2.0"
|
|
108
|
+
|
|
109
|
+
vertica:
|
|
110
|
+
description: "Vertica - Unified analytics platform"
|
|
111
|
+
driver_class: "com.vertica.jdbc.Driver"
|
|
112
|
+
maven_coordinates: "com.vertica.jdbc:vertica-jdbc:23.4.0-0"
|
|
113
|
+
url_pattern: "jdbc:vertica://<host>:<port>/<database>"
|
|
114
|
+
homepage: "https://www.vertica.com/"
|
|
115
|
+
license: "Apache 2.0"
|
|
116
|
+
|
|
117
|
+
presto:
|
|
118
|
+
description: "Presto - Distributed SQL query engine"
|
|
119
|
+
driver_class: "com.facebook.presto.jdbc.PrestoDriver"
|
|
120
|
+
maven_coordinates: "com.facebook.presto:presto-jdbc:0.285"
|
|
121
|
+
url_pattern: "jdbc:presto://<host>:<port>/<catalog>"
|
|
122
|
+
homepage: "https://prestodb.io/"
|
|
123
|
+
license: "Apache 2.0"
|
|
124
|
+
|
|
125
|
+
trino:
|
|
126
|
+
description: "Trino - Fast distributed SQL query engine"
|
|
127
|
+
driver_class: "io.trino.jdbc.TrinoDriver"
|
|
128
|
+
maven_coordinates: "io.trino:trino-jdbc:433"
|
|
129
|
+
url_pattern: "jdbc:trino://<host>:<port>/<catalog>"
|
|
130
|
+
homepage: "https://trino.io/"
|
|
131
|
+
license: "Apache 2.0"
|
|
132
|
+
|
|
133
|
+
# NoSQL & NewSQL
|
|
134
|
+
cassandra:
|
|
135
|
+
description: "Apache Cassandra - Distributed NoSQL database"
|
|
136
|
+
driver_class: "com.github.adejanovski.cassandra.jdbc.CassandraDriver"
|
|
137
|
+
maven_coordinates: "com.github.adejanovski:cassandra-jdbc-wrapper:4.10.2"
|
|
138
|
+
url_pattern: "jdbc:cassandra://<host>:<port>/<keyspace>"
|
|
139
|
+
homepage: "https://cassandra.apache.org/"
|
|
140
|
+
license: "Apache 2.0"
|
|
141
|
+
|
|
142
|
+
mongodb:
|
|
143
|
+
description: "MongoDB - Document database"
|
|
144
|
+
driver_class: "mongodb.jdbc.MongoDriver"
|
|
145
|
+
maven_coordinates: "org.mongodb.spark:mongo-spark-connector_2.12:10.2.1"
|
|
146
|
+
url_pattern: "mongodb://<host>:<port>/<database>"
|
|
147
|
+
homepage: "https://www.mongodb.com/"
|
|
148
|
+
license: "Server Side Public License"
|
|
149
|
+
|
|
150
|
+
# Other Databases
|
|
151
|
+
h2:
|
|
152
|
+
description: "H2 - Lightweight Java SQL database"
|
|
153
|
+
driver_class: "org.h2.Driver"
|
|
154
|
+
maven_coordinates: "com.h2database:h2:2.2.224"
|
|
155
|
+
url_pattern: "jdbc:h2:<file_path>"
|
|
156
|
+
homepage: "https://h2database.com/"
|
|
157
|
+
license: "MPL 2.0 or EPL 1.0"
|
|
158
|
+
|
|
159
|
+
sqlite:
|
|
160
|
+
description: "SQLite - Embedded relational database"
|
|
161
|
+
driver_class: "org.sqlite.JDBC"
|
|
162
|
+
maven_coordinates: "org.xerial:sqlite-jdbc:3.44.1.0"
|
|
163
|
+
url_pattern: "jdbc:sqlite:<file_path>"
|
|
164
|
+
homepage: "https://www.sqlite.org/"
|
|
165
|
+
license: "Public Domain"
|
|
166
|
+
|
|
167
|
+
derby:
|
|
168
|
+
description: "Apache Derby - Java-based relational database"
|
|
169
|
+
driver_class: "org.apache.derby.jdbc.EmbeddedDriver"
|
|
170
|
+
maven_coordinates: "org.apache.derby:derby:10.16.1.1"
|
|
171
|
+
url_pattern: "jdbc:derby:<database>"
|
|
172
|
+
homepage: "https://db.apache.org/derby/"
|
|
173
|
+
license: "Apache 2.0"
|
|
174
|
+
|
|
175
|
+
hive:
|
|
176
|
+
description: "Apache Hive - Data warehouse software"
|
|
177
|
+
driver_class: "org.apache.hive.jdbc.HiveDriver"
|
|
178
|
+
maven_coordinates: "org.apache.hive:hive-jdbc:3.1.3"
|
|
179
|
+
url_pattern: "jdbc:hive2://<host>:<port>/<database>"
|
|
180
|
+
homepage: "https://hive.apache.org/"
|
|
181
|
+
license: "Apache 2.0"
|
|
182
|
+
|
|
183
|
+
impala:
|
|
184
|
+
description: "Apache Impala - MPP SQL query engine"
|
|
185
|
+
driver_class: "com.cloudera.impala.jdbc.Driver"
|
|
186
|
+
maven_coordinates: "org.apache.impala:impala-jdbc:2.6.30"
|
|
187
|
+
url_pattern: "jdbc:impala://<host>:<port>/<database>"
|
|
188
|
+
homepage: "https://impala.apache.org/"
|
|
189
|
+
license: "Apache 2.0"
|
|
190
|
+
|
|
191
|
+
phoenix:
|
|
192
|
+
description: "Apache Phoenix - SQL skin over HBase"
|
|
193
|
+
driver_class: "org.apache.phoenix.jdbc.PhoenixDriver"
|
|
194
|
+
maven_coordinates: "org.apache.phoenix:phoenix-client-hbase-2.5:5.1.3"
|
|
195
|
+
url_pattern: "jdbc:phoenix:<zookeeper_quorum>"
|
|
196
|
+
homepage: "https://phoenix.apache.org/"
|
|
197
|
+
license: "Apache 2.0"
|
|
198
|
+
|
|
199
|
+
# Time Series & Specialized
|
|
200
|
+
timescaledb:
|
|
201
|
+
description: "TimescaleDB - Time-series extension for PostgreSQL"
|
|
202
|
+
driver_class: "org.postgresql.Driver"
|
|
203
|
+
maven_coordinates: "org.postgresql:postgresql:42.7.1"
|
|
204
|
+
url_pattern: "jdbc:postgresql://<host>:<port>/<database>"
|
|
205
|
+
homepage: "https://www.timescale.com/"
|
|
206
|
+
license: "Timescale License"
|
|
207
|
+
|
|
208
|
+
influxdb:
|
|
209
|
+
description: "InfluxDB - Time series database"
|
|
210
|
+
driver_class: "org.influxdb.jdbc.InfluxDBDriver"
|
|
211
|
+
maven_coordinates: "org.influxdb:influxdb-java:2.24"
|
|
212
|
+
url_pattern: "jdbc:influxdb://<host>:<port>/<database>"
|
|
213
|
+
homepage: "https://www.influxdata.com/"
|
|
214
|
+
license: "MIT"
|
|
215
|
+
|
|
216
|
+
# Estimated total size: ~500MB - 1GB for all connectors
|
|
217
|
+
# Individual connector sizes range from 1MB to 100MB
|