dvt-core 1.11.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dvt-core might be problematic. Click here for more details.

Files changed (261) hide show
  1. dvt/__init__.py +7 -0
  2. dvt/_pydantic_shim.py +26 -0
  3. dvt/adapters/__init__.py +16 -0
  4. dvt/adapters/multi_adapter_manager.py +268 -0
  5. dvt/artifacts/__init__.py +0 -0
  6. dvt/artifacts/exceptions/__init__.py +1 -0
  7. dvt/artifacts/exceptions/schemas.py +31 -0
  8. dvt/artifacts/resources/__init__.py +116 -0
  9. dvt/artifacts/resources/base.py +68 -0
  10. dvt/artifacts/resources/types.py +93 -0
  11. dvt/artifacts/resources/v1/analysis.py +10 -0
  12. dvt/artifacts/resources/v1/catalog.py +23 -0
  13. dvt/artifacts/resources/v1/components.py +275 -0
  14. dvt/artifacts/resources/v1/config.py +282 -0
  15. dvt/artifacts/resources/v1/documentation.py +11 -0
  16. dvt/artifacts/resources/v1/exposure.py +52 -0
  17. dvt/artifacts/resources/v1/function.py +53 -0
  18. dvt/artifacts/resources/v1/generic_test.py +32 -0
  19. dvt/artifacts/resources/v1/group.py +22 -0
  20. dvt/artifacts/resources/v1/hook.py +11 -0
  21. dvt/artifacts/resources/v1/macro.py +30 -0
  22. dvt/artifacts/resources/v1/metric.py +173 -0
  23. dvt/artifacts/resources/v1/model.py +146 -0
  24. dvt/artifacts/resources/v1/owner.py +10 -0
  25. dvt/artifacts/resources/v1/saved_query.py +112 -0
  26. dvt/artifacts/resources/v1/seed.py +42 -0
  27. dvt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  28. dvt/artifacts/resources/v1/semantic_model.py +315 -0
  29. dvt/artifacts/resources/v1/singular_test.py +14 -0
  30. dvt/artifacts/resources/v1/snapshot.py +92 -0
  31. dvt/artifacts/resources/v1/source_definition.py +85 -0
  32. dvt/artifacts/resources/v1/sql_operation.py +10 -0
  33. dvt/artifacts/resources/v1/unit_test_definition.py +78 -0
  34. dvt/artifacts/schemas/__init__.py +0 -0
  35. dvt/artifacts/schemas/base.py +191 -0
  36. dvt/artifacts/schemas/batch_results.py +24 -0
  37. dvt/artifacts/schemas/catalog/__init__.py +12 -0
  38. dvt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  39. dvt/artifacts/schemas/catalog/v1/catalog.py +60 -0
  40. dvt/artifacts/schemas/freshness/__init__.py +1 -0
  41. dvt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  42. dvt/artifacts/schemas/freshness/v3/freshness.py +159 -0
  43. dvt/artifacts/schemas/manifest/__init__.py +2 -0
  44. dvt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  45. dvt/artifacts/schemas/manifest/v12/manifest.py +212 -0
  46. dvt/artifacts/schemas/results.py +148 -0
  47. dvt/artifacts/schemas/run/__init__.py +2 -0
  48. dvt/artifacts/schemas/run/v5/__init__.py +0 -0
  49. dvt/artifacts/schemas/run/v5/run.py +184 -0
  50. dvt/artifacts/schemas/upgrades/__init__.py +4 -0
  51. dvt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  52. dvt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  53. dvt/artifacts/utils/validation.py +153 -0
  54. dvt/cli/__init__.py +1 -0
  55. dvt/cli/context.py +16 -0
  56. dvt/cli/exceptions.py +56 -0
  57. dvt/cli/flags.py +558 -0
  58. dvt/cli/main.py +971 -0
  59. dvt/cli/option_types.py +121 -0
  60. dvt/cli/options.py +79 -0
  61. dvt/cli/params.py +803 -0
  62. dvt/cli/requires.py +478 -0
  63. dvt/cli/resolvers.py +32 -0
  64. dvt/cli/types.py +40 -0
  65. dvt/clients/__init__.py +0 -0
  66. dvt/clients/checked_load.py +82 -0
  67. dvt/clients/git.py +164 -0
  68. dvt/clients/jinja.py +206 -0
  69. dvt/clients/jinja_static.py +245 -0
  70. dvt/clients/registry.py +192 -0
  71. dvt/clients/yaml_helper.py +68 -0
  72. dvt/compilation.py +833 -0
  73. dvt/compute/__init__.py +26 -0
  74. dvt/compute/base.py +288 -0
  75. dvt/compute/engines/__init__.py +13 -0
  76. dvt/compute/engines/duckdb_engine.py +368 -0
  77. dvt/compute/engines/spark_engine.py +273 -0
  78. dvt/compute/query_analyzer.py +212 -0
  79. dvt/compute/router.py +483 -0
  80. dvt/config/__init__.py +4 -0
  81. dvt/config/catalogs.py +95 -0
  82. dvt/config/compute_config.py +406 -0
  83. dvt/config/profile.py +411 -0
  84. dvt/config/profiles_v2.py +464 -0
  85. dvt/config/project.py +893 -0
  86. dvt/config/renderer.py +232 -0
  87. dvt/config/runtime.py +491 -0
  88. dvt/config/selectors.py +209 -0
  89. dvt/config/utils.py +78 -0
  90. dvt/connectors/.gitignore +6 -0
  91. dvt/connectors/README.md +306 -0
  92. dvt/connectors/catalog.yml +217 -0
  93. dvt/connectors/download_connectors.py +300 -0
  94. dvt/constants.py +29 -0
  95. dvt/context/__init__.py +0 -0
  96. dvt/context/base.py +746 -0
  97. dvt/context/configured.py +136 -0
  98. dvt/context/context_config.py +350 -0
  99. dvt/context/docs.py +82 -0
  100. dvt/context/exceptions_jinja.py +179 -0
  101. dvt/context/macro_resolver.py +195 -0
  102. dvt/context/macros.py +171 -0
  103. dvt/context/manifest.py +73 -0
  104. dvt/context/providers.py +2198 -0
  105. dvt/context/query_header.py +14 -0
  106. dvt/context/secret.py +59 -0
  107. dvt/context/target.py +74 -0
  108. dvt/contracts/__init__.py +0 -0
  109. dvt/contracts/files.py +413 -0
  110. dvt/contracts/graph/__init__.py +0 -0
  111. dvt/contracts/graph/manifest.py +1904 -0
  112. dvt/contracts/graph/metrics.py +98 -0
  113. dvt/contracts/graph/model_config.py +71 -0
  114. dvt/contracts/graph/node_args.py +42 -0
  115. dvt/contracts/graph/nodes.py +1806 -0
  116. dvt/contracts/graph/semantic_manifest.py +233 -0
  117. dvt/contracts/graph/unparsed.py +812 -0
  118. dvt/contracts/project.py +417 -0
  119. dvt/contracts/results.py +53 -0
  120. dvt/contracts/selection.py +23 -0
  121. dvt/contracts/sql.py +86 -0
  122. dvt/contracts/state.py +69 -0
  123. dvt/contracts/util.py +46 -0
  124. dvt/deprecations.py +347 -0
  125. dvt/deps/__init__.py +0 -0
  126. dvt/deps/base.py +153 -0
  127. dvt/deps/git.py +196 -0
  128. dvt/deps/local.py +80 -0
  129. dvt/deps/registry.py +131 -0
  130. dvt/deps/resolver.py +149 -0
  131. dvt/deps/tarball.py +121 -0
  132. dvt/docs/source/_ext/dbt_click.py +118 -0
  133. dvt/docs/source/conf.py +32 -0
  134. dvt/env_vars.py +64 -0
  135. dvt/event_time/event_time.py +40 -0
  136. dvt/event_time/sample_window.py +60 -0
  137. dvt/events/__init__.py +16 -0
  138. dvt/events/base_types.py +37 -0
  139. dvt/events/core_types_pb2.py +2 -0
  140. dvt/events/logging.py +109 -0
  141. dvt/events/types.py +2534 -0
  142. dvt/exceptions.py +1487 -0
  143. dvt/flags.py +89 -0
  144. dvt/graph/__init__.py +11 -0
  145. dvt/graph/cli.py +248 -0
  146. dvt/graph/graph.py +172 -0
  147. dvt/graph/queue.py +213 -0
  148. dvt/graph/selector.py +375 -0
  149. dvt/graph/selector_methods.py +976 -0
  150. dvt/graph/selector_spec.py +223 -0
  151. dvt/graph/thread_pool.py +18 -0
  152. dvt/hooks.py +21 -0
  153. dvt/include/README.md +49 -0
  154. dvt/include/__init__.py +3 -0
  155. dvt/include/global_project.py +4 -0
  156. dvt/include/starter_project/.gitignore +4 -0
  157. dvt/include/starter_project/README.md +15 -0
  158. dvt/include/starter_project/__init__.py +3 -0
  159. dvt/include/starter_project/analyses/.gitkeep +0 -0
  160. dvt/include/starter_project/dvt_project.yml +36 -0
  161. dvt/include/starter_project/macros/.gitkeep +0 -0
  162. dvt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
  163. dvt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
  164. dvt/include/starter_project/models/example/schema.yml +21 -0
  165. dvt/include/starter_project/seeds/.gitkeep +0 -0
  166. dvt/include/starter_project/snapshots/.gitkeep +0 -0
  167. dvt/include/starter_project/tests/.gitkeep +0 -0
  168. dvt/internal_deprecations.py +27 -0
  169. dvt/jsonschemas/__init__.py +3 -0
  170. dvt/jsonschemas/jsonschemas.py +309 -0
  171. dvt/jsonschemas/project/0.0.110.json +4717 -0
  172. dvt/jsonschemas/project/0.0.85.json +2015 -0
  173. dvt/jsonschemas/resources/0.0.110.json +2636 -0
  174. dvt/jsonschemas/resources/0.0.85.json +2536 -0
  175. dvt/jsonschemas/resources/latest.json +6773 -0
  176. dvt/links.py +4 -0
  177. dvt/materializations/__init__.py +0 -0
  178. dvt/materializations/incremental/__init__.py +0 -0
  179. dvt/materializations/incremental/microbatch.py +235 -0
  180. dvt/mp_context.py +8 -0
  181. dvt/node_types.py +37 -0
  182. dvt/parser/__init__.py +23 -0
  183. dvt/parser/analysis.py +21 -0
  184. dvt/parser/base.py +549 -0
  185. dvt/parser/common.py +267 -0
  186. dvt/parser/docs.py +52 -0
  187. dvt/parser/fixtures.py +51 -0
  188. dvt/parser/functions.py +30 -0
  189. dvt/parser/generic_test.py +100 -0
  190. dvt/parser/generic_test_builders.py +334 -0
  191. dvt/parser/hooks.py +119 -0
  192. dvt/parser/macros.py +137 -0
  193. dvt/parser/manifest.py +2204 -0
  194. dvt/parser/models.py +574 -0
  195. dvt/parser/partial.py +1179 -0
  196. dvt/parser/read_files.py +445 -0
  197. dvt/parser/schema_generic_tests.py +423 -0
  198. dvt/parser/schema_renderer.py +111 -0
  199. dvt/parser/schema_yaml_readers.py +936 -0
  200. dvt/parser/schemas.py +1467 -0
  201. dvt/parser/search.py +149 -0
  202. dvt/parser/seeds.py +28 -0
  203. dvt/parser/singular_test.py +20 -0
  204. dvt/parser/snapshots.py +44 -0
  205. dvt/parser/sources.py +557 -0
  206. dvt/parser/sql.py +63 -0
  207. dvt/parser/unit_tests.py +622 -0
  208. dvt/plugins/__init__.py +20 -0
  209. dvt/plugins/contracts.py +10 -0
  210. dvt/plugins/exceptions.py +2 -0
  211. dvt/plugins/manager.py +164 -0
  212. dvt/plugins/manifest.py +21 -0
  213. dvt/profiler.py +20 -0
  214. dvt/py.typed +1 -0
  215. dvt/runners/__init__.py +2 -0
  216. dvt/runners/exposure_runner.py +7 -0
  217. dvt/runners/no_op_runner.py +46 -0
  218. dvt/runners/saved_query_runner.py +7 -0
  219. dvt/selected_resources.py +8 -0
  220. dvt/task/__init__.py +0 -0
  221. dvt/task/base.py +504 -0
  222. dvt/task/build.py +197 -0
  223. dvt/task/clean.py +57 -0
  224. dvt/task/clone.py +162 -0
  225. dvt/task/compile.py +151 -0
  226. dvt/task/compute.py +366 -0
  227. dvt/task/debug.py +650 -0
  228. dvt/task/deps.py +280 -0
  229. dvt/task/docs/__init__.py +3 -0
  230. dvt/task/docs/generate.py +408 -0
  231. dvt/task/docs/index.html +250 -0
  232. dvt/task/docs/serve.py +28 -0
  233. dvt/task/freshness.py +323 -0
  234. dvt/task/function.py +122 -0
  235. dvt/task/group_lookup.py +46 -0
  236. dvt/task/init.py +374 -0
  237. dvt/task/list.py +237 -0
  238. dvt/task/printer.py +176 -0
  239. dvt/task/profiles.py +256 -0
  240. dvt/task/retry.py +175 -0
  241. dvt/task/run.py +1146 -0
  242. dvt/task/run_operation.py +142 -0
  243. dvt/task/runnable.py +802 -0
  244. dvt/task/seed.py +104 -0
  245. dvt/task/show.py +150 -0
  246. dvt/task/snapshot.py +57 -0
  247. dvt/task/sql.py +111 -0
  248. dvt/task/test.py +464 -0
  249. dvt/tests/fixtures/__init__.py +1 -0
  250. dvt/tests/fixtures/project.py +620 -0
  251. dvt/tests/util.py +651 -0
  252. dvt/tracking.py +529 -0
  253. dvt/utils/__init__.py +3 -0
  254. dvt/utils/artifact_upload.py +151 -0
  255. dvt/utils/utils.py +408 -0
  256. dvt/version.py +249 -0
  257. dvt_core-1.11.0b4.dist-info/METADATA +252 -0
  258. dvt_core-1.11.0b4.dist-info/RECORD +261 -0
  259. dvt_core-1.11.0b4.dist-info/WHEEL +5 -0
  260. dvt_core-1.11.0b4.dist-info/entry_points.txt +2 -0
  261. dvt_core-1.11.0b4.dist-info/top_level.txt +1 -0
dvt/config/utils.py ADDED
@@ -0,0 +1,78 @@
1
+ from typing import Any, Dict, Optional
2
+
3
+ from dvt import deprecations
4
+ from dvt.clients import yaml_helper
5
+ from dvt.events.types import InvalidOptionYAML
6
+ from dvt.exceptions import DbtExclusivePropertyUseError, OptionNotYamlDictError
7
+
8
+ from dbt_common.events.functions import fire_event
9
+ from dbt_common.exceptions import DbtValidationError
10
+
11
+
12
+ def parse_cli_vars(var_string: str) -> Dict[str, Any]:
13
+ return parse_cli_yaml_string(var_string, "vars")
14
+
15
+
16
+ def parse_cli_yaml_string(var_string: str, cli_option_name: str) -> Dict[str, Any]:
17
+ try:
18
+ cli_vars = yaml_helper.load_yaml_text(var_string)
19
+ var_type = type(cli_vars)
20
+ if cli_vars is not None and var_type is dict:
21
+ return cli_vars
22
+ else:
23
+ raise OptionNotYamlDictError(var_type, cli_option_name)
24
+ except (DbtValidationError, OptionNotYamlDictError):
25
+ fire_event(InvalidOptionYAML(option_name=cli_option_name))
26
+ raise
27
+
28
+
29
+ def exclusive_primary_alt_value_setting(
30
+ dictionary: Optional[Dict[str, Any]],
31
+ primary: str,
32
+ alt: str,
33
+ parent_config: Optional[str] = None,
34
+ ) -> None:
35
+ """Munges in place under the primary the options for the primary and alt values
36
+
37
+ Sometimes we allow setting something via TWO keys, but not at the same time. If both the primary
38
+ key and alt key have values, an error gets raised. If the alt key has values, then we update
39
+ the dictionary to ensure the primary key contains the values. If neither are set, nothing happens.
40
+ """
41
+
42
+ if dictionary is None:
43
+ return
44
+
45
+ primary_options = dictionary.get(primary)
46
+ alt_options = dictionary.get(alt)
47
+
48
+ if primary_options and alt_options:
49
+ where = f" in `{parent_config}`" if parent_config is not None else ""
50
+ raise DbtExclusivePropertyUseError(
51
+ f"Only `{alt}` or `{primary}` can be specified{where}, not both"
52
+ )
53
+
54
+ if alt in dictionary:
55
+ alt_value = dictionary.pop(alt)
56
+ dictionary[primary] = alt_value
57
+
58
+
59
+ def normalize_warn_error_options(warn_error_options: Dict[str, Any]) -> None:
60
+ has_include = "include" in warn_error_options
61
+ has_exclude = "exclude" in warn_error_options
62
+
63
+ if has_include or has_exclude:
64
+ deprecations.buffer(
65
+ "weo-include-exclude-deprecation",
66
+ found_include=has_include,
67
+ found_exclude=has_exclude,
68
+ )
69
+
70
+ exclusive_primary_alt_value_setting(
71
+ warn_error_options, "error", "include", "warn_error_options"
72
+ )
73
+ exclusive_primary_alt_value_setting(
74
+ warn_error_options, "warn", "exclude", "warn_error_options"
75
+ )
76
+ for key in ("error", "warn", "silence"):
77
+ if key in warn_error_options and warn_error_options[key] is None:
78
+ warn_error_options[key] = []
@@ -0,0 +1,6 @@
1
+ # Ignore downloaded JAR files
2
+ jars/
3
+ *.jar
4
+
5
+ # But keep the directory structure
6
+ !.gitkeep
@@ -0,0 +1,306 @@
1
+ # DVT Database Connectors
2
+
3
+ This directory contains the connector catalog for DVT's compute engines (DuckDB and Spark).
4
+
5
+ ## Overview
6
+
7
+ DVT uses **dbt adapters** for all database connections, both for reading from source databases and writing to target databases. This provides a unified, Python-based connection mechanism that works with any database supported by the dbt ecosystem.
8
+
9
+ ## No JARs Required
10
+
11
+ Unlike traditional Spark solutions that require JDBC JAR files, DVT extracts data from databases using dbt adapters and transfers it to the compute layer via Apache Arrow format. This approach:
12
+
13
+ - **Eliminates JAR dependencies** - Pure Python solution
14
+ - **Works with any dbt adapter** - 30+ databases supported out of the box
15
+ - **Provides consistent interface** - Same connection mechanism for all databases
16
+ - **Reduces package size** - No 200+ MB of JARs to download
17
+ - **Simplifies configuration** - Single `profiles.yml` for all connections
18
+
19
+ ## Architecture
20
+
21
+ ### Data Flow
22
+
23
+ ```
24
+ Source DB → dbt adapter → Agate Table → Arrow Table →
25
+ Compute Engine (DuckDB/Spark) → Arrow Table → Target dbt adapter → Target DB
26
+ ```
27
+
28
+ ### Example: Cross-Database Query
29
+
30
+ ```yaml
31
+ # profiles.yml
32
+ postgres_prod:
33
+ adapter: postgres
34
+ host: db.example.com
35
+ port: 5432
36
+ user: analytics
37
+ password: "{{ env_var('POSTGRES_PASSWORD') }}"
38
+ database: production
39
+ schema: public
40
+
41
+ mysql_legacy:
42
+ adapter: mysql
43
+ host: legacy-db.example.com
44
+ port: 3306
45
+ user: readonly
46
+ password: "{{ env_var('MYSQL_PASSWORD') }}"
47
+ database: orders_db
48
+ ```
49
+
50
+ ```yaml
51
+ # sources.yml
52
+ sources:
53
+ - name: postgres_source
54
+ profile: postgres_prod
55
+ tables:
56
+ - name: customers
57
+
58
+ - name: mysql_source
59
+ profile: mysql_legacy
60
+ tables:
61
+ - name: orders
62
+ ```
63
+
64
+ ```sql
65
+ -- models/cross_db_analysis.sql
66
+ select
67
+ c.customer_id,
68
+ c.name,
69
+ count(o.order_id) as order_count
70
+ from {{ source('postgres_source', 'customers') }} c
71
+ left join {{ source('mysql_source', 'orders') }} o
72
+ on c.customer_id = o.customer_id
73
+ group by c.customer_id, c.name
74
+ ```
75
+
76
+ DVT will automatically:
77
+ 1. Detect heterogeneous sources (PostgreSQL + MySQL)
78
+ 2. Extract data via dbt adapters (`dbt-postgres`, `dbt-mysql`)
79
+ 3. Convert to Arrow format for efficient transfer
80
+ 4. Load into compute engine (DuckDB or Spark)
81
+ 5. Execute the query in the compute layer
82
+ 6. Return unified result set
83
+
84
+ ## Supported Databases
85
+
86
+ DVT works with **any database that has a dbt adapter**. This includes:
87
+
88
+ **Relational Databases:**
89
+ - PostgreSQL (`dbt-postgres`)
90
+ - MySQL (`dbt-mysql`)
91
+ - SQL Server (`dbt-sqlserver`)
92
+ - Oracle (`dbt-oracle`)
93
+
94
+ **Cloud Data Warehouses:**
95
+ - Snowflake (`dbt-snowflake`)
96
+ - BigQuery (`dbt-bigquery`)
97
+ - Redshift (`dbt-redshift`)
98
+ - Databricks (`dbt-databricks`)
99
+
100
+ **Analytics Databases:**
101
+ - DuckDB (`dbt-duckdb`)
102
+ - ClickHouse (`dbt-clickhouse`)
103
+ - Trino (`dbt-trino`)
104
+
105
+ **And many more...**
106
+
107
+ See the [dbt adapter registry](https://docs.getdbt.com/docs/supported-data-platforms) for the complete list.
108
+
109
+ ## Installation
110
+
111
+ Install DVT with the dbt adapters you need:
112
+
113
+ ```bash
114
+ # Install DVT core
115
+ pip install dvt-core
116
+
117
+ # Install adapters for your databases
118
+ pip install dbt-postgres dbt-mysql dbt-snowflake
119
+ ```
120
+
121
+ ## Configuration
122
+
123
+ Configure profiles in `~/.dvt/profiles.yml` using the same format as dbt:
124
+
125
+ ```yaml
126
+ # PostgreSQL
127
+ postgres_prod:
128
+ adapter: postgres
129
+ host: db.example.com
130
+ port: 5432
131
+ user: analytics
132
+ password: "{{ env_var('POSTGRES_PASSWORD') }}"
133
+ database: production
134
+ schema: public
135
+
136
+ # MySQL
137
+ mysql_legacy:
138
+ adapter: mysql
139
+ host: legacy-db.example.com
140
+ port: 3306
141
+ user: readonly
142
+ password: "{{ env_var('MYSQL_PASSWORD') }}"
143
+ database: orders_db
144
+
145
+ # Snowflake
146
+ snowflake_analytics:
147
+ adapter: snowflake
148
+ account: mycompany
149
+ user: analytics
150
+ password: "{{ env_var('SNOWFLAKE_PASSWORD') }}"
151
+ database: analytics
152
+ warehouse: compute_wh
153
+ schema: public
154
+ ```
155
+
156
+ Test your connections:
157
+
158
+ ```bash
159
+ dvt profiles test --all
160
+ ```
161
+
162
+ ## Compute Engines
163
+
164
+ DVT supports two compute engines for cross-database queries:
165
+
166
+ ### DuckDB (Default)
167
+ - In-process analytical database
168
+ - Fast for datasets < 10GB
169
+ - Zero configuration required
170
+ - Perfect for development and small-to-medium workloads
171
+
172
+ ### PySpark
173
+ - Distributed compute engine
174
+ - Scales to 100GB+ datasets
175
+ - Local or cluster mode
176
+ - No JDBC JARs required - uses dbt adapters
177
+
178
+ Configure in `dvt_project.yml`:
179
+
180
+ ```yaml
181
+ compute:
182
+ default_engine: duckdb # or 'spark'
183
+
184
+ duckdb:
185
+ memory_limit: '4GB'
186
+ threads: 4
187
+
188
+ spark:
189
+ type: local
190
+ master: 'local[*]'
191
+ config:
192
+ spark.executor.memory: '4g'
193
+ ```
194
+
195
+ ## How It Works
196
+
197
+ ### Traditional Spark Approach (Old)
198
+ ```
199
+ Spark → JDBC JARs (200+ MB) → Database
200
+ ```
201
+ - Requires downloading JARs
202
+ - Different JAR for each database
203
+ - Version conflicts
204
+ - Large package size
205
+
206
+ ### DVT Approach (New)
207
+ ```
208
+ Database → dbt adapter → Arrow → Compute Engine
209
+ ```
210
+ - Pure Python solution
211
+ - Uses existing dbt adapters
212
+ - No JARs needed
213
+ - Small package size (~10 MB core)
214
+
215
+ ### Implementation Details
216
+
217
+ When DVT executes a cross-database query:
218
+
219
+ 1. **Query Analysis**: DVT analyzes the SQL to identify all source databases
220
+ 2. **Data Extraction**: For each source:
221
+ - Get dbt adapter for the profile
222
+ - Execute `SELECT * FROM table` via adapter
223
+ - Receive results as Agate table
224
+ 3. **Arrow Conversion**: Convert Agate tables to Arrow format (zero-copy)
225
+ 4. **Compute Layer**:
226
+ - **DuckDB**: Register Arrow tables directly
227
+ - **Spark**: Convert Arrow → Pandas → Spark DataFrame
228
+ 5. **Query Execution**: Execute the original query in the compute engine
229
+ 6. **Results**: Return results as Arrow table
230
+
231
+ ## Catalog File
232
+
233
+ The `catalog.yml` file is retained for documentation purposes and to map database types to dbt adapter names:
234
+
235
+ ```yaml
236
+ postgres:
237
+ adapter_name: postgres
238
+ dbt_package: dbt-postgres
239
+ description: PostgreSQL database
240
+ connection_docs: https://docs.getdbt.com/reference/warehouse-setups/postgres-setup
241
+
242
+ mysql:
243
+ adapter_name: mysql
244
+ dbt_package: dbt-mysql
245
+ description: MySQL database
246
+ connection_docs: https://docs.getdbt.com/reference/warehouse-setups/mysql-setup
247
+ ```
248
+
249
+ ## Troubleshooting
250
+
251
+ ### Missing Adapter Error
252
+
253
+ If you see `adapter not found` errors:
254
+
255
+ ```bash
256
+ # Check installed adapters
257
+ pip list | grep dbt-
258
+
259
+ # Install missing adapter
260
+ pip install dbt-postgres
261
+ ```
262
+
263
+ ### Connection Failures
264
+
265
+ If connections fail:
266
+
267
+ ```bash
268
+ # Test individual profile
269
+ dvt profiles test postgres_prod
270
+
271
+ # Test all profiles
272
+ dvt profiles test --all
273
+
274
+ # Check profile configuration
275
+ dvt profiles show postgres_prod
276
+ ```
277
+
278
+ ### Performance Issues
279
+
280
+ For large datasets:
281
+
282
+ 1. Switch to Spark engine:
283
+ ```yaml
284
+ compute:
285
+ default_engine: spark
286
+ ```
287
+
288
+ 2. Increase memory limits:
289
+ ```yaml
290
+ spark:
291
+ config:
292
+ spark.executor.memory: '8g'
293
+ spark.driver.memory: '4g'
294
+ ```
295
+
296
+ 3. Use per-model configuration:
297
+ ```sql
298
+ {{ config(compute='spark') }}
299
+ ```
300
+
301
+ ## See Also
302
+
303
+ - [Multi-Profile Guide](../../docs/multi-profile-guide.md)
304
+ - [Compute Configuration](../../docs/compute-configuration.md)
305
+ - [DVT Architecture](../../docs/DVT_ARCHITECTURE.md)
306
+ - [dbt Adapter Documentation](https://docs.getdbt.com/docs/supported-data-platforms)
@@ -0,0 +1,217 @@
1
+ # DVT Spark JDBC Connector Catalog
2
+ #
3
+ # This file catalogs all JDBC connectors bundled with DVT for cross-database queries.
4
+ # Each connector enables Spark to read from a specific database type.
5
+ #
6
+ # Format:
7
+ # connector_name:
8
+ # description: Human-readable description
9
+ # driver_class: JDBC driver class name
10
+ # maven_coordinates: Maven group:artifact:version
11
+ # url_pattern: JDBC URL pattern
12
+ # homepage: Official project homepage
13
+ # license: Software license
14
+
15
+ version: "1.0.0"
16
+
17
+ connectors:
18
+ # Relational Databases
19
+ postgresql:
20
+ description: "PostgreSQL - Open source relational database"
21
+ driver_class: "org.postgresql.Driver"
22
+ maven_coordinates: "org.postgresql:postgresql:42.7.1"
23
+ url_pattern: "jdbc:postgresql://<host>:<port>/<database>"
24
+ homepage: "https://www.postgresql.org/"
25
+ license: "PostgreSQL License"
26
+
27
+ mysql:
28
+ description: "MySQL - Popular open source relational database"
29
+ driver_class: "com.mysql.cj.jdbc.Driver"
30
+ maven_coordinates: "com.mysql:mysql-connector-j:8.2.0"
31
+ url_pattern: "jdbc:mysql://<host>:<port>/<database>"
32
+ homepage: "https://www.mysql.com/"
33
+ license: "GPL v2"
34
+
35
+ mariadb:
36
+ description: "MariaDB - MySQL-compatible open source database"
37
+ driver_class: "org.mariadb.jdbc.Driver"
38
+ maven_coordinates: "org.mariadb.jdbc:mariadb-java-client:3.3.1"
39
+ url_pattern: "jdbc:mariadb://<host>:<port>/<database>"
40
+ homepage: "https://mariadb.org/"
41
+ license: "LGPL v2.1"
42
+
43
+ oracle:
44
+ description: "Oracle Database - Enterprise relational database"
45
+ driver_class: "oracle.jdbc.OracleDriver"
46
+ maven_coordinates: "com.oracle.database.jdbc:ojdbc11:23.3.0.23.09"
47
+ url_pattern: "jdbc:oracle:thin:@<host>:<port>:<sid>"
48
+ homepage: "https://www.oracle.com/database/"
49
+ license: "Oracle Free Use Terms and Conditions"
50
+
51
+ sqlserver:
52
+ description: "Microsoft SQL Server - Enterprise database"
53
+ driver_class: "com.microsoft.sqlserver.jdbc.SQLServerDriver"
54
+ maven_coordinates: "com.microsoft.sqlserver:mssql-jdbc:12.4.2.jre11"
55
+ url_pattern: "jdbc:sqlserver://<host>:<port>;databaseName=<database>"
56
+ homepage: "https://www.microsoft.com/sql-server/"
57
+ license: "MIT"
58
+
59
+ db2:
60
+ description: "IBM Db2 - Enterprise database"
61
+ driver_class: "com.ibm.db2.jcc.DB2Driver"
62
+ maven_coordinates: "com.ibm.db2:jcc:11.5.9.0"
63
+ url_pattern: "jdbc:db2://<host>:<port>/<database>"
64
+ homepage: "https://www.ibm.com/products/db2"
65
+ license: "IBM"
66
+
67
+ # Cloud Databases
68
+ snowflake:
69
+ description: "Snowflake - Cloud data warehouse"
70
+ driver_class: "net.snowflake.client.jdbc.SnowflakeDriver"
71
+ maven_coordinates: "net.snowflake:snowflake-jdbc:3.14.4"
72
+ url_pattern: "jdbc:snowflake://<account>.snowflakecomputing.com"
73
+ homepage: "https://www.snowflake.com/"
74
+ license: "Apache 2.0"
75
+
76
+ bigquery:
77
+ description: "Google BigQuery - Serverless data warehouse"
78
+ driver_class: "com.simba.googlebigquery.jdbc.Driver"
79
+ maven_coordinates: "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.35.1"
80
+ url_pattern: "jdbc:bigquery://<project>:<dataset>"
81
+ homepage: "https://cloud.google.com/bigquery"
82
+ license: "Apache 2.0"
83
+
84
+ redshift:
85
+ description: "Amazon Redshift - Cloud data warehouse"
86
+ driver_class: "com.amazon.redshift.jdbc.Driver"
87
+ maven_coordinates: "com.amazon.redshift:redshift-jdbc42:2.1.0.25"
88
+ url_pattern: "jdbc:redshift://<endpoint>:<port>/<database>"
89
+ homepage: "https://aws.amazon.com/redshift/"
90
+ license: "Apache 2.0"
91
+
92
+ athena:
93
+ description: "Amazon Athena - Serverless query service"
94
+ driver_class: "com.simba.athena.jdbc.Driver"
95
+ maven_coordinates: "com.amazonaws:aws-java-sdk-athena:1.12.565"
96
+ url_pattern: "jdbc:awsathena://<region>.amazonaws.com:443"
97
+ homepage: "https://aws.amazon.com/athena/"
98
+ license: "Apache 2.0"
99
+
100
+ # Analytics & MPP Databases
101
+ clickhouse:
102
+ description: "ClickHouse - Fast open-source column-oriented database"
103
+ driver_class: "com.clickhouse.jdbc.ClickHouseDriver"
104
+ maven_coordinates: "com.clickhouse:clickhouse-jdbc:0.6.0-patch1"
105
+ url_pattern: "jdbc:clickhouse://<host>:<port>/<database>"
106
+ homepage: "https://clickhouse.com/"
107
+ license: "Apache 2.0"
108
+
109
+ vertica:
110
+ description: "Vertica - Unified analytics platform"
111
+ driver_class: "com.vertica.jdbc.Driver"
112
+ maven_coordinates: "com.vertica.jdbc:vertica-jdbc:23.4.0-0"
113
+ url_pattern: "jdbc:vertica://<host>:<port>/<database>"
114
+ homepage: "https://www.vertica.com/"
115
+ license: "Apache 2.0"
116
+
117
+ presto:
118
+ description: "Presto - Distributed SQL query engine"
119
+ driver_class: "com.facebook.presto.jdbc.PrestoDriver"
120
+ maven_coordinates: "com.facebook.presto:presto-jdbc:0.285"
121
+ url_pattern: "jdbc:presto://<host>:<port>/<catalog>"
122
+ homepage: "https://prestodb.io/"
123
+ license: "Apache 2.0"
124
+
125
+ trino:
126
+ description: "Trino - Fast distributed SQL query engine"
127
+ driver_class: "io.trino.jdbc.TrinoDriver"
128
+ maven_coordinates: "io.trino:trino-jdbc:433"
129
+ url_pattern: "jdbc:trino://<host>:<port>/<catalog>"
130
+ homepage: "https://trino.io/"
131
+ license: "Apache 2.0"
132
+
133
+ # NoSQL & NewSQL
134
+ cassandra:
135
+ description: "Apache Cassandra - Distributed NoSQL database"
136
+ driver_class: "com.github.adejanovski.cassandra.jdbc.CassandraDriver"
137
+ maven_coordinates: "com.github.adejanovski:cassandra-jdbc-wrapper:4.10.2"
138
+ url_pattern: "jdbc:cassandra://<host>:<port>/<keyspace>"
139
+ homepage: "https://cassandra.apache.org/"
140
+ license: "Apache 2.0"
141
+
142
+ mongodb:
143
+ description: "MongoDB - Document database"
144
+ driver_class: "mongodb.jdbc.MongoDriver"
145
+ maven_coordinates: "org.mongodb.spark:mongo-spark-connector_2.12:10.2.1"
146
+ url_pattern: "mongodb://<host>:<port>/<database>"
147
+ homepage: "https://www.mongodb.com/"
148
+ license: "Server Side Public License"
149
+
150
+ # Other Databases
151
+ h2:
152
+ description: "H2 - Lightweight Java SQL database"
153
+ driver_class: "org.h2.Driver"
154
+ maven_coordinates: "com.h2database:h2:2.2.224"
155
+ url_pattern: "jdbc:h2:<file_path>"
156
+ homepage: "https://h2database.com/"
157
+ license: "MPL 2.0 or EPL 1.0"
158
+
159
+ sqlite:
160
+ description: "SQLite - Embedded relational database"
161
+ driver_class: "org.sqlite.JDBC"
162
+ maven_coordinates: "org.xerial:sqlite-jdbc:3.44.1.0"
163
+ url_pattern: "jdbc:sqlite:<file_path>"
164
+ homepage: "https://www.sqlite.org/"
165
+ license: "Public Domain"
166
+
167
+ derby:
168
+ description: "Apache Derby - Java-based relational database"
169
+ driver_class: "org.apache.derby.jdbc.EmbeddedDriver"
170
+ maven_coordinates: "org.apache.derby:derby:10.16.1.1"
171
+ url_pattern: "jdbc:derby:<database>"
172
+ homepage: "https://db.apache.org/derby/"
173
+ license: "Apache 2.0"
174
+
175
+ hive:
176
+ description: "Apache Hive - Data warehouse software"
177
+ driver_class: "org.apache.hive.jdbc.HiveDriver"
178
+ maven_coordinates: "org.apache.hive:hive-jdbc:3.1.3"
179
+ url_pattern: "jdbc:hive2://<host>:<port>/<database>"
180
+ homepage: "https://hive.apache.org/"
181
+ license: "Apache 2.0"
182
+
183
+ impala:
184
+ description: "Apache Impala - MPP SQL query engine"
185
+ driver_class: "com.cloudera.impala.jdbc.Driver"
186
+ maven_coordinates: "org.apache.impala:impala-jdbc:2.6.30"
187
+ url_pattern: "jdbc:impala://<host>:<port>/<database>"
188
+ homepage: "https://impala.apache.org/"
189
+ license: "Apache 2.0"
190
+
191
+ phoenix:
192
+ description: "Apache Phoenix - SQL skin over HBase"
193
+ driver_class: "org.apache.phoenix.jdbc.PhoenixDriver"
194
+ maven_coordinates: "org.apache.phoenix:phoenix-client-hbase-2.5:5.1.3"
195
+ url_pattern: "jdbc:phoenix:<zookeeper_quorum>"
196
+ homepage: "https://phoenix.apache.org/"
197
+ license: "Apache 2.0"
198
+
199
+ # Time Series & Specialized
200
+ timescaledb:
201
+ description: "TimescaleDB - Time-series extension for PostgreSQL"
202
+ driver_class: "org.postgresql.Driver"
203
+ maven_coordinates: "org.postgresql:postgresql:42.7.1"
204
+ url_pattern: "jdbc:postgresql://<host>:<port>/<database>"
205
+ homepage: "https://www.timescale.com/"
206
+ license: "Timescale License"
207
+
208
+ influxdb:
209
+ description: "InfluxDB - Time series database"
210
+ driver_class: "org.influxdb.jdbc.InfluxDBDriver"
211
+ maven_coordinates: "org.influxdb:influxdb-java:2.24"
212
+ url_pattern: "jdbc:influxdb://<host>:<port>/<database>"
213
+ homepage: "https://www.influxdata.com/"
214
+ license: "MIT"
215
+
216
+ # Estimated total size: ~500MB - 1GB for all connectors
217
+ # Individual connector sizes range from 1MB to 100MB