dagster-evidence 0.1.6__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dagster_evidence/__init__.py +19 -2
- dagster_evidence/components/__init__.py +17 -0
- dagster_evidence/components/deployments.py +514 -0
- dagster_evidence/components/evidence_project_v2.py +185 -0
- dagster_evidence/components/projects.py +660 -0
- dagster_evidence/components/sources.py +1350 -0
- dagster_evidence/components/translator.py +168 -0
- dagster_evidence/lib/evidence_project.py +6 -5
- dagster_evidence/resource.py +4 -4
- dagster_evidence/utils/__init__.py +5 -0
- dagster_evidence/utils/sql_parser.py +87 -0
- dagster_evidence-0.2.0.dist-info/METADATA +120 -0
- dagster_evidence-0.2.0.dist-info/RECORD +17 -0
- {dagster_evidence-0.1.6.dist-info → dagster_evidence-0.2.0.dist-info}/WHEEL +1 -1
- dagster_evidence-0.1.6.dist-info/METADATA +0 -23
- dagster_evidence-0.1.6.dist-info/RECORD +0 -9
- {dagster_evidence-0.1.6.dist-info → dagster_evidence-0.2.0.dist-info}/entry_points.txt +0 -0
- {dagster_evidence-0.1.6.dist-info → dagster_evidence-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1350 @@
|
|
|
1
|
+
"""Source classes for Evidence projects.
|
|
2
|
+
|
|
3
|
+
This module defines the data structures used to represent Evidence project sources,
|
|
4
|
+
including queries, connections, and the translator data classes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
from abc import abstractmethod
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from typing import TYPE_CHECKING, Any
|
|
12
|
+
|
|
13
|
+
import dagster as dg
|
|
14
|
+
from dagster import AssetKey
|
|
15
|
+
from dagster._annotations import beta, public
|
|
16
|
+
from dagster._record import record
|
|
17
|
+
from dagster._serdes import whitelist_for_serdes
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from .sources import EvidenceSourceTranslatorData
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@beta
|
|
24
|
+
@public
|
|
25
|
+
@whitelist_for_serdes
|
|
26
|
+
@dataclass
|
|
27
|
+
class SourceQuery:
|
|
28
|
+
"""Represents a single SQL query in an Evidence source.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
name: The query name (derived from filename without .sql extension).
|
|
32
|
+
content: The SQL query content.
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
|
|
36
|
+
A query file ``sources/orders_db/daily_orders.sql`` would be parsed as:
|
|
37
|
+
|
|
38
|
+
.. code-block:: python
|
|
39
|
+
|
|
40
|
+
SourceQuery(
|
|
41
|
+
name="daily_orders",
|
|
42
|
+
content="SELECT * FROM orders WHERE date = current_date"
|
|
43
|
+
)
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
name: str
|
|
47
|
+
content: str
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@beta
|
|
51
|
+
@public
|
|
52
|
+
@whitelist_for_serdes
|
|
53
|
+
@dataclass
|
|
54
|
+
class SourceDagsterMetadata:
|
|
55
|
+
"""Dagster-specific metadata for Evidence sources.
|
|
56
|
+
|
|
57
|
+
Parsed from the ``meta.dagster`` section of connection.yaml.
|
|
58
|
+
|
|
59
|
+
Attributes:
|
|
60
|
+
create_source_sensor: Override whether sensors are created for this source.
|
|
61
|
+
If None, uses the source type's default (get_source_sensor_enabled_default).
|
|
62
|
+
hide_source_asset: Override whether this source's assets are hidden.
|
|
63
|
+
If None, uses the source type's default (get_hide_source_asset_default).
|
|
64
|
+
group_name: Override the asset group name for this source.
|
|
65
|
+
If None, uses the source folder name.
|
|
66
|
+
|
|
67
|
+
Example:
|
|
68
|
+
|
|
69
|
+
A ``connection.yaml`` file with Dagster metadata:
|
|
70
|
+
|
|
71
|
+
.. code-block:: yaml
|
|
72
|
+
|
|
73
|
+
name: motherduck_source
|
|
74
|
+
type: motherduck
|
|
75
|
+
options:
|
|
76
|
+
database: analytics
|
|
77
|
+
meta:
|
|
78
|
+
dagster:
|
|
79
|
+
create_source_sensor: false
|
|
80
|
+
hide_source_asset: false
|
|
81
|
+
group_name: analytics_sources
|
|
82
|
+
|
|
83
|
+
Would be parsed as:
|
|
84
|
+
|
|
85
|
+
.. code-block:: python
|
|
86
|
+
|
|
87
|
+
SourceDagsterMetadata(
|
|
88
|
+
create_source_sensor=False,
|
|
89
|
+
hide_source_asset=False,
|
|
90
|
+
group_name="analytics_sources"
|
|
91
|
+
)
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
create_source_sensor: bool | None = None
|
|
95
|
+
hide_source_asset: bool | None = None
|
|
96
|
+
group_name: str | None = None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@beta
|
|
100
|
+
@public
|
|
101
|
+
@whitelist_for_serdes
|
|
102
|
+
@dataclass
|
|
103
|
+
class ProjectDagsterMetadata:
|
|
104
|
+
"""Dagster-specific metadata for Evidence projects.
|
|
105
|
+
|
|
106
|
+
Parsed from the ``meta.dagster`` section of evidence.config.yaml.
|
|
107
|
+
|
|
108
|
+
Attributes:
|
|
109
|
+
group_name: Override the asset group name for the project asset.
|
|
110
|
+
If None, uses Dagster's default grouping.
|
|
111
|
+
|
|
112
|
+
Example:
|
|
113
|
+
|
|
114
|
+
An ``evidence.config.yaml`` file with Dagster metadata:
|
|
115
|
+
|
|
116
|
+
.. code-block:: yaml
|
|
117
|
+
|
|
118
|
+
deployment:
|
|
119
|
+
basePath: /sales-dashboard
|
|
120
|
+
|
|
121
|
+
meta:
|
|
122
|
+
dagster:
|
|
123
|
+
group_name: dashboards
|
|
124
|
+
|
|
125
|
+
Would be parsed as:
|
|
126
|
+
|
|
127
|
+
.. code-block:: python
|
|
128
|
+
|
|
129
|
+
ProjectDagsterMetadata(
|
|
130
|
+
group_name="dashboards"
|
|
131
|
+
)
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
group_name: str | None = None
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@beta
|
|
138
|
+
@public
|
|
139
|
+
@whitelist_for_serdes
|
|
140
|
+
@dataclass
|
|
141
|
+
class SourceConnection:
|
|
142
|
+
"""Represents connection configuration for an Evidence source.
|
|
143
|
+
|
|
144
|
+
This is parsed from the ``connection.yaml`` file in each source directory.
|
|
145
|
+
|
|
146
|
+
Attributes:
|
|
147
|
+
type: The source type identifier (e.g., "duckdb", "bigquery", "motherduck").
|
|
148
|
+
extra: Additional connection-specific fields from the YAML file.
|
|
149
|
+
dagster_metadata: Dagster-specific metadata parsed from meta.dagster section.
|
|
150
|
+
|
|
151
|
+
Example:
|
|
152
|
+
|
|
153
|
+
A ``connection.yaml`` file:
|
|
154
|
+
|
|
155
|
+
.. code-block:: yaml
|
|
156
|
+
|
|
157
|
+
type: duckdb
|
|
158
|
+
filename: ./data/analytics.duckdb
|
|
159
|
+
|
|
160
|
+
Would be parsed as:
|
|
161
|
+
|
|
162
|
+
.. code-block:: python
|
|
163
|
+
|
|
164
|
+
SourceConnection(
|
|
165
|
+
type="duckdb",
|
|
166
|
+
extra={"filename": "./data/analytics.duckdb"},
|
|
167
|
+
dagster_metadata=SourceDagsterMetadata()
|
|
168
|
+
)
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
type: str
|
|
172
|
+
extra: dict[str, Any] # Additional connection-specific fields
|
|
173
|
+
dagster_metadata: SourceDagsterMetadata = field(
|
|
174
|
+
default_factory=lambda: SourceDagsterMetadata()
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@beta
|
|
179
|
+
@public
|
|
180
|
+
@whitelist_for_serdes
|
|
181
|
+
@dataclass
|
|
182
|
+
class SourceContent:
|
|
183
|
+
"""Represents the full content of an Evidence source directory.
|
|
184
|
+
|
|
185
|
+
A source directory contains a connection.yaml and one or more .sql query files.
|
|
186
|
+
|
|
187
|
+
Attributes:
|
|
188
|
+
connection: The connection configuration parsed from connection.yaml.
|
|
189
|
+
queries: List of SQL queries parsed from .sql files.
|
|
190
|
+
|
|
191
|
+
Example:
|
|
192
|
+
|
|
193
|
+
Source directory structure:
|
|
194
|
+
|
|
195
|
+
.. code-block:: text
|
|
196
|
+
|
|
197
|
+
sources/orders_db/
|
|
198
|
+
├── connection.yaml
|
|
199
|
+
├── orders.sql
|
|
200
|
+
└── customers.sql
|
|
201
|
+
|
|
202
|
+
Would be parsed as:
|
|
203
|
+
|
|
204
|
+
.. code-block:: python
|
|
205
|
+
|
|
206
|
+
SourceContent(
|
|
207
|
+
connection=SourceConnection(type="duckdb", extra={...}),
|
|
208
|
+
queries=[
|
|
209
|
+
SourceQuery(name="orders", content="SELECT ..."),
|
|
210
|
+
SourceQuery(name="customers", content="SELECT ..."),
|
|
211
|
+
]
|
|
212
|
+
)
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
connection: SourceConnection
|
|
216
|
+
queries: list[SourceQuery]
|
|
217
|
+
|
|
218
|
+
@public
|
|
219
|
+
@staticmethod
|
|
220
|
+
def from_dict(data: dict[str, Any]) -> "SourceContent":
|
|
221
|
+
"""Create SourceContent from a raw dictionary.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
data: Dictionary containing "connection" and "queries" keys.
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
A SourceContent instance.
|
|
228
|
+
|
|
229
|
+
Example:
|
|
230
|
+
|
|
231
|
+
.. code-block:: python
|
|
232
|
+
|
|
233
|
+
data = {
|
|
234
|
+
"connection": {"type": "duckdb", "filename": "data.db"},
|
|
235
|
+
"queries": [
|
|
236
|
+
{"name": "orders", "content": "SELECT * FROM orders"}
|
|
237
|
+
]
|
|
238
|
+
}
|
|
239
|
+
source = SourceContent.from_dict(data)
|
|
240
|
+
|
|
241
|
+
With Dagster metadata:
|
|
242
|
+
|
|
243
|
+
.. code-block:: python
|
|
244
|
+
|
|
245
|
+
data = {
|
|
246
|
+
"connection": {
|
|
247
|
+
"type": "duckdb",
|
|
248
|
+
"filename": "data.db",
|
|
249
|
+
"meta": {
|
|
250
|
+
"dagster": {
|
|
251
|
+
"create_source_sensor": False,
|
|
252
|
+
"hide_source_asset": False,
|
|
253
|
+
"group_name": "custom_group"
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
},
|
|
257
|
+
"queries": [...]
|
|
258
|
+
}
|
|
259
|
+
"""
|
|
260
|
+
connection_data = data.get("connection", {})
|
|
261
|
+
# Parse dagster metadata from meta.dagster section
|
|
262
|
+
meta = connection_data.get("meta", {})
|
|
263
|
+
dagster_meta = meta.get("dagster", {})
|
|
264
|
+
dagster_metadata = SourceDagsterMetadata(
|
|
265
|
+
create_source_sensor=dagster_meta.get("create_source_sensor"),
|
|
266
|
+
hide_source_asset=dagster_meta.get("hide_source_asset"),
|
|
267
|
+
group_name=dagster_meta.get("group_name"),
|
|
268
|
+
)
|
|
269
|
+
connection = SourceConnection(
|
|
270
|
+
type=connection_data.get("type", ""),
|
|
271
|
+
extra={
|
|
272
|
+
k: v for k, v in connection_data.items() if k not in ("type", "meta")
|
|
273
|
+
},
|
|
274
|
+
dagster_metadata=dagster_metadata,
|
|
275
|
+
)
|
|
276
|
+
queries = [
|
|
277
|
+
SourceQuery(name=q.get("name", ""), content=q.get("content", ""))
|
|
278
|
+
for q in data.get("queries", [])
|
|
279
|
+
]
|
|
280
|
+
return SourceContent(connection=connection, queries=queries)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
@beta
|
|
284
|
+
@public
|
|
285
|
+
@record
|
|
286
|
+
class EvidenceSourceTranslatorData:
|
|
287
|
+
"""Data passed to the translator for generating source asset specs.
|
|
288
|
+
|
|
289
|
+
This record contains all information needed to generate an AssetSpec
|
|
290
|
+
for a single source query.
|
|
291
|
+
|
|
292
|
+
Attributes:
|
|
293
|
+
source_content: The full source content including connection and queries.
|
|
294
|
+
source_group: The source folder name (e.g., "orders_db").
|
|
295
|
+
query: The specific query being translated.
|
|
296
|
+
extracted_data: Additional data extracted from the source (e.g., table dependencies).
|
|
297
|
+
|
|
298
|
+
Example:
|
|
299
|
+
|
|
300
|
+
Used in custom translator implementations:
|
|
301
|
+
|
|
302
|
+
.. code-block:: python
|
|
303
|
+
|
|
304
|
+
from dagster_evidence import (
|
|
305
|
+
DagsterEvidenceTranslator,
|
|
306
|
+
EvidenceSourceTranslatorData,
|
|
307
|
+
)
|
|
308
|
+
import dagster as dg
|
|
309
|
+
|
|
310
|
+
class CustomTranslator(DagsterEvidenceTranslator):
|
|
311
|
+
def get_asset_spec(self, data):
|
|
312
|
+
if isinstance(data, EvidenceSourceTranslatorData):
|
|
313
|
+
# Access source information
|
|
314
|
+
source_type = data.source_content.connection.type
|
|
315
|
+
query_name = data.query.name
|
|
316
|
+
group = data.source_group
|
|
317
|
+
# Access extracted table dependencies
|
|
318
|
+
table_deps = data.extracted_data.get("table_deps", [])
|
|
319
|
+
# Generate custom AssetSpec
|
|
320
|
+
return dg.AssetSpec(
|
|
321
|
+
key=dg.AssetKey([group, query_name]),
|
|
322
|
+
kinds={"evidence", source_type},
|
|
323
|
+
)
|
|
324
|
+
return super().get_asset_spec(data)
|
|
325
|
+
"""
|
|
326
|
+
|
|
327
|
+
source_content: SourceContent
|
|
328
|
+
source_group: str # The source folder name (e.g., "orders_db")
|
|
329
|
+
query: SourceQuery # The specific query being translated
|
|
330
|
+
extracted_data: dict[str, Any] = {} # Additional extracted data (e.g., table_deps)
|
|
331
|
+
source_path: str | None = (
|
|
332
|
+
None # Absolute path to source directory (for resolving relative paths)
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
@public
|
|
336
|
+
@property
|
|
337
|
+
def effective_group_name(self) -> str:
|
|
338
|
+
"""Get the effective group name, considering metadata override.
|
|
339
|
+
|
|
340
|
+
Returns the group_name from dagster metadata if set, otherwise
|
|
341
|
+
returns the source_group (folder name).
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
The effective group name to use for asset grouping.
|
|
345
|
+
"""
|
|
346
|
+
meta = self.source_content.connection.dagster_metadata
|
|
347
|
+
return meta.group_name if meta.group_name else self.source_group
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
@beta
|
|
351
|
+
@public
|
|
352
|
+
@record
|
|
353
|
+
class EvidenceProjectTranslatorData:
|
|
354
|
+
"""Data passed to the translator for generating the main project asset spec.
|
|
355
|
+
|
|
356
|
+
This record contains all information needed to generate an AssetSpec
|
|
357
|
+
for the Evidence project build-and-deploy asset.
|
|
358
|
+
|
|
359
|
+
Attributes:
|
|
360
|
+
project_name: The name of the Evidence project.
|
|
361
|
+
sources_by_id: Dictionary mapping source folder names to their content.
|
|
362
|
+
source_deps: List of AssetKeys for source assets this project depends on.
|
|
363
|
+
dagster_metadata: Dagster-specific metadata parsed from evidence.config.yaml.
|
|
364
|
+
|
|
365
|
+
Example:
|
|
366
|
+
|
|
367
|
+
Used in custom translator implementations:
|
|
368
|
+
|
|
369
|
+
.. code-block:: python
|
|
370
|
+
|
|
371
|
+
from dagster_evidence import (
|
|
372
|
+
DagsterEvidenceTranslator,
|
|
373
|
+
EvidenceProjectTranslatorData,
|
|
374
|
+
)
|
|
375
|
+
import dagster as dg
|
|
376
|
+
|
|
377
|
+
class CustomTranslator(DagsterEvidenceTranslator):
|
|
378
|
+
def get_asset_spec(self, data):
|
|
379
|
+
if isinstance(data, EvidenceProjectTranslatorData):
|
|
380
|
+
return dg.AssetSpec(
|
|
381
|
+
key=dg.AssetKey(["dashboards", data.project_name]),
|
|
382
|
+
kinds={"evidence", "dashboard"},
|
|
383
|
+
deps=data.source_deps,
|
|
384
|
+
metadata={"source_count": len(data.sources_by_id)},
|
|
385
|
+
)
|
|
386
|
+
return super().get_asset_spec(data)
|
|
387
|
+
"""
|
|
388
|
+
|
|
389
|
+
project_name: str
|
|
390
|
+
sources_by_id: dict[str, SourceContent]
|
|
391
|
+
source_deps: Sequence[AssetKey] # Dependencies on source assets
|
|
392
|
+
dagster_metadata: ProjectDagsterMetadata = ProjectDagsterMetadata()
|
|
393
|
+
|
|
394
|
+
@public
|
|
395
|
+
@property
|
|
396
|
+
def effective_group_name(self) -> str | None:
|
|
397
|
+
"""Get the effective group name from metadata, or None for default.
|
|
398
|
+
|
|
399
|
+
Returns the group_name from dagster metadata if set, otherwise
|
|
400
|
+
returns None to use Dagster's default grouping.
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
The effective group name to use for asset grouping, or None.
|
|
404
|
+
"""
|
|
405
|
+
return self.dagster_metadata.group_name
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
@beta
|
|
409
|
+
@public
|
|
410
|
+
@dataclass
|
|
411
|
+
class BaseEvidenceProjectSource:
|
|
412
|
+
"""Base class for Evidence project data sources.
|
|
413
|
+
|
|
414
|
+
Subclass this to implement custom source types that can be registered
|
|
415
|
+
with the translator's SOURCE_TYPE_REGISTRY.
|
|
416
|
+
|
|
417
|
+
Attributes:
|
|
418
|
+
source_content: The parsed source content from the Evidence project.
|
|
419
|
+
|
|
420
|
+
Example:
|
|
421
|
+
|
|
422
|
+
Implementing a custom PostgreSQL source:
|
|
423
|
+
|
|
424
|
+
.. code-block:: python
|
|
425
|
+
|
|
426
|
+
from dagster_evidence.components.sources import BaseEvidenceProjectSource
|
|
427
|
+
|
|
428
|
+
class PostgresEvidenceProjectSource(BaseEvidenceProjectSource):
|
|
429
|
+
@staticmethod
|
|
430
|
+
def get_source_type() -> str:
|
|
431
|
+
return "postgres"
|
|
432
|
+
|
|
433
|
+
# Register with translator
|
|
434
|
+
from dagster_evidence import DagsterEvidenceTranslator
|
|
435
|
+
|
|
436
|
+
class CustomTranslator(DagsterEvidenceTranslator):
|
|
437
|
+
SOURCE_TYPE_REGISTRY = {
|
|
438
|
+
**DagsterEvidenceTranslator.SOURCE_TYPE_REGISTRY,
|
|
439
|
+
"postgres": PostgresEvidenceProjectSource,
|
|
440
|
+
}
|
|
441
|
+
"""
|
|
442
|
+
|
|
443
|
+
source_content: SourceContent
|
|
444
|
+
|
|
445
|
+
@public
|
|
446
|
+
@classmethod
|
|
447
|
+
def get_hide_source_asset_default(cls) -> bool:
|
|
448
|
+
"""Return whether this source type should hide its assets by default.
|
|
449
|
+
|
|
450
|
+
When enabled via ``enable_source_assets_hiding`` on the project, sources
|
|
451
|
+
that return True here will not create intermediate source assets. Instead,
|
|
452
|
+
their table dependencies (extracted from SQL) are linked directly to the
|
|
453
|
+
project asset.
|
|
454
|
+
|
|
455
|
+
Override in subclasses to change the default behavior.
|
|
456
|
+
|
|
457
|
+
Returns:
|
|
458
|
+
True to hide source assets by default, False to show them.
|
|
459
|
+
"""
|
|
460
|
+
return False
|
|
461
|
+
|
|
462
|
+
@public
|
|
463
|
+
@classmethod
|
|
464
|
+
def get_source_sensor_enabled_default(cls) -> bool:
|
|
465
|
+
"""Return whether sensors are enabled by default for this source type.
|
|
466
|
+
|
|
467
|
+
When enabled via ``enable_source_sensors`` on the project, sources
|
|
468
|
+
that return True here will have sensors created to detect changes
|
|
469
|
+
in the underlying data.
|
|
470
|
+
|
|
471
|
+
Override in subclasses to enable sensor support.
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
True to enable sensors by default, False to disable them.
|
|
475
|
+
"""
|
|
476
|
+
return False
|
|
477
|
+
|
|
478
|
+
@public
|
|
479
|
+
def get_hide_source_asset(self) -> bool:
|
|
480
|
+
"""Return whether this source should hide its assets.
|
|
481
|
+
|
|
482
|
+
Checks per-source metadata override first (meta.dagster.hide_source_asset),
|
|
483
|
+
then falls back to the class default (get_hide_source_asset_default).
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
True to hide source assets, False to show them.
|
|
487
|
+
"""
|
|
488
|
+
meta = self.source_content.connection.dagster_metadata
|
|
489
|
+
if meta.hide_source_asset is not None:
|
|
490
|
+
return meta.hide_source_asset
|
|
491
|
+
return self.get_hide_source_asset_default()
|
|
492
|
+
|
|
493
|
+
@public
|
|
494
|
+
def get_source_sensor_enabled(self) -> bool:
|
|
495
|
+
"""Return whether sensors are enabled for this source.
|
|
496
|
+
|
|
497
|
+
Checks per-source metadata override first (meta.dagster.create_source_sensor),
|
|
498
|
+
then falls back to the class default (get_source_sensor_enabled_default).
|
|
499
|
+
|
|
500
|
+
Returns:
|
|
501
|
+
True to enable sensors, False to disable them.
|
|
502
|
+
"""
|
|
503
|
+
meta = self.source_content.connection.dagster_metadata
|
|
504
|
+
if meta.create_source_sensor is not None:
|
|
505
|
+
return meta.create_source_sensor
|
|
506
|
+
return self.get_source_sensor_enabled_default()
|
|
507
|
+
|
|
508
|
+
@public
|
|
509
|
+
@classmethod
|
|
510
|
+
def get_source_sensor(
|
|
511
|
+
cls,
|
|
512
|
+
data: "EvidenceSourceTranslatorData",
|
|
513
|
+
asset_key: dg.AssetKey,
|
|
514
|
+
) -> dg.SensorDefinition | None:
|
|
515
|
+
"""Get a sensor for this source to detect data changes.
|
|
516
|
+
|
|
517
|
+
Override in subclasses to implement source-specific change detection.
|
|
518
|
+
The sensor should detect changes in the underlying data and trigger
|
|
519
|
+
the source asset materialization when changes are detected.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
data: The translator data containing source and query information.
|
|
523
|
+
asset_key: The asset key of the source asset to trigger.
|
|
524
|
+
|
|
525
|
+
Returns:
|
|
526
|
+
A SensorDefinition that monitors for changes, or None if not supported.
|
|
527
|
+
"""
|
|
528
|
+
return None
|
|
529
|
+
|
|
530
|
+
@classmethod
|
|
531
|
+
def _build_description_with_sql(cls, data: "EvidenceSourceTranslatorData") -> str:
|
|
532
|
+
"""Build description with raw SQL for SQL-based sources."""
|
|
533
|
+
return (
|
|
534
|
+
f"Evidence {cls.get_source_type()} source: {data.query.name}\n\n"
|
|
535
|
+
f"**Raw SQL:**\n```sql\n{data.query.content}\n```"
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
@classmethod
|
|
539
|
+
def _build_base_metadata(
|
|
540
|
+
cls, data: "EvidenceSourceTranslatorData"
|
|
541
|
+
) -> dict[str, Any]:
|
|
542
|
+
"""Build base metadata dictionary for source assets."""
|
|
543
|
+
metadata: dict[str, Any] = {
|
|
544
|
+
"Source Type": cls.get_source_type(),
|
|
545
|
+
}
|
|
546
|
+
if data.query.content:
|
|
547
|
+
metadata["Raw SQL"] = dg.MetadataValue.md(
|
|
548
|
+
f"```sql\n{data.query.content}\n```"
|
|
549
|
+
)
|
|
550
|
+
table_deps = data.extracted_data.get("table_deps", [])
|
|
551
|
+
if table_deps:
|
|
552
|
+
metadata["Table Dependencies"] = dg.MetadataValue.json(table_deps)
|
|
553
|
+
return metadata
|
|
554
|
+
|
|
555
|
+
@public
|
|
556
|
+
@staticmethod
|
|
557
|
+
@abstractmethod
|
|
558
|
+
def get_source_type() -> str:
|
|
559
|
+
"""Return the source type identifier (e.g., 'duckdb').
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
The source type string that matches the 'type' field in connection.yaml.
|
|
563
|
+
"""
|
|
564
|
+
raise NotImplementedError()
|
|
565
|
+
|
|
566
|
+
@public
|
|
567
|
+
@classmethod
|
|
568
|
+
@abstractmethod
|
|
569
|
+
def extract_data_from_source(
|
|
570
|
+
cls, data: "EvidenceSourceTranslatorData"
|
|
571
|
+
) -> dict[str, Any]:
|
|
572
|
+
"""Extract additional data from the source query.
|
|
573
|
+
|
|
574
|
+
This method is called before get_asset_spec to extract information
|
|
575
|
+
from the SQL query and connection configuration. The extracted data
|
|
576
|
+
is stored in data.extracted_data and can be used in get_asset_spec.
|
|
577
|
+
|
|
578
|
+
Common extracted data includes table dependencies parsed from the SQL query.
|
|
579
|
+
|
|
580
|
+
Args:
|
|
581
|
+
data: The translator data containing source and query information.
|
|
582
|
+
|
|
583
|
+
Returns:
|
|
584
|
+
Dictionary of extracted data. Common keys include:
|
|
585
|
+
- table_deps: List of table references extracted from the SQL query.
|
|
586
|
+
|
|
587
|
+
Example:
|
|
588
|
+
|
|
589
|
+
.. code-block:: python
|
|
590
|
+
|
|
591
|
+
class PostgresEvidenceProjectSource(BaseEvidenceProjectSource):
|
|
592
|
+
@classmethod
|
|
593
|
+
def extract_data_from_source(cls, data):
|
|
594
|
+
from dagster_evidence.utils import extract_table_references
|
|
595
|
+
table_refs = extract_table_references(
|
|
596
|
+
data.query.content,
|
|
597
|
+
default_schema="public",
|
|
598
|
+
)
|
|
599
|
+
return {"table_deps": table_refs}
|
|
600
|
+
"""
|
|
601
|
+
raise NotImplementedError()
|
|
602
|
+
|
|
603
|
+
@public
|
|
604
|
+
@classmethod
|
|
605
|
+
@abstractmethod
|
|
606
|
+
def get_source_asset(
|
|
607
|
+
cls, data: "EvidenceSourceTranslatorData"
|
|
608
|
+
) -> dg.AssetsDefinition:
|
|
609
|
+
"""Get the AssetsDefinition for a source query.
|
|
610
|
+
|
|
611
|
+
Each source type must implement this method to define how its
|
|
612
|
+
assets are represented in Dagster. The returned asset includes
|
|
613
|
+
an automation condition that triggers when upstream dependencies
|
|
614
|
+
are updated.
|
|
615
|
+
|
|
616
|
+
Args:
|
|
617
|
+
data: The translator data containing source and query information.
|
|
618
|
+
The extracted_data field contains data from extract_data_from_source.
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
The AssetsDefinition for the source query with automation condition.
|
|
622
|
+
|
|
623
|
+
Example:
|
|
624
|
+
|
|
625
|
+
.. code-block:: python
|
|
626
|
+
|
|
627
|
+
class PostgresEvidenceProjectSource(BaseEvidenceProjectSource):
|
|
628
|
+
@staticmethod
|
|
629
|
+
def get_source_type() -> str:
|
|
630
|
+
return "postgres"
|
|
631
|
+
|
|
632
|
+
@classmethod
|
|
633
|
+
def get_source_asset(cls, data):
|
|
634
|
+
# Use extracted table dependencies
|
|
635
|
+
deps = []
|
|
636
|
+
for ref in data.extracted_data.get("table_deps", []):
|
|
637
|
+
if ref.get("table"):
|
|
638
|
+
deps.append(dg.AssetKey([ref["table"]]))
|
|
639
|
+
|
|
640
|
+
key = dg.AssetKey(["postgres", data.query.name])
|
|
641
|
+
has_deps = bool(deps)
|
|
642
|
+
|
|
643
|
+
@dg.asset(
|
|
644
|
+
key=key,
|
|
645
|
+
group_name=data.source_group,
|
|
646
|
+
kinds={"evidence", "postgres"},
|
|
647
|
+
deps=deps,
|
|
648
|
+
automation_condition=dg.AutomationCondition.any_deps_match(
|
|
649
|
+
dg.AutomationCondition.newly_updated()
|
|
650
|
+
) if has_deps else None,
|
|
651
|
+
)
|
|
652
|
+
def _source_asset():
|
|
653
|
+
return dg.MaterializeResult()
|
|
654
|
+
|
|
655
|
+
return _source_asset
|
|
656
|
+
"""
|
|
657
|
+
raise NotImplementedError()
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
@beta
|
|
661
|
+
@public
|
|
662
|
+
class DuckdbEvidenceProjectSource(BaseEvidenceProjectSource):
|
|
663
|
+
"""DuckDB source for Evidence projects.
|
|
664
|
+
|
|
665
|
+
Handles Evidence sources configured with ``type: duckdb`` in connection.yaml.
|
|
666
|
+
|
|
667
|
+
Example:
|
|
668
|
+
|
|
669
|
+
connection.yaml for a DuckDB source:
|
|
670
|
+
|
|
671
|
+
.. code-block:: yaml
|
|
672
|
+
|
|
673
|
+
type: duckdb
|
|
674
|
+
filename: ./data/analytics.duckdb
|
|
675
|
+
"""
|
|
676
|
+
|
|
677
|
+
@classmethod
|
|
678
|
+
def get_hide_source_asset_default(cls) -> bool:
|
|
679
|
+
return False
|
|
680
|
+
|
|
681
|
+
@classmethod
|
|
682
|
+
def get_source_sensor_enabled_default(cls) -> bool:
|
|
683
|
+
return False
|
|
684
|
+
|
|
685
|
+
@staticmethod
|
|
686
|
+
def get_source_type() -> str:
|
|
687
|
+
return "duckdb"
|
|
688
|
+
|
|
689
|
+
@classmethod
|
|
690
|
+
def get_source_sensor(
|
|
691
|
+
cls,
|
|
692
|
+
data: "EvidenceSourceTranslatorData",
|
|
693
|
+
asset_key: dg.AssetKey,
|
|
694
|
+
) -> dg.SensorDefinition | None:
|
|
695
|
+
"""Get a sensor that monitors DuckDB tables for changes.
|
|
696
|
+
|
|
697
|
+
Uses information_schema queries with read-only connection to detect
|
|
698
|
+
changes in table row counts.
|
|
699
|
+
"""
|
|
700
|
+
import json
|
|
701
|
+
|
|
702
|
+
options = data.source_content.connection.extra.get("options", {})
|
|
703
|
+
db_path = options.get("filename")
|
|
704
|
+
if not db_path:
|
|
705
|
+
return None
|
|
706
|
+
|
|
707
|
+
# Resolve relative path against source_path
|
|
708
|
+
if data.source_path and not os.path.isabs(db_path):
|
|
709
|
+
db_path = os.path.join(data.source_path, db_path)
|
|
710
|
+
|
|
711
|
+
table_deps = data.extracted_data.get("table_deps", [])
|
|
712
|
+
if not table_deps:
|
|
713
|
+
return None
|
|
714
|
+
|
|
715
|
+
source_group = data.source_group
|
|
716
|
+
query_name = data.query.name
|
|
717
|
+
sensor_name = f"{source_group}_{query_name}_sensor"
|
|
718
|
+
|
|
719
|
+
@dg.sensor(name=sensor_name, asset_selection=[asset_key])
|
|
720
|
+
def duckdb_sensor(context: dg.SensorEvaluationContext):
|
|
721
|
+
try:
|
|
722
|
+
import duckdb
|
|
723
|
+
except ImportError:
|
|
724
|
+
raise ImportError(
|
|
725
|
+
"duckdb is required for DuckDB sensors. "
|
|
726
|
+
"Install it with: pip install dagster-evidence[duckdb]"
|
|
727
|
+
) from None
|
|
728
|
+
|
|
729
|
+
try:
|
|
730
|
+
conn = duckdb.connect(db_path, read_only=True)
|
|
731
|
+
except Exception as e:
|
|
732
|
+
raise Exception(f"Could not connect to DuckDB: {e}") from e
|
|
733
|
+
|
|
734
|
+
try:
|
|
735
|
+
table_counts: dict[str, int] = {}
|
|
736
|
+
for ref in table_deps:
|
|
737
|
+
table_name = ref.get("table")
|
|
738
|
+
schema = ref.get("schema", "main")
|
|
739
|
+
if table_name:
|
|
740
|
+
try:
|
|
741
|
+
result = conn.execute(
|
|
742
|
+
"""
|
|
743
|
+
SELECT estimated_size
|
|
744
|
+
FROM duckdb_tables()
|
|
745
|
+
WHERE table_name = ? AND schema_name = ?
|
|
746
|
+
""",
|
|
747
|
+
[table_name, schema],
|
|
748
|
+
).fetchone()
|
|
749
|
+
table_counts[f"{schema}.{table_name}"] = (
|
|
750
|
+
result[0] if result else 0
|
|
751
|
+
)
|
|
752
|
+
except Exception:
|
|
753
|
+
table_counts[f"{schema}.{table_name}"] = 0
|
|
754
|
+
finally:
|
|
755
|
+
conn.close()
|
|
756
|
+
|
|
757
|
+
cursor = json.loads(context.cursor) if context.cursor else {}
|
|
758
|
+
last_counts = cursor.get("counts", {})
|
|
759
|
+
|
|
760
|
+
if table_counts != last_counts:
|
|
761
|
+
context.update_cursor(json.dumps({"counts": table_counts}))
|
|
762
|
+
yield dg.RunRequest(asset_selection=[asset_key])
|
|
763
|
+
|
|
764
|
+
return duckdb_sensor
|
|
765
|
+
|
|
766
|
+
@classmethod
|
|
767
|
+
def extract_data_from_source(
|
|
768
|
+
cls, data: "EvidenceSourceTranslatorData"
|
|
769
|
+
) -> dict[str, Any]:
|
|
770
|
+
"""Extract table references from DuckDB source query."""
|
|
771
|
+
from dagster_evidence.utils import extract_table_references
|
|
772
|
+
|
|
773
|
+
options = data.source_content.connection.extra.get("options", {})
|
|
774
|
+
# For DuckDB, database can be inferred from filename (without .duckdb extension)
|
|
775
|
+
filename = options.get("filename", "")
|
|
776
|
+
default_database = filename.replace(".duckdb", "") if filename else None
|
|
777
|
+
default_schema = "main" # DuckDB default schema
|
|
778
|
+
|
|
779
|
+
table_refs = extract_table_references(
|
|
780
|
+
data.query.content,
|
|
781
|
+
default_database=default_database,
|
|
782
|
+
default_schema=default_schema,
|
|
783
|
+
)
|
|
784
|
+
return {"table_deps": table_refs}
|
|
785
|
+
|
|
786
|
+
@classmethod
|
|
787
|
+
def get_source_asset(
|
|
788
|
+
cls, data: "EvidenceSourceTranslatorData"
|
|
789
|
+
) -> dg.AssetsDefinition:
|
|
790
|
+
"""Get the AssetsDefinition for a DuckDB source query."""
|
|
791
|
+
deps = []
|
|
792
|
+
for ref in data.extracted_data.get("table_deps", []):
|
|
793
|
+
if ref.get("table"):
|
|
794
|
+
deps.append(dg.AssetKey([ref["table"]]))
|
|
795
|
+
|
|
796
|
+
key = dg.AssetKey([data.source_group, data.query.name])
|
|
797
|
+
group_name = data.effective_group_name
|
|
798
|
+
has_deps = bool(deps)
|
|
799
|
+
|
|
800
|
+
# Add description and metadata
|
|
801
|
+
description = cls._build_description_with_sql(data)
|
|
802
|
+
metadata = cls._build_base_metadata(data)
|
|
803
|
+
|
|
804
|
+
@dg.asset(
|
|
805
|
+
key=key,
|
|
806
|
+
group_name=group_name,
|
|
807
|
+
kinds={"evidence", "source", "duckdb"},
|
|
808
|
+
deps=deps,
|
|
809
|
+
description=description,
|
|
810
|
+
metadata=metadata,
|
|
811
|
+
automation_condition=dg.AutomationCondition.any_deps_match(
|
|
812
|
+
dg.AutomationCondition.newly_updated()
|
|
813
|
+
)
|
|
814
|
+
if has_deps
|
|
815
|
+
else None,
|
|
816
|
+
)
|
|
817
|
+
def _source_asset():
|
|
818
|
+
return dg.MaterializeResult()
|
|
819
|
+
|
|
820
|
+
return _source_asset
|
|
821
|
+
|
|
822
|
+
|
|
823
|
+
@beta
|
|
824
|
+
@public
|
|
825
|
+
class MotherDuckEvidenceProjectSource(BaseEvidenceProjectSource):
|
|
826
|
+
"""MotherDuck source for Evidence projects.
|
|
827
|
+
|
|
828
|
+
Handles Evidence sources configured with ``type: motherduck`` in connection.yaml.
|
|
829
|
+
|
|
830
|
+
Example:
|
|
831
|
+
|
|
832
|
+
connection.yaml for a MotherDuck source:
|
|
833
|
+
|
|
834
|
+
.. code-block:: yaml
|
|
835
|
+
|
|
836
|
+
type: motherduck
|
|
837
|
+
token: ${MOTHERDUCK_TOKEN}
|
|
838
|
+
database: my_database
|
|
839
|
+
"""
|
|
840
|
+
|
|
841
|
+
@classmethod
|
|
842
|
+
def get_hide_source_asset_default(cls) -> bool:
|
|
843
|
+
return False
|
|
844
|
+
|
|
845
|
+
@classmethod
|
|
846
|
+
def get_source_sensor_enabled_default(cls) -> bool:
|
|
847
|
+
return False
|
|
848
|
+
|
|
849
|
+
@staticmethod
|
|
850
|
+
def get_source_type() -> str:
|
|
851
|
+
return "motherduck"
|
|
852
|
+
|
|
853
|
+
@classmethod
|
|
854
|
+
def get_source_sensor(
|
|
855
|
+
cls,
|
|
856
|
+
data: "EvidenceSourceTranslatorData",
|
|
857
|
+
asset_key: dg.AssetKey,
|
|
858
|
+
) -> dg.SensorDefinition | None:
|
|
859
|
+
"""Get a sensor that monitors MotherDuck tables for changes.
|
|
860
|
+
|
|
861
|
+
Uses information_schema queries with read-only connection to detect
|
|
862
|
+
changes in table row counts.
|
|
863
|
+
"""
|
|
864
|
+
import json
|
|
865
|
+
import os
|
|
866
|
+
|
|
867
|
+
options = data.source_content.connection.extra.get("options", {})
|
|
868
|
+
database = options.get("database")
|
|
869
|
+
token = options.get("token") or os.environ.get("MOTHERDUCK_TOKEN")
|
|
870
|
+
|
|
871
|
+
if not database or not token:
|
|
872
|
+
return None
|
|
873
|
+
|
|
874
|
+
table_deps = data.extracted_data.get("table_deps", [])
|
|
875
|
+
if not table_deps:
|
|
876
|
+
return None
|
|
877
|
+
|
|
878
|
+
source_group = data.source_group
|
|
879
|
+
query_name = data.query.name
|
|
880
|
+
sensor_name = f"{source_group}_{query_name}_sensor"
|
|
881
|
+
|
|
882
|
+
@dg.sensor(name=sensor_name, asset_selection=[asset_key])
|
|
883
|
+
def motherduck_sensor(context: dg.SensorEvaluationContext):
|
|
884
|
+
try:
|
|
885
|
+
import duckdb
|
|
886
|
+
except ImportError:
|
|
887
|
+
raise ImportError(
|
|
888
|
+
"duckdb is required for MotherDuck sensors. "
|
|
889
|
+
"Install it with: pip install dagster-evidence[duckdb]"
|
|
890
|
+
) from None
|
|
891
|
+
|
|
892
|
+
md_token = os.environ.get("MOTHERDUCK_TOKEN", token)
|
|
893
|
+
connection_string = f"md:{database}?motherduck_token={md_token}"
|
|
894
|
+
|
|
895
|
+
try:
|
|
896
|
+
conn = duckdb.connect(connection_string, read_only=True)
|
|
897
|
+
except Exception as e:
|
|
898
|
+
raise Exception(f"Could not connect to MotherDuck: {e}") from e
|
|
899
|
+
|
|
900
|
+
try:
|
|
901
|
+
table_counts: dict[str, int] = {}
|
|
902
|
+
for ref in table_deps:
|
|
903
|
+
table_name = ref.get("table")
|
|
904
|
+
schema = ref.get("schema", "main")
|
|
905
|
+
if table_name:
|
|
906
|
+
try:
|
|
907
|
+
result = conn.execute(
|
|
908
|
+
"""
|
|
909
|
+
SELECT estimated_size
|
|
910
|
+
FROM duckdb_tables()
|
|
911
|
+
WHERE table_name = ? AND schema_name = ?
|
|
912
|
+
""",
|
|
913
|
+
[table_name, schema],
|
|
914
|
+
).fetchone()
|
|
915
|
+
table_counts[f"{schema}.{table_name}"] = (
|
|
916
|
+
result[0] if result else 0
|
|
917
|
+
)
|
|
918
|
+
except Exception:
|
|
919
|
+
table_counts[f"{schema}.{table_name}"] = 0
|
|
920
|
+
finally:
|
|
921
|
+
conn.close()
|
|
922
|
+
|
|
923
|
+
cursor = json.loads(context.cursor) if context.cursor else {}
|
|
924
|
+
last_counts = cursor.get("counts", {})
|
|
925
|
+
|
|
926
|
+
if table_counts != last_counts:
|
|
927
|
+
context.update_cursor(json.dumps({"counts": table_counts}))
|
|
928
|
+
yield dg.RunRequest(asset_selection=[asset_key])
|
|
929
|
+
|
|
930
|
+
return motherduck_sensor
|
|
931
|
+
|
|
932
|
+
@classmethod
|
|
933
|
+
def extract_data_from_source(
|
|
934
|
+
cls, data: "EvidenceSourceTranslatorData"
|
|
935
|
+
) -> dict[str, Any]:
|
|
936
|
+
"""Extract table references from MotherDuck source query."""
|
|
937
|
+
from dagster_evidence.utils import extract_table_references
|
|
938
|
+
|
|
939
|
+
# Get database from connection config options
|
|
940
|
+
options = data.source_content.connection.extra.get("options", {})
|
|
941
|
+
default_database = options.get("database")
|
|
942
|
+
default_schema = "main" # MotherDuck default schema
|
|
943
|
+
|
|
944
|
+
table_refs = extract_table_references(
|
|
945
|
+
data.query.content,
|
|
946
|
+
default_database=default_database,
|
|
947
|
+
default_schema=default_schema,
|
|
948
|
+
)
|
|
949
|
+
return {"table_deps": table_refs}
|
|
950
|
+
|
|
951
|
+
@classmethod
|
|
952
|
+
def get_source_asset(
|
|
953
|
+
cls, data: "EvidenceSourceTranslatorData"
|
|
954
|
+
) -> dg.AssetsDefinition:
|
|
955
|
+
"""Get the AssetsDefinition for a MotherDuck source query."""
|
|
956
|
+
deps = []
|
|
957
|
+
for ref in data.extracted_data.get("table_deps", []):
|
|
958
|
+
if ref.get("table"):
|
|
959
|
+
deps.append(dg.AssetKey([ref["table"]]))
|
|
960
|
+
|
|
961
|
+
key = dg.AssetKey([data.source_group, data.query.name])
|
|
962
|
+
group_name = data.effective_group_name
|
|
963
|
+
has_deps = bool(deps)
|
|
964
|
+
|
|
965
|
+
# Add description and metadata
|
|
966
|
+
description = cls._build_description_with_sql(data)
|
|
967
|
+
metadata = cls._build_base_metadata(data)
|
|
968
|
+
|
|
969
|
+
@dg.asset(
|
|
970
|
+
key=key,
|
|
971
|
+
group_name=group_name,
|
|
972
|
+
kinds={"evidence", "source", "motherduck"},
|
|
973
|
+
deps=deps,
|
|
974
|
+
description=description,
|
|
975
|
+
metadata=metadata,
|
|
976
|
+
automation_condition=dg.AutomationCondition.any_deps_match(
|
|
977
|
+
dg.AutomationCondition.newly_updated()
|
|
978
|
+
)
|
|
979
|
+
if has_deps
|
|
980
|
+
else None,
|
|
981
|
+
)
|
|
982
|
+
def _source_asset():
|
|
983
|
+
return dg.MaterializeResult()
|
|
984
|
+
|
|
985
|
+
return _source_asset
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
@beta
|
|
989
|
+
@public
|
|
990
|
+
class BigQueryEvidenceProjectSource(BaseEvidenceProjectSource):
|
|
991
|
+
"""BigQuery source for Evidence projects.
|
|
992
|
+
|
|
993
|
+
Handles Evidence sources configured with ``type: bigquery`` in connection.yaml.
|
|
994
|
+
|
|
995
|
+
Example:
|
|
996
|
+
|
|
997
|
+
connection.yaml for a BigQuery source:
|
|
998
|
+
|
|
999
|
+
.. code-block:: yaml
|
|
1000
|
+
|
|
1001
|
+
type: bigquery
|
|
1002
|
+
project_id: my-gcp-project
|
|
1003
|
+
credentials: ${GOOGLE_APPLICATION_CREDENTIALS}
|
|
1004
|
+
"""
|
|
1005
|
+
|
|
1006
|
+
@classmethod
|
|
1007
|
+
def get_hide_source_asset_default(cls) -> bool:
|
|
1008
|
+
return True
|
|
1009
|
+
|
|
1010
|
+
@classmethod
|
|
1011
|
+
def get_source_sensor_enabled_default(cls) -> bool:
|
|
1012
|
+
return False
|
|
1013
|
+
|
|
1014
|
+
@staticmethod
|
|
1015
|
+
def get_source_type() -> str:
|
|
1016
|
+
return "bigquery"
|
|
1017
|
+
|
|
1018
|
+
@classmethod
|
|
1019
|
+
def get_source_sensor(
|
|
1020
|
+
cls,
|
|
1021
|
+
data: "EvidenceSourceTranslatorData",
|
|
1022
|
+
asset_key: dg.AssetKey,
|
|
1023
|
+
) -> dg.SensorDefinition | None:
|
|
1024
|
+
"""Get a sensor that monitors BigQuery tables for changes.
|
|
1025
|
+
|
|
1026
|
+
Uses BigQuery API to check table.modified timestamps.
|
|
1027
|
+
"""
|
|
1028
|
+
import json
|
|
1029
|
+
|
|
1030
|
+
options = data.source_content.connection.extra.get("options", {})
|
|
1031
|
+
project_id = options.get("project_id")
|
|
1032
|
+
|
|
1033
|
+
if not project_id:
|
|
1034
|
+
return None
|
|
1035
|
+
|
|
1036
|
+
table_deps = data.extracted_data.get("table_deps", [])
|
|
1037
|
+
if not table_deps:
|
|
1038
|
+
return None
|
|
1039
|
+
|
|
1040
|
+
source_group = data.source_group
|
|
1041
|
+
query_name = data.query.name
|
|
1042
|
+
sensor_name = f"{source_group}_{query_name}_sensor"
|
|
1043
|
+
|
|
1044
|
+
@dg.sensor(name=sensor_name, asset_selection=[asset_key])
|
|
1045
|
+
def bigquery_sensor(context: dg.SensorEvaluationContext):
|
|
1046
|
+
try:
|
|
1047
|
+
from google.cloud import bigquery
|
|
1048
|
+
except ImportError:
|
|
1049
|
+
raise ImportError(
|
|
1050
|
+
"google-cloud-bigquery is required for BigQuery sensors. "
|
|
1051
|
+
"Install it with: pip install dagster-evidence[bigquery]"
|
|
1052
|
+
) from None
|
|
1053
|
+
|
|
1054
|
+
try:
|
|
1055
|
+
client = bigquery.Client(project=project_id)
|
|
1056
|
+
except Exception as e:
|
|
1057
|
+
raise Exception(f"Could not connect to BigQuery: {e}") from e
|
|
1058
|
+
|
|
1059
|
+
mod_times: dict[str, str] = {}
|
|
1060
|
+
for ref in table_deps:
|
|
1061
|
+
table_name = ref.get("table")
|
|
1062
|
+
dataset = ref.get("schema")
|
|
1063
|
+
if table_name and dataset:
|
|
1064
|
+
try:
|
|
1065
|
+
table_ref = f"{project_id}.{dataset}.{table_name}"
|
|
1066
|
+
table = client.get_table(table_ref)
|
|
1067
|
+
if table.modified:
|
|
1068
|
+
mod_times[table_ref] = table.modified.isoformat()
|
|
1069
|
+
except Exception:
|
|
1070
|
+
pass
|
|
1071
|
+
|
|
1072
|
+
cursor = json.loads(context.cursor) if context.cursor else {}
|
|
1073
|
+
last_mod_times = cursor.get("mod_times", {})
|
|
1074
|
+
|
|
1075
|
+
if mod_times != last_mod_times:
|
|
1076
|
+
context.update_cursor(json.dumps({"mod_times": mod_times}))
|
|
1077
|
+
yield dg.RunRequest(asset_selection=[asset_key])
|
|
1078
|
+
|
|
1079
|
+
return bigquery_sensor
|
|
1080
|
+
|
|
1081
|
+
@classmethod
|
|
1082
|
+
def extract_data_from_source(
|
|
1083
|
+
cls, data: "EvidenceSourceTranslatorData"
|
|
1084
|
+
) -> dict[str, Any]:
|
|
1085
|
+
"""Extract table references from BigQuery source query."""
|
|
1086
|
+
from dagster_evidence.utils import extract_table_references
|
|
1087
|
+
|
|
1088
|
+
# Get project and dataset from connection config options
|
|
1089
|
+
options = data.source_content.connection.extra.get("options", {})
|
|
1090
|
+
default_database = options.get("project_id")
|
|
1091
|
+
default_schema = options.get("dataset")
|
|
1092
|
+
|
|
1093
|
+
table_refs = extract_table_references(
|
|
1094
|
+
data.query.content,
|
|
1095
|
+
default_database=default_database,
|
|
1096
|
+
default_schema=default_schema,
|
|
1097
|
+
)
|
|
1098
|
+
return {"table_deps": table_refs}
|
|
1099
|
+
|
|
1100
|
+
@classmethod
|
|
1101
|
+
def get_source_asset(
|
|
1102
|
+
cls, data: "EvidenceSourceTranslatorData"
|
|
1103
|
+
) -> dg.AssetsDefinition:
|
|
1104
|
+
"""Get the AssetsDefinition for a BigQuery source query."""
|
|
1105
|
+
deps = []
|
|
1106
|
+
for ref in data.extracted_data.get("table_deps", []):
|
|
1107
|
+
if ref.get("table"):
|
|
1108
|
+
deps.append(dg.AssetKey([ref["table"]]))
|
|
1109
|
+
|
|
1110
|
+
key = dg.AssetKey([data.source_group, data.query.name])
|
|
1111
|
+
group_name = data.effective_group_name
|
|
1112
|
+
has_deps = bool(deps)
|
|
1113
|
+
|
|
1114
|
+
# Add description and metadata
|
|
1115
|
+
description = cls._build_description_with_sql(data)
|
|
1116
|
+
metadata = cls._build_base_metadata(data)
|
|
1117
|
+
|
|
1118
|
+
@dg.asset(
|
|
1119
|
+
key=key,
|
|
1120
|
+
group_name=group_name,
|
|
1121
|
+
kinds={"evidence", "source", "bigquery"},
|
|
1122
|
+
deps=deps,
|
|
1123
|
+
description=description,
|
|
1124
|
+
metadata=metadata,
|
|
1125
|
+
automation_condition=dg.AutomationCondition.any_deps_match(
|
|
1126
|
+
dg.AutomationCondition.newly_updated()
|
|
1127
|
+
)
|
|
1128
|
+
if has_deps
|
|
1129
|
+
else None,
|
|
1130
|
+
)
|
|
1131
|
+
def _source_asset():
|
|
1132
|
+
return dg.MaterializeResult()
|
|
1133
|
+
|
|
1134
|
+
return _source_asset
|
|
1135
|
+
|
|
1136
|
+
|
|
1137
|
+
@beta
|
|
1138
|
+
@public
|
|
1139
|
+
class GSheetsEvidenceProjectSource(BaseEvidenceProjectSource):
|
|
1140
|
+
"""Google Sheets source for Evidence projects.
|
|
1141
|
+
|
|
1142
|
+
Handles Evidence sources configured with ``type: gsheets`` in connection.yaml.
|
|
1143
|
+
Unlike SQL-based sources, gsheets sources define sheets and pages declaratively
|
|
1144
|
+
rather than using SQL queries.
|
|
1145
|
+
|
|
1146
|
+
Example:
|
|
1147
|
+
|
|
1148
|
+
connection.yaml for a Google Sheets source:
|
|
1149
|
+
|
|
1150
|
+
.. code-block:: yaml
|
|
1151
|
+
|
|
1152
|
+
name: my_sheets
|
|
1153
|
+
type: gsheets
|
|
1154
|
+
options:
|
|
1155
|
+
ratelimitms: 2500
|
|
1156
|
+
sheets:
|
|
1157
|
+
sales_data:
|
|
1158
|
+
id: 1Sc4nyLSSNETSIEpNKzheh5AFJJ-YA-wQeubFgeeEw9g
|
|
1159
|
+
pages:
|
|
1160
|
+
- q1_sales
|
|
1161
|
+
- q2_sales
|
|
1162
|
+
inventory:
|
|
1163
|
+
id: kj235Bo3wRFG9kj3tp98grnPB-P97iu87lv877gliuId
|
|
1164
|
+
|
|
1165
|
+
This generates assets:
|
|
1166
|
+
- ``[source_group, "sales_data", "q1_sales"]``
|
|
1167
|
+
- ``[source_group, "sales_data", "q2_sales"]``
|
|
1168
|
+
- ``[source_group, "inventory"]``
|
|
1169
|
+
"""
|
|
1170
|
+
|
|
1171
|
+
@classmethod
|
|
1172
|
+
def get_source_sensor_enabled_default(cls) -> bool:
|
|
1173
|
+
return False
|
|
1174
|
+
|
|
1175
|
+
@staticmethod
|
|
1176
|
+
def get_source_type() -> str:
|
|
1177
|
+
return "gsheets"
|
|
1178
|
+
|
|
1179
|
+
@classmethod
|
|
1180
|
+
def get_source_sensor(
|
|
1181
|
+
cls,
|
|
1182
|
+
data: "EvidenceSourceTranslatorData",
|
|
1183
|
+
asset_key: dg.AssetKey,
|
|
1184
|
+
) -> dg.SensorDefinition | None:
|
|
1185
|
+
"""Get a sensor that monitors Google Sheets for changes.
|
|
1186
|
+
|
|
1187
|
+
Uses Google Drive API to check modifiedTime and version.
|
|
1188
|
+
"""
|
|
1189
|
+
import json
|
|
1190
|
+
|
|
1191
|
+
options = data.source_content.connection.extra.get("options", {})
|
|
1192
|
+
service_account_path = options.get("service_account_path")
|
|
1193
|
+
sheets_config = data.source_content.connection.extra.get("sheets", {})
|
|
1194
|
+
|
|
1195
|
+
# Parse query.name to get sheet_name
|
|
1196
|
+
parts = data.query.name.split("/", 1)
|
|
1197
|
+
sheet_name = parts[0]
|
|
1198
|
+
|
|
1199
|
+
sheet_config = sheets_config.get(sheet_name, {})
|
|
1200
|
+
sheet_id = sheet_config.get("id") if isinstance(sheet_config, dict) else None
|
|
1201
|
+
|
|
1202
|
+
if not sheet_id:
|
|
1203
|
+
return None
|
|
1204
|
+
|
|
1205
|
+
source_group = data.source_group
|
|
1206
|
+
query_name = data.query.name.replace("/", "_")
|
|
1207
|
+
sensor_name = f"{source_group}_{query_name}_sensor"
|
|
1208
|
+
|
|
1209
|
+
@dg.sensor(name=sensor_name, asset_selection=[asset_key])
|
|
1210
|
+
def gsheets_sensor(context: dg.SensorEvaluationContext):
|
|
1211
|
+
try:
|
|
1212
|
+
from google.oauth2 import service_account
|
|
1213
|
+
from googleapiclient.discovery import build
|
|
1214
|
+
except ImportError:
|
|
1215
|
+
raise ImportError(
|
|
1216
|
+
"google-api-python-client is required for Google Sheets sensors. "
|
|
1217
|
+
"Install it with: uv pip install 'dagster-evidence[gsheets]'"
|
|
1218
|
+
) from None
|
|
1219
|
+
|
|
1220
|
+
try:
|
|
1221
|
+
if service_account_path:
|
|
1222
|
+
credentials = service_account.Credentials.from_service_account_file(
|
|
1223
|
+
service_account_path,
|
|
1224
|
+
scopes=["https://www.googleapis.com/auth/drive.readonly"],
|
|
1225
|
+
)
|
|
1226
|
+
else:
|
|
1227
|
+
# Use default credentials
|
|
1228
|
+
import google.auth
|
|
1229
|
+
|
|
1230
|
+
credentials, _ = google.auth.default(
|
|
1231
|
+
scopes=["https://www.googleapis.com/auth/drive.readonly"]
|
|
1232
|
+
)
|
|
1233
|
+
|
|
1234
|
+
service = build("drive", "v3", credentials=credentials)
|
|
1235
|
+
file_metadata = (
|
|
1236
|
+
service.files()
|
|
1237
|
+
.get(fileId=sheet_id, fields="modifiedTime,version")
|
|
1238
|
+
.execute()
|
|
1239
|
+
)
|
|
1240
|
+
|
|
1241
|
+
current_state = {
|
|
1242
|
+
"modified_time": file_metadata.get("modifiedTime"),
|
|
1243
|
+
"version": file_metadata.get("version"),
|
|
1244
|
+
}
|
|
1245
|
+
except Exception as e:
|
|
1246
|
+
raise Exception(f"Could not fetch Google Sheet metadata: {e}") from e
|
|
1247
|
+
|
|
1248
|
+
cursor = json.loads(context.cursor) if context.cursor else {}
|
|
1249
|
+
|
|
1250
|
+
if current_state.get("modified_time") != cursor.get(
|
|
1251
|
+
"modified_time"
|
|
1252
|
+
) or current_state.get("version") != cursor.get("version"):
|
|
1253
|
+
context.update_cursor(json.dumps(current_state))
|
|
1254
|
+
yield dg.RunRequest(asset_selection=[asset_key])
|
|
1255
|
+
|
|
1256
|
+
return gsheets_sensor
|
|
1257
|
+
|
|
1258
|
+
@classmethod
|
|
1259
|
+
def extract_data_from_source(
|
|
1260
|
+
cls, data: "EvidenceSourceTranslatorData"
|
|
1261
|
+
) -> dict[str, Any]:
|
|
1262
|
+
"""Extract sheet configuration from Google Sheets source.
|
|
1263
|
+
|
|
1264
|
+
Google Sheets sources don't have SQL to parse, so this returns
|
|
1265
|
+
the sheets configuration for use in get_asset_spec.
|
|
1266
|
+
"""
|
|
1267
|
+
sheets_config = data.source_content.connection.extra.get("sheets", {})
|
|
1268
|
+
return {"sheets_config": sheets_config}
|
|
1269
|
+
|
|
1270
|
+
@classmethod
|
|
1271
|
+
def get_source_asset(
|
|
1272
|
+
cls, data: "EvidenceSourceTranslatorData"
|
|
1273
|
+
) -> dg.AssetsDefinition:
|
|
1274
|
+
"""Get the AssetsDefinition for a Google Sheets source.
|
|
1275
|
+
|
|
1276
|
+
Parses the query name to extract sheet_name and optional page_name,
|
|
1277
|
+
then builds a 3-part asset key: [source_group, sheet_name, page_name].
|
|
1278
|
+
"""
|
|
1279
|
+
# Parse query.name to get sheet_name and optional page_name
|
|
1280
|
+
# Format: "sheet_name" or "sheet_name/page_name"
|
|
1281
|
+
parts = data.query.name.split("/", 1)
|
|
1282
|
+
sheet_name = parts[0]
|
|
1283
|
+
page_name = parts[1] if len(parts) > 1 else None
|
|
1284
|
+
|
|
1285
|
+
# Build asset key: [source_group, sheet_name, page_name] or [source_group, sheet_name]
|
|
1286
|
+
if page_name:
|
|
1287
|
+
key = dg.AssetKey([data.source_group, sheet_name, page_name])
|
|
1288
|
+
else:
|
|
1289
|
+
key = dg.AssetKey([data.source_group, sheet_name])
|
|
1290
|
+
|
|
1291
|
+
group_name = data.effective_group_name
|
|
1292
|
+
|
|
1293
|
+
# Build description
|
|
1294
|
+
description = f"Evidence Google Sheets source: {sheet_name}"
|
|
1295
|
+
if page_name:
|
|
1296
|
+
description += f" / {page_name}"
|
|
1297
|
+
|
|
1298
|
+
# Build metadata with sheet URL
|
|
1299
|
+
metadata: dict[str, Any] = {"Source Type": "gsheets"}
|
|
1300
|
+
sheets_config = data.extracted_data.get("sheets_config", {})
|
|
1301
|
+
sheet_config = sheets_config.get(sheet_name, {})
|
|
1302
|
+
sheet_id = sheet_config.get("id") if isinstance(sheet_config, dict) else None
|
|
1303
|
+
if sheet_id:
|
|
1304
|
+
metadata["Sheet ID"] = sheet_id
|
|
1305
|
+
metadata["Sheet URL"] = dg.MetadataValue.url(
|
|
1306
|
+
f"https://docs.google.com/spreadsheets/d/{sheet_id}"
|
|
1307
|
+
)
|
|
1308
|
+
|
|
1309
|
+
@dg.asset(
|
|
1310
|
+
key=key,
|
|
1311
|
+
group_name=group_name,
|
|
1312
|
+
kinds={"evidence", "source", "gsheets"},
|
|
1313
|
+
deps=[], # No upstream deps for gsheets - they are source of truth
|
|
1314
|
+
description=description,
|
|
1315
|
+
metadata=metadata,
|
|
1316
|
+
)
|
|
1317
|
+
def _source_asset():
|
|
1318
|
+
return dg.MaterializeResult()
|
|
1319
|
+
|
|
1320
|
+
return _source_asset
|
|
1321
|
+
|
|
1322
|
+
@classmethod
|
|
1323
|
+
def build_queries_from_sheets_config(
|
|
1324
|
+
cls, connection: dict[str, Any]
|
|
1325
|
+
) -> list[dict[str, str]]:
|
|
1326
|
+
"""Build virtual queries from sheets configuration.
|
|
1327
|
+
|
|
1328
|
+
This method synthesizes SourceQuery-compatible dictionaries from
|
|
1329
|
+
the sheets configuration in connection.yaml. Each sheet/page
|
|
1330
|
+
combination becomes a "virtual query" with an empty content field.
|
|
1331
|
+
|
|
1332
|
+
Args:
|
|
1333
|
+
connection: The full connection configuration dictionary.
|
|
1334
|
+
|
|
1335
|
+
Returns:
|
|
1336
|
+
List of query dictionaries with "name" and "content" keys.
|
|
1337
|
+
"""
|
|
1338
|
+
queries: list[dict[str, str]] = []
|
|
1339
|
+
sheets = connection.get("sheets", {})
|
|
1340
|
+
for sheet_name, sheet_config in sheets.items():
|
|
1341
|
+
if not isinstance(sheet_config, dict):
|
|
1342
|
+
continue
|
|
1343
|
+
pages = sheet_config.get("pages", [])
|
|
1344
|
+
if pages:
|
|
1345
|
+
for page in pages:
|
|
1346
|
+
queries.append({"name": f"{sheet_name}/{page}", "content": ""})
|
|
1347
|
+
else:
|
|
1348
|
+
# No pages specified - create single asset for the sheet
|
|
1349
|
+
queries.append({"name": sheet_name, "content": ""})
|
|
1350
|
+
return queries
|