dagster-evidence 0.1.7__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1350 @@
1
+ """Source classes for Evidence projects.
2
+
3
+ This module defines the data structures used to represent Evidence project sources,
4
+ including queries, connections, and the translator data classes.
5
+ """
6
+
7
+ import os
8
+ from abc import abstractmethod
9
+ from collections.abc import Sequence
10
+ from dataclasses import dataclass, field
11
+ from typing import TYPE_CHECKING, Any
12
+
13
+ import dagster as dg
14
+ from dagster import AssetKey
15
+ from dagster._annotations import beta, public
16
+ from dagster._record import record
17
+ from dagster._serdes import whitelist_for_serdes
18
+
19
+ if TYPE_CHECKING:
20
+ from .sources import EvidenceSourceTranslatorData
21
+
22
+
23
+ @beta
24
+ @public
25
+ @whitelist_for_serdes
26
+ @dataclass
27
+ class SourceQuery:
28
+ """Represents a single SQL query in an Evidence source.
29
+
30
+ Attributes:
31
+ name: The query name (derived from filename without .sql extension).
32
+ content: The SQL query content.
33
+
34
+ Example:
35
+
36
+ A query file ``sources/orders_db/daily_orders.sql`` would be parsed as:
37
+
38
+ .. code-block:: python
39
+
40
+ SourceQuery(
41
+ name="daily_orders",
42
+ content="SELECT * FROM orders WHERE date = current_date"
43
+ )
44
+ """
45
+
46
+ name: str
47
+ content: str
48
+
49
+
50
+ @beta
51
+ @public
52
+ @whitelist_for_serdes
53
+ @dataclass
54
+ class SourceDagsterMetadata:
55
+ """Dagster-specific metadata for Evidence sources.
56
+
57
+ Parsed from the ``meta.dagster`` section of connection.yaml.
58
+
59
+ Attributes:
60
+ create_source_sensor: Override whether sensors are created for this source.
61
+ If None, uses the source type's default (get_source_sensor_enabled_default).
62
+ hide_source_asset: Override whether this source's assets are hidden.
63
+ If None, uses the source type's default (get_hide_source_asset_default).
64
+ group_name: Override the asset group name for this source.
65
+ If None, uses the source folder name.
66
+
67
+ Example:
68
+
69
+ A ``connection.yaml`` file with Dagster metadata:
70
+
71
+ .. code-block:: yaml
72
+
73
+ name: motherduck_source
74
+ type: motherduck
75
+ options:
76
+ database: analytics
77
+ meta:
78
+ dagster:
79
+ create_source_sensor: false
80
+ hide_source_asset: false
81
+ group_name: analytics_sources
82
+
83
+ Would be parsed as:
84
+
85
+ .. code-block:: python
86
+
87
+ SourceDagsterMetadata(
88
+ create_source_sensor=False,
89
+ hide_source_asset=False,
90
+ group_name="analytics_sources"
91
+ )
92
+ """
93
+
94
+ create_source_sensor: bool | None = None
95
+ hide_source_asset: bool | None = None
96
+ group_name: str | None = None
97
+
98
+
99
+ @beta
100
+ @public
101
+ @whitelist_for_serdes
102
+ @dataclass
103
+ class ProjectDagsterMetadata:
104
+ """Dagster-specific metadata for Evidence projects.
105
+
106
+ Parsed from the ``meta.dagster`` section of evidence.config.yaml.
107
+
108
+ Attributes:
109
+ group_name: Override the asset group name for the project asset.
110
+ If None, uses Dagster's default grouping.
111
+
112
+ Example:
113
+
114
+ An ``evidence.config.yaml`` file with Dagster metadata:
115
+
116
+ .. code-block:: yaml
117
+
118
+ deployment:
119
+ basePath: /sales-dashboard
120
+
121
+ meta:
122
+ dagster:
123
+ group_name: dashboards
124
+
125
+ Would be parsed as:
126
+
127
+ .. code-block:: python
128
+
129
+ ProjectDagsterMetadata(
130
+ group_name="dashboards"
131
+ )
132
+ """
133
+
134
+ group_name: str | None = None
135
+
136
+
137
+ @beta
138
+ @public
139
+ @whitelist_for_serdes
140
+ @dataclass
141
+ class SourceConnection:
142
+ """Represents connection configuration for an Evidence source.
143
+
144
+ This is parsed from the ``connection.yaml`` file in each source directory.
145
+
146
+ Attributes:
147
+ type: The source type identifier (e.g., "duckdb", "bigquery", "motherduck").
148
+ extra: Additional connection-specific fields from the YAML file.
149
+ dagster_metadata: Dagster-specific metadata parsed from meta.dagster section.
150
+
151
+ Example:
152
+
153
+ A ``connection.yaml`` file:
154
+
155
+ .. code-block:: yaml
156
+
157
+ type: duckdb
158
+ filename: ./data/analytics.duckdb
159
+
160
+ Would be parsed as:
161
+
162
+ .. code-block:: python
163
+
164
+ SourceConnection(
165
+ type="duckdb",
166
+ extra={"filename": "./data/analytics.duckdb"},
167
+ dagster_metadata=SourceDagsterMetadata()
168
+ )
169
+ """
170
+
171
+ type: str
172
+ extra: dict[str, Any] # Additional connection-specific fields
173
+ dagster_metadata: SourceDagsterMetadata = field(
174
+ default_factory=lambda: SourceDagsterMetadata()
175
+ )
176
+
177
+
178
+ @beta
179
+ @public
180
+ @whitelist_for_serdes
181
+ @dataclass
182
+ class SourceContent:
183
+ """Represents the full content of an Evidence source directory.
184
+
185
+ A source directory contains a connection.yaml and one or more .sql query files.
186
+
187
+ Attributes:
188
+ connection: The connection configuration parsed from connection.yaml.
189
+ queries: List of SQL queries parsed from .sql files.
190
+
191
+ Example:
192
+
193
+ Source directory structure:
194
+
195
+ .. code-block:: text
196
+
197
+ sources/orders_db/
198
+ ├── connection.yaml
199
+ ├── orders.sql
200
+ └── customers.sql
201
+
202
+ Would be parsed as:
203
+
204
+ .. code-block:: python
205
+
206
+ SourceContent(
207
+ connection=SourceConnection(type="duckdb", extra={...}),
208
+ queries=[
209
+ SourceQuery(name="orders", content="SELECT ..."),
210
+ SourceQuery(name="customers", content="SELECT ..."),
211
+ ]
212
+ )
213
+ """
214
+
215
+ connection: SourceConnection
216
+ queries: list[SourceQuery]
217
+
218
+ @public
219
+ @staticmethod
220
+ def from_dict(data: dict[str, Any]) -> "SourceContent":
221
+ """Create SourceContent from a raw dictionary.
222
+
223
+ Args:
224
+ data: Dictionary containing "connection" and "queries" keys.
225
+
226
+ Returns:
227
+ A SourceContent instance.
228
+
229
+ Example:
230
+
231
+ .. code-block:: python
232
+
233
+ data = {
234
+ "connection": {"type": "duckdb", "filename": "data.db"},
235
+ "queries": [
236
+ {"name": "orders", "content": "SELECT * FROM orders"}
237
+ ]
238
+ }
239
+ source = SourceContent.from_dict(data)
240
+
241
+ With Dagster metadata:
242
+
243
+ .. code-block:: python
244
+
245
+ data = {
246
+ "connection": {
247
+ "type": "duckdb",
248
+ "filename": "data.db",
249
+ "meta": {
250
+ "dagster": {
251
+ "create_source_sensor": False,
252
+ "hide_source_asset": False,
253
+ "group_name": "custom_group"
254
+ }
255
+ }
256
+ },
257
+ "queries": [...]
258
+ }
259
+ """
260
+ connection_data = data.get("connection", {})
261
+ # Parse dagster metadata from meta.dagster section
262
+ meta = connection_data.get("meta", {})
263
+ dagster_meta = meta.get("dagster", {})
264
+ dagster_metadata = SourceDagsterMetadata(
265
+ create_source_sensor=dagster_meta.get("create_source_sensor"),
266
+ hide_source_asset=dagster_meta.get("hide_source_asset"),
267
+ group_name=dagster_meta.get("group_name"),
268
+ )
269
+ connection = SourceConnection(
270
+ type=connection_data.get("type", ""),
271
+ extra={
272
+ k: v for k, v in connection_data.items() if k not in ("type", "meta")
273
+ },
274
+ dagster_metadata=dagster_metadata,
275
+ )
276
+ queries = [
277
+ SourceQuery(name=q.get("name", ""), content=q.get("content", ""))
278
+ for q in data.get("queries", [])
279
+ ]
280
+ return SourceContent(connection=connection, queries=queries)
281
+
282
+
283
+ @beta
284
+ @public
285
+ @record
286
+ class EvidenceSourceTranslatorData:
287
+ """Data passed to the translator for generating source asset specs.
288
+
289
+ This record contains all information needed to generate an AssetSpec
290
+ for a single source query.
291
+
292
+ Attributes:
293
+ source_content: The full source content including connection and queries.
294
+ source_group: The source folder name (e.g., "orders_db").
295
+ query: The specific query being translated.
296
+ extracted_data: Additional data extracted from the source (e.g., table dependencies).
297
+
298
+ Example:
299
+
300
+ Used in custom translator implementations:
301
+
302
+ .. code-block:: python
303
+
304
+ from dagster_evidence import (
305
+ DagsterEvidenceTranslator,
306
+ EvidenceSourceTranslatorData,
307
+ )
308
+ import dagster as dg
309
+
310
+ class CustomTranslator(DagsterEvidenceTranslator):
311
+ def get_asset_spec(self, data):
312
+ if isinstance(data, EvidenceSourceTranslatorData):
313
+ # Access source information
314
+ source_type = data.source_content.connection.type
315
+ query_name = data.query.name
316
+ group = data.source_group
317
+ # Access extracted table dependencies
318
+ table_deps = data.extracted_data.get("table_deps", [])
319
+ # Generate custom AssetSpec
320
+ return dg.AssetSpec(
321
+ key=dg.AssetKey([group, query_name]),
322
+ kinds={"evidence", source_type},
323
+ )
324
+ return super().get_asset_spec(data)
325
+ """
326
+
327
+ source_content: SourceContent
328
+ source_group: str # The source folder name (e.g., "orders_db")
329
+ query: SourceQuery # The specific query being translated
330
+ extracted_data: dict[str, Any] = {} # Additional extracted data (e.g., table_deps)
331
+ source_path: str | None = (
332
+ None # Absolute path to source directory (for resolving relative paths)
333
+ )
334
+
335
+ @public
336
+ @property
337
+ def effective_group_name(self) -> str:
338
+ """Get the effective group name, considering metadata override.
339
+
340
+ Returns the group_name from dagster metadata if set, otherwise
341
+ returns the source_group (folder name).
342
+
343
+ Returns:
344
+ The effective group name to use for asset grouping.
345
+ """
346
+ meta = self.source_content.connection.dagster_metadata
347
+ return meta.group_name if meta.group_name else self.source_group
348
+
349
+
350
+ @beta
351
+ @public
352
+ @record
353
+ class EvidenceProjectTranslatorData:
354
+ """Data passed to the translator for generating the main project asset spec.
355
+
356
+ This record contains all information needed to generate an AssetSpec
357
+ for the Evidence project build-and-deploy asset.
358
+
359
+ Attributes:
360
+ project_name: The name of the Evidence project.
361
+ sources_by_id: Dictionary mapping source folder names to their content.
362
+ source_deps: List of AssetKeys for source assets this project depends on.
363
+ dagster_metadata: Dagster-specific metadata parsed from evidence.config.yaml.
364
+
365
+ Example:
366
+
367
+ Used in custom translator implementations:
368
+
369
+ .. code-block:: python
370
+
371
+ from dagster_evidence import (
372
+ DagsterEvidenceTranslator,
373
+ EvidenceProjectTranslatorData,
374
+ )
375
+ import dagster as dg
376
+
377
+ class CustomTranslator(DagsterEvidenceTranslator):
378
+ def get_asset_spec(self, data):
379
+ if isinstance(data, EvidenceProjectTranslatorData):
380
+ return dg.AssetSpec(
381
+ key=dg.AssetKey(["dashboards", data.project_name]),
382
+ kinds={"evidence", "dashboard"},
383
+ deps=data.source_deps,
384
+ metadata={"source_count": len(data.sources_by_id)},
385
+ )
386
+ return super().get_asset_spec(data)
387
+ """
388
+
389
+ project_name: str
390
+ sources_by_id: dict[str, SourceContent]
391
+ source_deps: Sequence[AssetKey] # Dependencies on source assets
392
+ dagster_metadata: ProjectDagsterMetadata = ProjectDagsterMetadata()
393
+
394
+ @public
395
+ @property
396
+ def effective_group_name(self) -> str | None:
397
+ """Get the effective group name from metadata, or None for default.
398
+
399
+ Returns the group_name from dagster metadata if set, otherwise
400
+ returns None to use Dagster's default grouping.
401
+
402
+ Returns:
403
+ The effective group name to use for asset grouping, or None.
404
+ """
405
+ return self.dagster_metadata.group_name
406
+
407
+
408
+ @beta
409
+ @public
410
+ @dataclass
411
+ class BaseEvidenceProjectSource:
412
+ """Base class for Evidence project data sources.
413
+
414
+ Subclass this to implement custom source types that can be registered
415
+ with the translator's SOURCE_TYPE_REGISTRY.
416
+
417
+ Attributes:
418
+ source_content: The parsed source content from the Evidence project.
419
+
420
+ Example:
421
+
422
+ Implementing a custom PostgreSQL source:
423
+
424
+ .. code-block:: python
425
+
426
+ from dagster_evidence.components.sources import BaseEvidenceProjectSource
427
+
428
+ class PostgresEvidenceProjectSource(BaseEvidenceProjectSource):
429
+ @staticmethod
430
+ def get_source_type() -> str:
431
+ return "postgres"
432
+
433
+ # Register with translator
434
+ from dagster_evidence import DagsterEvidenceTranslator
435
+
436
+ class CustomTranslator(DagsterEvidenceTranslator):
437
+ SOURCE_TYPE_REGISTRY = {
438
+ **DagsterEvidenceTranslator.SOURCE_TYPE_REGISTRY,
439
+ "postgres": PostgresEvidenceProjectSource,
440
+ }
441
+ """
442
+
443
+ source_content: SourceContent
444
+
445
+ @public
446
+ @classmethod
447
+ def get_hide_source_asset_default(cls) -> bool:
448
+ """Return whether this source type should hide its assets by default.
449
+
450
+ When enabled via ``enable_source_assets_hiding`` on the project, sources
451
+ that return True here will not create intermediate source assets. Instead,
452
+ their table dependencies (extracted from SQL) are linked directly to the
453
+ project asset.
454
+
455
+ Override in subclasses to change the default behavior.
456
+
457
+ Returns:
458
+ True to hide source assets by default, False to show them.
459
+ """
460
+ return False
461
+
462
+ @public
463
+ @classmethod
464
+ def get_source_sensor_enabled_default(cls) -> bool:
465
+ """Return whether sensors are enabled by default for this source type.
466
+
467
+ When enabled via ``enable_source_sensors`` on the project, sources
468
+ that return True here will have sensors created to detect changes
469
+ in the underlying data.
470
+
471
+ Override in subclasses to enable sensor support.
472
+
473
+ Returns:
474
+ True to enable sensors by default, False to disable them.
475
+ """
476
+ return False
477
+
478
+ @public
479
+ def get_hide_source_asset(self) -> bool:
480
+ """Return whether this source should hide its assets.
481
+
482
+ Checks per-source metadata override first (meta.dagster.hide_source_asset),
483
+ then falls back to the class default (get_hide_source_asset_default).
484
+
485
+ Returns:
486
+ True to hide source assets, False to show them.
487
+ """
488
+ meta = self.source_content.connection.dagster_metadata
489
+ if meta.hide_source_asset is not None:
490
+ return meta.hide_source_asset
491
+ return self.get_hide_source_asset_default()
492
+
493
+ @public
494
+ def get_source_sensor_enabled(self) -> bool:
495
+ """Return whether sensors are enabled for this source.
496
+
497
+ Checks per-source metadata override first (meta.dagster.create_source_sensor),
498
+ then falls back to the class default (get_source_sensor_enabled_default).
499
+
500
+ Returns:
501
+ True to enable sensors, False to disable them.
502
+ """
503
+ meta = self.source_content.connection.dagster_metadata
504
+ if meta.create_source_sensor is not None:
505
+ return meta.create_source_sensor
506
+ return self.get_source_sensor_enabled_default()
507
+
508
+ @public
509
+ @classmethod
510
+ def get_source_sensor(
511
+ cls,
512
+ data: "EvidenceSourceTranslatorData",
513
+ asset_key: dg.AssetKey,
514
+ ) -> dg.SensorDefinition | None:
515
+ """Get a sensor for this source to detect data changes.
516
+
517
+ Override in subclasses to implement source-specific change detection.
518
+ The sensor should detect changes in the underlying data and trigger
519
+ the source asset materialization when changes are detected.
520
+
521
+ Args:
522
+ data: The translator data containing source and query information.
523
+ asset_key: The asset key of the source asset to trigger.
524
+
525
+ Returns:
526
+ A SensorDefinition that monitors for changes, or None if not supported.
527
+ """
528
+ return None
529
+
530
+ @classmethod
531
+ def _build_description_with_sql(cls, data: "EvidenceSourceTranslatorData") -> str:
532
+ """Build description with raw SQL for SQL-based sources."""
533
+ return (
534
+ f"Evidence {cls.get_source_type()} source: {data.query.name}\n\n"
535
+ f"**Raw SQL:**\n```sql\n{data.query.content}\n```"
536
+ )
537
+
538
+ @classmethod
539
+ def _build_base_metadata(
540
+ cls, data: "EvidenceSourceTranslatorData"
541
+ ) -> dict[str, Any]:
542
+ """Build base metadata dictionary for source assets."""
543
+ metadata: dict[str, Any] = {
544
+ "Source Type": cls.get_source_type(),
545
+ }
546
+ if data.query.content:
547
+ metadata["Raw SQL"] = dg.MetadataValue.md(
548
+ f"```sql\n{data.query.content}\n```"
549
+ )
550
+ table_deps = data.extracted_data.get("table_deps", [])
551
+ if table_deps:
552
+ metadata["Table Dependencies"] = dg.MetadataValue.json(table_deps)
553
+ return metadata
554
+
555
+ @public
556
+ @staticmethod
557
+ @abstractmethod
558
+ def get_source_type() -> str:
559
+ """Return the source type identifier (e.g., 'duckdb').
560
+
561
+ Returns:
562
+ The source type string that matches the 'type' field in connection.yaml.
563
+ """
564
+ raise NotImplementedError()
565
+
566
+ @public
567
+ @classmethod
568
+ @abstractmethod
569
+ def extract_data_from_source(
570
+ cls, data: "EvidenceSourceTranslatorData"
571
+ ) -> dict[str, Any]:
572
+ """Extract additional data from the source query.
573
+
574
+ This method is called before get_asset_spec to extract information
575
+ from the SQL query and connection configuration. The extracted data
576
+ is stored in data.extracted_data and can be used in get_asset_spec.
577
+
578
+ Common extracted data includes table dependencies parsed from the SQL query.
579
+
580
+ Args:
581
+ data: The translator data containing source and query information.
582
+
583
+ Returns:
584
+ Dictionary of extracted data. Common keys include:
585
+ - table_deps: List of table references extracted from the SQL query.
586
+
587
+ Example:
588
+
589
+ .. code-block:: python
590
+
591
+ class PostgresEvidenceProjectSource(BaseEvidenceProjectSource):
592
+ @classmethod
593
+ def extract_data_from_source(cls, data):
594
+ from dagster_evidence.utils import extract_table_references
595
+ table_refs = extract_table_references(
596
+ data.query.content,
597
+ default_schema="public",
598
+ )
599
+ return {"table_deps": table_refs}
600
+ """
601
+ raise NotImplementedError()
602
+
603
+ @public
604
+ @classmethod
605
+ @abstractmethod
606
+ def get_source_asset(
607
+ cls, data: "EvidenceSourceTranslatorData"
608
+ ) -> dg.AssetsDefinition:
609
+ """Get the AssetsDefinition for a source query.
610
+
611
+ Each source type must implement this method to define how its
612
+ assets are represented in Dagster. The returned asset includes
613
+ an automation condition that triggers when upstream dependencies
614
+ are updated.
615
+
616
+ Args:
617
+ data: The translator data containing source and query information.
618
+ The extracted_data field contains data from extract_data_from_source.
619
+
620
+ Returns:
621
+ The AssetsDefinition for the source query with automation condition.
622
+
623
+ Example:
624
+
625
+ .. code-block:: python
626
+
627
+ class PostgresEvidenceProjectSource(BaseEvidenceProjectSource):
628
+ @staticmethod
629
+ def get_source_type() -> str:
630
+ return "postgres"
631
+
632
+ @classmethod
633
+ def get_source_asset(cls, data):
634
+ # Use extracted table dependencies
635
+ deps = []
636
+ for ref in data.extracted_data.get("table_deps", []):
637
+ if ref.get("table"):
638
+ deps.append(dg.AssetKey([ref["table"]]))
639
+
640
+ key = dg.AssetKey(["postgres", data.query.name])
641
+ has_deps = bool(deps)
642
+
643
+ @dg.asset(
644
+ key=key,
645
+ group_name=data.source_group,
646
+ kinds={"evidence", "postgres"},
647
+ deps=deps,
648
+ automation_condition=dg.AutomationCondition.any_deps_match(
649
+ dg.AutomationCondition.newly_updated()
650
+ ) if has_deps else None,
651
+ )
652
+ def _source_asset():
653
+ return dg.MaterializeResult()
654
+
655
+ return _source_asset
656
+ """
657
+ raise NotImplementedError()
658
+
659
+
660
+ @beta
661
+ @public
662
+ class DuckdbEvidenceProjectSource(BaseEvidenceProjectSource):
663
+ """DuckDB source for Evidence projects.
664
+
665
+ Handles Evidence sources configured with ``type: duckdb`` in connection.yaml.
666
+
667
+ Example:
668
+
669
+ connection.yaml for a DuckDB source:
670
+
671
+ .. code-block:: yaml
672
+
673
+ type: duckdb
674
+ filename: ./data/analytics.duckdb
675
+ """
676
+
677
+ @classmethod
678
+ def get_hide_source_asset_default(cls) -> bool:
679
+ return False
680
+
681
+ @classmethod
682
+ def get_source_sensor_enabled_default(cls) -> bool:
683
+ return False
684
+
685
+ @staticmethod
686
+ def get_source_type() -> str:
687
+ return "duckdb"
688
+
689
+ @classmethod
690
+ def get_source_sensor(
691
+ cls,
692
+ data: "EvidenceSourceTranslatorData",
693
+ asset_key: dg.AssetKey,
694
+ ) -> dg.SensorDefinition | None:
695
+ """Get a sensor that monitors DuckDB tables for changes.
696
+
697
+ Uses information_schema queries with read-only connection to detect
698
+ changes in table row counts.
699
+ """
700
+ import json
701
+
702
+ options = data.source_content.connection.extra.get("options", {})
703
+ db_path = options.get("filename")
704
+ if not db_path:
705
+ return None
706
+
707
+ # Resolve relative path against source_path
708
+ if data.source_path and not os.path.isabs(db_path):
709
+ db_path = os.path.join(data.source_path, db_path)
710
+
711
+ table_deps = data.extracted_data.get("table_deps", [])
712
+ if not table_deps:
713
+ return None
714
+
715
+ source_group = data.source_group
716
+ query_name = data.query.name
717
+ sensor_name = f"{source_group}_{query_name}_sensor"
718
+
719
+ @dg.sensor(name=sensor_name, asset_selection=[asset_key])
720
+ def duckdb_sensor(context: dg.SensorEvaluationContext):
721
+ try:
722
+ import duckdb
723
+ except ImportError:
724
+ raise ImportError(
725
+ "duckdb is required for DuckDB sensors. "
726
+ "Install it with: pip install dagster-evidence[duckdb]"
727
+ ) from None
728
+
729
+ try:
730
+ conn = duckdb.connect(db_path, read_only=True)
731
+ except Exception as e:
732
+ raise Exception(f"Could not connect to DuckDB: {e}") from e
733
+
734
+ try:
735
+ table_counts: dict[str, int] = {}
736
+ for ref in table_deps:
737
+ table_name = ref.get("table")
738
+ schema = ref.get("schema", "main")
739
+ if table_name:
740
+ try:
741
+ result = conn.execute(
742
+ """
743
+ SELECT estimated_size
744
+ FROM duckdb_tables()
745
+ WHERE table_name = ? AND schema_name = ?
746
+ """,
747
+ [table_name, schema],
748
+ ).fetchone()
749
+ table_counts[f"{schema}.{table_name}"] = (
750
+ result[0] if result else 0
751
+ )
752
+ except Exception:
753
+ table_counts[f"{schema}.{table_name}"] = 0
754
+ finally:
755
+ conn.close()
756
+
757
+ cursor = json.loads(context.cursor) if context.cursor else {}
758
+ last_counts = cursor.get("counts", {})
759
+
760
+ if table_counts != last_counts:
761
+ context.update_cursor(json.dumps({"counts": table_counts}))
762
+ yield dg.RunRequest(asset_selection=[asset_key])
763
+
764
+ return duckdb_sensor
765
+
766
+ @classmethod
767
+ def extract_data_from_source(
768
+ cls, data: "EvidenceSourceTranslatorData"
769
+ ) -> dict[str, Any]:
770
+ """Extract table references from DuckDB source query."""
771
+ from dagster_evidence.utils import extract_table_references
772
+
773
+ options = data.source_content.connection.extra.get("options", {})
774
+ # For DuckDB, database can be inferred from filename (without .duckdb extension)
775
+ filename = options.get("filename", "")
776
+ default_database = filename.replace(".duckdb", "") if filename else None
777
+ default_schema = "main" # DuckDB default schema
778
+
779
+ table_refs = extract_table_references(
780
+ data.query.content,
781
+ default_database=default_database,
782
+ default_schema=default_schema,
783
+ )
784
+ return {"table_deps": table_refs}
785
+
786
+ @classmethod
787
+ def get_source_asset(
788
+ cls, data: "EvidenceSourceTranslatorData"
789
+ ) -> dg.AssetsDefinition:
790
+ """Get the AssetsDefinition for a DuckDB source query."""
791
+ deps = []
792
+ for ref in data.extracted_data.get("table_deps", []):
793
+ if ref.get("table"):
794
+ deps.append(dg.AssetKey([ref["table"]]))
795
+
796
+ key = dg.AssetKey([data.source_group, data.query.name])
797
+ group_name = data.effective_group_name
798
+ has_deps = bool(deps)
799
+
800
+ # Add description and metadata
801
+ description = cls._build_description_with_sql(data)
802
+ metadata = cls._build_base_metadata(data)
803
+
804
+ @dg.asset(
805
+ key=key,
806
+ group_name=group_name,
807
+ kinds={"evidence", "source", "duckdb"},
808
+ deps=deps,
809
+ description=description,
810
+ metadata=metadata,
811
+ automation_condition=dg.AutomationCondition.any_deps_match(
812
+ dg.AutomationCondition.newly_updated()
813
+ )
814
+ if has_deps
815
+ else None,
816
+ )
817
+ def _source_asset():
818
+ return dg.MaterializeResult()
819
+
820
+ return _source_asset
821
+
822
+
823
+ @beta
824
+ @public
825
+ class MotherDuckEvidenceProjectSource(BaseEvidenceProjectSource):
826
+ """MotherDuck source for Evidence projects.
827
+
828
+ Handles Evidence sources configured with ``type: motherduck`` in connection.yaml.
829
+
830
+ Example:
831
+
832
+ connection.yaml for a MotherDuck source:
833
+
834
+ .. code-block:: yaml
835
+
836
+ type: motherduck
837
+ token: ${MOTHERDUCK_TOKEN}
838
+ database: my_database
839
+ """
840
+
841
+ @classmethod
842
+ def get_hide_source_asset_default(cls) -> bool:
843
+ return False
844
+
845
+ @classmethod
846
+ def get_source_sensor_enabled_default(cls) -> bool:
847
+ return False
848
+
849
+ @staticmethod
850
+ def get_source_type() -> str:
851
+ return "motherduck"
852
+
853
+ @classmethod
854
+ def get_source_sensor(
855
+ cls,
856
+ data: "EvidenceSourceTranslatorData",
857
+ asset_key: dg.AssetKey,
858
+ ) -> dg.SensorDefinition | None:
859
+ """Get a sensor that monitors MotherDuck tables for changes.
860
+
861
+ Uses information_schema queries with read-only connection to detect
862
+ changes in table row counts.
863
+ """
864
+ import json
865
+ import os
866
+
867
+ options = data.source_content.connection.extra.get("options", {})
868
+ database = options.get("database")
869
+ token = options.get("token") or os.environ.get("MOTHERDUCK_TOKEN")
870
+
871
+ if not database or not token:
872
+ return None
873
+
874
+ table_deps = data.extracted_data.get("table_deps", [])
875
+ if not table_deps:
876
+ return None
877
+
878
+ source_group = data.source_group
879
+ query_name = data.query.name
880
+ sensor_name = f"{source_group}_{query_name}_sensor"
881
+
882
+ @dg.sensor(name=sensor_name, asset_selection=[asset_key])
883
+ def motherduck_sensor(context: dg.SensorEvaluationContext):
884
+ try:
885
+ import duckdb
886
+ except ImportError:
887
+ raise ImportError(
888
+ "duckdb is required for MotherDuck sensors. "
889
+ "Install it with: pip install dagster-evidence[duckdb]"
890
+ ) from None
891
+
892
+ md_token = os.environ.get("MOTHERDUCK_TOKEN", token)
893
+ connection_string = f"md:{database}?motherduck_token={md_token}"
894
+
895
+ try:
896
+ conn = duckdb.connect(connection_string, read_only=True)
897
+ except Exception as e:
898
+ raise Exception(f"Could not connect to MotherDuck: {e}") from e
899
+
900
+ try:
901
+ table_counts: dict[str, int] = {}
902
+ for ref in table_deps:
903
+ table_name = ref.get("table")
904
+ schema = ref.get("schema", "main")
905
+ if table_name:
906
+ try:
907
+ result = conn.execute(
908
+ """
909
+ SELECT estimated_size
910
+ FROM duckdb_tables()
911
+ WHERE table_name = ? AND schema_name = ?
912
+ """,
913
+ [table_name, schema],
914
+ ).fetchone()
915
+ table_counts[f"{schema}.{table_name}"] = (
916
+ result[0] if result else 0
917
+ )
918
+ except Exception:
919
+ table_counts[f"{schema}.{table_name}"] = 0
920
+ finally:
921
+ conn.close()
922
+
923
+ cursor = json.loads(context.cursor) if context.cursor else {}
924
+ last_counts = cursor.get("counts", {})
925
+
926
+ if table_counts != last_counts:
927
+ context.update_cursor(json.dumps({"counts": table_counts}))
928
+ yield dg.RunRequest(asset_selection=[asset_key])
929
+
930
+ return motherduck_sensor
931
+
932
+ @classmethod
933
+ def extract_data_from_source(
934
+ cls, data: "EvidenceSourceTranslatorData"
935
+ ) -> dict[str, Any]:
936
+ """Extract table references from MotherDuck source query."""
937
+ from dagster_evidence.utils import extract_table_references
938
+
939
+ # Get database from connection config options
940
+ options = data.source_content.connection.extra.get("options", {})
941
+ default_database = options.get("database")
942
+ default_schema = "main" # MotherDuck default schema
943
+
944
+ table_refs = extract_table_references(
945
+ data.query.content,
946
+ default_database=default_database,
947
+ default_schema=default_schema,
948
+ )
949
+ return {"table_deps": table_refs}
950
+
951
+ @classmethod
952
+ def get_source_asset(
953
+ cls, data: "EvidenceSourceTranslatorData"
954
+ ) -> dg.AssetsDefinition:
955
+ """Get the AssetsDefinition for a MotherDuck source query."""
956
+ deps = []
957
+ for ref in data.extracted_data.get("table_deps", []):
958
+ if ref.get("table"):
959
+ deps.append(dg.AssetKey([ref["table"]]))
960
+
961
+ key = dg.AssetKey([data.source_group, data.query.name])
962
+ group_name = data.effective_group_name
963
+ has_deps = bool(deps)
964
+
965
+ # Add description and metadata
966
+ description = cls._build_description_with_sql(data)
967
+ metadata = cls._build_base_metadata(data)
968
+
969
+ @dg.asset(
970
+ key=key,
971
+ group_name=group_name,
972
+ kinds={"evidence", "source", "motherduck"},
973
+ deps=deps,
974
+ description=description,
975
+ metadata=metadata,
976
+ automation_condition=dg.AutomationCondition.any_deps_match(
977
+ dg.AutomationCondition.newly_updated()
978
+ )
979
+ if has_deps
980
+ else None,
981
+ )
982
+ def _source_asset():
983
+ return dg.MaterializeResult()
984
+
985
+ return _source_asset
986
+
987
+
988
+ @beta
989
+ @public
990
+ class BigQueryEvidenceProjectSource(BaseEvidenceProjectSource):
991
+ """BigQuery source for Evidence projects.
992
+
993
+ Handles Evidence sources configured with ``type: bigquery`` in connection.yaml.
994
+
995
+ Example:
996
+
997
+ connection.yaml for a BigQuery source:
998
+
999
+ .. code-block:: yaml
1000
+
1001
+ type: bigquery
1002
+ project_id: my-gcp-project
1003
+ credentials: ${GOOGLE_APPLICATION_CREDENTIALS}
1004
+ """
1005
+
1006
+ @classmethod
1007
+ def get_hide_source_asset_default(cls) -> bool:
1008
+ return True
1009
+
1010
+ @classmethod
1011
+ def get_source_sensor_enabled_default(cls) -> bool:
1012
+ return False
1013
+
1014
+ @staticmethod
1015
+ def get_source_type() -> str:
1016
+ return "bigquery"
1017
+
1018
+ @classmethod
1019
+ def get_source_sensor(
1020
+ cls,
1021
+ data: "EvidenceSourceTranslatorData",
1022
+ asset_key: dg.AssetKey,
1023
+ ) -> dg.SensorDefinition | None:
1024
+ """Get a sensor that monitors BigQuery tables for changes.
1025
+
1026
+ Uses BigQuery API to check table.modified timestamps.
1027
+ """
1028
+ import json
1029
+
1030
+ options = data.source_content.connection.extra.get("options", {})
1031
+ project_id = options.get("project_id")
1032
+
1033
+ if not project_id:
1034
+ return None
1035
+
1036
+ table_deps = data.extracted_data.get("table_deps", [])
1037
+ if not table_deps:
1038
+ return None
1039
+
1040
+ source_group = data.source_group
1041
+ query_name = data.query.name
1042
+ sensor_name = f"{source_group}_{query_name}_sensor"
1043
+
1044
+ @dg.sensor(name=sensor_name, asset_selection=[asset_key])
1045
+ def bigquery_sensor(context: dg.SensorEvaluationContext):
1046
+ try:
1047
+ from google.cloud import bigquery
1048
+ except ImportError:
1049
+ raise ImportError(
1050
+ "google-cloud-bigquery is required for BigQuery sensors. "
1051
+ "Install it with: pip install dagster-evidence[bigquery]"
1052
+ ) from None
1053
+
1054
+ try:
1055
+ client = bigquery.Client(project=project_id)
1056
+ except Exception as e:
1057
+ raise Exception(f"Could not connect to BigQuery: {e}") from e
1058
+
1059
+ mod_times: dict[str, str] = {}
1060
+ for ref in table_deps:
1061
+ table_name = ref.get("table")
1062
+ dataset = ref.get("schema")
1063
+ if table_name and dataset:
1064
+ try:
1065
+ table_ref = f"{project_id}.{dataset}.{table_name}"
1066
+ table = client.get_table(table_ref)
1067
+ if table.modified:
1068
+ mod_times[table_ref] = table.modified.isoformat()
1069
+ except Exception:
1070
+ pass
1071
+
1072
+ cursor = json.loads(context.cursor) if context.cursor else {}
1073
+ last_mod_times = cursor.get("mod_times", {})
1074
+
1075
+ if mod_times != last_mod_times:
1076
+ context.update_cursor(json.dumps({"mod_times": mod_times}))
1077
+ yield dg.RunRequest(asset_selection=[asset_key])
1078
+
1079
+ return bigquery_sensor
1080
+
1081
+ @classmethod
1082
+ def extract_data_from_source(
1083
+ cls, data: "EvidenceSourceTranslatorData"
1084
+ ) -> dict[str, Any]:
1085
+ """Extract table references from BigQuery source query."""
1086
+ from dagster_evidence.utils import extract_table_references
1087
+
1088
+ # Get project and dataset from connection config options
1089
+ options = data.source_content.connection.extra.get("options", {})
1090
+ default_database = options.get("project_id")
1091
+ default_schema = options.get("dataset")
1092
+
1093
+ table_refs = extract_table_references(
1094
+ data.query.content,
1095
+ default_database=default_database,
1096
+ default_schema=default_schema,
1097
+ )
1098
+ return {"table_deps": table_refs}
1099
+
1100
+ @classmethod
1101
+ def get_source_asset(
1102
+ cls, data: "EvidenceSourceTranslatorData"
1103
+ ) -> dg.AssetsDefinition:
1104
+ """Get the AssetsDefinition for a BigQuery source query."""
1105
+ deps = []
1106
+ for ref in data.extracted_data.get("table_deps", []):
1107
+ if ref.get("table"):
1108
+ deps.append(dg.AssetKey([ref["table"]]))
1109
+
1110
+ key = dg.AssetKey([data.source_group, data.query.name])
1111
+ group_name = data.effective_group_name
1112
+ has_deps = bool(deps)
1113
+
1114
+ # Add description and metadata
1115
+ description = cls._build_description_with_sql(data)
1116
+ metadata = cls._build_base_metadata(data)
1117
+
1118
+ @dg.asset(
1119
+ key=key,
1120
+ group_name=group_name,
1121
+ kinds={"evidence", "source", "bigquery"},
1122
+ deps=deps,
1123
+ description=description,
1124
+ metadata=metadata,
1125
+ automation_condition=dg.AutomationCondition.any_deps_match(
1126
+ dg.AutomationCondition.newly_updated()
1127
+ )
1128
+ if has_deps
1129
+ else None,
1130
+ )
1131
+ def _source_asset():
1132
+ return dg.MaterializeResult()
1133
+
1134
+ return _source_asset
1135
+
1136
+
1137
+ @beta
1138
+ @public
1139
+ class GSheetsEvidenceProjectSource(BaseEvidenceProjectSource):
1140
+ """Google Sheets source for Evidence projects.
1141
+
1142
+ Handles Evidence sources configured with ``type: gsheets`` in connection.yaml.
1143
+ Unlike SQL-based sources, gsheets sources define sheets and pages declaratively
1144
+ rather than using SQL queries.
1145
+
1146
+ Example:
1147
+
1148
+ connection.yaml for a Google Sheets source:
1149
+
1150
+ .. code-block:: yaml
1151
+
1152
+ name: my_sheets
1153
+ type: gsheets
1154
+ options:
1155
+ ratelimitms: 2500
1156
+ sheets:
1157
+ sales_data:
1158
+ id: 1Sc4nyLSSNETSIEpNKzheh5AFJJ-YA-wQeubFgeeEw9g
1159
+ pages:
1160
+ - q1_sales
1161
+ - q2_sales
1162
+ inventory:
1163
+ id: kj235Bo3wRFG9kj3tp98grnPB-P97iu87lv877gliuId
1164
+
1165
+ This generates assets:
1166
+ - ``[source_group, "sales_data", "q1_sales"]``
1167
+ - ``[source_group, "sales_data", "q2_sales"]``
1168
+ - ``[source_group, "inventory"]``
1169
+ """
1170
+
1171
+ @classmethod
1172
+ def get_source_sensor_enabled_default(cls) -> bool:
1173
+ return False
1174
+
1175
+ @staticmethod
1176
+ def get_source_type() -> str:
1177
+ return "gsheets"
1178
+
1179
+ @classmethod
1180
+ def get_source_sensor(
1181
+ cls,
1182
+ data: "EvidenceSourceTranslatorData",
1183
+ asset_key: dg.AssetKey,
1184
+ ) -> dg.SensorDefinition | None:
1185
+ """Get a sensor that monitors Google Sheets for changes.
1186
+
1187
+ Uses Google Drive API to check modifiedTime and version.
1188
+ """
1189
+ import json
1190
+
1191
+ options = data.source_content.connection.extra.get("options", {})
1192
+ service_account_path = options.get("service_account_path")
1193
+ sheets_config = data.source_content.connection.extra.get("sheets", {})
1194
+
1195
+ # Parse query.name to get sheet_name
1196
+ parts = data.query.name.split("/", 1)
1197
+ sheet_name = parts[0]
1198
+
1199
+ sheet_config = sheets_config.get(sheet_name, {})
1200
+ sheet_id = sheet_config.get("id") if isinstance(sheet_config, dict) else None
1201
+
1202
+ if not sheet_id:
1203
+ return None
1204
+
1205
+ source_group = data.source_group
1206
+ query_name = data.query.name.replace("/", "_")
1207
+ sensor_name = f"{source_group}_{query_name}_sensor"
1208
+
1209
+ @dg.sensor(name=sensor_name, asset_selection=[asset_key])
1210
+ def gsheets_sensor(context: dg.SensorEvaluationContext):
1211
+ try:
1212
+ from google.oauth2 import service_account
1213
+ from googleapiclient.discovery import build
1214
+ except ImportError:
1215
+ raise ImportError(
1216
+ "google-api-python-client is required for Google Sheets sensors. "
1217
+ "Install it with: uv pip install 'dagster-evidence[gsheets]'"
1218
+ ) from None
1219
+
1220
+ try:
1221
+ if service_account_path:
1222
+ credentials = service_account.Credentials.from_service_account_file(
1223
+ service_account_path,
1224
+ scopes=["https://www.googleapis.com/auth/drive.readonly"],
1225
+ )
1226
+ else:
1227
+ # Use default credentials
1228
+ import google.auth
1229
+
1230
+ credentials, _ = google.auth.default(
1231
+ scopes=["https://www.googleapis.com/auth/drive.readonly"]
1232
+ )
1233
+
1234
+ service = build("drive", "v3", credentials=credentials)
1235
+ file_metadata = (
1236
+ service.files()
1237
+ .get(fileId=sheet_id, fields="modifiedTime,version")
1238
+ .execute()
1239
+ )
1240
+
1241
+ current_state = {
1242
+ "modified_time": file_metadata.get("modifiedTime"),
1243
+ "version": file_metadata.get("version"),
1244
+ }
1245
+ except Exception as e:
1246
+ raise Exception(f"Could not fetch Google Sheet metadata: {e}") from e
1247
+
1248
+ cursor = json.loads(context.cursor) if context.cursor else {}
1249
+
1250
+ if current_state.get("modified_time") != cursor.get(
1251
+ "modified_time"
1252
+ ) or current_state.get("version") != cursor.get("version"):
1253
+ context.update_cursor(json.dumps(current_state))
1254
+ yield dg.RunRequest(asset_selection=[asset_key])
1255
+
1256
+ return gsheets_sensor
1257
+
1258
+ @classmethod
1259
+ def extract_data_from_source(
1260
+ cls, data: "EvidenceSourceTranslatorData"
1261
+ ) -> dict[str, Any]:
1262
+ """Extract sheet configuration from Google Sheets source.
1263
+
1264
+ Google Sheets sources don't have SQL to parse, so this returns
1265
+ the sheets configuration for use in get_asset_spec.
1266
+ """
1267
+ sheets_config = data.source_content.connection.extra.get("sheets", {})
1268
+ return {"sheets_config": sheets_config}
1269
+
1270
+ @classmethod
1271
+ def get_source_asset(
1272
+ cls, data: "EvidenceSourceTranslatorData"
1273
+ ) -> dg.AssetsDefinition:
1274
+ """Get the AssetsDefinition for a Google Sheets source.
1275
+
1276
+ Parses the query name to extract sheet_name and optional page_name,
1277
+ then builds a 3-part asset key: [source_group, sheet_name, page_name].
1278
+ """
1279
+ # Parse query.name to get sheet_name and optional page_name
1280
+ # Format: "sheet_name" or "sheet_name/page_name"
1281
+ parts = data.query.name.split("/", 1)
1282
+ sheet_name = parts[0]
1283
+ page_name = parts[1] if len(parts) > 1 else None
1284
+
1285
+ # Build asset key: [source_group, sheet_name, page_name] or [source_group, sheet_name]
1286
+ if page_name:
1287
+ key = dg.AssetKey([data.source_group, sheet_name, page_name])
1288
+ else:
1289
+ key = dg.AssetKey([data.source_group, sheet_name])
1290
+
1291
+ group_name = data.effective_group_name
1292
+
1293
+ # Build description
1294
+ description = f"Evidence Google Sheets source: {sheet_name}"
1295
+ if page_name:
1296
+ description += f" / {page_name}"
1297
+
1298
+ # Build metadata with sheet URL
1299
+ metadata: dict[str, Any] = {"Source Type": "gsheets"}
1300
+ sheets_config = data.extracted_data.get("sheets_config", {})
1301
+ sheet_config = sheets_config.get(sheet_name, {})
1302
+ sheet_id = sheet_config.get("id") if isinstance(sheet_config, dict) else None
1303
+ if sheet_id:
1304
+ metadata["Sheet ID"] = sheet_id
1305
+ metadata["Sheet URL"] = dg.MetadataValue.url(
1306
+ f"https://docs.google.com/spreadsheets/d/{sheet_id}"
1307
+ )
1308
+
1309
+ @dg.asset(
1310
+ key=key,
1311
+ group_name=group_name,
1312
+ kinds={"evidence", "source", "gsheets"},
1313
+ deps=[], # No upstream deps for gsheets - they are source of truth
1314
+ description=description,
1315
+ metadata=metadata,
1316
+ )
1317
+ def _source_asset():
1318
+ return dg.MaterializeResult()
1319
+
1320
+ return _source_asset
1321
+
1322
+ @classmethod
1323
+ def build_queries_from_sheets_config(
1324
+ cls, connection: dict[str, Any]
1325
+ ) -> list[dict[str, str]]:
1326
+ """Build virtual queries from sheets configuration.
1327
+
1328
+ This method synthesizes SourceQuery-compatible dictionaries from
1329
+ the sheets configuration in connection.yaml. Each sheet/page
1330
+ combination becomes a "virtual query" with an empty content field.
1331
+
1332
+ Args:
1333
+ connection: The full connection configuration dictionary.
1334
+
1335
+ Returns:
1336
+ List of query dictionaries with "name" and "content" keys.
1337
+ """
1338
+ queries: list[dict[str, str]] = []
1339
+ sheets = connection.get("sheets", {})
1340
+ for sheet_name, sheet_config in sheets.items():
1341
+ if not isinstance(sheet_config, dict):
1342
+ continue
1343
+ pages = sheet_config.get("pages", [])
1344
+ if pages:
1345
+ for page in pages:
1346
+ queries.append({"name": f"{sheet_name}/{page}", "content": ""})
1347
+ else:
1348
+ # No pages specified - create single asset for the sheet
1349
+ queries.append({"name": sheet_name, "content": ""})
1350
+ return queries