dagster-cloud 1.8.2__py3-none-any.whl → 1.12.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dagster_cloud/__init__.py +3 -3
- dagster_cloud/agent/__init__.py +4 -4
- dagster_cloud/agent/cli/__init__.py +56 -17
- dagster_cloud/agent/dagster_cloud_agent.py +360 -172
- dagster_cloud/agent/instrumentation/__init__.py +0 -0
- dagster_cloud/agent/instrumentation/constants.py +2 -0
- dagster_cloud/agent/instrumentation/run_launch.py +23 -0
- dagster_cloud/agent/instrumentation/schedule.py +34 -0
- dagster_cloud/agent/instrumentation/sensor.py +34 -0
- dagster_cloud/anomaly_detection/__init__.py +2 -2
- dagster_cloud/anomaly_detection/defs.py +17 -12
- dagster_cloud/anomaly_detection/types.py +3 -3
- dagster_cloud/api/dagster_cloud_api.py +209 -293
- dagster_cloud/auth/constants.py +21 -5
- dagster_cloud/batching/__init__.py +1 -0
- dagster_cloud/batching/batcher.py +210 -0
- dagster_cloud/dagster_insights/__init__.py +12 -6
- dagster_cloud/dagster_insights/bigquery/bigquery_utils.py +3 -2
- dagster_cloud/dagster_insights/bigquery/dbt_wrapper.py +39 -12
- dagster_cloud/dagster_insights/bigquery/insights_bigquery_resource.py +8 -6
- dagster_cloud/dagster_insights/insights_utils.py +18 -8
- dagster_cloud/dagster_insights/metrics_utils.py +12 -12
- dagster_cloud/dagster_insights/snowflake/dagster_snowflake_insights.py +5 -12
- dagster_cloud/dagster_insights/snowflake/dbt_wrapper.py +34 -8
- dagster_cloud/dagster_insights/snowflake/definitions.py +38 -12
- dagster_cloud/dagster_insights/snowflake/insights_snowflake_resource.py +11 -23
- dagster_cloud/definitions/__init__.py +0 -0
- dagster_cloud/definitions/job_selection.py +36 -0
- dagster_cloud/execution/cloud_run_launcher/k8s.py +1 -1
- dagster_cloud/execution/cloud_run_launcher/process.py +3 -3
- dagster_cloud/execution/monitoring/__init__.py +27 -33
- dagster_cloud/execution/utils/process.py +3 -3
- dagster_cloud/instance/__init__.py +125 -38
- dagster_cloud/instrumentation/__init__.py +32 -0
- dagster_cloud/metadata/source_code.py +13 -8
- dagster_cloud/metrics/__init__.py +0 -0
- dagster_cloud/metrics/tracer.py +59 -0
- dagster_cloud/opentelemetry/__init__.py +0 -0
- dagster_cloud/opentelemetry/config/__init__.py +73 -0
- dagster_cloud/opentelemetry/config/exporter.py +81 -0
- dagster_cloud/opentelemetry/config/log_record_processor.py +40 -0
- dagster_cloud/opentelemetry/config/logging_handler.py +14 -0
- dagster_cloud/opentelemetry/config/meter_provider.py +9 -0
- dagster_cloud/opentelemetry/config/metric_reader.py +39 -0
- dagster_cloud/opentelemetry/controller.py +319 -0
- dagster_cloud/opentelemetry/enum.py +58 -0
- dagster_cloud/opentelemetry/factories/__init__.py +1 -0
- dagster_cloud/opentelemetry/factories/logs.py +113 -0
- dagster_cloud/opentelemetry/factories/metrics.py +121 -0
- dagster_cloud/opentelemetry/metrics/__init__.py +0 -0
- dagster_cloud/opentelemetry/metrics/meter.py +140 -0
- dagster_cloud/opentelemetry/observers/__init__.py +0 -0
- dagster_cloud/opentelemetry/observers/dagster_exception_handler.py +40 -0
- dagster_cloud/opentelemetry/observers/execution_observer.py +178 -0
- dagster_cloud/pex/grpc/__generated__/multi_pex_api_pb2.pyi +175 -0
- dagster_cloud/pex/grpc/__init__.py +2 -2
- dagster_cloud/pex/grpc/client.py +4 -4
- dagster_cloud/pex/grpc/compile.py +2 -2
- dagster_cloud/pex/grpc/server/__init__.py +2 -2
- dagster_cloud/pex/grpc/server/cli/__init__.py +31 -19
- dagster_cloud/pex/grpc/server/manager.py +60 -42
- dagster_cloud/pex/grpc/server/registry.py +28 -21
- dagster_cloud/pex/grpc/server/server.py +23 -14
- dagster_cloud/pex/grpc/types.py +5 -5
- dagster_cloud/py.typed +0 -0
- dagster_cloud/secrets/__init__.py +1 -1
- dagster_cloud/secrets/loader.py +3 -3
- dagster_cloud/serverless/__init__.py +1 -1
- dagster_cloud/serverless/io_manager.py +36 -53
- dagster_cloud/storage/client.py +54 -17
- dagster_cloud/storage/compute_logs/__init__.py +3 -1
- dagster_cloud/storage/compute_logs/compute_log_manager.py +22 -17
- dagster_cloud/storage/defs_state/__init__.py +3 -0
- dagster_cloud/storage/defs_state/queries.py +15 -0
- dagster_cloud/storage/defs_state/storage.py +113 -0
- dagster_cloud/storage/event_logs/__init__.py +3 -1
- dagster_cloud/storage/event_logs/queries.py +102 -4
- dagster_cloud/storage/event_logs/storage.py +266 -73
- dagster_cloud/storage/event_logs/utils.py +88 -7
- dagster_cloud/storage/runs/__init__.py +1 -1
- dagster_cloud/storage/runs/queries.py +17 -2
- dagster_cloud/storage/runs/storage.py +88 -42
- dagster_cloud/storage/schedules/__init__.py +1 -1
- dagster_cloud/storage/schedules/storage.py +6 -8
- dagster_cloud/storage/tags.py +66 -1
- dagster_cloud/util/__init__.py +10 -12
- dagster_cloud/util/errors.py +49 -64
- dagster_cloud/version.py +1 -1
- dagster_cloud/workspace/config_schema/__init__.py +55 -13
- dagster_cloud/workspace/docker/__init__.py +76 -25
- dagster_cloud/workspace/docker/utils.py +1 -1
- dagster_cloud/workspace/ecs/__init__.py +1 -1
- dagster_cloud/workspace/ecs/client.py +51 -33
- dagster_cloud/workspace/ecs/launcher.py +76 -22
- dagster_cloud/workspace/ecs/run_launcher.py +3 -3
- dagster_cloud/workspace/ecs/utils.py +14 -5
- dagster_cloud/workspace/kubernetes/__init__.py +1 -1
- dagster_cloud/workspace/kubernetes/launcher.py +61 -29
- dagster_cloud/workspace/kubernetes/utils.py +34 -22
- dagster_cloud/workspace/user_code_launcher/__init__.py +5 -3
- dagster_cloud/workspace/user_code_launcher/process.py +16 -14
- dagster_cloud/workspace/user_code_launcher/user_code_launcher.py +552 -172
- dagster_cloud/workspace/user_code_launcher/utils.py +105 -1
- {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/METADATA +48 -42
- dagster_cloud-1.12.6.dist-info/RECORD +134 -0
- {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/WHEEL +1 -1
- dagster_cloud-1.8.2.dist-info/RECORD +0 -100
- {dagster_cloud-1.8.2.dist-info → dagster_cloud-1.12.6.dist-info}/top_level.txt +0 -0
dagster_cloud/auth/constants.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import uuid
|
|
1
2
|
from typing import Optional
|
|
2
3
|
|
|
3
4
|
from dagster._core.errors import DagsterInvariantViolationError
|
|
@@ -19,12 +20,27 @@ def get_organization_public_id_from_api_token(api_token: str) -> Optional[str]:
|
|
|
19
20
|
return split_token[2]
|
|
20
21
|
|
|
21
22
|
|
|
22
|
-
def
|
|
23
|
+
def decode_region_from_uuid(regional_token: str) -> Optional[str]:
|
|
24
|
+
try:
|
|
25
|
+
regional_uuid = uuid.UUID(regional_token)
|
|
26
|
+
except ValueError:
|
|
27
|
+
# if it's not an actual uuid, we can't decode region
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
# custom uuids contain region subdomains in the first 2 bytes
|
|
31
|
+
if regional_uuid.version != 8 or regional_uuid.variant != uuid.RFC_4122:
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
uuid_bytes = regional_uuid.bytes
|
|
35
|
+
return uuid_bytes[:2].decode("ascii")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def decode_agent_token(agent_token: str) -> tuple[Optional[str], Optional[str]]:
|
|
23
39
|
split_token = agent_token.split(":")
|
|
24
40
|
|
|
25
41
|
# Legacy agent token format - organization must be specified in dagster.yaml
|
|
26
42
|
if len(split_token) == 1:
|
|
27
|
-
return None
|
|
43
|
+
return None, None
|
|
28
44
|
|
|
29
45
|
token_type, *token = split_token
|
|
30
46
|
|
|
@@ -35,6 +51,6 @@ def get_organization_name_from_agent_token(agent_token: str) -> Optional[str]:
|
|
|
35
51
|
"Generate a new agent token in Dagster Cloud."
|
|
36
52
|
)
|
|
37
53
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
return organization
|
|
54
|
+
# token format: agent:<org>:<uuid>
|
|
55
|
+
organization, uuid_str = token
|
|
56
|
+
return organization, decode_region_from_uuid(uuid_str)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from dagster_cloud.batching.batcher import Batcher as Batcher
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from collections.abc import Generator
|
|
4
|
+
from concurrent.futures import Future, TimeoutError
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from queue import Empty, Full, Queue
|
|
7
|
+
from threading import Lock
|
|
8
|
+
from typing import Callable, Generic, Optional, TypeVar
|
|
9
|
+
|
|
10
|
+
import dagster._check as check
|
|
11
|
+
|
|
12
|
+
from dagster_cloud.instrumentation import Instrumentation, NoOpInstrumentation
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
I = TypeVar("I") # noqa: E741
|
|
17
|
+
O = TypeVar("O") # noqa: E741
|
|
18
|
+
QueueItem = tuple[I, Future[O]]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
DEFAULT_MAX_WAIT_MS = 1000
|
|
22
|
+
DEFAULT_MAX_BATCH_SIZE = 100
|
|
23
|
+
DEFAULT_MAX_QUEUE_SIZE = 1000
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _get_override_for_name(setting: str, name: str) -> Optional[int]:
|
|
27
|
+
env_name = f"DAGSTER_BATCHING__{name.upper().replace('-', '_')}__{setting.upper()}"
|
|
28
|
+
value = os.getenv(env_name)
|
|
29
|
+
if value is None:
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
value_int = int(value)
|
|
34
|
+
if value_int <= 0:
|
|
35
|
+
logger.warning(
|
|
36
|
+
f"Environment variable misconfiguration for {env_name} (should be positive int, got: '{value}')"
|
|
37
|
+
)
|
|
38
|
+
return None
|
|
39
|
+
return value_int
|
|
40
|
+
except ValueError:
|
|
41
|
+
logger.warning(
|
|
42
|
+
f"Environment variable misconfiguration for {env_name} (should be positive int, got: '{value}')"
|
|
43
|
+
)
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _get_config(
|
|
48
|
+
setting: str, name: str, passed_in_default: Optional[int], global_default: int
|
|
49
|
+
) -> int:
|
|
50
|
+
override = _get_override_for_name(setting, name)
|
|
51
|
+
if override is not None:
|
|
52
|
+
return override
|
|
53
|
+
|
|
54
|
+
if passed_in_default is not None:
|
|
55
|
+
return passed_in_default
|
|
56
|
+
|
|
57
|
+
return global_default
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class Batcher(Generic[I, O]):
|
|
61
|
+
"""the basic algorithm is.
|
|
62
|
+
|
|
63
|
+
1. insert (item, future) into queue
|
|
64
|
+
2. wait for future to complete, with max timeout
|
|
65
|
+
2a. if future completes, return result
|
|
66
|
+
2b. on timeout, acquire lock, then drain the queue until
|
|
67
|
+
the future completes
|
|
68
|
+
|
|
69
|
+
NOTE: if the queue is full, submit() will raise an exception
|
|
70
|
+
NOTE: the lock means that only one thread will ever be running the batcher_fn
|
|
71
|
+
at a time. the algorithm would still be correct without the lock but
|
|
72
|
+
locking leads to larger batches. HOWEVER without the lock we might try
|
|
73
|
+
to submit empty batches, which there is currently an invariant to protect
|
|
74
|
+
against
|
|
75
|
+
NOTE: the max queue size is meant to cap the number of inflight requests
|
|
76
|
+
in order to fail faster if the underlying function is taking too long
|
|
77
|
+
(database issues).
|
|
78
|
+
|
|
79
|
+
Configuration for queue size, max wait, and batch size is specified (by priority order) by:
|
|
80
|
+
|
|
81
|
+
1. an env var override (of the form DAGSTER_BATCHING__TEST__MAX_WAIT_MS -- see _get_override_for_name)
|
|
82
|
+
2. the passed in value
|
|
83
|
+
3. the default (specified in this file)
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
name: str,
|
|
89
|
+
batcher_fn: Callable[[list[I]], list[O]],
|
|
90
|
+
max_queue_size: Optional[int] = None,
|
|
91
|
+
max_batch_size: Optional[int] = None,
|
|
92
|
+
max_wait_ms: Optional[int] = None,
|
|
93
|
+
instrumentation: Optional[Instrumentation] = None,
|
|
94
|
+
) -> None:
|
|
95
|
+
check.invariant(
|
|
96
|
+
max_wait_ms is None or max_wait_ms > 0,
|
|
97
|
+
"max wait, if provided, must be set to a positive integer",
|
|
98
|
+
)
|
|
99
|
+
check.invariant(
|
|
100
|
+
max_queue_size is None or max_queue_size > 0,
|
|
101
|
+
"max queue size, if provided, must be set to a positive integer",
|
|
102
|
+
)
|
|
103
|
+
check.invariant(
|
|
104
|
+
max_batch_size is None or max_batch_size > 0,
|
|
105
|
+
"max batch size, if provided, must be set to a positive integer",
|
|
106
|
+
)
|
|
107
|
+
if max_queue_size and max_batch_size:
|
|
108
|
+
check.invariant(
|
|
109
|
+
max_batch_size <= max_queue_size,
|
|
110
|
+
"if max batch size and max queue size are provided, max batch size must be "
|
|
111
|
+
"less than or equal to max queue size",
|
|
112
|
+
)
|
|
113
|
+
self._name = name
|
|
114
|
+
self._batcher_fn = batcher_fn
|
|
115
|
+
self._max_batch_size = _get_config(
|
|
116
|
+
"max_batch_size", name, max_batch_size, DEFAULT_MAX_BATCH_SIZE
|
|
117
|
+
)
|
|
118
|
+
self._max_wait_ms: float = _get_config(
|
|
119
|
+
"max_wait_ms", name, max_wait_ms, DEFAULT_MAX_WAIT_MS
|
|
120
|
+
)
|
|
121
|
+
config_max_queue_size = _get_config(
|
|
122
|
+
"max_queue_size", name, max_queue_size, DEFAULT_MAX_QUEUE_SIZE
|
|
123
|
+
)
|
|
124
|
+
self._queue: Queue[QueueItem] = Queue(maxsize=config_max_queue_size)
|
|
125
|
+
self._drain_lock = Lock()
|
|
126
|
+
self._instrumentation = (instrumentation or NoOpInstrumentation()).tags([f"batcher:{name}"])
|
|
127
|
+
|
|
128
|
+
def _submit_batch(self, batch: list[QueueItem]) -> None:
|
|
129
|
+
check.invariant(len(batch) > 0, "should never submit an empty batch")
|
|
130
|
+
self._instrument_batch_size(len(batch))
|
|
131
|
+
try:
|
|
132
|
+
with self._time("batcher_fn"):
|
|
133
|
+
results = self._batcher_fn([i for i, _ in batch])
|
|
134
|
+
except Exception as e:
|
|
135
|
+
for _, fut in batch:
|
|
136
|
+
fut.set_exception(e)
|
|
137
|
+
else:
|
|
138
|
+
check.invariant(
|
|
139
|
+
len(results) == len(batch), "batcher returned fewer results than expected"
|
|
140
|
+
)
|
|
141
|
+
for (_, fut), result in zip(batch, results):
|
|
142
|
+
fut.set_result(result)
|
|
143
|
+
|
|
144
|
+
def _build_batch(self) -> list[QueueItem]:
|
|
145
|
+
batch = []
|
|
146
|
+
for _ in range(self._max_batch_size):
|
|
147
|
+
try:
|
|
148
|
+
batch.append(self._queue.get(block=False))
|
|
149
|
+
except Empty:
|
|
150
|
+
break
|
|
151
|
+
return batch
|
|
152
|
+
|
|
153
|
+
@contextmanager
|
|
154
|
+
def _lock(self) -> Generator[None, None, None]:
|
|
155
|
+
with self._time("lock_acquisition"):
|
|
156
|
+
self._drain_lock.acquire()
|
|
157
|
+
try:
|
|
158
|
+
yield
|
|
159
|
+
finally:
|
|
160
|
+
self._drain_lock.release()
|
|
161
|
+
|
|
162
|
+
def _drain_batch(self, fut: Future[O]) -> O:
|
|
163
|
+
with self._lock(), self._time("drain_batch"):
|
|
164
|
+
while not fut.done():
|
|
165
|
+
self._submit_batch(self._build_batch())
|
|
166
|
+
return fut.result()
|
|
167
|
+
|
|
168
|
+
def submit(self, i: I) -> O:
|
|
169
|
+
with self._time("submit"):
|
|
170
|
+
fut: Future[O] = Future()
|
|
171
|
+
try:
|
|
172
|
+
self._queue.put((i, fut), block=False)
|
|
173
|
+
except Full:
|
|
174
|
+
self._instrumentation.increment("dagster.batching.full")
|
|
175
|
+
logger.exception(f"Batching queue for batcher {self._name} is full!")
|
|
176
|
+
raise
|
|
177
|
+
else:
|
|
178
|
+
try:
|
|
179
|
+
queue_size = self._queue.qsize()
|
|
180
|
+
self._instrument_queue_size(queue_size)
|
|
181
|
+
timeout = 0 if queue_size >= self._max_batch_size else self._max_wait_ms / 1000
|
|
182
|
+
return fut.result(timeout=timeout)
|
|
183
|
+
except TimeoutError:
|
|
184
|
+
self._instrumentation.increment("dagster.batching.timeout")
|
|
185
|
+
self._drain_batch(fut)
|
|
186
|
+
return fut.result()
|
|
187
|
+
|
|
188
|
+
def _instrument_queue_size(self, queue_size: int) -> None:
|
|
189
|
+
self._instrumentation.histogram("dagster.batching.queue_size", queue_size)
|
|
190
|
+
for bucket in [5, 10, 100]:
|
|
191
|
+
if queue_size >= bucket:
|
|
192
|
+
self._instrumentation.increment(f"dagster.batching.queue_size.ge_{bucket}")
|
|
193
|
+
else:
|
|
194
|
+
break
|
|
195
|
+
|
|
196
|
+
def _instrument_batch_size(self, batch_size: int) -> None:
|
|
197
|
+
self._instrumentation.histogram("dagster.batching.batch_size", batch_size)
|
|
198
|
+
for bucket in [5, 10, 100]:
|
|
199
|
+
if batch_size >= bucket:
|
|
200
|
+
self._instrumentation.increment(f"dagster.batching.batch_size.ge_{bucket}")
|
|
201
|
+
else:
|
|
202
|
+
break
|
|
203
|
+
|
|
204
|
+
@contextmanager
|
|
205
|
+
def _time(self, metric_name: str) -> Generator[None, None, None]:
|
|
206
|
+
with self._instrumentation.instrument_context(
|
|
207
|
+
f"dagster.batching.{metric_name}",
|
|
208
|
+
buckets_ms=[10, 100, 500, 1000],
|
|
209
|
+
):
|
|
210
|
+
yield
|
|
@@ -1,15 +1,19 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
|
-
from .snowflake.dbt_wrapper import
|
|
5
|
-
|
|
4
|
+
from dagster_cloud.dagster_insights.snowflake.dbt_wrapper import (
|
|
5
|
+
dbt_with_snowflake_insights as dbt_with_snowflake_insights,
|
|
6
|
+
)
|
|
7
|
+
from dagster_cloud.dagster_insights.snowflake.definitions import (
|
|
6
8
|
create_snowflake_insights_asset_and_schedule as create_snowflake_insights_asset_and_schedule,
|
|
7
9
|
)
|
|
8
|
-
from .snowflake.snowflake_utils import
|
|
10
|
+
from dagster_cloud.dagster_insights.snowflake.snowflake_utils import (
|
|
11
|
+
meter_snowflake_query as meter_snowflake_query,
|
|
12
|
+
)
|
|
9
13
|
|
|
10
14
|
dagster_snowflake_req_imports = {"InsightsSnowflakeResource"}
|
|
11
15
|
try:
|
|
12
|
-
from .snowflake.insights_snowflake_resource import (
|
|
16
|
+
from dagster_cloud.dagster_insights.snowflake.insights_snowflake_resource import (
|
|
13
17
|
InsightsSnowflakeResource as InsightsSnowflakeResource,
|
|
14
18
|
)
|
|
15
19
|
except ImportError:
|
|
@@ -17,8 +21,10 @@ except ImportError:
|
|
|
17
21
|
|
|
18
22
|
dagster_bigquery_req_imports = {"InsightsBigQueryResource", "dbt_with_bigquery_insights"}
|
|
19
23
|
try:
|
|
20
|
-
from .bigquery.dbt_wrapper import
|
|
21
|
-
|
|
24
|
+
from dagster_cloud.dagster_insights.bigquery.dbt_wrapper import (
|
|
25
|
+
dbt_with_bigquery_insights as dbt_with_bigquery_insights,
|
|
26
|
+
)
|
|
27
|
+
from dagster_cloud.dagster_insights.bigquery.insights_bigquery_resource import (
|
|
22
28
|
InsightsBigQueryResource as InsightsBigQueryResource,
|
|
23
29
|
)
|
|
24
30
|
except ImportError:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Mapping
|
|
2
|
+
from typing import Any, Optional
|
|
2
3
|
|
|
3
4
|
from dagster import AssetKey, JobDefinition
|
|
4
5
|
|
|
@@ -15,7 +16,7 @@ def marker_asset_key_for_job(
|
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
def build_bigquery_cost_metadata(
|
|
18
|
-
job_ids: Optional[
|
|
19
|
+
job_ids: Optional[list[str]], bytes_billed: int, slots_ms: int
|
|
19
20
|
) -> Mapping[str, Any]:
|
|
20
21
|
metadata: Mapping[str, Any] = {
|
|
21
22
|
BIGQUERY_METADATA_BYTES_BILLED: bytes_billed,
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
|
+
from collections.abc import Iterable, Iterator
|
|
2
3
|
from dataclasses import dataclass
|
|
3
|
-
from typing import TYPE_CHECKING,
|
|
4
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
4
5
|
|
|
5
6
|
import yaml
|
|
6
7
|
from dagster import (
|
|
8
|
+
AssetCheckEvaluation,
|
|
7
9
|
AssetCheckResult,
|
|
8
10
|
AssetExecutionContext,
|
|
9
11
|
AssetKey,
|
|
@@ -16,8 +18,14 @@ from dagster_dbt import DbtCliInvocation
|
|
|
16
18
|
from dagster_dbt.version import __version__ as dagster_dbt_version
|
|
17
19
|
from packaging import version
|
|
18
20
|
|
|
19
|
-
from
|
|
20
|
-
|
|
21
|
+
from dagster_cloud.dagster_insights.bigquery.bigquery_utils import (
|
|
22
|
+
build_bigquery_cost_metadata,
|
|
23
|
+
marker_asset_key_for_job,
|
|
24
|
+
)
|
|
25
|
+
from dagster_cloud.dagster_insights.insights_utils import (
|
|
26
|
+
extract_asset_info_from_event,
|
|
27
|
+
handle_raise_on_error,
|
|
28
|
+
)
|
|
21
29
|
|
|
22
30
|
if TYPE_CHECKING:
|
|
23
31
|
from dbt.adapters.base.impl import BaseAdapter
|
|
@@ -52,11 +60,21 @@ def dbt_with_bigquery_insights(
|
|
|
52
60
|
context: Union[OpExecutionContext, AssetExecutionContext],
|
|
53
61
|
dbt_cli_invocation: DbtCliInvocation,
|
|
54
62
|
dagster_events: Optional[
|
|
55
|
-
Iterable[
|
|
63
|
+
Iterable[
|
|
64
|
+
Union[
|
|
65
|
+
Output,
|
|
66
|
+
AssetMaterialization,
|
|
67
|
+
AssetObservation,
|
|
68
|
+
AssetCheckResult,
|
|
69
|
+
AssetCheckEvaluation,
|
|
70
|
+
]
|
|
71
|
+
]
|
|
56
72
|
] = None,
|
|
57
73
|
skip_config_check=False,
|
|
58
74
|
record_observation_usage: bool = True,
|
|
59
|
-
) -> Iterator[
|
|
75
|
+
) -> Iterator[
|
|
76
|
+
Union[Output, AssetMaterialization, AssetObservation, AssetCheckResult, AssetCheckEvaluation]
|
|
77
|
+
]:
|
|
60
78
|
"""Wraps a dagster-dbt invocation to associate each BigQuery query with the produced
|
|
61
79
|
asset materializations. This allows the cost of each query to be associated with the asset
|
|
62
80
|
materialization that it produced.
|
|
@@ -67,7 +85,7 @@ def dbt_with_bigquery_insights(
|
|
|
67
85
|
Args:
|
|
68
86
|
context (AssetExecutionContext): The context of the asset that is being materialized.
|
|
69
87
|
dbt_cli_invocation (DbtCliInvocation): The invocation of the dbt CLI to wrap.
|
|
70
|
-
dagster_events (Optional[Iterable[Union[Output, AssetObservation, AssetCheckResult]]]):
|
|
88
|
+
dagster_events (Optional[Iterable[Union[Output, AssetObservation, AssetCheckResult, AssetCheckEvaluation]]]):
|
|
71
89
|
The events that were produced by the dbt CLI invocation. If not provided, it is assumed
|
|
72
90
|
that the dbt CLI invocation has not yet been run, and it will be run and the events
|
|
73
91
|
will be streamed.
|
|
@@ -116,7 +134,14 @@ def dbt_with_bigquery_insights(
|
|
|
116
134
|
asset_info_by_unique_id = {}
|
|
117
135
|
for dagster_event in dagster_events:
|
|
118
136
|
if isinstance(
|
|
119
|
-
dagster_event,
|
|
137
|
+
dagster_event,
|
|
138
|
+
(
|
|
139
|
+
AssetMaterialization,
|
|
140
|
+
AssetObservation,
|
|
141
|
+
Output,
|
|
142
|
+
AssetCheckResult,
|
|
143
|
+
AssetCheckEvaluation,
|
|
144
|
+
),
|
|
120
145
|
):
|
|
121
146
|
unique_id = dagster_event.metadata["unique_id"].value
|
|
122
147
|
asset_key, partition = extract_asset_info_from_event(
|
|
@@ -133,7 +158,7 @@ def dbt_with_bigquery_insights(
|
|
|
133
158
|
invocation_id = run_results_json["metadata"]["invocation_id"]
|
|
134
159
|
|
|
135
160
|
# backcompat-proof in case the invocation does not have an instantiated adapter on it
|
|
136
|
-
adapter: Optional[
|
|
161
|
+
adapter: Optional[BaseAdapter] = getattr(dbt_cli_invocation, "adapter", None)
|
|
137
162
|
if not adapter:
|
|
138
163
|
if version.parse(dagster_dbt_version) < version.parse(MIN_DAGSTER_DBT_VERSION):
|
|
139
164
|
upgrade_message = f" Extracting cost information requires dagster_dbt>={MIN_DAGSTER_DBT_VERSION} (found {dagster_dbt_version}). "
|
|
@@ -149,14 +174,16 @@ def dbt_with_bigquery_insights(
|
|
|
149
174
|
cost_by_asset = defaultdict(list)
|
|
150
175
|
try:
|
|
151
176
|
with adapter.connection_named("dagster_insights:bigquery_cost"):
|
|
152
|
-
client:
|
|
153
|
-
|
|
177
|
+
client: bigquery.Client = adapter.connections.get_thread_connection().handle # pyright: ignore[reportAssignmentType]
|
|
178
|
+
|
|
179
|
+
if (client.location or adapter.config.credentials.location) and client.project:
|
|
154
180
|
# we should populate the location/project from the client, and use that to determine
|
|
155
181
|
# the correct INFORMATION_SCHEMA.JOBS table to query for cost information
|
|
156
|
-
|
|
182
|
+
# If the client doesn't have a location, fall back to the location provided
|
|
183
|
+
# in the dbt profile config
|
|
184
|
+
location = client.location or adapter.config.credentials.location
|
|
157
185
|
project = client.project
|
|
158
186
|
else:
|
|
159
|
-
# try fetching the default dataset from the schema, if it exists
|
|
160
187
|
dataset = client.get_dataset(adapter.config.credentials.schema)
|
|
161
188
|
location = dataset.location if dataset else None
|
|
162
189
|
project = client.project or dataset.project
|
|
@@ -1,16 +1,18 @@
|
|
|
1
|
+
from collections.abc import Iterator
|
|
1
2
|
from contextlib import contextmanager, nullcontext
|
|
2
|
-
from typing import Iterator, List
|
|
3
3
|
|
|
4
4
|
from dagster import AssetObservation
|
|
5
|
-
from dagster._annotations import
|
|
5
|
+
from dagster._annotations import beta
|
|
6
6
|
from dagster_gcp import BigQueryResource
|
|
7
7
|
from dagster_gcp.bigquery.utils import setup_gcp_creds
|
|
8
8
|
from google.cloud import bigquery
|
|
9
9
|
|
|
10
|
+
from dagster_cloud.dagster_insights.bigquery.bigquery_utils import (
|
|
11
|
+
build_bigquery_cost_metadata,
|
|
12
|
+
marker_asset_key_for_job,
|
|
13
|
+
)
|
|
10
14
|
from dagster_cloud.dagster_insights.insights_utils import get_current_context_and_asset_key
|
|
11
15
|
|
|
12
|
-
from .bigquery_utils import build_bigquery_cost_metadata, marker_asset_key_for_job
|
|
13
|
-
|
|
14
16
|
OUTPUT_NON_ASSET_SIGIL = "__bigquery_query_metadata_"
|
|
15
17
|
|
|
16
18
|
|
|
@@ -29,7 +31,7 @@ class WrappedBigQueryClient(bigquery.Client):
|
|
|
29
31
|
return bq_job
|
|
30
32
|
|
|
31
33
|
@property
|
|
32
|
-
def job_ids(self) ->
|
|
34
|
+
def job_ids(self) -> list[str]:
|
|
33
35
|
return self._job_ids
|
|
34
36
|
|
|
35
37
|
@property
|
|
@@ -41,7 +43,7 @@ class WrappedBigQueryClient(bigquery.Client):
|
|
|
41
43
|
return sum([x for x in self._query_slots_ms])
|
|
42
44
|
|
|
43
45
|
|
|
44
|
-
@
|
|
46
|
+
@beta
|
|
45
47
|
class InsightsBigQueryResource(BigQueryResource):
|
|
46
48
|
"""A wrapper around :py:class:`BigQueryResource` which automatically collects metadata about
|
|
47
49
|
BigQuery costs which can be attributed to Dagster jobs and assets.
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
from dataclasses import replace
|
|
2
|
-
from typing import Optional,
|
|
2
|
+
from typing import Optional, Union
|
|
3
3
|
|
|
4
4
|
import dagster._check as check
|
|
5
5
|
from dagster import (
|
|
6
|
+
AssetCheckEvaluation,
|
|
6
7
|
AssetCheckResult,
|
|
7
8
|
AssetExecutionContext,
|
|
8
9
|
AssetKey,
|
|
@@ -15,9 +16,9 @@ from dagster import (
|
|
|
15
16
|
from dagster._core.errors import DagsterInvalidPropertyError
|
|
16
17
|
|
|
17
18
|
|
|
18
|
-
def get_current_context_and_asset_key() ->
|
|
19
|
-
|
|
20
|
-
|
|
19
|
+
def get_current_context_and_asset_key() -> tuple[
|
|
20
|
+
Union[OpExecutionContext, AssetExecutionContext], Optional[AssetKey]
|
|
21
|
+
]:
|
|
21
22
|
asset_key = None
|
|
22
23
|
try:
|
|
23
24
|
context = AssetExecutionContext.get()
|
|
@@ -32,7 +33,7 @@ def get_current_context_and_asset_key() -> (
|
|
|
32
33
|
def get_asset_key_for_output(
|
|
33
34
|
context: Union[OpExecutionContext, AssetExecutionContext], output_name: str
|
|
34
35
|
) -> Optional[AssetKey]:
|
|
35
|
-
asset_key = context.job_def.asset_layer.
|
|
36
|
+
asset_key = context.job_def.asset_layer.get_asset_key_for_node_output(
|
|
36
37
|
node_handle=context.op_handle, output_name=output_name
|
|
37
38
|
)
|
|
38
39
|
if asset_key is None:
|
|
@@ -40,15 +41,24 @@ def get_asset_key_for_output(
|
|
|
40
41
|
return asset_key
|
|
41
42
|
|
|
42
43
|
|
|
43
|
-
def extract_asset_info_from_event(
|
|
44
|
+
def extract_asset_info_from_event(
|
|
45
|
+
context,
|
|
46
|
+
dagster_event: Union[
|
|
47
|
+
Output, AssetMaterialization, AssetObservation, AssetCheckResult, AssetCheckEvaluation
|
|
48
|
+
],
|
|
49
|
+
record_observation_usage,
|
|
50
|
+
):
|
|
44
51
|
if isinstance(dagster_event, AssetMaterialization):
|
|
45
52
|
return dagster_event.asset_key, dagster_event.partition
|
|
46
53
|
|
|
47
|
-
if
|
|
54
|
+
if (
|
|
55
|
+
isinstance(dagster_event, (AssetCheckResult, AssetObservation, AssetCheckEvaluation))
|
|
56
|
+
and record_observation_usage
|
|
57
|
+
):
|
|
48
58
|
partition = dagster_event.partition if isinstance(dagster_event, AssetObservation) else None
|
|
49
59
|
return dagster_event.asset_key, partition
|
|
50
60
|
|
|
51
|
-
if isinstance(dagster_event, (AssetCheckResult, AssetObservation)):
|
|
61
|
+
if isinstance(dagster_event, (AssetCheckResult, AssetObservation, AssetCheckEvaluation)):
|
|
52
62
|
return None, None
|
|
53
63
|
|
|
54
64
|
if isinstance(dagster_event, Output):
|
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import tempfile
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import NamedTuple, Optional, Union
|
|
4
4
|
|
|
5
5
|
import requests
|
|
6
6
|
from dagster import AssetExecutionContext, DagsterInstance, OpExecutionContext
|
|
7
|
-
from dagster._annotations import
|
|
7
|
+
from dagster._annotations import beta
|
|
8
8
|
from dagster_cloud_cli.core.errors import raise_http_error
|
|
9
9
|
from dagster_cloud_cli.core.headers.auth import DagsterCloudInstanceScope
|
|
10
10
|
|
|
11
11
|
from dagster_cloud.instance import DagsterCloudAgentInstance
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
@
|
|
14
|
+
@beta
|
|
15
15
|
class DagsterMetric(NamedTuple):
|
|
16
|
-
"""
|
|
16
|
+
"""Beta: This class gives information about a Metric.
|
|
17
17
|
|
|
18
18
|
Args:
|
|
19
19
|
metric_name (str): name of the metric
|
|
@@ -24,16 +24,16 @@ class DagsterMetric(NamedTuple):
|
|
|
24
24
|
metric_value: float
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
def get_url_and_token_from_instance(instance: DagsterInstance) ->
|
|
27
|
+
def get_url_and_token_from_instance(instance: DagsterInstance) -> tuple[str, str]:
|
|
28
28
|
if not isinstance(instance, DagsterCloudAgentInstance):
|
|
29
29
|
raise RuntimeError("This asset only functions in a running Dagster Cloud instance")
|
|
30
30
|
|
|
31
31
|
return f"{instance.dagit_url}graphql", instance.dagster_cloud_agent_token
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def
|
|
34
|
+
def get_insights_upload_request_params(
|
|
35
35
|
instance: DagsterInstance,
|
|
36
|
-
) ->
|
|
36
|
+
) -> tuple[requests.Session, str, dict[str, str], int, Optional[dict[str, str]]]:
|
|
37
37
|
if not isinstance(instance, DagsterCloudAgentInstance):
|
|
38
38
|
raise RuntimeError("This asset only functions in a running Dagster Cloud instance")
|
|
39
39
|
|
|
@@ -49,7 +49,7 @@ def get_post_request_params(
|
|
|
49
49
|
def upload_cost_information(
|
|
50
50
|
context: Union[OpExecutionContext, AssetExecutionContext],
|
|
51
51
|
metric_name: str,
|
|
52
|
-
cost_information:
|
|
52
|
+
cost_information: list[tuple[str, float, str]],
|
|
53
53
|
):
|
|
54
54
|
import pyarrow as pa
|
|
55
55
|
import pyarrow.parquet as pq
|
|
@@ -70,9 +70,9 @@ def upload_cost_information(
|
|
|
70
70
|
)
|
|
71
71
|
|
|
72
72
|
instance = context.instance
|
|
73
|
-
session, url, headers, timeout, proxies =
|
|
73
|
+
session, url, headers, timeout, proxies = get_insights_upload_request_params(instance)
|
|
74
74
|
|
|
75
|
-
resp = session.
|
|
75
|
+
resp = session.get(url, headers=headers, timeout=timeout, proxies=proxies)
|
|
76
76
|
raise_http_error(resp)
|
|
77
77
|
resp_data = resp.json()
|
|
78
78
|
|
|
@@ -86,11 +86,11 @@ def upload_cost_information(
|
|
|
86
86
|
)
|
|
87
87
|
|
|
88
88
|
|
|
89
|
-
@
|
|
89
|
+
@beta
|
|
90
90
|
def put_cost_information(
|
|
91
91
|
context: Union[OpExecutionContext, AssetExecutionContext],
|
|
92
92
|
metric_name: str,
|
|
93
|
-
cost_information:
|
|
93
|
+
cost_information: list[tuple[str, float, str]],
|
|
94
94
|
start: float,
|
|
95
95
|
end: float,
|
|
96
96
|
) -> None:
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
|
+
from collections.abc import Sequence
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from datetime import datetime, timedelta
|
|
4
|
-
from typing import TYPE_CHECKING,
|
|
5
|
+
from typing import TYPE_CHECKING, Optional
|
|
5
6
|
|
|
6
7
|
from dagster import AssetKey, AssetsDefinition, ScheduleDefinition
|
|
7
8
|
|
|
8
|
-
from .snowflake_utils import OPAQUE_ID_SQL_SIGIL
|
|
9
|
+
from dagster_cloud.dagster_insights.snowflake.snowflake_utils import OPAQUE_ID_SQL_SIGIL
|
|
9
10
|
|
|
10
11
|
if TYPE_CHECKING:
|
|
11
12
|
from dagster_snowflake import SnowflakeConnection
|
|
@@ -34,7 +35,7 @@ def get_cost_data_for_hour(
|
|
|
34
35
|
snowflake: "SnowflakeConnection",
|
|
35
36
|
start_hour: datetime,
|
|
36
37
|
end_hour: datetime,
|
|
37
|
-
) ->
|
|
38
|
+
) -> list[tuple[str, float, str]]:
|
|
38
39
|
"""Given a date range, queries the Snowflake query_history table for all queries that were run
|
|
39
40
|
during that time period and returns a mapping from AssetMaterializationId to the cost of the
|
|
40
41
|
query that produced it, as estimated by Snowflake. The cost is in Snowflake credits.
|
|
@@ -75,11 +76,7 @@ HAVING ARRAY_SIZE(opaque_ids) > 0
|
|
|
75
76
|
assert result
|
|
76
77
|
results = result.fetchall()
|
|
77
78
|
|
|
78
|
-
costs:
|
|
79
|
-
|
|
80
|
-
print(
|
|
81
|
-
f"{len(results) if results else 0} annotated queries returned from snowflake query_history"
|
|
82
|
-
)
|
|
79
|
+
costs: list[tuple[str, float, str]] = []
|
|
83
80
|
|
|
84
81
|
if not results:
|
|
85
82
|
return []
|
|
@@ -93,8 +90,4 @@ HAVING ARRAY_SIZE(opaque_ids) > 0
|
|
|
93
90
|
for opaque_id in opaque_ids:
|
|
94
91
|
costs.append((opaque_id, float(cost), query_id))
|
|
95
92
|
|
|
96
|
-
print(
|
|
97
|
-
f"Reported costs for {len(costs)} of {total} asset materializations found in the"
|
|
98
|
-
" query_history."
|
|
99
|
-
)
|
|
100
93
|
return costs
|