contextbase-shared-plugins 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextbase_shared_plugins-0.2.3.dist-info/METADATA +22 -0
- contextbase_shared_plugins-0.2.3.dist-info/RECORD +37 -0
- contextbase_shared_plugins-0.2.3.dist-info/WHEEL +4 -0
- shared_plugins/__init__.py +12 -0
- shared_plugins/automation.py +11 -0
- shared_plugins/bindings.py +253 -0
- shared_plugins/control_plane.py +208 -0
- shared_plugins/dlt.py +84 -0
- shared_plugins/env.py +102 -0
- shared_plugins/exceptions.py +10 -0
- shared_plugins/google_client/__init__.py +1 -0
- shared_plugins/google_client/auth.py +82 -0
- shared_plugins/google_client/batch_retry.py +308 -0
- shared_plugins/google_client/http_errors.py +27 -0
- shared_plugins/microsoft_dataverse/__init__.py +27 -0
- shared_plugins/microsoft_dataverse/annotations.py +38 -0
- shared_plugins/microsoft_dataverse/auth.py +26 -0
- shared_plugins/microsoft_dataverse/binding_config.py +35 -0
- shared_plugins/microsoft_dataverse/client.py +456 -0
- shared_plugins/microsoft_dataverse/ctx.py +21 -0
- shared_plugins/microsoft_dataverse/identifiers.py +62 -0
- shared_plugins/microsoft_dataverse/ingress.py +53 -0
- shared_plugins/microsoft_dataverse/metadata.py +106 -0
- shared_plugins/microsoft_dataverse/runtime_schema.py +332 -0
- shared_plugins/microsoft_dataverse/source.py +250 -0
- shared_plugins/microsoft_dataverse/tables.py +34 -0
- shared_plugins/microsoft_dataverse/translators.py +128 -0
- shared_plugins/microsoft_dataverse/types.py +346 -0
- shared_plugins/models.py +91 -0
- shared_plugins/naming.py +83 -0
- shared_plugins/pg_column_comments.py +59 -0
- shared_plugins/pyairbyte.py +399 -0
- shared_plugins/resources.py +179 -0
- shared_plugins/scratch.py +127 -0
- shared_plugins/sqlalchemy_types.py +225 -0
- shared_plugins/sqlite.py +123 -0
- shared_plugins/values.py +117 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Monkey-patch PostgresClient to emit COMMENT ON COLUMN for dlt column descriptions.
|
|
2
|
+
|
|
3
|
+
dlt's ``TColumnSchema`` supports a ``description`` field, but the Postgres destination
|
|
4
|
+
does not act on it. Databricks and Snowflake do (inline ``COMMENT`` in DDL). Postgres
|
|
5
|
+
uses a separate ``COMMENT ON COLUMN`` statement, so we hook into
|
|
6
|
+
PostgresClient's ``_build_schema_update_sql`` to append COMMENT statements for the
|
|
7
|
+
same tables dlt is updating — regardless of whether those tables have new columns.
|
|
8
|
+
|
|
9
|
+
ctxb docs reads ``COMMENT ON COLUMN`` metadata and surfaces it in generated schema docs
|
|
10
|
+
that are injected into the agent context. This closes the loop:
|
|
11
|
+
|
|
12
|
+
dlt schema (description) → COMMENT ON COLUMN → ctxb docs → agent prompt
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from typing import Iterable, List, Tuple
|
|
18
|
+
|
|
19
|
+
from dlt.common.data_writers.escape import (
|
|
20
|
+
escape_postgres_identifier,
|
|
21
|
+
escape_postgres_literal,
|
|
22
|
+
)
|
|
23
|
+
from dlt.common.schema.typing import (
|
|
24
|
+
TSchemaTables,
|
|
25
|
+
TTableSchemaColumns,
|
|
26
|
+
)
|
|
27
|
+
from dlt.destinations.impl.postgres.postgres import PostgresClient
|
|
28
|
+
|
|
29
|
+
_original_build_schema_update_sql = PostgresClient._build_schema_update_sql
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _build_schema_update_sql_with_comments(
|
|
33
|
+
self: PostgresClient,
|
|
34
|
+
storage_tables: Iterable[Tuple[str, TTableSchemaColumns]],
|
|
35
|
+
) -> Tuple[List[str], TSchemaTables]:
|
|
36
|
+
storage_tables = list(storage_tables)
|
|
37
|
+
sql_updates, schema_update = _original_build_schema_update_sql(self, storage_tables)
|
|
38
|
+
|
|
39
|
+
dlt_table_names = set(self.schema.dlt_table_names())
|
|
40
|
+
for table_name, _storage_columns in storage_tables:
|
|
41
|
+
if table_name in dlt_table_names:
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
full_table = self.prepare_load_table(table_name)
|
|
45
|
+
qualified_table = self.sql_client.make_qualified_table_name(table_name)
|
|
46
|
+
for col in full_table["columns"].values():
|
|
47
|
+
description = col.get("description")
|
|
48
|
+
comment_literal = (
|
|
49
|
+
escape_postgres_literal(description) if description else "NULL"
|
|
50
|
+
)
|
|
51
|
+
col_name = escape_postgres_identifier(col["name"])
|
|
52
|
+
sql_updates.append(
|
|
53
|
+
f"COMMENT ON COLUMN {qualified_table}.{col_name} IS {comment_literal}"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
return sql_updates, schema_update
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
PostgresClient._build_schema_update_sql = _build_schema_update_sql_with_comments # type: ignore[method-assign]
|
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections.abc import Mapping, Sequence
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from math import isfinite
|
|
7
|
+
from typing import Any, ClassVar, cast
|
|
8
|
+
|
|
9
|
+
import airbyte as ab
|
|
10
|
+
import dagster as dg
|
|
11
|
+
import sqlalchemy
|
|
12
|
+
from dagster import AssetExecutionContext
|
|
13
|
+
from airbyte._processors.sql.postgres import PostgresSqlProcessor
|
|
14
|
+
from airbyte.caches.postgres import PostgresCache
|
|
15
|
+
from airbyte.progress import ProgressTracker
|
|
16
|
+
from airbyte.records import StreamRecordHandler
|
|
17
|
+
from airbyte_protocol.models import AirbyteRecordMessage
|
|
18
|
+
from pydantic import Field
|
|
19
|
+
from sqlalchemy.engine import make_url
|
|
20
|
+
from sqlalchemy.types import TypeEngine
|
|
21
|
+
|
|
22
|
+
from shared_types.dagster_binding_plan import DagsterAllPlanBinding
|
|
23
|
+
|
|
24
|
+
from .env import load_shared_python_settings
|
|
25
|
+
from .exceptions import PluginConfigurationError
|
|
26
|
+
from .naming import dagster_airbyte_sync_asset_key
|
|
27
|
+
from .values import parse_utc_datetime_from_str
|
|
28
|
+
|
|
29
|
+
_CTX_BINDING_ID_COLUMN = "_ctx_binding_id"
|
|
30
|
+
_CTX_SOURCE_UPDATED_AT_COLUMN = "_ctx_source_updated_at"
|
|
31
|
+
_CTX_BINDING_ID_SCHEMA: dict[str, object] = {"type": "string"}
|
|
32
|
+
# WARNING: This *looks* like it produces a Postgres timestamptz column, but
|
|
33
|
+
# pyairbyte/airbyte currently drops the timezone flag in its DDL path.
|
|
34
|
+
#
|
|
35
|
+
# In airbyte/shared/sql_processor.py, `_create_table_for_loading()`,
|
|
36
|
+
# `_ensure_final_table_exists()`, and `_add_column_to_table()` render SQLAlchemy
|
|
37
|
+
# types via `f"{sql_type}"` / `f"{column_type}"` instead of dialect-aware
|
|
38
|
+
# compilation. For `sqlalchemy.TIMESTAMP(timezone=True)`, `str(...)` is just
|
|
39
|
+
# `"TIMESTAMP"`, so Postgres creates `timestamp without time zone` instead of
|
|
40
|
+
# `TIMESTAMP WITH TIME ZONE`.
|
|
41
|
+
#
|
|
42
|
+
# This means `_ctx_source_updated_at` can be stored incorrectly: psycopg/Postgres
|
|
43
|
+
# will coerce aware datetimes through the session timezone and then drop tz info.
|
|
44
|
+
# The value may therefore vary by session/database timezone instead of preserving a
|
|
45
|
+
# stable absolute instant.
|
|
46
|
+
#
|
|
47
|
+
# We are documenting this intentionally rather than fixing it right now. A future
|
|
48
|
+
# fix should keep the upstream catalog/row-shaping behavior, but override the DDL
|
|
49
|
+
# rendering path so these types compile with the Postgres dialect (for example,
|
|
50
|
+
# `TIMESTAMP(timezone=True)` -> `TIMESTAMP WITH TIME ZONE`) before creating or
|
|
51
|
+
# altering tables.
|
|
52
|
+
_CTX_SOURCE_UPDATED_AT_SCHEMA: dict[str, object] = {
|
|
53
|
+
"type": ["string", "null"],
|
|
54
|
+
"format": "date-time",
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _coerce_source_updated_at(value: object) -> datetime | None:
|
|
59
|
+
if value is None:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
if isinstance(value, bool):
|
|
63
|
+
raise RuntimeError("_ctx_source_updated_at does not accept bool values.")
|
|
64
|
+
|
|
65
|
+
if isinstance(value, datetime):
|
|
66
|
+
if value.tzinfo is None or value.utcoffset() is None:
|
|
67
|
+
return value.replace(tzinfo=timezone.utc)
|
|
68
|
+
return value.astimezone(timezone.utc)
|
|
69
|
+
|
|
70
|
+
if isinstance(value, int):
|
|
71
|
+
return _coerce_unix_timestamp(value)
|
|
72
|
+
|
|
73
|
+
if isinstance(value, float):
|
|
74
|
+
if not isfinite(value):
|
|
75
|
+
raise RuntimeError(
|
|
76
|
+
"_ctx_source_updated_at does not accept non-finite unix timestamps."
|
|
77
|
+
)
|
|
78
|
+
return _coerce_unix_timestamp(value)
|
|
79
|
+
|
|
80
|
+
if isinstance(value, str):
|
|
81
|
+
try:
|
|
82
|
+
return parse_utc_datetime_from_str(value)
|
|
83
|
+
except ValueError as exc:
|
|
84
|
+
raise RuntimeError(
|
|
85
|
+
f"_ctx_source_updated_at must be an ISO-8601 timestamp, got {value!r}."
|
|
86
|
+
) from exc
|
|
87
|
+
|
|
88
|
+
raise RuntimeError(
|
|
89
|
+
"_ctx_source_updated_at must be None, a datetime, an ISO-8601 timestamp string, "
|
|
90
|
+
f"or a unix timestamp, got {type(value).__name__}."
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _coerce_unix_timestamp(value: int | float) -> datetime | None:
|
|
95
|
+
if value == 0:
|
|
96
|
+
return None
|
|
97
|
+
if value < 0:
|
|
98
|
+
raise RuntimeError(
|
|
99
|
+
"_ctx_source_updated_at does not accept negative unix timestamps."
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
return datetime.fromtimestamp(value, tz=timezone.utc)
|
|
104
|
+
except (OverflowError, OSError, ValueError) as exc:
|
|
105
|
+
raise RuntimeError(
|
|
106
|
+
"_ctx_source_updated_at must be a valid unix timestamp."
|
|
107
|
+
) from exc
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _build_source_updated_at_fields(
|
|
111
|
+
stream_names: Sequence[str],
|
|
112
|
+
source_updated_at_fields: Mapping[str, str | None] | None,
|
|
113
|
+
) -> dict[str, str | None]:
|
|
114
|
+
return {
|
|
115
|
+
stream_name: (
|
|
116
|
+
None
|
|
117
|
+
if source_updated_at_fields is None
|
|
118
|
+
else source_updated_at_fields.get(stream_name)
|
|
119
|
+
)
|
|
120
|
+
for stream_name in stream_names
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _extract_source_updated_at_value(
|
|
125
|
+
*,
|
|
126
|
+
stream_name: str,
|
|
127
|
+
record_data: Mapping[str, Any],
|
|
128
|
+
source_updated_at_field: str | None,
|
|
129
|
+
) -> object:
|
|
130
|
+
if source_updated_at_field is None:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
if source_updated_at_field not in record_data:
|
|
134
|
+
raise RuntimeError(
|
|
135
|
+
f"PyAirbyte stream '{stream_name}' declared SOURCE_UPDATED_AT_FIELDS[{stream_name!r}] = "
|
|
136
|
+
f"{source_updated_at_field!r}, but that field was not present in the record payload."
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
return record_data[source_updated_at_field]
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _with_ctx_binding_primary_key(
|
|
143
|
+
*,
|
|
144
|
+
stream_name: str,
|
|
145
|
+
source_primary_key: Sequence[Sequence[str]] | None,
|
|
146
|
+
) -> list[list[str]]:
|
|
147
|
+
if not source_primary_key:
|
|
148
|
+
raise RuntimeError(
|
|
149
|
+
f"PyAirbyte stream '{stream_name}' does not define a primary key. "
|
|
150
|
+
"ContextBase requires source-defined keys so _ctx_binding_id can be prepended safely."
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
normalized_primary_key = [[_CTX_BINDING_ID_COLUMN]]
|
|
154
|
+
for field_path in source_primary_key:
|
|
155
|
+
if len(field_path) != 1:
|
|
156
|
+
raise RuntimeError(
|
|
157
|
+
f"PyAirbyte stream '{stream_name}' uses nested primary keys {field_path!r}. "
|
|
158
|
+
"ContextBase only supports top-level primary-key columns."
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
field_name = field_path[0]
|
|
162
|
+
if field_name == _CTX_BINDING_ID_COLUMN:
|
|
163
|
+
continue
|
|
164
|
+
normalized_primary_key.append([field_name])
|
|
165
|
+
|
|
166
|
+
return normalized_primary_key
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _apply_ctx_catalog_contract(
|
|
170
|
+
*,
|
|
171
|
+
source: Any,
|
|
172
|
+
selected_stream_names: Sequence[str],
|
|
173
|
+
) -> None:
|
|
174
|
+
selected_stream_name_set = set(selected_stream_names)
|
|
175
|
+
applied_stream_names: set[str] = set()
|
|
176
|
+
|
|
177
|
+
for stream in source.discovered_catalog.streams:
|
|
178
|
+
if stream.name not in selected_stream_name_set:
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
json_schema = stream.json_schema
|
|
182
|
+
properties = json_schema.setdefault("properties", {})
|
|
183
|
+
if not isinstance(properties, dict):
|
|
184
|
+
raise RuntimeError(
|
|
185
|
+
f"PyAirbyte stream '{stream.name}' JSON schema must define a top-level "
|
|
186
|
+
"properties mapping."
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
properties[_CTX_BINDING_ID_COLUMN] = dict(_CTX_BINDING_ID_SCHEMA)
|
|
190
|
+
properties[_CTX_SOURCE_UPDATED_AT_COLUMN] = dict(_CTX_SOURCE_UPDATED_AT_SCHEMA)
|
|
191
|
+
stream.source_defined_primary_key = _with_ctx_binding_primary_key(
|
|
192
|
+
stream_name=stream.name,
|
|
193
|
+
source_primary_key=stream.source_defined_primary_key,
|
|
194
|
+
)
|
|
195
|
+
applied_stream_names.add(stream.name)
|
|
196
|
+
|
|
197
|
+
missing_stream_names = selected_stream_name_set - applied_stream_names
|
|
198
|
+
if missing_stream_names:
|
|
199
|
+
missing_stream_names_str = ", ".join(sorted(missing_stream_names))
|
|
200
|
+
raise RuntimeError(
|
|
201
|
+
"PyAirbyte selected streams were not found in the discovered catalog: "
|
|
202
|
+
f"{missing_stream_names_str}."
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _normalize_record_data_for_sql_columns(
|
|
207
|
+
*,
|
|
208
|
+
record_data: dict[str, Any],
|
|
209
|
+
column_definitions: dict[str, TypeEngine[Any]],
|
|
210
|
+
) -> None:
|
|
211
|
+
for key, value in record_data.items():
|
|
212
|
+
if not isinstance(value, (dict, list)):
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
column_type = column_definitions.get(key)
|
|
216
|
+
if isinstance(column_type, sqlalchemy.types.JSON):
|
|
217
|
+
continue
|
|
218
|
+
if isinstance(column_type, sqlalchemy.types.String):
|
|
219
|
+
record_data[key] = json.dumps(value)
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
raise RuntimeError(
|
|
223
|
+
f"PyAirbyte record field {key!r} has structured "
|
|
224
|
+
f"{type(value).__name__} data for unsupported SQL column type "
|
|
225
|
+
f"{type(column_type).__name__ if column_type is not None else 'None'}."
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class CtxPostgresProcessor(PostgresSqlProcessor):
|
|
230
|
+
def process_record_message(
|
|
231
|
+
self,
|
|
232
|
+
record_msg: AirbyteRecordMessage,
|
|
233
|
+
stream_record_handler: StreamRecordHandler,
|
|
234
|
+
progress_tracker: ProgressTracker,
|
|
235
|
+
) -> None:
|
|
236
|
+
cache = cast(CtxPostgresCache, self.sql_config)
|
|
237
|
+
record_data = record_msg.data
|
|
238
|
+
record_data[_CTX_BINDING_ID_COLUMN] = cache.ctx_binding_id
|
|
239
|
+
|
|
240
|
+
source_updated_at_field = cache.ctx_source_updated_at_fields.get(
|
|
241
|
+
record_msg.stream
|
|
242
|
+
)
|
|
243
|
+
raw_source_updated_at = _extract_source_updated_at_value(
|
|
244
|
+
stream_name=record_msg.stream,
|
|
245
|
+
record_data=record_data,
|
|
246
|
+
source_updated_at_field=source_updated_at_field,
|
|
247
|
+
)
|
|
248
|
+
record_data[_CTX_SOURCE_UPDATED_AT_COLUMN] = _coerce_source_updated_at(
|
|
249
|
+
raw_source_updated_at
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
column_definitions = self._get_sql_column_definitions(record_msg.stream)
|
|
253
|
+
_normalize_record_data_for_sql_columns(
|
|
254
|
+
record_data=record_data,
|
|
255
|
+
column_definitions=column_definitions,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
super().process_record_message(
|
|
259
|
+
record_msg,
|
|
260
|
+
stream_record_handler,
|
|
261
|
+
progress_tracker,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class CtxPostgresCache(PostgresCache):
|
|
266
|
+
_sql_processor_class: ClassVar[type] = CtxPostgresProcessor
|
|
267
|
+
|
|
268
|
+
ctx_binding_id: str = ""
|
|
269
|
+
ctx_source_updated_at_fields: dict[str, str | None] = Field(default_factory=dict)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _build_cache(
|
|
273
|
+
*,
|
|
274
|
+
schema_name: str,
|
|
275
|
+
binding_id: str,
|
|
276
|
+
source_updated_at_fields: dict[str, str | None],
|
|
277
|
+
) -> CtxPostgresCache:
|
|
278
|
+
database_url = load_shared_python_settings().ctx_database_url
|
|
279
|
+
parsed_url = make_url(database_url)
|
|
280
|
+
|
|
281
|
+
return CtxPostgresCache(
|
|
282
|
+
host=parsed_url.host or "localhost",
|
|
283
|
+
port=parsed_url.port or 5432,
|
|
284
|
+
database=parsed_url.database,
|
|
285
|
+
username=parsed_url.username or "postgres",
|
|
286
|
+
password=parsed_url.password or "",
|
|
287
|
+
schema_name=schema_name,
|
|
288
|
+
ctx_binding_id=binding_id,
|
|
289
|
+
ctx_source_updated_at_fields=source_updated_at_fields,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def build_pyairbyte_source(
|
|
294
|
+
*,
|
|
295
|
+
docker_image: str,
|
|
296
|
+
connector_config: dict[str, Any],
|
|
297
|
+
) -> Any:
|
|
298
|
+
image_name = docker_image.split(":")[0].split("/")[-1]
|
|
299
|
+
return ab.get_source(
|
|
300
|
+
image_name,
|
|
301
|
+
docker_image=docker_image,
|
|
302
|
+
config=connector_config,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def require_pyairbyte_selected_stream_names(
|
|
307
|
+
binding: DagsterAllPlanBinding,
|
|
308
|
+
) -> tuple[str, ...]:
|
|
309
|
+
if binding.models is not None and binding.models.filter is not None:
|
|
310
|
+
raise PluginConfigurationError(
|
|
311
|
+
f"{binding.plugin_id} models.filter is not supported for PyAirbyte connectors."
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
if binding.models is None or binding.models.active is None:
|
|
315
|
+
raise PluginConfigurationError(
|
|
316
|
+
f"{binding.plugin_id} models.active must include at least one stream."
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
selected_stream_names = tuple(binding.models.active)
|
|
320
|
+
if not selected_stream_names:
|
|
321
|
+
raise PluginConfigurationError(
|
|
322
|
+
f"{binding.plugin_id} models.active must include at least one stream."
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
return selected_stream_names
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def run_pyairbyte_sync(
|
|
329
|
+
*,
|
|
330
|
+
context: AssetExecutionContext,
|
|
331
|
+
plugin_id: str,
|
|
332
|
+
binding_id: str,
|
|
333
|
+
selected_stream_names: Sequence[str],
|
|
334
|
+
source_updated_at_fields: Mapping[str, str | None] | None = None,
|
|
335
|
+
force_full_refresh: bool = False,
|
|
336
|
+
source: Any,
|
|
337
|
+
) -> dg.MaterializeResult:
|
|
338
|
+
ordered_selected_stream_names = tuple(selected_stream_names)
|
|
339
|
+
context.log.info(
|
|
340
|
+
f"Starting {plugin_id} sync for binding_id={binding_id} "
|
|
341
|
+
f"selected_streams={list(ordered_selected_stream_names)}"
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
cache = _build_cache(
|
|
345
|
+
schema_name=plugin_id,
|
|
346
|
+
binding_id=binding_id,
|
|
347
|
+
source_updated_at_fields=_build_source_updated_at_fields(
|
|
348
|
+
ordered_selected_stream_names,
|
|
349
|
+
source_updated_at_fields,
|
|
350
|
+
),
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
_apply_ctx_catalog_contract(
|
|
354
|
+
source=source,
|
|
355
|
+
selected_stream_names=ordered_selected_stream_names,
|
|
356
|
+
)
|
|
357
|
+
# Airbyte's internal sync model lives below this call. Our runtime does not
|
|
358
|
+
# choose per-stream incremental vs full-refresh policy; it only chooses:
|
|
359
|
+
# 1. which raw stream IDs to pass to Source.read(streams=[...])
|
|
360
|
+
# 2. whether to request force_full_refresh=True
|
|
361
|
+
#
|
|
362
|
+
# Verified against the installed PyAirbyte package in this repo:
|
|
363
|
+
# - `airbyte/sources/base.py:420`
|
|
364
|
+
# `Source.get_configured_catalog(...)` builds each configured stream and
|
|
365
|
+
# assigns `sync_mode`.
|
|
366
|
+
# - `airbyte/sources/base.py:449`
|
|
367
|
+
# When `force_full_refresh=True`, PyAirbyte prefers `full_refresh` if the
|
|
368
|
+
# stream supports it; otherwise it uses `incremental`.
|
|
369
|
+
# - `airbyte/sources/base.py:873`
|
|
370
|
+
# Full refresh also disables state loading by setting
|
|
371
|
+
# `state_provider = None`.
|
|
372
|
+
# - `airbyte/caches/_state_backend.py:31`
|
|
373
|
+
# Incremental state is stored in `_airbyte_state`.
|
|
374
|
+
#
|
|
375
|
+
# So the repo seam stays intentionally narrow: `binding.models.active`
|
|
376
|
+
# selects streams, and `force_full_refresh` is the only runtime override.
|
|
377
|
+
source.read(
|
|
378
|
+
cache=cache,
|
|
379
|
+
streams=list(ordered_selected_stream_names),
|
|
380
|
+
force_full_refresh=force_full_refresh,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
context.log.info(
|
|
384
|
+
f"Completed {plugin_id} sync for binding_id={binding_id} "
|
|
385
|
+
f"selected_streams={list(ordered_selected_stream_names)}"
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
return dg.MaterializeResult(
|
|
389
|
+
asset_key=dagster_airbyte_sync_asset_key(plugin_id),
|
|
390
|
+
metadata={
|
|
391
|
+
"selected_streams": dg.MetadataValue.json(
|
|
392
|
+
list(ordered_selected_stream_names)
|
|
393
|
+
),
|
|
394
|
+
"selected_stream_count": dg.MetadataValue.int(
|
|
395
|
+
len(ordered_selected_stream_names)
|
|
396
|
+
),
|
|
397
|
+
"force_full_refresh": dg.MetadataValue.bool(force_full_refresh),
|
|
398
|
+
},
|
|
399
|
+
)
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable, Iterable, Mapping, Sequence
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from functools import wraps
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import dlt
|
|
9
|
+
from dagster_dlt import DagsterDltResource, DagsterDltTranslator
|
|
10
|
+
|
|
11
|
+
from .models import CtxModel
|
|
12
|
+
|
|
13
|
+
_CTX_METADATA_COLUMNS = (
|
|
14
|
+
"_ctx_binding_id",
|
|
15
|
+
"_ctx_source_updated_at",
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _normalize_primary_key(primary_key: object) -> tuple[str, ...]:
|
|
20
|
+
if isinstance(primary_key, str):
|
|
21
|
+
return (primary_key,)
|
|
22
|
+
if isinstance(primary_key, Sequence):
|
|
23
|
+
normalized = tuple(str(value) for value in primary_key)
|
|
24
|
+
if normalized:
|
|
25
|
+
return normalized
|
|
26
|
+
raise ValueError(
|
|
27
|
+
"ctx_dlt_* wrappers require a non-empty primary_key that starts with '_ctx_binding_id'."
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _validate_primary_key_prefix(primary_key: object) -> None:
|
|
32
|
+
normalized = _normalize_primary_key(primary_key)
|
|
33
|
+
if normalized[0] != "_ctx_binding_id":
|
|
34
|
+
raise ValueError(
|
|
35
|
+
"ctx_dlt_* wrappers require primary_key to start with '_ctx_binding_id'. "
|
|
36
|
+
f"Received: {normalized!r}"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _apply_max_table_nesting(kwargs: dict[str, Any]) -> None:
|
|
41
|
+
max_table_nesting = kwargs.pop("max_table_nesting", None)
|
|
42
|
+
if max_table_nesting not in (None, 0):
|
|
43
|
+
raise ValueError(
|
|
44
|
+
"ctx_dlt_* wrappers enforce max_table_nesting=0. Omit the argument or set it to 0."
|
|
45
|
+
)
|
|
46
|
+
kwargs["max_table_nesting"] = 0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _with_metadata_columns(columns: object) -> dict[str, Any]:
|
|
50
|
+
if columns is None:
|
|
51
|
+
merged_columns: dict[str, Any] = {}
|
|
52
|
+
elif isinstance(columns, Mapping):
|
|
53
|
+
merged_columns = deepcopy(dict(columns))
|
|
54
|
+
else:
|
|
55
|
+
raise TypeError("ctx_dlt_* wrappers expect 'columns' to be a mapping when set.")
|
|
56
|
+
|
|
57
|
+
merged_columns.setdefault("_ctx_source_updated_at", {"data_type": "timestamp"})
|
|
58
|
+
return merged_columns
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _validate_ctx_row(
|
|
62
|
+
row: object,
|
|
63
|
+
*,
|
|
64
|
+
wrapper_name: str,
|
|
65
|
+
resource_name: str,
|
|
66
|
+
) -> CtxModel:
|
|
67
|
+
if not isinstance(row, CtxModel):
|
|
68
|
+
raise TypeError(
|
|
69
|
+
f"{wrapper_name}('{resource_name}') emitted non-CtxModel row "
|
|
70
|
+
f"of type {type(row).__name__}."
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
serialized = row.model_dump(by_alias=True)
|
|
74
|
+
missing_columns = [name for name in _CTX_METADATA_COLUMNS if name not in serialized]
|
|
75
|
+
if missing_columns:
|
|
76
|
+
raise TypeError(
|
|
77
|
+
f"{wrapper_name}('{resource_name}') emitted CtxModel row missing required "
|
|
78
|
+
f"metadata columns: {', '.join(missing_columns)}."
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
binding_id = serialized.get("_ctx_binding_id")
|
|
82
|
+
if not isinstance(binding_id, str) or not binding_id.strip():
|
|
83
|
+
raise TypeError(
|
|
84
|
+
f"{wrapper_name}('{resource_name}') emitted row with empty '_ctx_binding_id'."
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
return row
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _iter_validated_rows(
|
|
91
|
+
emitted: object,
|
|
92
|
+
*,
|
|
93
|
+
wrapper_name: str,
|
|
94
|
+
resource_name: str,
|
|
95
|
+
) -> Iterable[CtxModel]:
|
|
96
|
+
if emitted is None:
|
|
97
|
+
return ()
|
|
98
|
+
|
|
99
|
+
if isinstance(emitted, CtxModel):
|
|
100
|
+
return (
|
|
101
|
+
_validate_ctx_row(
|
|
102
|
+
emitted, wrapper_name=wrapper_name, resource_name=resource_name
|
|
103
|
+
),
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if isinstance(emitted, (str, bytes, bytearray, Mapping)):
|
|
107
|
+
raise TypeError(
|
|
108
|
+
f"{wrapper_name}('{resource_name}') emitted invalid row container "
|
|
109
|
+
f"of type {type(emitted).__name__}."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if not isinstance(emitted, Iterable):
|
|
113
|
+
raise TypeError(
|
|
114
|
+
f"{wrapper_name}('{resource_name}') emitted non-iterable value "
|
|
115
|
+
f"of type {type(emitted).__name__}."
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def _generator() -> Iterable[CtxModel]:
|
|
119
|
+
for row in emitted:
|
|
120
|
+
yield _validate_ctx_row(
|
|
121
|
+
row,
|
|
122
|
+
wrapper_name=wrapper_name,
|
|
123
|
+
resource_name=resource_name,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
return _generator()
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _wrap_emitter(
|
|
130
|
+
func: Callable[..., object],
|
|
131
|
+
*,
|
|
132
|
+
wrapper_name: str,
|
|
133
|
+
) -> Callable[..., Iterable[CtxModel]]:
|
|
134
|
+
@wraps(func)
|
|
135
|
+
def wrapped(*args: Any, **kwargs: Any) -> Iterable[CtxModel]:
|
|
136
|
+
emitted = func(*args, **kwargs)
|
|
137
|
+
yield from _iter_validated_rows(
|
|
138
|
+
emitted,
|
|
139
|
+
wrapper_name=wrapper_name,
|
|
140
|
+
resource_name=func.__name__,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return wrapped
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def ctx_dlt_resource(**kwargs: Any) -> Callable[[Callable[..., object]], Any]:
|
|
147
|
+
decorator_kwargs = dict(kwargs)
|
|
148
|
+
_apply_max_table_nesting(decorator_kwargs)
|
|
149
|
+
_validate_primary_key_prefix(decorator_kwargs.get("primary_key"))
|
|
150
|
+
decorator_kwargs["columns"] = _with_metadata_columns(
|
|
151
|
+
decorator_kwargs.get("columns")
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def decorator(func: Callable[..., object]) -> Any:
|
|
155
|
+
return dlt.resource(**decorator_kwargs)(
|
|
156
|
+
_wrap_emitter(func, wrapper_name="ctx_dlt_resource")
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return decorator
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def ctx_dlt_transformer(**kwargs: Any) -> Callable[[Callable[..., object]], Any]:
|
|
163
|
+
decorator_kwargs = dict(kwargs)
|
|
164
|
+
_apply_max_table_nesting(decorator_kwargs)
|
|
165
|
+
_validate_primary_key_prefix(decorator_kwargs.get("primary_key"))
|
|
166
|
+
decorator_kwargs["columns"] = _with_metadata_columns(
|
|
167
|
+
decorator_kwargs.get("columns")
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
def decorator(func: Callable[..., object]) -> Any:
|
|
171
|
+
return dlt.transformer(**decorator_kwargs)(
|
|
172
|
+
_wrap_emitter(func, wrapper_name="ctx_dlt_transformer")
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return decorator
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
DLT_TRANSLATOR = DagsterDltTranslator()
|
|
179
|
+
DLT_RESOURCE = DagsterDltResource()
|