pytrilogy 0.3.149__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- LICENSE.md +19 -0
- _preql_import_resolver/__init__.py +5 -0
- _preql_import_resolver/_preql_import_resolver.cp313-win_amd64.pyd +0 -0
- pytrilogy-0.3.149.dist-info/METADATA +555 -0
- pytrilogy-0.3.149.dist-info/RECORD +207 -0
- pytrilogy-0.3.149.dist-info/WHEEL +4 -0
- pytrilogy-0.3.149.dist-info/entry_points.txt +2 -0
- pytrilogy-0.3.149.dist-info/licenses/LICENSE.md +19 -0
- trilogy/__init__.py +27 -0
- trilogy/ai/README.md +10 -0
- trilogy/ai/__init__.py +19 -0
- trilogy/ai/constants.py +92 -0
- trilogy/ai/conversation.py +107 -0
- trilogy/ai/enums.py +7 -0
- trilogy/ai/execute.py +50 -0
- trilogy/ai/models.py +34 -0
- trilogy/ai/prompts.py +100 -0
- trilogy/ai/providers/__init__.py +0 -0
- trilogy/ai/providers/anthropic.py +106 -0
- trilogy/ai/providers/base.py +24 -0
- trilogy/ai/providers/google.py +146 -0
- trilogy/ai/providers/openai.py +89 -0
- trilogy/ai/providers/utils.py +68 -0
- trilogy/authoring/README.md +3 -0
- trilogy/authoring/__init__.py +148 -0
- trilogy/constants.py +119 -0
- trilogy/core/README.md +52 -0
- trilogy/core/__init__.py +0 -0
- trilogy/core/constants.py +6 -0
- trilogy/core/enums.py +454 -0
- trilogy/core/env_processor.py +239 -0
- trilogy/core/environment_helpers.py +320 -0
- trilogy/core/ergonomics.py +193 -0
- trilogy/core/exceptions.py +123 -0
- trilogy/core/functions.py +1240 -0
- trilogy/core/graph_models.py +142 -0
- trilogy/core/internal.py +85 -0
- trilogy/core/models/__init__.py +0 -0
- trilogy/core/models/author.py +2670 -0
- trilogy/core/models/build.py +2603 -0
- trilogy/core/models/build_environment.py +165 -0
- trilogy/core/models/core.py +506 -0
- trilogy/core/models/datasource.py +436 -0
- trilogy/core/models/environment.py +756 -0
- trilogy/core/models/execute.py +1213 -0
- trilogy/core/optimization.py +251 -0
- trilogy/core/optimizations/__init__.py +12 -0
- trilogy/core/optimizations/base_optimization.py +17 -0
- trilogy/core/optimizations/hide_unused_concept.py +47 -0
- trilogy/core/optimizations/inline_datasource.py +102 -0
- trilogy/core/optimizations/predicate_pushdown.py +245 -0
- trilogy/core/processing/README.md +94 -0
- trilogy/core/processing/READMEv2.md +121 -0
- trilogy/core/processing/VIRTUAL_UNNEST.md +30 -0
- trilogy/core/processing/__init__.py +0 -0
- trilogy/core/processing/concept_strategies_v3.py +508 -0
- trilogy/core/processing/constants.py +15 -0
- trilogy/core/processing/discovery_node_factory.py +451 -0
- trilogy/core/processing/discovery_utility.py +548 -0
- trilogy/core/processing/discovery_validation.py +167 -0
- trilogy/core/processing/graph_utils.py +43 -0
- trilogy/core/processing/node_generators/README.md +9 -0
- trilogy/core/processing/node_generators/__init__.py +31 -0
- trilogy/core/processing/node_generators/basic_node.py +160 -0
- trilogy/core/processing/node_generators/common.py +270 -0
- trilogy/core/processing/node_generators/constant_node.py +38 -0
- trilogy/core/processing/node_generators/filter_node.py +315 -0
- trilogy/core/processing/node_generators/group_node.py +213 -0
- trilogy/core/processing/node_generators/group_to_node.py +117 -0
- trilogy/core/processing/node_generators/multiselect_node.py +207 -0
- trilogy/core/processing/node_generators/node_merge_node.py +695 -0
- trilogy/core/processing/node_generators/recursive_node.py +88 -0
- trilogy/core/processing/node_generators/rowset_node.py +165 -0
- trilogy/core/processing/node_generators/select_helpers/__init__.py +0 -0
- trilogy/core/processing/node_generators/select_helpers/datasource_injection.py +261 -0
- trilogy/core/processing/node_generators/select_merge_node.py +846 -0
- trilogy/core/processing/node_generators/select_node.py +95 -0
- trilogy/core/processing/node_generators/synonym_node.py +98 -0
- trilogy/core/processing/node_generators/union_node.py +91 -0
- trilogy/core/processing/node_generators/unnest_node.py +182 -0
- trilogy/core/processing/node_generators/window_node.py +201 -0
- trilogy/core/processing/nodes/README.md +28 -0
- trilogy/core/processing/nodes/__init__.py +179 -0
- trilogy/core/processing/nodes/base_node.py +522 -0
- trilogy/core/processing/nodes/filter_node.py +75 -0
- trilogy/core/processing/nodes/group_node.py +194 -0
- trilogy/core/processing/nodes/merge_node.py +420 -0
- trilogy/core/processing/nodes/recursive_node.py +46 -0
- trilogy/core/processing/nodes/select_node_v2.py +242 -0
- trilogy/core/processing/nodes/union_node.py +53 -0
- trilogy/core/processing/nodes/unnest_node.py +62 -0
- trilogy/core/processing/nodes/window_node.py +56 -0
- trilogy/core/processing/utility.py +823 -0
- trilogy/core/query_processor.py +604 -0
- trilogy/core/statements/README.md +35 -0
- trilogy/core/statements/__init__.py +0 -0
- trilogy/core/statements/author.py +536 -0
- trilogy/core/statements/build.py +0 -0
- trilogy/core/statements/common.py +20 -0
- trilogy/core/statements/execute.py +155 -0
- trilogy/core/table_processor.py +66 -0
- trilogy/core/utility.py +8 -0
- trilogy/core/validation/README.md +46 -0
- trilogy/core/validation/__init__.py +0 -0
- trilogy/core/validation/common.py +161 -0
- trilogy/core/validation/concept.py +146 -0
- trilogy/core/validation/datasource.py +227 -0
- trilogy/core/validation/environment.py +73 -0
- trilogy/core/validation/fix.py +256 -0
- trilogy/dialect/__init__.py +32 -0
- trilogy/dialect/base.py +1432 -0
- trilogy/dialect/bigquery.py +314 -0
- trilogy/dialect/common.py +147 -0
- trilogy/dialect/config.py +159 -0
- trilogy/dialect/dataframe.py +50 -0
- trilogy/dialect/duckdb.py +397 -0
- trilogy/dialect/enums.py +151 -0
- trilogy/dialect/metadata.py +173 -0
- trilogy/dialect/mock.py +190 -0
- trilogy/dialect/postgres.py +117 -0
- trilogy/dialect/presto.py +110 -0
- trilogy/dialect/results.py +89 -0
- trilogy/dialect/snowflake.py +129 -0
- trilogy/dialect/sql_server.py +137 -0
- trilogy/engine.py +48 -0
- trilogy/execution/__init__.py +17 -0
- trilogy/execution/config.py +119 -0
- trilogy/execution/state/__init__.py +0 -0
- trilogy/execution/state/exceptions.py +26 -0
- trilogy/execution/state/file_state_store.py +0 -0
- trilogy/execution/state/sqllite_state_store.py +0 -0
- trilogy/execution/state/state_store.py +406 -0
- trilogy/executor.py +692 -0
- trilogy/hooks/__init__.py +4 -0
- trilogy/hooks/base_hook.py +40 -0
- trilogy/hooks/graph_hook.py +135 -0
- trilogy/hooks/query_debugger.py +166 -0
- trilogy/metadata/__init__.py +0 -0
- trilogy/parser.py +10 -0
- trilogy/parsing/README.md +21 -0
- trilogy/parsing/__init__.py +0 -0
- trilogy/parsing/common.py +1069 -0
- trilogy/parsing/config.py +5 -0
- trilogy/parsing/exceptions.py +8 -0
- trilogy/parsing/helpers.py +1 -0
- trilogy/parsing/parse_engine.py +2876 -0
- trilogy/parsing/render.py +775 -0
- trilogy/parsing/trilogy.lark +546 -0
- trilogy/py.typed +0 -0
- trilogy/render.py +45 -0
- trilogy/scripts/README.md +9 -0
- trilogy/scripts/__init__.py +0 -0
- trilogy/scripts/agent.py +41 -0
- trilogy/scripts/agent_info.py +306 -0
- trilogy/scripts/common.py +432 -0
- trilogy/scripts/dependency/Cargo.lock +617 -0
- trilogy/scripts/dependency/Cargo.toml +39 -0
- trilogy/scripts/dependency/README.md +131 -0
- trilogy/scripts/dependency/build.sh +25 -0
- trilogy/scripts/dependency/src/directory_resolver.rs +387 -0
- trilogy/scripts/dependency/src/lib.rs +16 -0
- trilogy/scripts/dependency/src/main.rs +770 -0
- trilogy/scripts/dependency/src/parser.rs +435 -0
- trilogy/scripts/dependency/src/preql.pest +208 -0
- trilogy/scripts/dependency/src/python_bindings.rs +311 -0
- trilogy/scripts/dependency/src/resolver.rs +716 -0
- trilogy/scripts/dependency/tests/base.preql +3 -0
- trilogy/scripts/dependency/tests/cli_integration.rs +377 -0
- trilogy/scripts/dependency/tests/customer.preql +6 -0
- trilogy/scripts/dependency/tests/main.preql +9 -0
- trilogy/scripts/dependency/tests/orders.preql +7 -0
- trilogy/scripts/dependency/tests/test_data/base.preql +9 -0
- trilogy/scripts/dependency/tests/test_data/consumer.preql +1 -0
- trilogy/scripts/dependency.py +323 -0
- trilogy/scripts/display.py +555 -0
- trilogy/scripts/environment.py +59 -0
- trilogy/scripts/fmt.py +32 -0
- trilogy/scripts/ingest.py +487 -0
- trilogy/scripts/ingest_helpers/__init__.py +1 -0
- trilogy/scripts/ingest_helpers/foreign_keys.py +123 -0
- trilogy/scripts/ingest_helpers/formatting.py +93 -0
- trilogy/scripts/ingest_helpers/typing.py +161 -0
- trilogy/scripts/init.py +105 -0
- trilogy/scripts/parallel_execution.py +762 -0
- trilogy/scripts/plan.py +189 -0
- trilogy/scripts/refresh.py +161 -0
- trilogy/scripts/run.py +79 -0
- trilogy/scripts/serve.py +202 -0
- trilogy/scripts/serve_helpers/__init__.py +41 -0
- trilogy/scripts/serve_helpers/file_discovery.py +142 -0
- trilogy/scripts/serve_helpers/index_generation.py +206 -0
- trilogy/scripts/serve_helpers/models.py +38 -0
- trilogy/scripts/single_execution.py +131 -0
- trilogy/scripts/testing.py +143 -0
- trilogy/scripts/trilogy.py +75 -0
- trilogy/std/__init__.py +0 -0
- trilogy/std/color.preql +3 -0
- trilogy/std/date.preql +13 -0
- trilogy/std/display.preql +18 -0
- trilogy/std/geography.preql +22 -0
- trilogy/std/metric.preql +15 -0
- trilogy/std/money.preql +67 -0
- trilogy/std/net.preql +14 -0
- trilogy/std/ranking.preql +7 -0
- trilogy/std/report.preql +5 -0
- trilogy/std/semantic.preql +6 -0
- trilogy/utility.py +34 -0
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from datetime import date
|
|
3
|
+
from typing import Callable
|
|
4
|
+
|
|
5
|
+
from trilogy import Executor
|
|
6
|
+
from trilogy.core.enums import Purpose
|
|
7
|
+
from trilogy.core.models.build import Factory
|
|
8
|
+
from trilogy.core.models.datasource import (
|
|
9
|
+
Address,
|
|
10
|
+
ColumnAssignment,
|
|
11
|
+
Datasource,
|
|
12
|
+
RawColumnExpr,
|
|
13
|
+
UpdateKey,
|
|
14
|
+
UpdateKeys,
|
|
15
|
+
UpdateKeyType,
|
|
16
|
+
)
|
|
17
|
+
from trilogy.core.models.environment import Environment
|
|
18
|
+
from trilogy.core.models.execute import CTE
|
|
19
|
+
from trilogy.execution.state.exceptions import is_missing_source_error
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class DatasourceWatermark:
|
|
24
|
+
keys: dict[str, UpdateKey]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class StaleAsset:
|
|
29
|
+
"""Represents an asset that needs to be refreshed."""
|
|
30
|
+
|
|
31
|
+
datasource_id: str
|
|
32
|
+
reason: str
|
|
33
|
+
filters: UpdateKeys = field(default_factory=UpdateKeys)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _compare_watermark_values(
|
|
37
|
+
a: str | int | float | date, b: str | int | float | date
|
|
38
|
+
) -> int:
|
|
39
|
+
"""Compare two watermark values, returning -1, 0, or 1.
|
|
40
|
+
|
|
41
|
+
Handles type mismatches by comparing string representations.
|
|
42
|
+
"""
|
|
43
|
+
if type(a) is type(b):
|
|
44
|
+
if a < b: # type: ignore[operator]
|
|
45
|
+
return -1
|
|
46
|
+
elif a > b: # type: ignore[operator]
|
|
47
|
+
return 1
|
|
48
|
+
return 0
|
|
49
|
+
# Different types: compare as strings
|
|
50
|
+
sa, sb = str(a), str(b)
|
|
51
|
+
if sa < sb:
|
|
52
|
+
return -1
|
|
53
|
+
elif sa > sb:
|
|
54
|
+
return 1
|
|
55
|
+
return 0
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_last_update_time_watermarks(
|
|
59
|
+
datasource: Datasource, executor: Executor
|
|
60
|
+
) -> DatasourceWatermark:
|
|
61
|
+
update_time = executor.generator.get_table_last_modified(
|
|
62
|
+
executor, datasource.safe_address
|
|
63
|
+
)
|
|
64
|
+
return DatasourceWatermark(
|
|
65
|
+
keys={
|
|
66
|
+
"update_time": UpdateKey(
|
|
67
|
+
concept_name="update_time",
|
|
68
|
+
type=UpdateKeyType.UPDATE_TIME,
|
|
69
|
+
value=update_time,
|
|
70
|
+
)
|
|
71
|
+
}
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_unique_key_hash_watermarks(
|
|
76
|
+
datasource: Datasource, executor: Executor
|
|
77
|
+
) -> DatasourceWatermark:
|
|
78
|
+
key_columns: list[ColumnAssignment] = []
|
|
79
|
+
for col_assignment in datasource.columns:
|
|
80
|
+
concrete = executor.environment.concepts[col_assignment.concept.address]
|
|
81
|
+
if concrete.purpose == Purpose.KEY:
|
|
82
|
+
key_columns.append(col_assignment)
|
|
83
|
+
|
|
84
|
+
if not key_columns:
|
|
85
|
+
return DatasourceWatermark(keys={})
|
|
86
|
+
|
|
87
|
+
if isinstance(datasource.address, Address):
|
|
88
|
+
table_ref = executor.generator.render_source(datasource.address)
|
|
89
|
+
else:
|
|
90
|
+
table_ref = datasource.safe_address
|
|
91
|
+
|
|
92
|
+
dialect = executor.generator
|
|
93
|
+
watermarks = {}
|
|
94
|
+
for col in key_columns:
|
|
95
|
+
if isinstance(col.alias, str):
|
|
96
|
+
column_name = col.alias
|
|
97
|
+
elif isinstance(col.alias, RawColumnExpr):
|
|
98
|
+
column_name = col.alias.text
|
|
99
|
+
else:
|
|
100
|
+
# Function - use rendered expression
|
|
101
|
+
column_name = str(col.alias)
|
|
102
|
+
hash_expr = dialect.hash_column_value(column_name)
|
|
103
|
+
checksum_expr = dialect.aggregate_checksum(hash_expr)
|
|
104
|
+
query = f"SELECT {checksum_expr} as checksum FROM {table_ref}"
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
result = executor.execute_raw_sql(query).fetchone()
|
|
108
|
+
checksum_value = result[0] if result else None
|
|
109
|
+
except Exception as e:
|
|
110
|
+
if is_missing_source_error(e, dialect):
|
|
111
|
+
checksum_value = None
|
|
112
|
+
executor.connection.rollback()
|
|
113
|
+
else:
|
|
114
|
+
raise
|
|
115
|
+
|
|
116
|
+
watermarks[col.concept.address] = UpdateKey(
|
|
117
|
+
concept_name=col.concept.address,
|
|
118
|
+
type=UpdateKeyType.KEY_HASH,
|
|
119
|
+
value=checksum_value,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
return DatasourceWatermark(keys=watermarks)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def get_incremental_key_watermarks(
|
|
126
|
+
datasource: Datasource, executor: Executor
|
|
127
|
+
) -> DatasourceWatermark:
|
|
128
|
+
if not datasource.incremental_by:
|
|
129
|
+
return DatasourceWatermark(keys={})
|
|
130
|
+
|
|
131
|
+
if isinstance(datasource.address, Address):
|
|
132
|
+
table_ref = executor.generator.render_source(datasource.address)
|
|
133
|
+
else:
|
|
134
|
+
table_ref = datasource.safe_address
|
|
135
|
+
|
|
136
|
+
watermarks = {}
|
|
137
|
+
factory = Factory(environment=executor.environment)
|
|
138
|
+
|
|
139
|
+
dialect = executor.generator
|
|
140
|
+
for concept_ref in datasource.incremental_by:
|
|
141
|
+
concept = executor.environment.concepts[concept_ref.address]
|
|
142
|
+
build_concept = factory.build(concept)
|
|
143
|
+
build_datasource = factory.build(datasource)
|
|
144
|
+
cte: CTE = CTE.from_datasource(build_datasource)
|
|
145
|
+
# Check if concept is in output_concepts by comparing addresses
|
|
146
|
+
output_addresses = {c.address for c in datasource.output_concepts}
|
|
147
|
+
if concept.address in output_addresses:
|
|
148
|
+
query = f"SELECT MAX({dialect.render_concept_sql(build_concept, cte=cte, alias=False)}) as max_value FROM {table_ref} as {dialect.quote(cte.base_alias)}"
|
|
149
|
+
else:
|
|
150
|
+
query = f"SELECT MAX({dialect.render_expr(build_concept.lineage, cte=cte)}) as max_value FROM {table_ref} as {dialect.quote(cte.base_alias)}"
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
result = executor.execute_raw_sql(query).fetchone()
|
|
154
|
+
max_value = result[0] if result else None
|
|
155
|
+
except Exception as e:
|
|
156
|
+
if is_missing_source_error(e, dialect):
|
|
157
|
+
max_value = None
|
|
158
|
+
executor.connection.rollback()
|
|
159
|
+
else:
|
|
160
|
+
raise
|
|
161
|
+
|
|
162
|
+
watermarks[concept.name] = UpdateKey(
|
|
163
|
+
concept_name=concept.name,
|
|
164
|
+
type=UpdateKeyType.INCREMENTAL_KEY,
|
|
165
|
+
value=max_value,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return DatasourceWatermark(keys=watermarks)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def get_freshness_watermarks(
|
|
172
|
+
datasource: Datasource, executor: Executor
|
|
173
|
+
) -> DatasourceWatermark:
|
|
174
|
+
if not datasource.freshness_by:
|
|
175
|
+
return DatasourceWatermark(keys={})
|
|
176
|
+
|
|
177
|
+
if isinstance(datasource.address, Address):
|
|
178
|
+
table_ref = executor.generator.render_source(datasource.address)
|
|
179
|
+
else:
|
|
180
|
+
table_ref = datasource.safe_address
|
|
181
|
+
|
|
182
|
+
watermarks = {}
|
|
183
|
+
factory = Factory(environment=executor.environment)
|
|
184
|
+
|
|
185
|
+
dialect = executor.generator
|
|
186
|
+
for concept_ref in datasource.freshness_by:
|
|
187
|
+
concept = executor.environment.concepts[concept_ref.address]
|
|
188
|
+
build_concept = factory.build(concept)
|
|
189
|
+
build_datasource = factory.build(datasource)
|
|
190
|
+
cte: CTE = CTE.from_datasource(build_datasource)
|
|
191
|
+
output_addresses = {c.address for c in datasource.output_concepts}
|
|
192
|
+
if concept.address in output_addresses:
|
|
193
|
+
query = f"SELECT MAX({dialect.render_concept_sql(build_concept, cte=cte, alias=False)}) as max_value FROM {table_ref} as {dialect.quote(cte.base_alias)}"
|
|
194
|
+
else:
|
|
195
|
+
query = f"SELECT MAX({dialect.render_expr(build_concept.lineage, cte=cte)}) as max_value FROM {table_ref} as {dialect.quote(cte.base_alias)}"
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
result = executor.execute_raw_sql(query).fetchone()
|
|
199
|
+
max_value = result[0] if result else None
|
|
200
|
+
except Exception as e:
|
|
201
|
+
if is_missing_source_error(e, dialect):
|
|
202
|
+
max_value = None
|
|
203
|
+
executor.connection.rollback()
|
|
204
|
+
else:
|
|
205
|
+
raise
|
|
206
|
+
|
|
207
|
+
watermarks[concept.name] = UpdateKey(
|
|
208
|
+
concept_name=concept.name,
|
|
209
|
+
type=UpdateKeyType.UPDATE_TIME,
|
|
210
|
+
value=max_value,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
return DatasourceWatermark(keys=watermarks)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class BaseStateStore:
|
|
217
|
+
|
|
218
|
+
def __init__(self) -> None:
|
|
219
|
+
self.watermarks: dict[str, DatasourceWatermark] = {}
|
|
220
|
+
|
|
221
|
+
def watermark_asset(
|
|
222
|
+
self, datasource: Datasource, executor: Executor
|
|
223
|
+
) -> DatasourceWatermark:
|
|
224
|
+
if datasource.freshness_by:
|
|
225
|
+
watermarks = get_freshness_watermarks(datasource, executor)
|
|
226
|
+
elif datasource.incremental_by:
|
|
227
|
+
watermarks = get_incremental_key_watermarks(datasource, executor)
|
|
228
|
+
else:
|
|
229
|
+
key_columns = [
|
|
230
|
+
col
|
|
231
|
+
for col in datasource.columns
|
|
232
|
+
if executor.environment.concepts[col.concept.address].purpose
|
|
233
|
+
== Purpose.KEY
|
|
234
|
+
]
|
|
235
|
+
if key_columns:
|
|
236
|
+
watermarks = get_unique_key_hash_watermarks(datasource, executor)
|
|
237
|
+
else:
|
|
238
|
+
watermarks = get_last_update_time_watermarks(datasource, executor)
|
|
239
|
+
|
|
240
|
+
self.watermarks[datasource.identifier] = watermarks
|
|
241
|
+
return watermarks
|
|
242
|
+
|
|
243
|
+
def get_datasource_watermarks(
|
|
244
|
+
self, datasource: Datasource
|
|
245
|
+
) -> DatasourceWatermark | None:
|
|
246
|
+
return self.watermarks.get(datasource.identifier)
|
|
247
|
+
|
|
248
|
+
def check_datasource_state(self, datasource: Datasource) -> bool:
|
|
249
|
+
return datasource.identifier in self.watermarks
|
|
250
|
+
|
|
251
|
+
def watermark_all_assets(
|
|
252
|
+
self, env: Environment, executor: Executor
|
|
253
|
+
) -> dict[str, DatasourceWatermark]:
|
|
254
|
+
"""Watermark all datasources in the environment."""
|
|
255
|
+
for ds in env.datasources.values():
|
|
256
|
+
self.watermark_asset(ds, executor)
|
|
257
|
+
return self.watermarks
|
|
258
|
+
|
|
259
|
+
def get_stale_assets(
|
|
260
|
+
self,
|
|
261
|
+
env: Environment,
|
|
262
|
+
executor: Executor,
|
|
263
|
+
root_assets: set[str] | None = None,
|
|
264
|
+
) -> list[StaleAsset]:
|
|
265
|
+
"""Find all assets that are stale and need refresh.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
env: The environment containing datasources
|
|
269
|
+
executor: Executor for querying current state
|
|
270
|
+
root_assets: Optional set of datasource identifiers that are "source of truth"
|
|
271
|
+
and should not be marked stale. If None, uses datasources marked
|
|
272
|
+
with is_root=True in the model.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
List of StaleAsset objects describing what needs refresh and why.
|
|
276
|
+
"""
|
|
277
|
+
if root_assets is None:
|
|
278
|
+
root_assets = {
|
|
279
|
+
ds.identifier for ds in env.datasources.values() if ds.is_root
|
|
280
|
+
}
|
|
281
|
+
stale: list[StaleAsset] = []
|
|
282
|
+
|
|
283
|
+
# First pass: watermark all assets to get current state
|
|
284
|
+
self.watermark_all_assets(env, executor)
|
|
285
|
+
|
|
286
|
+
# Build map of concept -> max watermark across root assets
|
|
287
|
+
concept_max_watermarks: dict[str, UpdateKey] = {}
|
|
288
|
+
for ds_id, watermark in self.watermarks.items():
|
|
289
|
+
if ds_id in root_assets:
|
|
290
|
+
for key, val in watermark.keys.items():
|
|
291
|
+
if (
|
|
292
|
+
val.type
|
|
293
|
+
in (UpdateKeyType.INCREMENTAL_KEY, UpdateKeyType.UPDATE_TIME)
|
|
294
|
+
and val.value is not None
|
|
295
|
+
):
|
|
296
|
+
existing = concept_max_watermarks.get(key)
|
|
297
|
+
if existing is None or (
|
|
298
|
+
existing.value is not None
|
|
299
|
+
and _compare_watermark_values(val.value, existing.value) > 0
|
|
300
|
+
):
|
|
301
|
+
concept_max_watermarks[key] = val
|
|
302
|
+
|
|
303
|
+
# Second pass: check non-root assets against max watermarks
|
|
304
|
+
for ds_id, watermark in self.watermarks.items():
|
|
305
|
+
if ds_id in root_assets:
|
|
306
|
+
continue
|
|
307
|
+
|
|
308
|
+
for key, val in watermark.keys.items():
|
|
309
|
+
if val.type == UpdateKeyType.INCREMENTAL_KEY:
|
|
310
|
+
max_val = concept_max_watermarks.get(key)
|
|
311
|
+
if max_val and max_val.value is not None:
|
|
312
|
+
if (
|
|
313
|
+
val.value is None
|
|
314
|
+
or _compare_watermark_values(val.value, max_val.value) < 0
|
|
315
|
+
):
|
|
316
|
+
filters = (
|
|
317
|
+
UpdateKeys(keys={key: val})
|
|
318
|
+
if val.value
|
|
319
|
+
else UpdateKeys()
|
|
320
|
+
)
|
|
321
|
+
stale.append(
|
|
322
|
+
StaleAsset(
|
|
323
|
+
datasource_id=ds_id,
|
|
324
|
+
reason=f"incremental key '{key}' behind: {val.value} < {max_val.value}",
|
|
325
|
+
filters=filters,
|
|
326
|
+
)
|
|
327
|
+
)
|
|
328
|
+
break
|
|
329
|
+
|
|
330
|
+
elif val.type == UpdateKeyType.UPDATE_TIME:
|
|
331
|
+
max_val = concept_max_watermarks.get(key)
|
|
332
|
+
if max_val and max_val.value is not None:
|
|
333
|
+
if (
|
|
334
|
+
val.value is None
|
|
335
|
+
or _compare_watermark_values(val.value, max_val.value) < 0
|
|
336
|
+
):
|
|
337
|
+
stale.append(
|
|
338
|
+
StaleAsset(
|
|
339
|
+
datasource_id=ds_id,
|
|
340
|
+
reason=f"freshness '{key}' behind: {val.value} < {max_val.value}",
|
|
341
|
+
filters=UpdateKeys(),
|
|
342
|
+
)
|
|
343
|
+
)
|
|
344
|
+
break
|
|
345
|
+
|
|
346
|
+
elif val.type == UpdateKeyType.KEY_HASH:
|
|
347
|
+
pass
|
|
348
|
+
|
|
349
|
+
return stale
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
@dataclass
|
|
353
|
+
class RefreshResult:
|
|
354
|
+
"""Result of refreshing stale assets."""
|
|
355
|
+
|
|
356
|
+
stale_count: int
|
|
357
|
+
refreshed_count: int
|
|
358
|
+
root_assets: int
|
|
359
|
+
all_assets: int
|
|
360
|
+
|
|
361
|
+
@property
|
|
362
|
+
def had_stale(self) -> bool:
|
|
363
|
+
return self.stale_count > 0
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def refresh_stale_assets(
|
|
367
|
+
executor: "Executor",
|
|
368
|
+
on_stale_found: Callable[[int, int, int], None] | None = None,
|
|
369
|
+
on_refresh: Callable[[str, str], None] | None = None,
|
|
370
|
+
on_watermarks: Callable[[dict[str, DatasourceWatermark]], None] | None = None,
|
|
371
|
+
) -> RefreshResult:
|
|
372
|
+
"""Find and refresh stale assets.
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
executor: The executor with parsed environment
|
|
376
|
+
on_stale_found: Optional callback(stale_count, root_assets, all_assets)
|
|
377
|
+
on_refresh: Optional callback(asset_id, reason) called before each refresh
|
|
378
|
+
on_watermarks: Optional callback(watermarks_dict) called after collecting watermarks
|
|
379
|
+
"""
|
|
380
|
+
state_store = BaseStateStore()
|
|
381
|
+
stale_assets = state_store.get_stale_assets(executor.environment, executor)
|
|
382
|
+
|
|
383
|
+
if on_watermarks:
|
|
384
|
+
on_watermarks(state_store.watermarks)
|
|
385
|
+
root_assets = sum(
|
|
386
|
+
1 for asset in executor.environment.datasources.values() if asset.is_root
|
|
387
|
+
)
|
|
388
|
+
all_assets = len(executor.environment.datasources)
|
|
389
|
+
|
|
390
|
+
if on_stale_found:
|
|
391
|
+
on_stale_found(len(stale_assets), root_assets, all_assets)
|
|
392
|
+
|
|
393
|
+
refreshed = 0
|
|
394
|
+
for asset in stale_assets:
|
|
395
|
+
if on_refresh:
|
|
396
|
+
on_refresh(asset.datasource_id, asset.reason)
|
|
397
|
+
datasource = executor.environment.datasources[asset.datasource_id]
|
|
398
|
+
executor.update_datasource(datasource)
|
|
399
|
+
refreshed += 1
|
|
400
|
+
|
|
401
|
+
return RefreshResult(
|
|
402
|
+
stale_count=len(stale_assets),
|
|
403
|
+
refreshed_count=refreshed,
|
|
404
|
+
root_assets=root_assets,
|
|
405
|
+
all_assets=all_assets,
|
|
406
|
+
)
|