pytrilogy 0.3.148__cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- LICENSE.md +19 -0
- _preql_import_resolver/__init__.py +5 -0
- _preql_import_resolver/_preql_import_resolver.cpython-312-aarch64-linux-gnu.so +0 -0
- pytrilogy-0.3.148.dist-info/METADATA +555 -0
- pytrilogy-0.3.148.dist-info/RECORD +206 -0
- pytrilogy-0.3.148.dist-info/WHEEL +5 -0
- pytrilogy-0.3.148.dist-info/entry_points.txt +2 -0
- pytrilogy-0.3.148.dist-info/licenses/LICENSE.md +19 -0
- trilogy/__init__.py +27 -0
- trilogy/ai/README.md +10 -0
- trilogy/ai/__init__.py +19 -0
- trilogy/ai/constants.py +92 -0
- trilogy/ai/conversation.py +107 -0
- trilogy/ai/enums.py +7 -0
- trilogy/ai/execute.py +50 -0
- trilogy/ai/models.py +34 -0
- trilogy/ai/prompts.py +100 -0
- trilogy/ai/providers/__init__.py +0 -0
- trilogy/ai/providers/anthropic.py +106 -0
- trilogy/ai/providers/base.py +24 -0
- trilogy/ai/providers/google.py +146 -0
- trilogy/ai/providers/openai.py +89 -0
- trilogy/ai/providers/utils.py +68 -0
- trilogy/authoring/README.md +3 -0
- trilogy/authoring/__init__.py +148 -0
- trilogy/constants.py +119 -0
- trilogy/core/README.md +52 -0
- trilogy/core/__init__.py +0 -0
- trilogy/core/constants.py +6 -0
- trilogy/core/enums.py +454 -0
- trilogy/core/env_processor.py +239 -0
- trilogy/core/environment_helpers.py +320 -0
- trilogy/core/ergonomics.py +193 -0
- trilogy/core/exceptions.py +123 -0
- trilogy/core/functions.py +1240 -0
- trilogy/core/graph_models.py +142 -0
- trilogy/core/internal.py +85 -0
- trilogy/core/models/__init__.py +0 -0
- trilogy/core/models/author.py +2662 -0
- trilogy/core/models/build.py +2603 -0
- trilogy/core/models/build_environment.py +165 -0
- trilogy/core/models/core.py +506 -0
- trilogy/core/models/datasource.py +434 -0
- trilogy/core/models/environment.py +756 -0
- trilogy/core/models/execute.py +1213 -0
- trilogy/core/optimization.py +251 -0
- trilogy/core/optimizations/__init__.py +12 -0
- trilogy/core/optimizations/base_optimization.py +17 -0
- trilogy/core/optimizations/hide_unused_concept.py +47 -0
- trilogy/core/optimizations/inline_datasource.py +102 -0
- trilogy/core/optimizations/predicate_pushdown.py +245 -0
- trilogy/core/processing/README.md +94 -0
- trilogy/core/processing/READMEv2.md +121 -0
- trilogy/core/processing/VIRTUAL_UNNEST.md +30 -0
- trilogy/core/processing/__init__.py +0 -0
- trilogy/core/processing/concept_strategies_v3.py +508 -0
- trilogy/core/processing/constants.py +15 -0
- trilogy/core/processing/discovery_node_factory.py +451 -0
- trilogy/core/processing/discovery_utility.py +548 -0
- trilogy/core/processing/discovery_validation.py +167 -0
- trilogy/core/processing/graph_utils.py +43 -0
- trilogy/core/processing/node_generators/README.md +9 -0
- trilogy/core/processing/node_generators/__init__.py +31 -0
- trilogy/core/processing/node_generators/basic_node.py +160 -0
- trilogy/core/processing/node_generators/common.py +270 -0
- trilogy/core/processing/node_generators/constant_node.py +38 -0
- trilogy/core/processing/node_generators/filter_node.py +315 -0
- trilogy/core/processing/node_generators/group_node.py +213 -0
- trilogy/core/processing/node_generators/group_to_node.py +117 -0
- trilogy/core/processing/node_generators/multiselect_node.py +207 -0
- trilogy/core/processing/node_generators/node_merge_node.py +695 -0
- trilogy/core/processing/node_generators/recursive_node.py +88 -0
- trilogy/core/processing/node_generators/rowset_node.py +165 -0
- trilogy/core/processing/node_generators/select_helpers/__init__.py +0 -0
- trilogy/core/processing/node_generators/select_helpers/datasource_injection.py +261 -0
- trilogy/core/processing/node_generators/select_merge_node.py +786 -0
- trilogy/core/processing/node_generators/select_node.py +95 -0
- trilogy/core/processing/node_generators/synonym_node.py +98 -0
- trilogy/core/processing/node_generators/union_node.py +91 -0
- trilogy/core/processing/node_generators/unnest_node.py +182 -0
- trilogy/core/processing/node_generators/window_node.py +201 -0
- trilogy/core/processing/nodes/README.md +28 -0
- trilogy/core/processing/nodes/__init__.py +179 -0
- trilogy/core/processing/nodes/base_node.py +522 -0
- trilogy/core/processing/nodes/filter_node.py +75 -0
- trilogy/core/processing/nodes/group_node.py +194 -0
- trilogy/core/processing/nodes/merge_node.py +420 -0
- trilogy/core/processing/nodes/recursive_node.py +46 -0
- trilogy/core/processing/nodes/select_node_v2.py +242 -0
- trilogy/core/processing/nodes/union_node.py +53 -0
- trilogy/core/processing/nodes/unnest_node.py +62 -0
- trilogy/core/processing/nodes/window_node.py +56 -0
- trilogy/core/processing/utility.py +823 -0
- trilogy/core/query_processor.py +604 -0
- trilogy/core/statements/README.md +35 -0
- trilogy/core/statements/__init__.py +0 -0
- trilogy/core/statements/author.py +536 -0
- trilogy/core/statements/build.py +0 -0
- trilogy/core/statements/common.py +20 -0
- trilogy/core/statements/execute.py +155 -0
- trilogy/core/table_processor.py +66 -0
- trilogy/core/utility.py +8 -0
- trilogy/core/validation/README.md +46 -0
- trilogy/core/validation/__init__.py +0 -0
- trilogy/core/validation/common.py +161 -0
- trilogy/core/validation/concept.py +146 -0
- trilogy/core/validation/datasource.py +227 -0
- trilogy/core/validation/environment.py +73 -0
- trilogy/core/validation/fix.py +256 -0
- trilogy/dialect/__init__.py +32 -0
- trilogy/dialect/base.py +1431 -0
- trilogy/dialect/bigquery.py +314 -0
- trilogy/dialect/common.py +147 -0
- trilogy/dialect/config.py +159 -0
- trilogy/dialect/dataframe.py +50 -0
- trilogy/dialect/duckdb.py +376 -0
- trilogy/dialect/enums.py +149 -0
- trilogy/dialect/metadata.py +173 -0
- trilogy/dialect/mock.py +190 -0
- trilogy/dialect/postgres.py +117 -0
- trilogy/dialect/presto.py +110 -0
- trilogy/dialect/results.py +89 -0
- trilogy/dialect/snowflake.py +129 -0
- trilogy/dialect/sql_server.py +137 -0
- trilogy/engine.py +48 -0
- trilogy/execution/__init__.py +17 -0
- trilogy/execution/config.py +119 -0
- trilogy/execution/state/__init__.py +0 -0
- trilogy/execution/state/file_state_store.py +0 -0
- trilogy/execution/state/sqllite_state_store.py +0 -0
- trilogy/execution/state/state_store.py +301 -0
- trilogy/executor.py +656 -0
- trilogy/hooks/__init__.py +4 -0
- trilogy/hooks/base_hook.py +40 -0
- trilogy/hooks/graph_hook.py +135 -0
- trilogy/hooks/query_debugger.py +166 -0
- trilogy/metadata/__init__.py +0 -0
- trilogy/parser.py +10 -0
- trilogy/parsing/README.md +21 -0
- trilogy/parsing/__init__.py +0 -0
- trilogy/parsing/common.py +1069 -0
- trilogy/parsing/config.py +5 -0
- trilogy/parsing/exceptions.py +8 -0
- trilogy/parsing/helpers.py +1 -0
- trilogy/parsing/parse_engine.py +2863 -0
- trilogy/parsing/render.py +773 -0
- trilogy/parsing/trilogy.lark +544 -0
- trilogy/py.typed +0 -0
- trilogy/render.py +45 -0
- trilogy/scripts/README.md +9 -0
- trilogy/scripts/__init__.py +0 -0
- trilogy/scripts/agent.py +41 -0
- trilogy/scripts/agent_info.py +306 -0
- trilogy/scripts/common.py +430 -0
- trilogy/scripts/dependency/Cargo.lock +617 -0
- trilogy/scripts/dependency/Cargo.toml +39 -0
- trilogy/scripts/dependency/README.md +131 -0
- trilogy/scripts/dependency/build.sh +25 -0
- trilogy/scripts/dependency/src/directory_resolver.rs +387 -0
- trilogy/scripts/dependency/src/lib.rs +16 -0
- trilogy/scripts/dependency/src/main.rs +770 -0
- trilogy/scripts/dependency/src/parser.rs +435 -0
- trilogy/scripts/dependency/src/preql.pest +208 -0
- trilogy/scripts/dependency/src/python_bindings.rs +311 -0
- trilogy/scripts/dependency/src/resolver.rs +716 -0
- trilogy/scripts/dependency/tests/base.preql +3 -0
- trilogy/scripts/dependency/tests/cli_integration.rs +377 -0
- trilogy/scripts/dependency/tests/customer.preql +6 -0
- trilogy/scripts/dependency/tests/main.preql +9 -0
- trilogy/scripts/dependency/tests/orders.preql +7 -0
- trilogy/scripts/dependency/tests/test_data/base.preql +9 -0
- trilogy/scripts/dependency/tests/test_data/consumer.preql +1 -0
- trilogy/scripts/dependency.py +323 -0
- trilogy/scripts/display.py +555 -0
- trilogy/scripts/environment.py +59 -0
- trilogy/scripts/fmt.py +32 -0
- trilogy/scripts/ingest.py +472 -0
- trilogy/scripts/ingest_helpers/__init__.py +1 -0
- trilogy/scripts/ingest_helpers/foreign_keys.py +123 -0
- trilogy/scripts/ingest_helpers/formatting.py +93 -0
- trilogy/scripts/ingest_helpers/typing.py +161 -0
- trilogy/scripts/init.py +105 -0
- trilogy/scripts/parallel_execution.py +748 -0
- trilogy/scripts/plan.py +189 -0
- trilogy/scripts/refresh.py +106 -0
- trilogy/scripts/run.py +79 -0
- trilogy/scripts/serve.py +202 -0
- trilogy/scripts/serve_helpers/__init__.py +41 -0
- trilogy/scripts/serve_helpers/file_discovery.py +142 -0
- trilogy/scripts/serve_helpers/index_generation.py +206 -0
- trilogy/scripts/serve_helpers/models.py +38 -0
- trilogy/scripts/single_execution.py +131 -0
- trilogy/scripts/testing.py +129 -0
- trilogy/scripts/trilogy.py +75 -0
- trilogy/std/__init__.py +0 -0
- trilogy/std/color.preql +3 -0
- trilogy/std/date.preql +13 -0
- trilogy/std/display.preql +18 -0
- trilogy/std/geography.preql +22 -0
- trilogy/std/metric.preql +15 -0
- trilogy/std/money.preql +67 -0
- trilogy/std/net.preql +14 -0
- trilogy/std/ranking.preql +7 -0
- trilogy/std/report.preql +5 -0
- trilogy/std/semantic.preql +6 -0
- trilogy/utility.py +34 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
from typing import Any, Callable, Mapping
|
|
2
|
+
|
|
3
|
+
from jinja2 import Template
|
|
4
|
+
|
|
5
|
+
from trilogy.core.enums import FunctionType, WindowType
|
|
6
|
+
from trilogy.core.statements.execute import (
|
|
7
|
+
PROCESSED_STATEMENT_TYPES,
|
|
8
|
+
ProcessedQuery,
|
|
9
|
+
ProcessedQueryPersist,
|
|
10
|
+
)
|
|
11
|
+
from trilogy.dialect.base import BaseDialect
|
|
12
|
+
from trilogy.utility import string_to_hash
|
|
13
|
+
|
|
14
|
+
WINDOW_FUNCTION_MAP: Mapping[WindowType, Callable[[Any, Any, Any], str]] = {}
|
|
15
|
+
|
|
16
|
+
FUNCTION_MAP = {
|
|
17
|
+
FunctionType.COUNT: lambda args, types: f"count({args[0]})",
|
|
18
|
+
FunctionType.SUM: lambda args, types: f"sum({args[0]})",
|
|
19
|
+
FunctionType.AVG: lambda args, types: f"avg({args[0]})",
|
|
20
|
+
FunctionType.LENGTH: lambda args, types: f"length({args[0]})",
|
|
21
|
+
FunctionType.LIKE: lambda args, types: (
|
|
22
|
+
f" CASE WHEN {args[0]} like {args[1]} THEN True ELSE False END"
|
|
23
|
+
),
|
|
24
|
+
FunctionType.CONCAT: lambda args, types: (
|
|
25
|
+
f"CONCAT({','.join([f''' '{a}' ''' for a in args])})"
|
|
26
|
+
),
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
# if an aggregate function is called on a source that is at the same grain as the aggregate
|
|
30
|
+
# we may return a static value
|
|
31
|
+
FUNCTION_GRAIN_MATCH_MAP = {
|
|
32
|
+
**FUNCTION_MAP,
|
|
33
|
+
FunctionType.COUNT_DISTINCT: lambda args, types: f"CASE WHEN{args[0]} IS NOT NULL THEN 1 ELSE 0 END",
|
|
34
|
+
FunctionType.COUNT: lambda args, types: f"CASE WHEN {args[0]} IS NOT NULL THEN 1 ELSE 0 END",
|
|
35
|
+
FunctionType.SUM: lambda args, types: f"{args[0]}",
|
|
36
|
+
FunctionType.AVG: lambda args, types: f"{args[0]}",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
TSQL_TEMPLATE = Template(
|
|
40
|
+
"""{%- if ctes %}
|
|
41
|
+
WITH {% for cte in ctes %}
|
|
42
|
+
{{cte.name}} as ({{cte.statement}}){% if not loop.last %},{% endif %}{% endfor %}{% endif %}
|
|
43
|
+
{%- if full_select -%}{{full_select}}
|
|
44
|
+
{%- else -%}{%- if comment %}
|
|
45
|
+
-- {{ comment }}{%- endif -%}
|
|
46
|
+
SELECT
|
|
47
|
+
{%- if limit is not none %}
|
|
48
|
+
TOP {{ limit }}{% endif %}
|
|
49
|
+
{%- for select in select_columns %}
|
|
50
|
+
{{ select }}{% if not loop.last %},{% endif %}{% endfor %}
|
|
51
|
+
{% if base %}FROM
|
|
52
|
+
{{ base }}{% endif %}{% if joins %}
|
|
53
|
+
{% for join in joins %}
|
|
54
|
+
{{ join }}
|
|
55
|
+
{% endfor %}{% endif %}
|
|
56
|
+
{% if where %}WHERE
|
|
57
|
+
{{ where }}
|
|
58
|
+
{% endif %}
|
|
59
|
+
{%- if group_by %}
|
|
60
|
+
GROUP BY {% for group in group_by %}
|
|
61
|
+
{{group}}{% if not loop.last %},{% endif %}
|
|
62
|
+
{% endfor %}{% endif %}{% if having %}
|
|
63
|
+
HAVING
|
|
64
|
+
\t{{ having }}{% endif %}
|
|
65
|
+
{%- if order_by %}
|
|
66
|
+
ORDER BY {% for order in order_by %}
|
|
67
|
+
{{ order }}{% if not loop.last %},{% endif %}
|
|
68
|
+
{% endfor %}{% endif %}{% endif %}
|
|
69
|
+
"""
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
MAX_IDENTIFIER_LENGTH = 128
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class SqlServerDialect(BaseDialect):
|
|
76
|
+
WINDOW_FUNCTION_MAP = {**BaseDialect.WINDOW_FUNCTION_MAP, **WINDOW_FUNCTION_MAP}
|
|
77
|
+
FUNCTION_MAP = {**BaseDialect.FUNCTION_MAP, **FUNCTION_MAP}
|
|
78
|
+
FUNCTION_GRAIN_MATCH_MAP = {
|
|
79
|
+
**BaseDialect.FUNCTION_GRAIN_MATCH_MAP,
|
|
80
|
+
**FUNCTION_GRAIN_MATCH_MAP,
|
|
81
|
+
}
|
|
82
|
+
QUOTE_CHARACTER = '"'
|
|
83
|
+
SQL_TEMPLATE = TSQL_TEMPLATE
|
|
84
|
+
|
|
85
|
+
def get_table_schema(
|
|
86
|
+
self, executor, table_name: str, schema: str | None = None
|
|
87
|
+
) -> list[tuple]:
|
|
88
|
+
"""Defaults to 'dbo' schema if none specified."""
|
|
89
|
+
if not schema:
|
|
90
|
+
schema = "dbo"
|
|
91
|
+
|
|
92
|
+
column_query = f"""
|
|
93
|
+
SELECT
|
|
94
|
+
column_name,
|
|
95
|
+
data_type,
|
|
96
|
+
is_nullable,
|
|
97
|
+
'' as column_comment
|
|
98
|
+
FROM information_schema.columns
|
|
99
|
+
WHERE table_name = '{table_name}'
|
|
100
|
+
AND table_schema = '{schema}'
|
|
101
|
+
ORDER BY ordinal_position
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
rows = executor.execute_raw_sql(column_query).fetchall()
|
|
105
|
+
return rows
|
|
106
|
+
|
|
107
|
+
def get_table_primary_keys(
|
|
108
|
+
self, executor, table_name: str, schema: str | None = None
|
|
109
|
+
) -> list[str]:
|
|
110
|
+
"""Uses sys catalog views for more reliable constraint information."""
|
|
111
|
+
if not schema:
|
|
112
|
+
schema = "dbo"
|
|
113
|
+
|
|
114
|
+
pk_query = f"""
|
|
115
|
+
SELECT c.name
|
|
116
|
+
FROM sys.indexes i
|
|
117
|
+
INNER JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id
|
|
118
|
+
INNER JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
|
|
119
|
+
INNER JOIN sys.tables t ON i.object_id = t.object_id
|
|
120
|
+
INNER JOIN sys.schemas s ON t.schema_id = s.schema_id
|
|
121
|
+
WHERE i.is_primary_key = 1
|
|
122
|
+
AND t.name = '{table_name}'
|
|
123
|
+
AND s.name = '{schema}'
|
|
124
|
+
ORDER BY ic.key_ordinal
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
rows = executor.execute_raw_sql(pk_query).fetchall()
|
|
128
|
+
return [row[0] for row in rows]
|
|
129
|
+
|
|
130
|
+
def compile_statement(self, query: PROCESSED_STATEMENT_TYPES) -> str:
|
|
131
|
+
base = super().compile_statement(query)
|
|
132
|
+
if isinstance(query, (ProcessedQuery, ProcessedQueryPersist)):
|
|
133
|
+
for cte in query.ctes:
|
|
134
|
+
if len(cte.name) > MAX_IDENTIFIER_LENGTH:
|
|
135
|
+
new_name = f"rhash_{string_to_hash(cte.name)}"
|
|
136
|
+
base = base.replace(cte.name, new_name)
|
|
137
|
+
return base
|
trilogy/engine.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from typing import Any, Generator, List, Optional, Protocol
|
|
2
|
+
|
|
3
|
+
from trilogy.core.models.environment import Environment
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ResultProtocol(Protocol):
|
|
7
|
+
|
|
8
|
+
def fetchall(self) -> List[Any]: ...
|
|
9
|
+
|
|
10
|
+
def keys(self) -> List[str]: ...
|
|
11
|
+
|
|
12
|
+
def fetchone(self) -> Optional[Any]: ...
|
|
13
|
+
|
|
14
|
+
def fetchmany(self, size: int) -> List[Any]: ...
|
|
15
|
+
|
|
16
|
+
def __iter__(self) -> Generator[Any, None, None]: ...
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EngineConnection(Protocol):
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
def execute(self, statement: str, parameters: Any | None = None) -> ResultProtocol:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
def commit(self):
|
|
26
|
+
raise NotImplementedError()
|
|
27
|
+
|
|
28
|
+
def begin(self):
|
|
29
|
+
raise NotImplementedError()
|
|
30
|
+
|
|
31
|
+
def rollback(self):
|
|
32
|
+
raise NotImplementedError()
|
|
33
|
+
|
|
34
|
+
def close(self) -> None:
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ExecutionEngine(Protocol):
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
def connect(self) -> EngineConnection:
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
def setup(self, env: Environment, connection):
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
def dispose(self, close: bool = True):
|
|
48
|
+
pass
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from trilogy.dialect.config import (
|
|
2
|
+
BigQueryConfig,
|
|
3
|
+
DialectConfig,
|
|
4
|
+
DuckDBConfig,
|
|
5
|
+
PostgresConfig,
|
|
6
|
+
PrestoConfig,
|
|
7
|
+
SnowflakeConfig,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"DialectConfig",
|
|
12
|
+
"DuckDBConfig",
|
|
13
|
+
"PrestoConfig",
|
|
14
|
+
"SnowflakeConfig",
|
|
15
|
+
"PostgresConfig",
|
|
16
|
+
"BigQueryConfig",
|
|
17
|
+
]
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from tomllib import loads
|
|
6
|
+
|
|
7
|
+
from trilogy.dialect import (
|
|
8
|
+
BigQueryConfig,
|
|
9
|
+
DialectConfig,
|
|
10
|
+
DuckDBConfig,
|
|
11
|
+
PostgresConfig,
|
|
12
|
+
PrestoConfig,
|
|
13
|
+
SnowflakeConfig,
|
|
14
|
+
SQLServerConfig,
|
|
15
|
+
)
|
|
16
|
+
from trilogy.dialect.enums import Dialects
|
|
17
|
+
|
|
18
|
+
DEFAULT_PARALLELISM = 4
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_env_file(env_file_path: Path) -> dict[str, str]:
|
|
22
|
+
"""Load environment variables from a .env file."""
|
|
23
|
+
env_vars: dict[str, str] = {}
|
|
24
|
+
if not env_file_path.exists():
|
|
25
|
+
raise FileNotFoundError(f"Environment file not found: {env_file_path}")
|
|
26
|
+
|
|
27
|
+
with open(env_file_path, "r") as f:
|
|
28
|
+
for line in f:
|
|
29
|
+
line = line.strip()
|
|
30
|
+
if not line or line.startswith("#"):
|
|
31
|
+
continue
|
|
32
|
+
if "=" not in line:
|
|
33
|
+
continue
|
|
34
|
+
key, _, value = line.partition("=")
|
|
35
|
+
key = key.strip()
|
|
36
|
+
value = value.strip()
|
|
37
|
+
# Remove surrounding quotes if present
|
|
38
|
+
if (value.startswith('"') and value.endswith('"')) or (
|
|
39
|
+
value.startswith("'") and value.endswith("'")
|
|
40
|
+
):
|
|
41
|
+
value = value[1:-1]
|
|
42
|
+
env_vars[key] = value
|
|
43
|
+
return env_vars
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def apply_env_vars(env_vars: dict[str, str]) -> None:
|
|
47
|
+
"""Apply environment variables to os.environ."""
|
|
48
|
+
for key, value in env_vars.items():
|
|
49
|
+
os.environ[key] = value
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class RuntimeConfig:
|
|
54
|
+
|
|
55
|
+
startup_trilogy: list[Path]
|
|
56
|
+
startup_sql: list[Path]
|
|
57
|
+
parallelism: int = DEFAULT_PARALLELISM
|
|
58
|
+
engine_dialect: Dialects | None = None
|
|
59
|
+
engine_config: DialectConfig | None = None
|
|
60
|
+
source_path: Path | None = None
|
|
61
|
+
env_files: list[Path] = field(default_factory=list)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def load_config_file(path: Path) -> RuntimeConfig:
|
|
65
|
+
with open(path, "r") as f:
|
|
66
|
+
toml_content = f.read()
|
|
67
|
+
config_data = loads(toml_content)
|
|
68
|
+
|
|
69
|
+
engine_raw: dict = config_data.get("engine", {})
|
|
70
|
+
engine_config_raw = engine_raw.get("config", {})
|
|
71
|
+
engine = Dialects(engine_raw.get("dialect")) if engine_raw.get("dialect") else None
|
|
72
|
+
engine_config: DialectConfig | None
|
|
73
|
+
if engine:
|
|
74
|
+
if engine == Dialects.DUCK_DB:
|
|
75
|
+
engine_config = (
|
|
76
|
+
DuckDBConfig(**engine_config_raw) if engine_config_raw else None
|
|
77
|
+
)
|
|
78
|
+
elif engine == Dialects.POSTGRES:
|
|
79
|
+
engine_config = (
|
|
80
|
+
PostgresConfig(**engine_config_raw) if engine_config_raw else None
|
|
81
|
+
)
|
|
82
|
+
elif engine == Dialects.PRESTO:
|
|
83
|
+
engine_config = (
|
|
84
|
+
PrestoConfig(**engine_config_raw) if engine_config_raw else None
|
|
85
|
+
)
|
|
86
|
+
elif engine == Dialects.SNOWFLAKE:
|
|
87
|
+
engine_config = (
|
|
88
|
+
SnowflakeConfig(**engine_config_raw) if engine_config_raw else None
|
|
89
|
+
)
|
|
90
|
+
elif engine == Dialects.SQL_SERVER:
|
|
91
|
+
engine_config = (
|
|
92
|
+
SQLServerConfig(**engine_config_raw) if engine_config_raw else None
|
|
93
|
+
)
|
|
94
|
+
elif engine == Dialects.BIGQUERY:
|
|
95
|
+
engine_config = (
|
|
96
|
+
BigQueryConfig(**engine_config_raw) if engine_config_raw else None
|
|
97
|
+
)
|
|
98
|
+
else:
|
|
99
|
+
engine_config = None
|
|
100
|
+
else:
|
|
101
|
+
engine_config = None
|
|
102
|
+
setup: dict = config_data.get("setup", {})
|
|
103
|
+
|
|
104
|
+
# Parse env_file - can be a single string or list of strings
|
|
105
|
+
env_raw = engine_raw.get("env_file", [])
|
|
106
|
+
if isinstance(env_raw, str):
|
|
107
|
+
env_files = [path.parent / env_raw]
|
|
108
|
+
else:
|
|
109
|
+
env_files = [path.parent / p for p in env_raw]
|
|
110
|
+
|
|
111
|
+
return RuntimeConfig(
|
|
112
|
+
startup_trilogy=[path.parent / p for p in setup.get("trilogy", [])],
|
|
113
|
+
startup_sql=[path.parent / p for p in setup.get("sql", [])],
|
|
114
|
+
parallelism=config_data.get("parallelism", DEFAULT_PARALLELISM),
|
|
115
|
+
engine_dialect=engine,
|
|
116
|
+
engine_config=engine_config,
|
|
117
|
+
source_path=path,
|
|
118
|
+
env_files=env_files,
|
|
119
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from datetime import date
|
|
3
|
+
|
|
4
|
+
from sqlalchemy.exc import ProgrammingError
|
|
5
|
+
|
|
6
|
+
from trilogy import Executor
|
|
7
|
+
from trilogy.core.enums import Purpose
|
|
8
|
+
from trilogy.core.models.build import Factory
|
|
9
|
+
from trilogy.core.models.datasource import (
|
|
10
|
+
Address,
|
|
11
|
+
ColumnAssignment,
|
|
12
|
+
Datasource,
|
|
13
|
+
RawColumnExpr,
|
|
14
|
+
UpdateKey,
|
|
15
|
+
UpdateKeys,
|
|
16
|
+
UpdateKeyType,
|
|
17
|
+
)
|
|
18
|
+
from trilogy.core.models.environment import Environment
|
|
19
|
+
from trilogy.core.models.execute import CTE
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _is_table_not_found_error(exc: ProgrammingError, dialect) -> bool:
|
|
23
|
+
"""Check if exception is a table-not-found error for the given dialect."""
|
|
24
|
+
pattern = dialect.TABLE_NOT_FOUND_PATTERN
|
|
25
|
+
if pattern is None:
|
|
26
|
+
return False
|
|
27
|
+
error_msg = str(exc.orig) if exc.orig else str(exc)
|
|
28
|
+
return pattern in error_msg
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class DatasourceWatermark:
|
|
33
|
+
keys: dict[str, UpdateKey]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class StaleAsset:
|
|
38
|
+
"""Represents an asset that needs to be refreshed."""
|
|
39
|
+
|
|
40
|
+
datasource_id: str
|
|
41
|
+
reason: str
|
|
42
|
+
filters: UpdateKeys = field(default_factory=UpdateKeys)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _compare_watermark_values(
|
|
46
|
+
a: str | int | float | date, b: str | int | float | date
|
|
47
|
+
) -> int:
|
|
48
|
+
"""Compare two watermark values, returning -1, 0, or 1.
|
|
49
|
+
|
|
50
|
+
Handles type mismatches by comparing string representations.
|
|
51
|
+
"""
|
|
52
|
+
if type(a) is type(b):
|
|
53
|
+
if a < b: # type: ignore[operator]
|
|
54
|
+
return -1
|
|
55
|
+
elif a > b: # type: ignore[operator]
|
|
56
|
+
return 1
|
|
57
|
+
return 0
|
|
58
|
+
# Different types: compare as strings
|
|
59
|
+
sa, sb = str(a), str(b)
|
|
60
|
+
if sa < sb:
|
|
61
|
+
return -1
|
|
62
|
+
elif sa > sb:
|
|
63
|
+
return 1
|
|
64
|
+
return 0
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_last_update_time_watermarks(
|
|
68
|
+
datasource: Datasource, executor: Executor
|
|
69
|
+
) -> DatasourceWatermark:
|
|
70
|
+
update_time = executor.generator.get_table_last_modified(
|
|
71
|
+
executor, datasource.safe_address
|
|
72
|
+
)
|
|
73
|
+
return DatasourceWatermark(
|
|
74
|
+
keys={
|
|
75
|
+
"update_time": UpdateKey(
|
|
76
|
+
concept_name="update_time",
|
|
77
|
+
type=UpdateKeyType.UPDATE_TIME,
|
|
78
|
+
value=update_time,
|
|
79
|
+
)
|
|
80
|
+
}
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_unique_key_hash_watermarks(
|
|
85
|
+
datasource: Datasource, executor: Executor
|
|
86
|
+
) -> DatasourceWatermark:
|
|
87
|
+
key_columns: list[ColumnAssignment] = []
|
|
88
|
+
for col_assignment in datasource.columns:
|
|
89
|
+
concrete = executor.environment.concepts[col_assignment.concept.address]
|
|
90
|
+
if concrete.purpose == Purpose.KEY:
|
|
91
|
+
key_columns.append(col_assignment)
|
|
92
|
+
|
|
93
|
+
if not key_columns:
|
|
94
|
+
return DatasourceWatermark(keys={})
|
|
95
|
+
|
|
96
|
+
if isinstance(datasource.address, Address):
|
|
97
|
+
table_ref = executor.generator.render_source(datasource.address)
|
|
98
|
+
else:
|
|
99
|
+
table_ref = datasource.safe_address
|
|
100
|
+
|
|
101
|
+
dialect = executor.generator
|
|
102
|
+
watermarks = {}
|
|
103
|
+
for col in key_columns:
|
|
104
|
+
if isinstance(col.alias, str):
|
|
105
|
+
column_name = col.alias
|
|
106
|
+
elif isinstance(col.alias, RawColumnExpr):
|
|
107
|
+
column_name = col.alias.text
|
|
108
|
+
else:
|
|
109
|
+
# Function - use rendered expression
|
|
110
|
+
column_name = str(col.alias)
|
|
111
|
+
hash_expr = dialect.hash_column_value(column_name)
|
|
112
|
+
checksum_expr = dialect.aggregate_checksum(hash_expr)
|
|
113
|
+
query = f"SELECT {checksum_expr} as checksum FROM {table_ref}"
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
result = executor.execute_raw_sql(query).fetchone()
|
|
117
|
+
checksum_value = result[0] if result else None
|
|
118
|
+
except ProgrammingError as e:
|
|
119
|
+
if _is_table_not_found_error(e, dialect):
|
|
120
|
+
checksum_value = None
|
|
121
|
+
else:
|
|
122
|
+
raise
|
|
123
|
+
|
|
124
|
+
watermarks[col.concept.address] = UpdateKey(
|
|
125
|
+
concept_name=col.concept.address,
|
|
126
|
+
type=UpdateKeyType.KEY_HASH,
|
|
127
|
+
value=checksum_value,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return DatasourceWatermark(keys=watermarks)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def get_incremental_key_watermarks(
|
|
134
|
+
datasource: Datasource, executor: Executor
|
|
135
|
+
) -> DatasourceWatermark:
|
|
136
|
+
if not datasource.incremental_by:
|
|
137
|
+
return DatasourceWatermark(keys={})
|
|
138
|
+
|
|
139
|
+
if isinstance(datasource.address, Address):
|
|
140
|
+
table_ref = executor.generator.render_source(datasource.address)
|
|
141
|
+
else:
|
|
142
|
+
table_ref = datasource.safe_address
|
|
143
|
+
|
|
144
|
+
watermarks = {}
|
|
145
|
+
factory = Factory(environment=executor.environment)
|
|
146
|
+
|
|
147
|
+
dialect = executor.generator
|
|
148
|
+
for concept_ref in datasource.incremental_by:
|
|
149
|
+
concept = executor.environment.concepts[concept_ref.address]
|
|
150
|
+
build_concept = factory.build(concept)
|
|
151
|
+
build_datasource = factory.build(datasource)
|
|
152
|
+
cte: CTE = CTE.from_datasource(build_datasource)
|
|
153
|
+
# Check if concept is in output_concepts by comparing addresses
|
|
154
|
+
output_addresses = {c.address for c in datasource.output_concepts}
|
|
155
|
+
if concept.address in output_addresses:
|
|
156
|
+
query = f"SELECT MAX({dialect.render_concept_sql(build_concept, cte=cte, alias=False)}) as max_value FROM {table_ref} as {dialect.quote(cte.base_alias)}"
|
|
157
|
+
else:
|
|
158
|
+
query = f"SELECT MAX({dialect.render_expr(build_concept.lineage, cte=cte)}) as max_value FROM {table_ref} as {dialect.quote(cte.base_alias)}"
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
result = executor.execute_raw_sql(query).fetchone()
|
|
162
|
+
max_value = result[0] if result else None
|
|
163
|
+
except ProgrammingError as e:
|
|
164
|
+
if _is_table_not_found_error(e, dialect):
|
|
165
|
+
max_value = None
|
|
166
|
+
else:
|
|
167
|
+
raise
|
|
168
|
+
|
|
169
|
+
watermarks[concept.name] = UpdateKey(
|
|
170
|
+
concept_name=concept.name,
|
|
171
|
+
type=UpdateKeyType.INCREMENTAL_KEY,
|
|
172
|
+
value=max_value,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return DatasourceWatermark(keys=watermarks)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class BaseStateStore:
|
|
179
|
+
|
|
180
|
+
def __init__(self) -> None:
|
|
181
|
+
self.watermarks: dict[str, DatasourceWatermark] = {}
|
|
182
|
+
|
|
183
|
+
def watermark_asset(
|
|
184
|
+
self, datasource: Datasource, executor: Executor
|
|
185
|
+
) -> DatasourceWatermark:
|
|
186
|
+
if datasource.incremental_by:
|
|
187
|
+
watermarks = get_incremental_key_watermarks(datasource, executor)
|
|
188
|
+
else:
|
|
189
|
+
key_columns = [
|
|
190
|
+
col
|
|
191
|
+
for col in datasource.columns
|
|
192
|
+
if executor.environment.concepts[col.concept.address].purpose
|
|
193
|
+
== Purpose.KEY
|
|
194
|
+
]
|
|
195
|
+
if key_columns:
|
|
196
|
+
watermarks = get_unique_key_hash_watermarks(datasource, executor)
|
|
197
|
+
else:
|
|
198
|
+
watermarks = get_last_update_time_watermarks(datasource, executor)
|
|
199
|
+
|
|
200
|
+
self.watermarks[datasource.identifier] = watermarks
|
|
201
|
+
return watermarks
|
|
202
|
+
|
|
203
|
+
def get_datasource_watermarks(
|
|
204
|
+
self, datasource: Datasource
|
|
205
|
+
) -> DatasourceWatermark | None:
|
|
206
|
+
return self.watermarks.get(datasource.identifier)
|
|
207
|
+
|
|
208
|
+
def check_datasource_state(self, datasource: Datasource) -> bool:
|
|
209
|
+
return datasource.identifier in self.watermarks
|
|
210
|
+
|
|
211
|
+
def watermark_all_assets(
|
|
212
|
+
self, env: Environment, executor: Executor
|
|
213
|
+
) -> dict[str, DatasourceWatermark]:
|
|
214
|
+
"""Watermark all datasources in the environment."""
|
|
215
|
+
for ds in env.datasources.values():
|
|
216
|
+
self.watermark_asset(ds, executor)
|
|
217
|
+
return self.watermarks
|
|
218
|
+
|
|
219
|
+
def get_stale_assets(
|
|
220
|
+
self,
|
|
221
|
+
env: Environment,
|
|
222
|
+
executor: Executor,
|
|
223
|
+
root_assets: set[str] | None = None,
|
|
224
|
+
) -> list[StaleAsset]:
|
|
225
|
+
"""Find all assets that are stale and need refresh.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
env: The environment containing datasources
|
|
229
|
+
executor: Executor for querying current state
|
|
230
|
+
root_assets: Optional set of datasource identifiers that are "source of truth"
|
|
231
|
+
and should not be marked stale. If None, uses datasources marked
|
|
232
|
+
with is_root=True in the model.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
List of StaleAsset objects describing what needs refresh and why.
|
|
236
|
+
"""
|
|
237
|
+
if root_assets is None:
|
|
238
|
+
root_assets = {
|
|
239
|
+
ds.identifier for ds in env.datasources.values() if ds.is_root
|
|
240
|
+
}
|
|
241
|
+
stale: list[StaleAsset] = []
|
|
242
|
+
|
|
243
|
+
# First pass: watermark all assets to get current state
|
|
244
|
+
self.watermark_all_assets(env, executor)
|
|
245
|
+
|
|
246
|
+
# Build map of concept -> max watermark across all assets
|
|
247
|
+
concept_max_watermarks: dict[str, UpdateKey] = {}
|
|
248
|
+
for ds_id, watermark in self.watermarks.items():
|
|
249
|
+
if ds_id in root_assets:
|
|
250
|
+
# Root assets define the "truth" for incremental keys
|
|
251
|
+
for key, val in watermark.keys.items():
|
|
252
|
+
if (
|
|
253
|
+
val.type == UpdateKeyType.INCREMENTAL_KEY
|
|
254
|
+
and val.value is not None
|
|
255
|
+
):
|
|
256
|
+
existing = concept_max_watermarks.get(key)
|
|
257
|
+
if existing is None or (
|
|
258
|
+
existing.value is not None
|
|
259
|
+
and _compare_watermark_values(val.value, existing.value) > 0
|
|
260
|
+
):
|
|
261
|
+
concept_max_watermarks[key] = val
|
|
262
|
+
|
|
263
|
+
# Second pass: check non-root assets against max watermarks
|
|
264
|
+
for ds_id, watermark in self.watermarks.items():
|
|
265
|
+
if ds_id in root_assets:
|
|
266
|
+
continue
|
|
267
|
+
|
|
268
|
+
for key, val in watermark.keys.items():
|
|
269
|
+
if val.type == UpdateKeyType.INCREMENTAL_KEY:
|
|
270
|
+
max_val = concept_max_watermarks.get(key)
|
|
271
|
+
if max_val and max_val.value is not None:
|
|
272
|
+
if (
|
|
273
|
+
val.value is None
|
|
274
|
+
or _compare_watermark_values(val.value, max_val.value) < 0
|
|
275
|
+
):
|
|
276
|
+
# Create UpdateKeys with the filter for incremental update
|
|
277
|
+
filters = (
|
|
278
|
+
UpdateKeys(keys={key: val})
|
|
279
|
+
if val.value
|
|
280
|
+
else UpdateKeys()
|
|
281
|
+
)
|
|
282
|
+
stale.append(
|
|
283
|
+
StaleAsset(
|
|
284
|
+
datasource_id=ds_id,
|
|
285
|
+
reason=f"incremental key '{key}' behind: {val.value} < {max_val.value}",
|
|
286
|
+
filters=filters,
|
|
287
|
+
)
|
|
288
|
+
)
|
|
289
|
+
break
|
|
290
|
+
|
|
291
|
+
elif val.type == UpdateKeyType.UPDATE_TIME:
|
|
292
|
+
# For update_time, we'd need root asset update times to compare
|
|
293
|
+
# This is tricky without explicit dependency tracking
|
|
294
|
+
pass
|
|
295
|
+
|
|
296
|
+
elif val.type == UpdateKeyType.KEY_HASH:
|
|
297
|
+
# Hash changes indicate data changed, but we need a reference
|
|
298
|
+
# to compare against - requires dependency graph
|
|
299
|
+
pass
|
|
300
|
+
|
|
301
|
+
return stale
|