pytrilogy 0.3.148__cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. LICENSE.md +19 -0
  2. _preql_import_resolver/__init__.py +5 -0
  3. _preql_import_resolver/_preql_import_resolver.cpython-312-aarch64-linux-gnu.so +0 -0
  4. pytrilogy-0.3.148.dist-info/METADATA +555 -0
  5. pytrilogy-0.3.148.dist-info/RECORD +206 -0
  6. pytrilogy-0.3.148.dist-info/WHEEL +5 -0
  7. pytrilogy-0.3.148.dist-info/entry_points.txt +2 -0
  8. pytrilogy-0.3.148.dist-info/licenses/LICENSE.md +19 -0
  9. trilogy/__init__.py +27 -0
  10. trilogy/ai/README.md +10 -0
  11. trilogy/ai/__init__.py +19 -0
  12. trilogy/ai/constants.py +92 -0
  13. trilogy/ai/conversation.py +107 -0
  14. trilogy/ai/enums.py +7 -0
  15. trilogy/ai/execute.py +50 -0
  16. trilogy/ai/models.py +34 -0
  17. trilogy/ai/prompts.py +100 -0
  18. trilogy/ai/providers/__init__.py +0 -0
  19. trilogy/ai/providers/anthropic.py +106 -0
  20. trilogy/ai/providers/base.py +24 -0
  21. trilogy/ai/providers/google.py +146 -0
  22. trilogy/ai/providers/openai.py +89 -0
  23. trilogy/ai/providers/utils.py +68 -0
  24. trilogy/authoring/README.md +3 -0
  25. trilogy/authoring/__init__.py +148 -0
  26. trilogy/constants.py +119 -0
  27. trilogy/core/README.md +52 -0
  28. trilogy/core/__init__.py +0 -0
  29. trilogy/core/constants.py +6 -0
  30. trilogy/core/enums.py +454 -0
  31. trilogy/core/env_processor.py +239 -0
  32. trilogy/core/environment_helpers.py +320 -0
  33. trilogy/core/ergonomics.py +193 -0
  34. trilogy/core/exceptions.py +123 -0
  35. trilogy/core/functions.py +1240 -0
  36. trilogy/core/graph_models.py +142 -0
  37. trilogy/core/internal.py +85 -0
  38. trilogy/core/models/__init__.py +0 -0
  39. trilogy/core/models/author.py +2662 -0
  40. trilogy/core/models/build.py +2603 -0
  41. trilogy/core/models/build_environment.py +165 -0
  42. trilogy/core/models/core.py +506 -0
  43. trilogy/core/models/datasource.py +434 -0
  44. trilogy/core/models/environment.py +756 -0
  45. trilogy/core/models/execute.py +1213 -0
  46. trilogy/core/optimization.py +251 -0
  47. trilogy/core/optimizations/__init__.py +12 -0
  48. trilogy/core/optimizations/base_optimization.py +17 -0
  49. trilogy/core/optimizations/hide_unused_concept.py +47 -0
  50. trilogy/core/optimizations/inline_datasource.py +102 -0
  51. trilogy/core/optimizations/predicate_pushdown.py +245 -0
  52. trilogy/core/processing/README.md +94 -0
  53. trilogy/core/processing/READMEv2.md +121 -0
  54. trilogy/core/processing/VIRTUAL_UNNEST.md +30 -0
  55. trilogy/core/processing/__init__.py +0 -0
  56. trilogy/core/processing/concept_strategies_v3.py +508 -0
  57. trilogy/core/processing/constants.py +15 -0
  58. trilogy/core/processing/discovery_node_factory.py +451 -0
  59. trilogy/core/processing/discovery_utility.py +548 -0
  60. trilogy/core/processing/discovery_validation.py +167 -0
  61. trilogy/core/processing/graph_utils.py +43 -0
  62. trilogy/core/processing/node_generators/README.md +9 -0
  63. trilogy/core/processing/node_generators/__init__.py +31 -0
  64. trilogy/core/processing/node_generators/basic_node.py +160 -0
  65. trilogy/core/processing/node_generators/common.py +270 -0
  66. trilogy/core/processing/node_generators/constant_node.py +38 -0
  67. trilogy/core/processing/node_generators/filter_node.py +315 -0
  68. trilogy/core/processing/node_generators/group_node.py +213 -0
  69. trilogy/core/processing/node_generators/group_to_node.py +117 -0
  70. trilogy/core/processing/node_generators/multiselect_node.py +207 -0
  71. trilogy/core/processing/node_generators/node_merge_node.py +695 -0
  72. trilogy/core/processing/node_generators/recursive_node.py +88 -0
  73. trilogy/core/processing/node_generators/rowset_node.py +165 -0
  74. trilogy/core/processing/node_generators/select_helpers/__init__.py +0 -0
  75. trilogy/core/processing/node_generators/select_helpers/datasource_injection.py +261 -0
  76. trilogy/core/processing/node_generators/select_merge_node.py +786 -0
  77. trilogy/core/processing/node_generators/select_node.py +95 -0
  78. trilogy/core/processing/node_generators/synonym_node.py +98 -0
  79. trilogy/core/processing/node_generators/union_node.py +91 -0
  80. trilogy/core/processing/node_generators/unnest_node.py +182 -0
  81. trilogy/core/processing/node_generators/window_node.py +201 -0
  82. trilogy/core/processing/nodes/README.md +28 -0
  83. trilogy/core/processing/nodes/__init__.py +179 -0
  84. trilogy/core/processing/nodes/base_node.py +522 -0
  85. trilogy/core/processing/nodes/filter_node.py +75 -0
  86. trilogy/core/processing/nodes/group_node.py +194 -0
  87. trilogy/core/processing/nodes/merge_node.py +420 -0
  88. trilogy/core/processing/nodes/recursive_node.py +46 -0
  89. trilogy/core/processing/nodes/select_node_v2.py +242 -0
  90. trilogy/core/processing/nodes/union_node.py +53 -0
  91. trilogy/core/processing/nodes/unnest_node.py +62 -0
  92. trilogy/core/processing/nodes/window_node.py +56 -0
  93. trilogy/core/processing/utility.py +823 -0
  94. trilogy/core/query_processor.py +604 -0
  95. trilogy/core/statements/README.md +35 -0
  96. trilogy/core/statements/__init__.py +0 -0
  97. trilogy/core/statements/author.py +536 -0
  98. trilogy/core/statements/build.py +0 -0
  99. trilogy/core/statements/common.py +20 -0
  100. trilogy/core/statements/execute.py +155 -0
  101. trilogy/core/table_processor.py +66 -0
  102. trilogy/core/utility.py +8 -0
  103. trilogy/core/validation/README.md +46 -0
  104. trilogy/core/validation/__init__.py +0 -0
  105. trilogy/core/validation/common.py +161 -0
  106. trilogy/core/validation/concept.py +146 -0
  107. trilogy/core/validation/datasource.py +227 -0
  108. trilogy/core/validation/environment.py +73 -0
  109. trilogy/core/validation/fix.py +256 -0
  110. trilogy/dialect/__init__.py +32 -0
  111. trilogy/dialect/base.py +1431 -0
  112. trilogy/dialect/bigquery.py +314 -0
  113. trilogy/dialect/common.py +147 -0
  114. trilogy/dialect/config.py +159 -0
  115. trilogy/dialect/dataframe.py +50 -0
  116. trilogy/dialect/duckdb.py +376 -0
  117. trilogy/dialect/enums.py +149 -0
  118. trilogy/dialect/metadata.py +173 -0
  119. trilogy/dialect/mock.py +190 -0
  120. trilogy/dialect/postgres.py +117 -0
  121. trilogy/dialect/presto.py +110 -0
  122. trilogy/dialect/results.py +89 -0
  123. trilogy/dialect/snowflake.py +129 -0
  124. trilogy/dialect/sql_server.py +137 -0
  125. trilogy/engine.py +48 -0
  126. trilogy/execution/__init__.py +17 -0
  127. trilogy/execution/config.py +119 -0
  128. trilogy/execution/state/__init__.py +0 -0
  129. trilogy/execution/state/file_state_store.py +0 -0
  130. trilogy/execution/state/sqllite_state_store.py +0 -0
  131. trilogy/execution/state/state_store.py +301 -0
  132. trilogy/executor.py +656 -0
  133. trilogy/hooks/__init__.py +4 -0
  134. trilogy/hooks/base_hook.py +40 -0
  135. trilogy/hooks/graph_hook.py +135 -0
  136. trilogy/hooks/query_debugger.py +166 -0
  137. trilogy/metadata/__init__.py +0 -0
  138. trilogy/parser.py +10 -0
  139. trilogy/parsing/README.md +21 -0
  140. trilogy/parsing/__init__.py +0 -0
  141. trilogy/parsing/common.py +1069 -0
  142. trilogy/parsing/config.py +5 -0
  143. trilogy/parsing/exceptions.py +8 -0
  144. trilogy/parsing/helpers.py +1 -0
  145. trilogy/parsing/parse_engine.py +2863 -0
  146. trilogy/parsing/render.py +773 -0
  147. trilogy/parsing/trilogy.lark +544 -0
  148. trilogy/py.typed +0 -0
  149. trilogy/render.py +45 -0
  150. trilogy/scripts/README.md +9 -0
  151. trilogy/scripts/__init__.py +0 -0
  152. trilogy/scripts/agent.py +41 -0
  153. trilogy/scripts/agent_info.py +306 -0
  154. trilogy/scripts/common.py +430 -0
  155. trilogy/scripts/dependency/Cargo.lock +617 -0
  156. trilogy/scripts/dependency/Cargo.toml +39 -0
  157. trilogy/scripts/dependency/README.md +131 -0
  158. trilogy/scripts/dependency/build.sh +25 -0
  159. trilogy/scripts/dependency/src/directory_resolver.rs +387 -0
  160. trilogy/scripts/dependency/src/lib.rs +16 -0
  161. trilogy/scripts/dependency/src/main.rs +770 -0
  162. trilogy/scripts/dependency/src/parser.rs +435 -0
  163. trilogy/scripts/dependency/src/preql.pest +208 -0
  164. trilogy/scripts/dependency/src/python_bindings.rs +311 -0
  165. trilogy/scripts/dependency/src/resolver.rs +716 -0
  166. trilogy/scripts/dependency/tests/base.preql +3 -0
  167. trilogy/scripts/dependency/tests/cli_integration.rs +377 -0
  168. trilogy/scripts/dependency/tests/customer.preql +6 -0
  169. trilogy/scripts/dependency/tests/main.preql +9 -0
  170. trilogy/scripts/dependency/tests/orders.preql +7 -0
  171. trilogy/scripts/dependency/tests/test_data/base.preql +9 -0
  172. trilogy/scripts/dependency/tests/test_data/consumer.preql +1 -0
  173. trilogy/scripts/dependency.py +323 -0
  174. trilogy/scripts/display.py +555 -0
  175. trilogy/scripts/environment.py +59 -0
  176. trilogy/scripts/fmt.py +32 -0
  177. trilogy/scripts/ingest.py +472 -0
  178. trilogy/scripts/ingest_helpers/__init__.py +1 -0
  179. trilogy/scripts/ingest_helpers/foreign_keys.py +123 -0
  180. trilogy/scripts/ingest_helpers/formatting.py +93 -0
  181. trilogy/scripts/ingest_helpers/typing.py +161 -0
  182. trilogy/scripts/init.py +105 -0
  183. trilogy/scripts/parallel_execution.py +748 -0
  184. trilogy/scripts/plan.py +189 -0
  185. trilogy/scripts/refresh.py +106 -0
  186. trilogy/scripts/run.py +79 -0
  187. trilogy/scripts/serve.py +202 -0
  188. trilogy/scripts/serve_helpers/__init__.py +41 -0
  189. trilogy/scripts/serve_helpers/file_discovery.py +142 -0
  190. trilogy/scripts/serve_helpers/index_generation.py +206 -0
  191. trilogy/scripts/serve_helpers/models.py +38 -0
  192. trilogy/scripts/single_execution.py +131 -0
  193. trilogy/scripts/testing.py +129 -0
  194. trilogy/scripts/trilogy.py +75 -0
  195. trilogy/std/__init__.py +0 -0
  196. trilogy/std/color.preql +3 -0
  197. trilogy/std/date.preql +13 -0
  198. trilogy/std/display.preql +18 -0
  199. trilogy/std/geography.preql +22 -0
  200. trilogy/std/metric.preql +15 -0
  201. trilogy/std/money.preql +67 -0
  202. trilogy/std/net.preql +14 -0
  203. trilogy/std/ranking.preql +7 -0
  204. trilogy/std/report.preql +5 -0
  205. trilogy/std/semantic.preql +6 -0
  206. trilogy/utility.py +34 -0
@@ -0,0 +1,137 @@
1
+ from typing import Any, Callable, Mapping
2
+
3
+ from jinja2 import Template
4
+
5
+ from trilogy.core.enums import FunctionType, WindowType
6
+ from trilogy.core.statements.execute import (
7
+ PROCESSED_STATEMENT_TYPES,
8
+ ProcessedQuery,
9
+ ProcessedQueryPersist,
10
+ )
11
+ from trilogy.dialect.base import BaseDialect
12
+ from trilogy.utility import string_to_hash
13
+
14
+ WINDOW_FUNCTION_MAP: Mapping[WindowType, Callable[[Any, Any, Any], str]] = {}
15
+
16
+ FUNCTION_MAP = {
17
+ FunctionType.COUNT: lambda args, types: f"count({args[0]})",
18
+ FunctionType.SUM: lambda args, types: f"sum({args[0]})",
19
+ FunctionType.AVG: lambda args, types: f"avg({args[0]})",
20
+ FunctionType.LENGTH: lambda args, types: f"length({args[0]})",
21
+ FunctionType.LIKE: lambda args, types: (
22
+ f" CASE WHEN {args[0]} like {args[1]} THEN True ELSE False END"
23
+ ),
24
+ FunctionType.CONCAT: lambda args, types: (
25
+ f"CONCAT({','.join([f''' '{a}' ''' for a in args])})"
26
+ ),
27
+ }
28
+
29
+ # if an aggregate function is called on a source that is at the same grain as the aggregate
30
+ # we may return a static value
31
+ FUNCTION_GRAIN_MATCH_MAP = {
32
+ **FUNCTION_MAP,
33
+ FunctionType.COUNT_DISTINCT: lambda args, types: f"CASE WHEN{args[0]} IS NOT NULL THEN 1 ELSE 0 END",
34
+ FunctionType.COUNT: lambda args, types: f"CASE WHEN {args[0]} IS NOT NULL THEN 1 ELSE 0 END",
35
+ FunctionType.SUM: lambda args, types: f"{args[0]}",
36
+ FunctionType.AVG: lambda args, types: f"{args[0]}",
37
+ }
38
+
39
+ TSQL_TEMPLATE = Template(
40
+ """{%- if ctes %}
41
+ WITH {% for cte in ctes %}
42
+ {{cte.name}} as ({{cte.statement}}){% if not loop.last %},{% endif %}{% endfor %}{% endif %}
43
+ {%- if full_select -%}{{full_select}}
44
+ {%- else -%}{%- if comment %}
45
+ -- {{ comment }}{%- endif -%}
46
+ SELECT
47
+ {%- if limit is not none %}
48
+ TOP {{ limit }}{% endif %}
49
+ {%- for select in select_columns %}
50
+ {{ select }}{% if not loop.last %},{% endif %}{% endfor %}
51
+ {% if base %}FROM
52
+ {{ base }}{% endif %}{% if joins %}
53
+ {% for join in joins %}
54
+ {{ join }}
55
+ {% endfor %}{% endif %}
56
+ {% if where %}WHERE
57
+ {{ where }}
58
+ {% endif %}
59
+ {%- if group_by %}
60
+ GROUP BY {% for group in group_by %}
61
+ {{group}}{% if not loop.last %},{% endif %}
62
+ {% endfor %}{% endif %}{% if having %}
63
+ HAVING
64
+ \t{{ having }}{% endif %}
65
+ {%- if order_by %}
66
+ ORDER BY {% for order in order_by %}
67
+ {{ order }}{% if not loop.last %},{% endif %}
68
+ {% endfor %}{% endif %}{% endif %}
69
+ """
70
+ )
71
+
72
+ MAX_IDENTIFIER_LENGTH = 128
73
+
74
+
75
+ class SqlServerDialect(BaseDialect):
76
+ WINDOW_FUNCTION_MAP = {**BaseDialect.WINDOW_FUNCTION_MAP, **WINDOW_FUNCTION_MAP}
77
+ FUNCTION_MAP = {**BaseDialect.FUNCTION_MAP, **FUNCTION_MAP}
78
+ FUNCTION_GRAIN_MATCH_MAP = {
79
+ **BaseDialect.FUNCTION_GRAIN_MATCH_MAP,
80
+ **FUNCTION_GRAIN_MATCH_MAP,
81
+ }
82
+ QUOTE_CHARACTER = '"'
83
+ SQL_TEMPLATE = TSQL_TEMPLATE
84
+
85
+ def get_table_schema(
86
+ self, executor, table_name: str, schema: str | None = None
87
+ ) -> list[tuple]:
88
+ """Defaults to 'dbo' schema if none specified."""
89
+ if not schema:
90
+ schema = "dbo"
91
+
92
+ column_query = f"""
93
+ SELECT
94
+ column_name,
95
+ data_type,
96
+ is_nullable,
97
+ '' as column_comment
98
+ FROM information_schema.columns
99
+ WHERE table_name = '{table_name}'
100
+ AND table_schema = '{schema}'
101
+ ORDER BY ordinal_position
102
+ """
103
+
104
+ rows = executor.execute_raw_sql(column_query).fetchall()
105
+ return rows
106
+
107
+ def get_table_primary_keys(
108
+ self, executor, table_name: str, schema: str | None = None
109
+ ) -> list[str]:
110
+ """Uses sys catalog views for more reliable constraint information."""
111
+ if not schema:
112
+ schema = "dbo"
113
+
114
+ pk_query = f"""
115
+ SELECT c.name
116
+ FROM sys.indexes i
117
+ INNER JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id
118
+ INNER JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
119
+ INNER JOIN sys.tables t ON i.object_id = t.object_id
120
+ INNER JOIN sys.schemas s ON t.schema_id = s.schema_id
121
+ WHERE i.is_primary_key = 1
122
+ AND t.name = '{table_name}'
123
+ AND s.name = '{schema}'
124
+ ORDER BY ic.key_ordinal
125
+ """
126
+
127
+ rows = executor.execute_raw_sql(pk_query).fetchall()
128
+ return [row[0] for row in rows]
129
+
130
+ def compile_statement(self, query: PROCESSED_STATEMENT_TYPES) -> str:
131
+ base = super().compile_statement(query)
132
+ if isinstance(query, (ProcessedQuery, ProcessedQueryPersist)):
133
+ for cte in query.ctes:
134
+ if len(cte.name) > MAX_IDENTIFIER_LENGTH:
135
+ new_name = f"rhash_{string_to_hash(cte.name)}"
136
+ base = base.replace(cte.name, new_name)
137
+ return base
trilogy/engine.py ADDED
@@ -0,0 +1,48 @@
1
+ from typing import Any, Generator, List, Optional, Protocol
2
+
3
+ from trilogy.core.models.environment import Environment
4
+
5
+
6
+ class ResultProtocol(Protocol):
7
+
8
+ def fetchall(self) -> List[Any]: ...
9
+
10
+ def keys(self) -> List[str]: ...
11
+
12
+ def fetchone(self) -> Optional[Any]: ...
13
+
14
+ def fetchmany(self, size: int) -> List[Any]: ...
15
+
16
+ def __iter__(self) -> Generator[Any, None, None]: ...
17
+
18
+
19
+ class EngineConnection(Protocol):
20
+ pass
21
+
22
+ def execute(self, statement: str, parameters: Any | None = None) -> ResultProtocol:
23
+ pass
24
+
25
+ def commit(self):
26
+ raise NotImplementedError()
27
+
28
+ def begin(self):
29
+ raise NotImplementedError()
30
+
31
+ def rollback(self):
32
+ raise NotImplementedError()
33
+
34
+ def close(self) -> None:
35
+ return
36
+
37
+
38
+ class ExecutionEngine(Protocol):
39
+ pass
40
+
41
+ def connect(self) -> EngineConnection:
42
+ pass
43
+
44
+ def setup(self, env: Environment, connection):
45
+ pass
46
+
47
+ def dispose(self, close: bool = True):
48
+ pass
@@ -0,0 +1,17 @@
1
+ from trilogy.dialect.config import (
2
+ BigQueryConfig,
3
+ DialectConfig,
4
+ DuckDBConfig,
5
+ PostgresConfig,
6
+ PrestoConfig,
7
+ SnowflakeConfig,
8
+ )
9
+
10
+ __all__ = [
11
+ "DialectConfig",
12
+ "DuckDBConfig",
13
+ "PrestoConfig",
14
+ "SnowflakeConfig",
15
+ "PostgresConfig",
16
+ "BigQueryConfig",
17
+ ]
@@ -0,0 +1,119 @@
1
+ import os
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+
5
+ from tomllib import loads
6
+
7
+ from trilogy.dialect import (
8
+ BigQueryConfig,
9
+ DialectConfig,
10
+ DuckDBConfig,
11
+ PostgresConfig,
12
+ PrestoConfig,
13
+ SnowflakeConfig,
14
+ SQLServerConfig,
15
+ )
16
+ from trilogy.dialect.enums import Dialects
17
+
18
+ DEFAULT_PARALLELISM = 4
19
+
20
+
21
+ def load_env_file(env_file_path: Path) -> dict[str, str]:
22
+ """Load environment variables from a .env file."""
23
+ env_vars: dict[str, str] = {}
24
+ if not env_file_path.exists():
25
+ raise FileNotFoundError(f"Environment file not found: {env_file_path}")
26
+
27
+ with open(env_file_path, "r") as f:
28
+ for line in f:
29
+ line = line.strip()
30
+ if not line or line.startswith("#"):
31
+ continue
32
+ if "=" not in line:
33
+ continue
34
+ key, _, value = line.partition("=")
35
+ key = key.strip()
36
+ value = value.strip()
37
+ # Remove surrounding quotes if present
38
+ if (value.startswith('"') and value.endswith('"')) or (
39
+ value.startswith("'") and value.endswith("'")
40
+ ):
41
+ value = value[1:-1]
42
+ env_vars[key] = value
43
+ return env_vars
44
+
45
+
46
+ def apply_env_vars(env_vars: dict[str, str]) -> None:
47
+ """Apply environment variables to os.environ."""
48
+ for key, value in env_vars.items():
49
+ os.environ[key] = value
50
+
51
+
52
+ @dataclass
53
+ class RuntimeConfig:
54
+
55
+ startup_trilogy: list[Path]
56
+ startup_sql: list[Path]
57
+ parallelism: int = DEFAULT_PARALLELISM
58
+ engine_dialect: Dialects | None = None
59
+ engine_config: DialectConfig | None = None
60
+ source_path: Path | None = None
61
+ env_files: list[Path] = field(default_factory=list)
62
+
63
+
64
+ def load_config_file(path: Path) -> RuntimeConfig:
65
+ with open(path, "r") as f:
66
+ toml_content = f.read()
67
+ config_data = loads(toml_content)
68
+
69
+ engine_raw: dict = config_data.get("engine", {})
70
+ engine_config_raw = engine_raw.get("config", {})
71
+ engine = Dialects(engine_raw.get("dialect")) if engine_raw.get("dialect") else None
72
+ engine_config: DialectConfig | None
73
+ if engine:
74
+ if engine == Dialects.DUCK_DB:
75
+ engine_config = (
76
+ DuckDBConfig(**engine_config_raw) if engine_config_raw else None
77
+ )
78
+ elif engine == Dialects.POSTGRES:
79
+ engine_config = (
80
+ PostgresConfig(**engine_config_raw) if engine_config_raw else None
81
+ )
82
+ elif engine == Dialects.PRESTO:
83
+ engine_config = (
84
+ PrestoConfig(**engine_config_raw) if engine_config_raw else None
85
+ )
86
+ elif engine == Dialects.SNOWFLAKE:
87
+ engine_config = (
88
+ SnowflakeConfig(**engine_config_raw) if engine_config_raw else None
89
+ )
90
+ elif engine == Dialects.SQL_SERVER:
91
+ engine_config = (
92
+ SQLServerConfig(**engine_config_raw) if engine_config_raw else None
93
+ )
94
+ elif engine == Dialects.BIGQUERY:
95
+ engine_config = (
96
+ BigQueryConfig(**engine_config_raw) if engine_config_raw else None
97
+ )
98
+ else:
99
+ engine_config = None
100
+ else:
101
+ engine_config = None
102
+ setup: dict = config_data.get("setup", {})
103
+
104
+ # Parse env_file - can be a single string or list of strings
105
+ env_raw = engine_raw.get("env_file", [])
106
+ if isinstance(env_raw, str):
107
+ env_files = [path.parent / env_raw]
108
+ else:
109
+ env_files = [path.parent / p for p in env_raw]
110
+
111
+ return RuntimeConfig(
112
+ startup_trilogy=[path.parent / p for p in setup.get("trilogy", [])],
113
+ startup_sql=[path.parent / p for p in setup.get("sql", [])],
114
+ parallelism=config_data.get("parallelism", DEFAULT_PARALLELISM),
115
+ engine_dialect=engine,
116
+ engine_config=engine_config,
117
+ source_path=path,
118
+ env_files=env_files,
119
+ )
File without changes
File without changes
File without changes
@@ -0,0 +1,301 @@
1
+ from dataclasses import dataclass, field
2
+ from datetime import date
3
+
4
+ from sqlalchemy.exc import ProgrammingError
5
+
6
+ from trilogy import Executor
7
+ from trilogy.core.enums import Purpose
8
+ from trilogy.core.models.build import Factory
9
+ from trilogy.core.models.datasource import (
10
+ Address,
11
+ ColumnAssignment,
12
+ Datasource,
13
+ RawColumnExpr,
14
+ UpdateKey,
15
+ UpdateKeys,
16
+ UpdateKeyType,
17
+ )
18
+ from trilogy.core.models.environment import Environment
19
+ from trilogy.core.models.execute import CTE
20
+
21
+
22
+ def _is_table_not_found_error(exc: ProgrammingError, dialect) -> bool:
23
+ """Check if exception is a table-not-found error for the given dialect."""
24
+ pattern = dialect.TABLE_NOT_FOUND_PATTERN
25
+ if pattern is None:
26
+ return False
27
+ error_msg = str(exc.orig) if exc.orig else str(exc)
28
+ return pattern in error_msg
29
+
30
+
31
+ @dataclass
32
+ class DatasourceWatermark:
33
+ keys: dict[str, UpdateKey]
34
+
35
+
36
+ @dataclass
37
+ class StaleAsset:
38
+ """Represents an asset that needs to be refreshed."""
39
+
40
+ datasource_id: str
41
+ reason: str
42
+ filters: UpdateKeys = field(default_factory=UpdateKeys)
43
+
44
+
45
+ def _compare_watermark_values(
46
+ a: str | int | float | date, b: str | int | float | date
47
+ ) -> int:
48
+ """Compare two watermark values, returning -1, 0, or 1.
49
+
50
+ Handles type mismatches by comparing string representations.
51
+ """
52
+ if type(a) is type(b):
53
+ if a < b: # type: ignore[operator]
54
+ return -1
55
+ elif a > b: # type: ignore[operator]
56
+ return 1
57
+ return 0
58
+ # Different types: compare as strings
59
+ sa, sb = str(a), str(b)
60
+ if sa < sb:
61
+ return -1
62
+ elif sa > sb:
63
+ return 1
64
+ return 0
65
+
66
+
67
+ def get_last_update_time_watermarks(
68
+ datasource: Datasource, executor: Executor
69
+ ) -> DatasourceWatermark:
70
+ update_time = executor.generator.get_table_last_modified(
71
+ executor, datasource.safe_address
72
+ )
73
+ return DatasourceWatermark(
74
+ keys={
75
+ "update_time": UpdateKey(
76
+ concept_name="update_time",
77
+ type=UpdateKeyType.UPDATE_TIME,
78
+ value=update_time,
79
+ )
80
+ }
81
+ )
82
+
83
+
84
+ def get_unique_key_hash_watermarks(
85
+ datasource: Datasource, executor: Executor
86
+ ) -> DatasourceWatermark:
87
+ key_columns: list[ColumnAssignment] = []
88
+ for col_assignment in datasource.columns:
89
+ concrete = executor.environment.concepts[col_assignment.concept.address]
90
+ if concrete.purpose == Purpose.KEY:
91
+ key_columns.append(col_assignment)
92
+
93
+ if not key_columns:
94
+ return DatasourceWatermark(keys={})
95
+
96
+ if isinstance(datasource.address, Address):
97
+ table_ref = executor.generator.render_source(datasource.address)
98
+ else:
99
+ table_ref = datasource.safe_address
100
+
101
+ dialect = executor.generator
102
+ watermarks = {}
103
+ for col in key_columns:
104
+ if isinstance(col.alias, str):
105
+ column_name = col.alias
106
+ elif isinstance(col.alias, RawColumnExpr):
107
+ column_name = col.alias.text
108
+ else:
109
+ # Function - use rendered expression
110
+ column_name = str(col.alias)
111
+ hash_expr = dialect.hash_column_value(column_name)
112
+ checksum_expr = dialect.aggregate_checksum(hash_expr)
113
+ query = f"SELECT {checksum_expr} as checksum FROM {table_ref}"
114
+
115
+ try:
116
+ result = executor.execute_raw_sql(query).fetchone()
117
+ checksum_value = result[0] if result else None
118
+ except ProgrammingError as e:
119
+ if _is_table_not_found_error(e, dialect):
120
+ checksum_value = None
121
+ else:
122
+ raise
123
+
124
+ watermarks[col.concept.address] = UpdateKey(
125
+ concept_name=col.concept.address,
126
+ type=UpdateKeyType.KEY_HASH,
127
+ value=checksum_value,
128
+ )
129
+
130
+ return DatasourceWatermark(keys=watermarks)
131
+
132
+
133
+ def get_incremental_key_watermarks(
134
+ datasource: Datasource, executor: Executor
135
+ ) -> DatasourceWatermark:
136
+ if not datasource.incremental_by:
137
+ return DatasourceWatermark(keys={})
138
+
139
+ if isinstance(datasource.address, Address):
140
+ table_ref = executor.generator.render_source(datasource.address)
141
+ else:
142
+ table_ref = datasource.safe_address
143
+
144
+ watermarks = {}
145
+ factory = Factory(environment=executor.environment)
146
+
147
+ dialect = executor.generator
148
+ for concept_ref in datasource.incremental_by:
149
+ concept = executor.environment.concepts[concept_ref.address]
150
+ build_concept = factory.build(concept)
151
+ build_datasource = factory.build(datasource)
152
+ cte: CTE = CTE.from_datasource(build_datasource)
153
+ # Check if concept is in output_concepts by comparing addresses
154
+ output_addresses = {c.address for c in datasource.output_concepts}
155
+ if concept.address in output_addresses:
156
+ query = f"SELECT MAX({dialect.render_concept_sql(build_concept, cte=cte, alias=False)}) as max_value FROM {table_ref} as {dialect.quote(cte.base_alias)}"
157
+ else:
158
+ query = f"SELECT MAX({dialect.render_expr(build_concept.lineage, cte=cte)}) as max_value FROM {table_ref} as {dialect.quote(cte.base_alias)}"
159
+
160
+ try:
161
+ result = executor.execute_raw_sql(query).fetchone()
162
+ max_value = result[0] if result else None
163
+ except ProgrammingError as e:
164
+ if _is_table_not_found_error(e, dialect):
165
+ max_value = None
166
+ else:
167
+ raise
168
+
169
+ watermarks[concept.name] = UpdateKey(
170
+ concept_name=concept.name,
171
+ type=UpdateKeyType.INCREMENTAL_KEY,
172
+ value=max_value,
173
+ )
174
+
175
+ return DatasourceWatermark(keys=watermarks)
176
+
177
+
178
+ class BaseStateStore:
179
+
180
+ def __init__(self) -> None:
181
+ self.watermarks: dict[str, DatasourceWatermark] = {}
182
+
183
+ def watermark_asset(
184
+ self, datasource: Datasource, executor: Executor
185
+ ) -> DatasourceWatermark:
186
+ if datasource.incremental_by:
187
+ watermarks = get_incremental_key_watermarks(datasource, executor)
188
+ else:
189
+ key_columns = [
190
+ col
191
+ for col in datasource.columns
192
+ if executor.environment.concepts[col.concept.address].purpose
193
+ == Purpose.KEY
194
+ ]
195
+ if key_columns:
196
+ watermarks = get_unique_key_hash_watermarks(datasource, executor)
197
+ else:
198
+ watermarks = get_last_update_time_watermarks(datasource, executor)
199
+
200
+ self.watermarks[datasource.identifier] = watermarks
201
+ return watermarks
202
+
203
+ def get_datasource_watermarks(
204
+ self, datasource: Datasource
205
+ ) -> DatasourceWatermark | None:
206
+ return self.watermarks.get(datasource.identifier)
207
+
208
+ def check_datasource_state(self, datasource: Datasource) -> bool:
209
+ return datasource.identifier in self.watermarks
210
+
211
+ def watermark_all_assets(
212
+ self, env: Environment, executor: Executor
213
+ ) -> dict[str, DatasourceWatermark]:
214
+ """Watermark all datasources in the environment."""
215
+ for ds in env.datasources.values():
216
+ self.watermark_asset(ds, executor)
217
+ return self.watermarks
218
+
219
+ def get_stale_assets(
220
+ self,
221
+ env: Environment,
222
+ executor: Executor,
223
+ root_assets: set[str] | None = None,
224
+ ) -> list[StaleAsset]:
225
+ """Find all assets that are stale and need refresh.
226
+
227
+ Args:
228
+ env: The environment containing datasources
229
+ executor: Executor for querying current state
230
+ root_assets: Optional set of datasource identifiers that are "source of truth"
231
+ and should not be marked stale. If None, uses datasources marked
232
+ with is_root=True in the model.
233
+
234
+ Returns:
235
+ List of StaleAsset objects describing what needs refresh and why.
236
+ """
237
+ if root_assets is None:
238
+ root_assets = {
239
+ ds.identifier for ds in env.datasources.values() if ds.is_root
240
+ }
241
+ stale: list[StaleAsset] = []
242
+
243
+ # First pass: watermark all assets to get current state
244
+ self.watermark_all_assets(env, executor)
245
+
246
+ # Build map of concept -> max watermark across all assets
247
+ concept_max_watermarks: dict[str, UpdateKey] = {}
248
+ for ds_id, watermark in self.watermarks.items():
249
+ if ds_id in root_assets:
250
+ # Root assets define the "truth" for incremental keys
251
+ for key, val in watermark.keys.items():
252
+ if (
253
+ val.type == UpdateKeyType.INCREMENTAL_KEY
254
+ and val.value is not None
255
+ ):
256
+ existing = concept_max_watermarks.get(key)
257
+ if existing is None or (
258
+ existing.value is not None
259
+ and _compare_watermark_values(val.value, existing.value) > 0
260
+ ):
261
+ concept_max_watermarks[key] = val
262
+
263
+ # Second pass: check non-root assets against max watermarks
264
+ for ds_id, watermark in self.watermarks.items():
265
+ if ds_id in root_assets:
266
+ continue
267
+
268
+ for key, val in watermark.keys.items():
269
+ if val.type == UpdateKeyType.INCREMENTAL_KEY:
270
+ max_val = concept_max_watermarks.get(key)
271
+ if max_val and max_val.value is not None:
272
+ if (
273
+ val.value is None
274
+ or _compare_watermark_values(val.value, max_val.value) < 0
275
+ ):
276
+ # Create UpdateKeys with the filter for incremental update
277
+ filters = (
278
+ UpdateKeys(keys={key: val})
279
+ if val.value
280
+ else UpdateKeys()
281
+ )
282
+ stale.append(
283
+ StaleAsset(
284
+ datasource_id=ds_id,
285
+ reason=f"incremental key '{key}' behind: {val.value} < {max_val.value}",
286
+ filters=filters,
287
+ )
288
+ )
289
+ break
290
+
291
+ elif val.type == UpdateKeyType.UPDATE_TIME:
292
+ # For update_time, we'd need root asset update times to compare
293
+ # This is tricky without explicit dependency tracking
294
+ pass
295
+
296
+ elif val.type == UpdateKeyType.KEY_HASH:
297
+ # Hash changes indicate data changed, but we need a reference
298
+ # to compare against - requires dependency graph
299
+ pass
300
+
301
+ return stale