pytrilogy 0.0.1.102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pytrilogy might be problematic. Click here for more details.

Files changed (77) hide show
  1. pytrilogy-0.0.1.102.dist-info/LICENSE.md +19 -0
  2. pytrilogy-0.0.1.102.dist-info/METADATA +277 -0
  3. pytrilogy-0.0.1.102.dist-info/RECORD +77 -0
  4. pytrilogy-0.0.1.102.dist-info/WHEEL +5 -0
  5. pytrilogy-0.0.1.102.dist-info/entry_points.txt +2 -0
  6. pytrilogy-0.0.1.102.dist-info/top_level.txt +1 -0
  7. trilogy/__init__.py +8 -0
  8. trilogy/compiler.py +0 -0
  9. trilogy/constants.py +30 -0
  10. trilogy/core/__init__.py +0 -0
  11. trilogy/core/constants.py +3 -0
  12. trilogy/core/enums.py +270 -0
  13. trilogy/core/env_processor.py +33 -0
  14. trilogy/core/environment_helpers.py +156 -0
  15. trilogy/core/ergonomics.py +187 -0
  16. trilogy/core/exceptions.py +23 -0
  17. trilogy/core/functions.py +320 -0
  18. trilogy/core/graph_models.py +55 -0
  19. trilogy/core/internal.py +37 -0
  20. trilogy/core/models.py +3145 -0
  21. trilogy/core/processing/__init__.py +0 -0
  22. trilogy/core/processing/concept_strategies_v3.py +603 -0
  23. trilogy/core/processing/graph_utils.py +44 -0
  24. trilogy/core/processing/node_generators/__init__.py +25 -0
  25. trilogy/core/processing/node_generators/basic_node.py +71 -0
  26. trilogy/core/processing/node_generators/common.py +239 -0
  27. trilogy/core/processing/node_generators/concept_merge.py +152 -0
  28. trilogy/core/processing/node_generators/filter_node.py +83 -0
  29. trilogy/core/processing/node_generators/group_node.py +92 -0
  30. trilogy/core/processing/node_generators/group_to_node.py +99 -0
  31. trilogy/core/processing/node_generators/merge_node.py +148 -0
  32. trilogy/core/processing/node_generators/multiselect_node.py +189 -0
  33. trilogy/core/processing/node_generators/rowset_node.py +130 -0
  34. trilogy/core/processing/node_generators/select_node.py +328 -0
  35. trilogy/core/processing/node_generators/unnest_node.py +37 -0
  36. trilogy/core/processing/node_generators/window_node.py +85 -0
  37. trilogy/core/processing/nodes/__init__.py +76 -0
  38. trilogy/core/processing/nodes/base_node.py +251 -0
  39. trilogy/core/processing/nodes/filter_node.py +49 -0
  40. trilogy/core/processing/nodes/group_node.py +110 -0
  41. trilogy/core/processing/nodes/merge_node.py +326 -0
  42. trilogy/core/processing/nodes/select_node_v2.py +198 -0
  43. trilogy/core/processing/nodes/unnest_node.py +54 -0
  44. trilogy/core/processing/nodes/window_node.py +34 -0
  45. trilogy/core/processing/utility.py +278 -0
  46. trilogy/core/query_processor.py +331 -0
  47. trilogy/dialect/__init__.py +0 -0
  48. trilogy/dialect/base.py +679 -0
  49. trilogy/dialect/bigquery.py +80 -0
  50. trilogy/dialect/common.py +43 -0
  51. trilogy/dialect/config.py +55 -0
  52. trilogy/dialect/duckdb.py +83 -0
  53. trilogy/dialect/enums.py +95 -0
  54. trilogy/dialect/postgres.py +86 -0
  55. trilogy/dialect/presto.py +82 -0
  56. trilogy/dialect/snowflake.py +82 -0
  57. trilogy/dialect/sql_server.py +89 -0
  58. trilogy/docs/__init__.py +0 -0
  59. trilogy/engine.py +48 -0
  60. trilogy/executor.py +242 -0
  61. trilogy/hooks/__init__.py +0 -0
  62. trilogy/hooks/base_hook.py +37 -0
  63. trilogy/hooks/graph_hook.py +24 -0
  64. trilogy/hooks/query_debugger.py +133 -0
  65. trilogy/metadata/__init__.py +0 -0
  66. trilogy/parser.py +10 -0
  67. trilogy/parsing/__init__.py +0 -0
  68. trilogy/parsing/common.py +176 -0
  69. trilogy/parsing/config.py +5 -0
  70. trilogy/parsing/exceptions.py +2 -0
  71. trilogy/parsing/helpers.py +1 -0
  72. trilogy/parsing/parse_engine.py +1951 -0
  73. trilogy/parsing/render.py +483 -0
  74. trilogy/py.typed +0 -0
  75. trilogy/scripts/__init__.py +0 -0
  76. trilogy/scripts/trilogy.py +127 -0
  77. trilogy/utility.py +31 -0
trilogy/core/models.py ADDED
@@ -0,0 +1,3145 @@
1
+ from __future__ import annotations
2
+ import difflib
3
+ import os
4
+ from enum import Enum
5
+ from typing import (
6
+ Dict,
7
+ TypeVar,
8
+ List,
9
+ Optional,
10
+ Union,
11
+ Set,
12
+ Any,
13
+ Sequence,
14
+ ValuesView,
15
+ Callable,
16
+ Annotated,
17
+ get_args,
18
+ Generic,
19
+ Tuple,
20
+ Type,
21
+ ItemsView,
22
+ )
23
+ from pydantic_core import core_schema
24
+ from pydantic.functional_validators import PlainValidator
25
+ from pydantic import (
26
+ BaseModel,
27
+ Field,
28
+ ConfigDict,
29
+ field_validator,
30
+ ValidationInfo,
31
+ ValidatorFunctionWrapHandler,
32
+ computed_field,
33
+ )
34
+ from lark.tree import Meta
35
+ from pathlib import Path
36
+ from trilogy.constants import logger, DEFAULT_NAMESPACE, ENV_CACHE_NAME, MagicConstants
37
+ from trilogy.core.constants import ALL_ROWS_CONCEPT, INTERNAL_NAMESPACE
38
+ from trilogy.core.enums import (
39
+ InfiniteFunctionArgs,
40
+ Purpose,
41
+ JoinType,
42
+ Ordering,
43
+ Modifier,
44
+ FunctionType,
45
+ FunctionClass,
46
+ BooleanOperator,
47
+ ComparisonOperator,
48
+ WindowOrder,
49
+ PurposeLineage,
50
+ SourceType,
51
+ WindowType,
52
+ ConceptSource,
53
+ DatePart,
54
+ ShowCategory,
55
+ Granularity,
56
+ )
57
+ from trilogy.core.exceptions import UndefinedConceptException, InvalidSyntaxException
58
+ from trilogy.utility import unique
59
+ from collections import UserList
60
+ from trilogy.utility import string_to_hash
61
+ from functools import cached_property
62
+ from abc import ABC
63
+
64
+
65
+ LOGGER_PREFIX = "[MODELS]"
66
+
67
+ KT = TypeVar("KT")
68
+ VT = TypeVar("VT")
69
+ LT = TypeVar("LT")
70
+
71
+
72
+ def get_version():
73
+ from trilogy import __version__
74
+
75
+ return __version__
76
+
77
+
78
+ def get_concept_arguments(expr) -> List["Concept"]:
79
+ output = []
80
+ if isinstance(expr, Concept):
81
+ output += [expr]
82
+
83
+ elif isinstance(
84
+ expr,
85
+ (
86
+ Comparison,
87
+ Conditional,
88
+ Function,
89
+ Parenthetical,
90
+ AggregateWrapper,
91
+ CaseWhen,
92
+ CaseElse,
93
+ ),
94
+ ):
95
+ output += expr.concept_arguments
96
+ return output
97
+
98
+
99
+ ALL_TYPES = Union["DataType", "MapType", "ListType", "StructType", "Concept"]
100
+
101
+ NAMESPACED_TYPES = Union[
102
+ "WindowItem",
103
+ "FilterItem",
104
+ "Conditional",
105
+ "Comparison",
106
+ "Concept",
107
+ "CaseWhen",
108
+ "CaseElse",
109
+ "Function",
110
+ "AggregateWrapper",
111
+ "Parenthetical",
112
+ ]
113
+
114
+
115
+ class Namespaced(ABC):
116
+ pass
117
+
118
+ def with_namespace(self, namespace: str):
119
+ raise NotImplementedError
120
+
121
+
122
+ class SelectGrain(ABC):
123
+ pass
124
+
125
+ def with_select_grain(self, grain: Grain):
126
+ raise NotImplementedError
127
+
128
+
129
+ class DataType(Enum):
130
+ # PRIMITIVES
131
+ STRING = "string"
132
+ BOOL = "bool"
133
+ MAP = "map"
134
+ LIST = "list"
135
+ NUMBER = "number"
136
+ FLOAT = "float"
137
+ NUMERIC = "numeric"
138
+ INTEGER = "int"
139
+ BIGINT = "bigint"
140
+ DATE = "date"
141
+ DATETIME = "datetime"
142
+ TIMESTAMP = "timestamp"
143
+ ARRAY = "array"
144
+ DATE_PART = "date_part"
145
+ STRUCT = "struct"
146
+
147
+ # GRANULAR
148
+ UNIX_SECONDS = "unix_seconds"
149
+
150
+ # PARSING
151
+ UNKNOWN = "unknown"
152
+
153
+ @property
154
+ def data_type(self):
155
+ return self
156
+
157
+
158
+ class ListType(BaseModel):
159
+ model_config = ConfigDict(frozen=True)
160
+ type: ALL_TYPES
161
+
162
+ def __str__(self) -> str:
163
+ return f"ListType<{self.type}>"
164
+
165
+ @property
166
+ def data_type(self):
167
+ return DataType.LIST
168
+
169
+ @property
170
+ def value(self):
171
+ return self.data_type.value
172
+
173
+ @property
174
+ def value_data_type(self) -> DataType | StructType | MapType | ListType:
175
+ if isinstance(self.type, Concept):
176
+ return self.type.datatype
177
+ return self.type
178
+
179
+
180
+ class MapType(BaseModel):
181
+ key_type: DataType
182
+ content_type: ALL_TYPES
183
+
184
+ @property
185
+ def data_type(self):
186
+ return DataType.MAP
187
+
188
+ @property
189
+ def value(self):
190
+ return self.data_type.value
191
+
192
+
193
+ class StructType(BaseModel):
194
+ fields: List[ALL_TYPES]
195
+ fields_map: Dict[str, Concept] = Field(default_factory=dict)
196
+
197
+ @property
198
+ def data_type(self):
199
+ return DataType.STRUCT
200
+
201
+ @property
202
+ def value(self):
203
+ return self.data_type.value
204
+
205
+
206
+ class ListWrapper(Generic[VT], UserList):
207
+ """Used to distinguish parsed list objects from other lists"""
208
+
209
+ def __init__(self, *args, type: DataType, **kwargs):
210
+ super().__init__(*args, **kwargs)
211
+ self.type = type
212
+
213
+ @classmethod
214
+ def __get_pydantic_core_schema__(
215
+ cls, source_type: Any, handler: Callable[[Any], core_schema.CoreSchema]
216
+ ) -> core_schema.CoreSchema:
217
+ args = get_args(source_type)
218
+ if args:
219
+ schema = handler(List[args]) # type: ignore
220
+ else:
221
+ schema = handler(List)
222
+ return core_schema.no_info_after_validator_function(cls.validate, schema)
223
+
224
+ @classmethod
225
+ def validate(cls, v):
226
+ return cls(v, type=arg_to_datatype(v[0]))
227
+
228
+
229
+ class Metadata(BaseModel):
230
+ """Metadata container object.
231
+ TODO: support arbitrary tags"""
232
+
233
+ description: Optional[str] = None
234
+ line_number: Optional[int] = None
235
+ concept_source: ConceptSource = ConceptSource.MANUAL
236
+
237
+
238
+ def lineage_validator(
239
+ v: Any, handler: ValidatorFunctionWrapHandler, info: ValidationInfo
240
+ ) -> Union[Function, WindowItem, FilterItem, AggregateWrapper]:
241
+ if v and not isinstance(v, (Function, WindowItem, FilterItem, AggregateWrapper)):
242
+ raise ValueError(v)
243
+ return v
244
+
245
+
246
+ def empty_grain() -> Grain:
247
+ return Grain(components=[])
248
+
249
+
250
+ class Concept(Namespaced, SelectGrain, BaseModel):
251
+ name: str
252
+ datatype: DataType | ListType | StructType | MapType
253
+ purpose: Purpose
254
+ metadata: Optional[Metadata] = Field(
255
+ default_factory=lambda: Metadata(description=None, line_number=None),
256
+ validate_default=True,
257
+ )
258
+ lineage: Optional[
259
+ Union[
260
+ Function,
261
+ WindowItem,
262
+ FilterItem,
263
+ AggregateWrapper,
264
+ RowsetItem,
265
+ MultiSelectStatement | MergeStatement,
266
+ ]
267
+ ] = None
268
+ # lineage: Annotated[Optional[
269
+ # Union[Function, WindowItem, FilterItem, AggregateWrapper]
270
+ # ], WrapValidator(lineage_validator)] = None
271
+ namespace: Optional[str] = Field(default=DEFAULT_NAMESPACE, validate_default=True)
272
+ keys: Optional[Tuple["Concept", ...]] = None
273
+ grain: "Grain" = Field(default=None, validate_default=True)
274
+ modifiers: Optional[List[Modifier]] = Field(default_factory=list)
275
+
276
+ def __hash__(self):
277
+ return hash(str(self))
278
+
279
+ @field_validator("keys", mode="before")
280
+ @classmethod
281
+ def keys_validator(cls, v, info: ValidationInfo):
282
+ if v is None:
283
+ return v
284
+ if not isinstance(v, (list, tuple)):
285
+ raise ValueError(f"Keys must be a list or tuple, got {type(v)}")
286
+ if isinstance(v, list):
287
+ return tuple(v)
288
+ return v
289
+
290
+ @field_validator("namespace", mode="plain")
291
+ @classmethod
292
+ def namespace_validation(cls, v):
293
+ return v or DEFAULT_NAMESPACE
294
+
295
+ @field_validator("metadata")
296
+ @classmethod
297
+ def metadata_validation(cls, v):
298
+ v = v or Metadata()
299
+ return v
300
+
301
+ @field_validator("purpose", mode="after")
302
+ @classmethod
303
+ def purpose_validation(cls, v):
304
+ if v == Purpose.AUTO:
305
+ raise ValueError("Cannot set purpose to AUTO")
306
+ return v
307
+
308
+ @field_validator("grain", mode="before")
309
+ @classmethod
310
+ def parse_grain(cls, v, info: ValidationInfo) -> Grain:
311
+ # this is silly - rethink how we do grains
312
+ values = info.data
313
+ if not v and values.get("purpose", None) == Purpose.KEY:
314
+ v = Grain(
315
+ components=[
316
+ Concept(
317
+ namespace=values.get("namespace", DEFAULT_NAMESPACE),
318
+ name=values["name"],
319
+ datatype=values["datatype"],
320
+ purpose=values["purpose"],
321
+ grain=Grain(),
322
+ )
323
+ ]
324
+ )
325
+ elif (
326
+ "lineage" in values
327
+ and isinstance(values["lineage"], AggregateWrapper)
328
+ and values["lineage"].by
329
+ ):
330
+ v = Grain(components=values["lineage"].by)
331
+ elif not v:
332
+ v = Grain(components=[])
333
+ elif isinstance(v, Concept):
334
+ v = Grain(components=[v])
335
+ if not v:
336
+ raise SyntaxError(f"Invalid grain {v} for concept {values['name']}")
337
+ return v
338
+
339
+ def __eq__(self, other: object):
340
+ if isinstance(other, str):
341
+ if self.address == str:
342
+ return True
343
+ if not isinstance(other, Concept):
344
+ return False
345
+ return (
346
+ self.name == other.name
347
+ and self.datatype == other.datatype
348
+ and self.purpose == other.purpose
349
+ and self.namespace == other.namespace
350
+ and self.grain == other.grain
351
+ # and self.keys == other.keys
352
+ )
353
+
354
+ def __str__(self):
355
+ grain = ",".join([str(c.address) for c in self.grain.components])
356
+ return f"{self.namespace}.{self.name}<{grain}>"
357
+
358
+ @property
359
+ def address(self) -> str:
360
+ return f"{self.namespace}.{self.name}"
361
+
362
+ @property
363
+ def output(self) -> "Concept":
364
+ return self
365
+
366
+ @property
367
+ def safe_address(self) -> str:
368
+ if self.namespace == DEFAULT_NAMESPACE:
369
+ return self.name.replace(".", "_")
370
+ elif self.namespace:
371
+ return f"{self.namespace.replace('.','_')}_{self.name.replace('.','_')}"
372
+ return self.name.replace(".", "_")
373
+
374
+ @property
375
+ def grain_components(self) -> List["Concept"]:
376
+ return self.grain.components_copy if self.grain else []
377
+
378
+ def with_namespace(self, namespace: str) -> "Concept":
379
+ return self.__class__(
380
+ name=self.name,
381
+ datatype=self.datatype,
382
+ purpose=self.purpose,
383
+ metadata=self.metadata,
384
+ lineage=self.lineage.with_namespace(namespace) if self.lineage else None,
385
+ grain=(
386
+ self.grain.with_namespace(namespace)
387
+ if self.grain
388
+ else Grain(components=[])
389
+ ),
390
+ namespace=(
391
+ namespace + "." + self.namespace
392
+ if self.namespace
393
+ and self.namespace != DEFAULT_NAMESPACE
394
+ and self.namespace != namespace
395
+ else namespace
396
+ ),
397
+ keys=(
398
+ tuple([x.with_namespace(namespace) for x in self.keys])
399
+ if self.keys
400
+ else None
401
+ ),
402
+ modifiers=self.modifiers,
403
+ )
404
+
405
+ def with_select_grain(self, grain: Optional["Grain"] = None) -> "Concept":
406
+ if not all([isinstance(x, Concept) for x in self.keys or []]):
407
+ raise ValueError(f"Invalid keys {self.keys} for concept {self.address}")
408
+ new_grain = grain or self.grain
409
+ new_lineage = self.lineage
410
+ if isinstance(self.lineage, SelectGrain):
411
+ new_lineage = self.lineage.with_select_grain(new_grain)
412
+ return self.__class__(
413
+ name=self.name,
414
+ datatype=self.datatype,
415
+ purpose=self.purpose,
416
+ metadata=self.metadata,
417
+ lineage=new_lineage,
418
+ grain=new_grain,
419
+ namespace=self.namespace,
420
+ keys=self.keys,
421
+ modifiers=self.modifiers,
422
+ )
423
+
424
+ def with_grain(self, grain: Optional["Grain"] = None) -> "Concept":
425
+ if not all([isinstance(x, Concept) for x in self.keys or []]):
426
+ raise ValueError(f"Invalid keys {self.keys} for concept {self.address}")
427
+ return self.__class__(
428
+ name=self.name,
429
+ datatype=self.datatype,
430
+ purpose=self.purpose,
431
+ metadata=self.metadata,
432
+ lineage=self.lineage,
433
+ grain=grain if grain else Grain(components=[]),
434
+ namespace=self.namespace,
435
+ keys=self.keys,
436
+ modifiers=self.modifiers,
437
+ )
438
+
439
+ def with_default_grain(self) -> "Concept":
440
+ if self.purpose == Purpose.KEY:
441
+ # we need to make this abstract
442
+ grain = Grain(components=[self.with_grain(Grain())], nested=True)
443
+ elif self.purpose == Purpose.PROPERTY:
444
+ components = []
445
+ if self.keys:
446
+ components = [*self.keys]
447
+ if self.lineage:
448
+ for item in self.lineage.arguments:
449
+ if isinstance(item, Concept):
450
+ if item.keys and not all(c in components for c in item.keys):
451
+ components += item.sources
452
+ else:
453
+ components += item.sources
454
+ grain = Grain(components=components)
455
+ elif self.purpose == Purpose.METRIC:
456
+ grain = Grain()
457
+ elif self.purpose == Purpose.CONSTANT:
458
+ if self.derivation != PurposeLineage.CONSTANT:
459
+ grain = Grain(components=[self.with_grain(Grain())], nested=True)
460
+ else:
461
+ grain = self.grain
462
+ else:
463
+ grain = self.grain # type: ignore
464
+ return self.__class__(
465
+ name=self.name,
466
+ datatype=self.datatype,
467
+ purpose=self.purpose,
468
+ metadata=self.metadata,
469
+ lineage=self.lineage,
470
+ grain=grain,
471
+ keys=self.keys,
472
+ namespace=self.namespace,
473
+ modifiers=self.modifiers,
474
+ )
475
+
476
+ @property
477
+ def sources(self) -> List["Concept"]:
478
+ if self.lineage:
479
+ output = []
480
+ for item in self.lineage.arguments:
481
+ if isinstance(item, Concept):
482
+ if item.address == self.address:
483
+ raise SyntaxError(f"Concept {self.address} references itself")
484
+ output.append(item)
485
+ output += item.sources
486
+ return output
487
+ return []
488
+
489
+ @property
490
+ def concept_arguments(self) -> List[Concept]:
491
+ return self.lineage.concept_arguments if self.lineage else []
492
+
493
+ @property
494
+ def input(self):
495
+ return [self] + self.sources
496
+
497
+ @property
498
+ def derivation(self) -> PurposeLineage:
499
+ if self.lineage and isinstance(self.lineage, WindowItem):
500
+ return PurposeLineage.WINDOW
501
+ elif self.lineage and isinstance(self.lineage, FilterItem):
502
+ return PurposeLineage.FILTER
503
+ elif self.lineage and isinstance(self.lineage, AggregateWrapper):
504
+ return PurposeLineage.AGGREGATE
505
+ elif self.lineage and isinstance(self.lineage, RowsetItem):
506
+ return PurposeLineage.ROWSET
507
+ elif self.lineage and isinstance(self.lineage, MultiSelectStatement):
508
+ return PurposeLineage.MULTISELECT
509
+ elif self.lineage and isinstance(self.lineage, MergeStatement):
510
+ return PurposeLineage.MERGE
511
+ elif (
512
+ self.lineage
513
+ and isinstance(self.lineage, Function)
514
+ and self.lineage.operator in FunctionClass.AGGREGATE_FUNCTIONS.value
515
+ ):
516
+ return PurposeLineage.AGGREGATE
517
+ elif (
518
+ self.lineage
519
+ and isinstance(self.lineage, Function)
520
+ and self.lineage.operator == FunctionType.UNNEST
521
+ ):
522
+ return PurposeLineage.UNNEST
523
+ elif (
524
+ self.lineage
525
+ and isinstance(self.lineage, Function)
526
+ and self.lineage.operator in FunctionClass.SINGLE_ROW.value
527
+ ):
528
+ return PurposeLineage.CONSTANT
529
+
530
+ elif self.lineage and isinstance(self.lineage, Function):
531
+ if not self.lineage.concept_arguments:
532
+ return PurposeLineage.CONSTANT
533
+ return PurposeLineage.BASIC
534
+ elif self.purpose == Purpose.CONSTANT:
535
+ return PurposeLineage.CONSTANT
536
+ return PurposeLineage.ROOT
537
+
538
+ @property
539
+ def granularity(self) -> Granularity:
540
+ """ "used to determine if concepts need to be included in grain
541
+ calculations"""
542
+ if self.derivation == PurposeLineage.CONSTANT:
543
+ # constants are a single row
544
+ return Granularity.SINGLE_ROW
545
+ elif self.derivation == PurposeLineage.AGGREGATE:
546
+ # if it's an aggregate grouped over all rows
547
+ # there is only one row left and it's fine to cross_join
548
+ if all([x.name == ALL_ROWS_CONCEPT for x in self.grain.components]):
549
+ return Granularity.SINGLE_ROW
550
+ elif self.namespace == INTERNAL_NAMESPACE and self.name == ALL_ROWS_CONCEPT:
551
+ return Granularity.SINGLE_ROW
552
+ elif (
553
+ self.lineage
554
+ and isinstance(self.lineage, Function)
555
+ and self.lineage.operator == FunctionType.UNNEST
556
+ ):
557
+ return Granularity.MULTI_ROW
558
+ elif self.lineage and all(
559
+ [
560
+ x.granularity == Granularity.SINGLE_ROW
561
+ for x in self.lineage.concept_arguments
562
+ ]
563
+ ):
564
+ return Granularity.SINGLE_ROW
565
+ return Granularity.MULTI_ROW
566
+
567
+
568
+ class Grain(BaseModel):
569
+ nested: bool = False
570
+ components: List[Concept] = Field(default_factory=list, validate_default=True)
571
+
572
+ @field_validator("components")
573
+ def component_validator(cls, v, info: ValidationInfo):
574
+ values = info.data
575
+ if not values.get("nested", False):
576
+ v2: List[Concept] = unique(
577
+ [safe_concept(c).with_default_grain() for c in v], "address"
578
+ )
579
+ else:
580
+ v2 = unique(v, "address")
581
+ final = []
582
+ for sub in v2:
583
+ if sub.purpose in (Purpose.PROPERTY, Purpose.METRIC) and sub.keys:
584
+ if all([c in v2 for c in sub.keys]):
585
+ continue
586
+ final.append(sub)
587
+ v2 = sorted(final, key=lambda x: x.name)
588
+ return v2
589
+
590
+ @property
591
+ def components_copy(self) -> List[Concept]:
592
+ return [*self.components]
593
+
594
+ def __str__(self):
595
+ if self.abstract:
596
+ return (
597
+ "Grain<Abstract" + ",".join([c.address for c in self.components]) + ">"
598
+ )
599
+ return "Grain<" + ",".join([c.address for c in self.components]) + ">"
600
+
601
+ def with_namespace(self, namespace: str) -> "Grain":
602
+ return Grain(
603
+ components=[c.with_namespace(namespace) for c in self.components],
604
+ nested=self.nested,
605
+ )
606
+
607
+ @property
608
+ def abstract(self):
609
+ return not self.components or all(
610
+ [c.name == ALL_ROWS_CONCEPT for c in self.components]
611
+ )
612
+
613
+ @property
614
+ def set(self):
615
+ return set([c.address for c in self.components_copy])
616
+
617
+ def __eq__(self, other: object):
618
+ if isinstance(other, list):
619
+ return self.set == set([c.address for c in other])
620
+ if not isinstance(other, Grain):
621
+ return False
622
+ return self.set == other.set
623
+
624
+ def issubset(self, other: "Grain"):
625
+ return self.set.issubset(other.set)
626
+
627
+ def union(self, other: "Grain"):
628
+ addresses = self.set.union(other.set)
629
+
630
+ return Grain(
631
+ components=[c for c in self.components if c.address in addresses]
632
+ + [c for c in other.components if c.address in addresses]
633
+ )
634
+
635
+ def isdisjoint(self, other: "Grain"):
636
+ return self.set.isdisjoint(other.set)
637
+
638
+ def intersection(self, other: "Grain") -> "Grain":
639
+ intersection = self.set.intersection(other.set)
640
+ components = [i for i in self.components if i.address in intersection]
641
+ return Grain(components=components)
642
+
643
+ def __add__(self, other: "Grain") -> "Grain":
644
+ components: List[Concept] = []
645
+ for clist in [self.components_copy, other.components_copy]:
646
+ for component in clist:
647
+ if component.with_default_grain() in components:
648
+ continue
649
+ components.append(component.with_default_grain())
650
+ base_components = [c for c in components if c.purpose == Purpose.KEY]
651
+ for c in components:
652
+ if c.purpose == Purpose.PROPERTY and not any(
653
+ [key in base_components for key in (c.keys or [])]
654
+ ):
655
+ base_components.append(c)
656
+ elif (
657
+ c.purpose == Purpose.CONSTANT
658
+ and not c.derivation == PurposeLineage.CONSTANT
659
+ ):
660
+ base_components.append(c)
661
+ return Grain(components=base_components)
662
+
663
+ def __radd__(self, other) -> "Grain":
664
+ if other == 0:
665
+ return self
666
+ else:
667
+ return self.__add__(other)
668
+
669
+
670
+ class RawColumnExpr(BaseModel):
671
+ text: str
672
+
673
+
674
+ class ColumnAssignment(BaseModel):
675
+ alias: str | RawColumnExpr | Function
676
+ concept: Concept
677
+ modifiers: List[Modifier] = Field(default_factory=list)
678
+
679
+ @property
680
+ def is_complete(self) -> bool:
681
+ return Modifier.PARTIAL not in self.modifiers
682
+
683
+ def with_namespace(self, namespace: str) -> "ColumnAssignment":
684
+ return ColumnAssignment(
685
+ alias=(
686
+ self.alias.with_namespace(namespace)
687
+ if isinstance(self.alias, Function)
688
+ else self.alias
689
+ ),
690
+ concept=self.concept.with_namespace(namespace),
691
+ modifiers=self.modifiers,
692
+ )
693
+
694
+
695
+ class Statement(BaseModel):
696
+ pass
697
+
698
+
699
+ class LooseConceptList(BaseModel):
700
+ concepts: List[Concept]
701
+
702
+ @cached_property
703
+ def addresses(self) -> set[str]:
704
+ return {s.address for s in self.concepts}
705
+
706
+ @classmethod
707
+ def validate(cls, v):
708
+ return cls(v)
709
+
710
+ def __str__(self) -> str:
711
+ return f"lcl{str(self.addresses)}"
712
+
713
+ def __iter__(self):
714
+ return iter(self.concepts)
715
+
716
+ def __eq__(self, other):
717
+ if not isinstance(other, LooseConceptList):
718
+ return False
719
+ return self.addresses == other.addresses
720
+
721
+ def issubset(self, other):
722
+ if not isinstance(other, LooseConceptList):
723
+ return False
724
+ return self.addresses.issubset(other.addresses)
725
+
726
+ def __contains__(self, other):
727
+ if isinstance(other, str):
728
+ return other in self.addresses
729
+ if not isinstance(other, Concept):
730
+ return False
731
+ return other.address in self.addresses
732
+
733
+ def difference(self, other):
734
+ if not isinstance(other, LooseConceptList):
735
+ return False
736
+ return self.addresses.difference(other.addresses)
737
+
738
+ def isdisjoint(self, other):
739
+ if not isinstance(other, LooseConceptList):
740
+ return False
741
+ return self.addresses.isdisjoint(other.addresses)
742
+
743
+
744
+ class Function(Namespaced, SelectGrain, BaseModel):
745
+ operator: FunctionType
746
+ arg_count: int = Field(default=1)
747
+ output_datatype: DataType | ListType | StructType | MapType
748
+ output_purpose: Purpose
749
+ valid_inputs: Optional[
750
+ Union[
751
+ Set[DataType | ListType | StructType],
752
+ List[Set[DataType | ListType | StructType]],
753
+ ]
754
+ ] = None
755
+ arguments: Sequence[
756
+ Union[
757
+ Concept,
758
+ "AggregateWrapper",
759
+ "Function",
760
+ int,
761
+ float,
762
+ str,
763
+ DataType,
764
+ ListType,
765
+ DatePart,
766
+ "Parenthetical",
767
+ CaseWhen,
768
+ "CaseElse",
769
+ ListWrapper[int],
770
+ ListWrapper[str],
771
+ ListWrapper[float],
772
+ ]
773
+ ]
774
+
775
+ def __str__(self):
776
+ return f'{self.operator.value}({",".join([str(a) for a in self.arguments])})'
777
+
778
+ @property
779
+ def datatype(self):
780
+ return self.output_datatype
781
+
782
+ def with_select_grain(self, grain: Grain) -> Function:
783
+ return Function(
784
+ operator=self.operator,
785
+ arguments=[
786
+ (
787
+ c.with_select_grain(grain)
788
+ if isinstance(
789
+ c,
790
+ SelectGrain,
791
+ )
792
+ else c
793
+ )
794
+ for c in self.arguments
795
+ ],
796
+ output_datatype=self.output_datatype,
797
+ output_purpose=self.output_purpose,
798
+ valid_inputs=self.valid_inputs,
799
+ arg_count=self.arg_count,
800
+ )
801
+
802
+ @field_validator("arguments")
803
+ @classmethod
804
+ def parse_arguments(cls, v, info: ValidationInfo):
805
+ from trilogy.parsing.exceptions import ParseError
806
+
807
+ values = info.data
808
+ arg_count = len(v)
809
+ target_arg_count = values["arg_count"]
810
+ operator_name = values["operator"].name
811
+ # surface right error
812
+ if "valid_inputs" not in values:
813
+ return v
814
+ valid_inputs = values["valid_inputs"]
815
+ if not arg_count <= target_arg_count:
816
+ if target_arg_count != InfiniteFunctionArgs:
817
+ raise ParseError(
818
+ f"Incorrect argument count to {operator_name} function, expects"
819
+ f" {target_arg_count}, got {arg_count}"
820
+ )
821
+ # if all arguments can be any of the set type
822
+ # turn this into an array for validation
823
+ if isinstance(valid_inputs, set):
824
+ valid_inputs = [valid_inputs for _ in v]
825
+ elif not valid_inputs:
826
+ return v
827
+ for idx, arg in enumerate(v):
828
+ if (
829
+ isinstance(arg, Concept)
830
+ and arg.datatype.data_type not in valid_inputs[idx]
831
+ ):
832
+ if arg.datatype != DataType.UNKNOWN:
833
+ raise TypeError(
834
+ f"Invalid input datatype {arg.datatype.data_type} passed into position {idx}"
835
+ f" for {operator_name} from concept {arg.name}, valid is {valid_inputs[idx]}"
836
+ )
837
+ if (
838
+ isinstance(arg, Function)
839
+ and arg.output_datatype not in valid_inputs[idx]
840
+ ):
841
+ if arg.output_datatype != DataType.UNKNOWN:
842
+ raise TypeError(
843
+ f"Invalid input datatype {arg.output_datatype} passed into"
844
+ f" {operator_name} from function {arg.operator.name}"
845
+ )
846
+ # check constants
847
+ comparisons: List[Tuple[Type, DataType]] = [
848
+ (str, DataType.STRING),
849
+ (int, DataType.INTEGER),
850
+ (float, DataType.FLOAT),
851
+ (bool, DataType.BOOL),
852
+ (DatePart, DataType.DATE_PART),
853
+ ]
854
+ for ptype, dtype in comparisons:
855
+ if isinstance(arg, ptype) and dtype in valid_inputs[idx]:
856
+ # attempt to exit early to avoid checking all types
857
+ break
858
+ elif isinstance(arg, ptype):
859
+ raise TypeError(
860
+ f"Invalid {dtype} constant passed into {operator_name} {arg}, expecting one of {valid_inputs[idx]}"
861
+ )
862
+ return v
863
+
864
+ def with_namespace(self, namespace: str) -> "Function":
865
+ return Function(
866
+ operator=self.operator,
867
+ arguments=[
868
+ (
869
+ c.with_namespace(namespace)
870
+ if isinstance(
871
+ c,
872
+ Namespaced,
873
+ )
874
+ else c
875
+ )
876
+ for c in self.arguments
877
+ ],
878
+ output_datatype=self.output_datatype,
879
+ output_purpose=self.output_purpose,
880
+ valid_inputs=self.valid_inputs,
881
+ arg_count=self.arg_count,
882
+ )
883
+
884
+ @property
885
+ def concept_arguments(self) -> List[Concept]:
886
+ base = []
887
+ for arg in self.arguments:
888
+ base += get_concept_arguments(arg)
889
+ return base
890
+
891
+ @property
892
+ def output_grain(self):
893
+ # aggregates have an abstract grain
894
+ base_grain = Grain(components=[])
895
+ if self.operator in FunctionClass.AGGREGATE_FUNCTIONS.value:
896
+ return base_grain
897
+ # scalars have implicit grain of all arguments
898
+ for input in self.concept_arguments:
899
+ base_grain += input.grain
900
+ return base_grain
901
+
902
+ @property
903
+ def output_keys(self) -> list[Concept]:
904
+ # aggregates have an abstract grain
905
+ components = []
906
+ # scalars have implicit grain of all arguments
907
+ for input in self.concept_arguments:
908
+ if input.purpose == Purpose.KEY:
909
+ components.append(input)
910
+ elif input.keys:
911
+ components += input.keys
912
+ return list(set(components))
913
+
914
+
915
+ class ConceptTransform(Namespaced, BaseModel):
916
+ function: Function | FilterItem | WindowItem | AggregateWrapper
917
+ output: Concept
918
+ modifiers: List[Modifier] = Field(default_factory=list)
919
+
920
+ @property
921
+ def input(self) -> List[Concept]:
922
+ return [v for v in self.function.arguments if isinstance(v, Concept)]
923
+
924
+ def with_namespace(self, namespace: str) -> "ConceptTransform":
925
+ return ConceptTransform(
926
+ function=self.function.with_namespace(namespace),
927
+ output=self.output.with_namespace(namespace),
928
+ modifiers=self.modifiers,
929
+ )
930
+
931
+ def with_filter(self, where: "WhereClause") -> "ConceptTransform":
932
+ id_hash = string_to_hash(str(where))
933
+ new_parent_concept = Concept(
934
+ name=f"_anon_concept_transform_filter_input_{id_hash}",
935
+ datatype=self.output.datatype,
936
+ purpose=self.output.purpose,
937
+ lineage=self.output.lineage,
938
+ namespace=DEFAULT_NAMESPACE,
939
+ grain=self.output.grain,
940
+ keys=self.output.keys,
941
+ )
942
+ new_parent = FilterItem(content=new_parent_concept, where=where)
943
+ self.output.lineage = new_parent
944
+ return ConceptTransform(
945
+ function=new_parent, output=self.output, modifiers=self.modifiers
946
+ )
947
+
948
+
949
+ class Window(BaseModel):
950
+ count: int
951
+ window_order: WindowOrder
952
+
953
+ def __str__(self):
954
+ return f"Window<{self.window_order}>"
955
+
956
+
957
+ class WindowItemOver(BaseModel):
958
+ contents: List[Concept]
959
+
960
+
961
+ class WindowItemOrder(BaseModel):
962
+ contents: List["OrderItem"]
963
+
964
+
965
+ class WindowItem(Namespaced, SelectGrain, BaseModel):
966
+ type: WindowType
967
+ content: Concept
968
+ order_by: List["OrderItem"]
969
+ over: List["Concept"] = Field(default_factory=list)
970
+
971
+ def with_namespace(self, namespace: str) -> "WindowItem":
972
+ return WindowItem(
973
+ type=self.type,
974
+ content=self.content.with_namespace(namespace),
975
+ over=[x.with_namespace(namespace) for x in self.over],
976
+ order_by=[x.with_namespace(namespace) for x in self.order_by],
977
+ )
978
+
979
+ def with_select_grain(self, grain: Grain) -> "WindowItem":
980
+ return WindowItem(
981
+ type=self.type,
982
+ content=self.content.with_select_grain(grain),
983
+ over=[x.with_select_grain(grain) for x in self.over],
984
+ order_by=[x.with_select_grain(grain) for x in self.order_by],
985
+ )
986
+
987
+ @property
988
+ def concept_arguments(self) -> List[Concept]:
989
+ return self.arguments
990
+
991
+ @property
992
+ def arguments(self) -> List[Concept]:
993
+ output = [self.content]
994
+ for order in self.order_by:
995
+ output += [order.output]
996
+ for item in self.over:
997
+ output += [item]
998
+ return output
999
+
1000
+ @property
1001
+ def output(self) -> Concept:
1002
+ if isinstance(self.content, ConceptTransform):
1003
+ return self.content.output
1004
+ return self.content
1005
+
1006
+ @output.setter
1007
+ def output(self, value):
1008
+ if isinstance(self.content, ConceptTransform):
1009
+ self.content.output = value
1010
+ else:
1011
+ self.content = value
1012
+
1013
+ @property
1014
+ def input(self) -> List[Concept]:
1015
+ base = self.content.input
1016
+ for v in self.order_by:
1017
+ base += v.input
1018
+ for c in self.over:
1019
+ base += c.input
1020
+ return base
1021
+
1022
+ @property
1023
+ def output_datatype(self):
1024
+ return self.content.datatype
1025
+
1026
+ @property
1027
+ def output_purpose(self):
1028
+ return Purpose.PROPERTY
1029
+
1030
+
1031
+ class FilterItem(Namespaced, SelectGrain, BaseModel):
1032
+ content: Concept
1033
+ where: "WhereClause"
1034
+
1035
+ def __str__(self):
1036
+ return f"<Filter: {str(self.content)} where {str(self.where)}>"
1037
+
1038
+ def with_namespace(self, namespace: str) -> "FilterItem":
1039
+ return FilterItem(
1040
+ content=self.content.with_namespace(namespace),
1041
+ where=self.where.with_namespace(namespace),
1042
+ )
1043
+
1044
+ def with_select_grain(self, grain: Grain) -> FilterItem:
1045
+ return FilterItem(
1046
+ content=self.content.with_select_grain(grain),
1047
+ where=self.where.with_select_grain(grain),
1048
+ )
1049
+
1050
+ @property
1051
+ def arguments(self) -> List[Concept]:
1052
+ output = [self.content]
1053
+ output += self.where.input
1054
+ return output
1055
+
1056
+ @property
1057
+ def output(self) -> Concept:
1058
+ if isinstance(self.content, ConceptTransform):
1059
+ return self.content.output
1060
+ return self.content
1061
+
1062
+ @output.setter
1063
+ def output(self, value):
1064
+ if isinstance(self.content, ConceptTransform):
1065
+ self.content.output = value
1066
+ else:
1067
+ self.content = value
1068
+
1069
+ @property
1070
+ def input(self) -> List[Concept]:
1071
+ base = self.content.input
1072
+ base += self.where.input
1073
+ return base
1074
+
1075
+ @property
1076
+ def output_datatype(self):
1077
+ return self.content.datatype
1078
+
1079
+ @property
1080
+ def output_purpose(self):
1081
+ return self.content.purpose
1082
+
1083
+ @property
1084
+ def concept_arguments(self):
1085
+ return [self.content] + self.where.concept_arguments
1086
+
1087
+
1088
+ class SelectItem(Namespaced, BaseModel):
1089
+ content: Union[Concept, ConceptTransform]
1090
+ modifiers: List[Modifier] = Field(default_factory=list)
1091
+
1092
+ @property
1093
+ def output(self) -> Concept:
1094
+ if isinstance(self.content, ConceptTransform):
1095
+ return self.content.output
1096
+ elif isinstance(self.content, WindowItem):
1097
+ return self.content.output
1098
+ return self.content
1099
+
1100
+ @property
1101
+ def input(self) -> List[Concept]:
1102
+ return self.content.input
1103
+
1104
+ def with_namespace(self, namespace: str) -> "SelectItem":
1105
+ return SelectItem(
1106
+ content=self.content.with_namespace(namespace),
1107
+ modifiers=self.modifiers,
1108
+ )
1109
+
1110
+
1111
+ class OrderItem(SelectGrain, Namespaced, BaseModel):
1112
+ expr: Concept
1113
+ order: Ordering
1114
+
1115
+ def with_namespace(self, namespace: str) -> "OrderItem":
1116
+ return OrderItem(expr=self.expr.with_namespace(namespace), order=self.order)
1117
+
1118
+ def with_select_grain(self, grain: Grain) -> "OrderItem":
1119
+ return OrderItem(expr=self.expr.with_grain(grain), order=self.order)
1120
+
1121
+ @property
1122
+ def input(self):
1123
+ return self.expr.input
1124
+
1125
+ @property
1126
+ def output(self):
1127
+ return self.expr.output
1128
+
1129
+
1130
+ class OrderBy(Namespaced, BaseModel):
1131
+ items: List[OrderItem]
1132
+
1133
+ def with_namespace(self, namespace: str) -> "OrderBy":
1134
+ return OrderBy(items=[x.with_namespace(namespace) for x in self.items])
1135
+
1136
+
1137
+ class SelectStatement(Namespaced, BaseModel):
1138
+ selection: List[SelectItem]
1139
+ where_clause: Optional["WhereClause"] = None
1140
+ order_by: Optional[OrderBy] = None
1141
+ limit: Optional[int] = None
1142
+
1143
+ def __str__(self):
1144
+ from trilogy.parsing.render import render_query
1145
+
1146
+ return render_query(self)
1147
+
1148
+ def __init__(self, *args, **kwargs) -> None:
1149
+ super().__init__(*args, **kwargs)
1150
+ for nitem in self.selection:
1151
+ if not isinstance(nitem.content, Concept):
1152
+ continue
1153
+ if nitem.content.grain == Grain():
1154
+ if nitem.content.derivation == PurposeLineage.AGGREGATE:
1155
+ nitem.content = nitem.content.with_grain(self.grain)
1156
+
1157
+ @field_validator("selection", mode="before")
1158
+ @classmethod
1159
+ def selection_validation(cls, v):
1160
+ new = []
1161
+ for item in v:
1162
+ if isinstance(item, (Concept, ConceptTransform)):
1163
+ new.append(SelectItem(content=item))
1164
+ else:
1165
+ new.append(item)
1166
+ return new
1167
+
1168
+ @property
1169
+ def input_components(self) -> List[Concept]:
1170
+ output = set()
1171
+ output_list = []
1172
+ for item in self.selection:
1173
+ for concept in item.input:
1174
+ if concept.name in output:
1175
+ continue
1176
+ output.add(concept.name)
1177
+ output_list.append(concept)
1178
+ if self.where_clause:
1179
+ for concept in self.where_clause.input:
1180
+ if concept.name in output:
1181
+ continue
1182
+ output.add(concept.name)
1183
+ output_list.append(concept)
1184
+
1185
+ return output_list
1186
+
1187
+ @property
1188
+ def output_components(self) -> List[Concept]:
1189
+ output = []
1190
+ for item in self.selection:
1191
+ if isinstance(item, Concept):
1192
+ output.append(item)
1193
+ else:
1194
+ output.append(item.output)
1195
+ return output
1196
+
1197
+ @property
1198
+ def hidden_components(self) -> List[Concept]:
1199
+ output = []
1200
+ for item in self.selection:
1201
+ if isinstance(item, SelectItem) and Modifier.HIDDEN in item.modifiers:
1202
+ output.append(item.output)
1203
+ return output
1204
+
1205
+ @property
1206
+ def all_components(self) -> List[Concept]:
1207
+ return (
1208
+ self.input_components + self.output_components + self.grain.components_copy
1209
+ )
1210
+
1211
+ def to_datasource(
1212
+ self,
1213
+ namespace: str,
1214
+ identifier: str,
1215
+ address: Address,
1216
+ grain: Grain | None = None,
1217
+ ) -> Datasource:
1218
+ columns = [
1219
+ # TODO: replace hardcoded replacement here
1220
+ ColumnAssignment(alias=c.address.replace(".", "_"), concept=c)
1221
+ for c in self.output_components
1222
+ ]
1223
+ new_datasource = Datasource(
1224
+ identifier=identifier,
1225
+ address=address,
1226
+ grain=grain or self.grain,
1227
+ columns=columns,
1228
+ namespace=namespace,
1229
+ )
1230
+ for column in columns:
1231
+ column.concept = column.concept.with_grain(new_datasource.grain)
1232
+ return new_datasource
1233
+
1234
+ @property
1235
+ def grain(self) -> "Grain":
1236
+ output = []
1237
+ for item in self.output_components:
1238
+ if item.purpose == Purpose.KEY:
1239
+ output.append(item)
1240
+ if self.where_clause:
1241
+ for item in self.where_clause.concept_arguments:
1242
+ if item.purpose == Purpose.KEY:
1243
+ output.append(item)
1244
+ # elif item.purpose == Purpose.PROPERTY and item.grain:
1245
+ # output += item.grain.components
1246
+ # TODO: handle other grain cases
1247
+ # new if block by design
1248
+ # add back any purpose that is not at the grain
1249
+ # if a query already has the key of the property in the grain
1250
+ # we want to group to that grain and ignore the property, which is a derivation
1251
+ # otherwise, we need to include property as the group by
1252
+ for item in self.output_components:
1253
+ if (
1254
+ item.purpose == Purpose.PROPERTY
1255
+ and item.grain
1256
+ and (
1257
+ not item.grain.components
1258
+ or not item.grain.issubset(
1259
+ Grain(components=unique(output, "address"))
1260
+ )
1261
+ )
1262
+ ):
1263
+ output.append(item)
1264
+ if (
1265
+ item.purpose == Purpose.CONSTANT
1266
+ and item.derivation != PurposeLineage.CONSTANT
1267
+ and item.grain
1268
+ and (
1269
+ not item.grain.components
1270
+ or not item.grain.issubset(
1271
+ Grain(components=unique(output, "address"))
1272
+ )
1273
+ )
1274
+ ):
1275
+ output.append(item)
1276
+ return Grain(components=unique(output, "address"))
1277
+
1278
+ def with_namespace(self, namespace: str) -> "SelectStatement":
1279
+ return SelectStatement(
1280
+ selection=[c.with_namespace(namespace) for c in self.selection],
1281
+ where_clause=(
1282
+ self.where_clause.with_namespace(namespace)
1283
+ if self.where_clause
1284
+ else None
1285
+ ),
1286
+ order_by=self.order_by.with_namespace(namespace) if self.order_by else None,
1287
+ limit=self.limit,
1288
+ )
1289
+
1290
+
1291
+ class AlignItem(Namespaced, BaseModel):
1292
+ alias: str
1293
+ concepts: List[Concept]
1294
+ namespace: Optional[str] = Field(default=DEFAULT_NAMESPACE, validate_default=True)
1295
+
1296
+ @computed_field # type: ignore
1297
+ @cached_property
1298
+ def concepts_lcl(self) -> LooseConceptList:
1299
+ return LooseConceptList(concepts=self.concepts)
1300
+
1301
+ def with_namespace(self, namespace: str) -> "AlignItem":
1302
+ return AlignItem(
1303
+ alias=self.alias,
1304
+ concepts=[c.with_namespace(namespace) for c in self.concepts],
1305
+ namespace=namespace,
1306
+ )
1307
+
1308
+ def gen_concept(self, parent: MultiSelectStatement):
1309
+ datatypes = set([c.datatype for c in self.concepts])
1310
+ purposes = set([c.purpose for c in self.concepts])
1311
+ if len(datatypes) > 1:
1312
+ raise InvalidSyntaxException(
1313
+ f"Datatypes do not align for merged statements {self.alias}, have {datatypes}"
1314
+ )
1315
+ if len(purposes) > 1:
1316
+ purpose = Purpose.KEY
1317
+ else:
1318
+ purpose = list(purposes)[0]
1319
+ new = Concept(
1320
+ name=self.alias,
1321
+ datatype=datatypes.pop(),
1322
+ purpose=purpose,
1323
+ lineage=parent,
1324
+ namespace=parent.namespace,
1325
+ )
1326
+ return new
1327
+
1328
+
1329
+ class AlignClause(Namespaced, BaseModel):
1330
+ items: List[AlignItem]
1331
+
1332
+ def with_namespace(self, namespace: str) -> "AlignClause":
1333
+ return AlignClause(items=[x.with_namespace(namespace) for x in self.items])
1334
+
1335
+
1336
+ class MultiSelectStatement(Namespaced, BaseModel):
1337
+ selects: List[SelectStatement]
1338
+ align: AlignClause
1339
+ namespace: str
1340
+ where_clause: Optional["WhereClause"] = None
1341
+ order_by: Optional[OrderBy] = None
1342
+ limit: Optional[int] = None
1343
+
1344
+ def __repr__(self):
1345
+ return "MultiSelect<" + " MERGE ".join([str(s) for s in self.selects]) + ">"
1346
+
1347
+ @computed_field # type: ignore
1348
+ @cached_property
1349
+ def arguments(self) -> List[Concept]:
1350
+ output = []
1351
+ for select in self.selects:
1352
+ output += select.input_components
1353
+ return unique(output, "address")
1354
+
1355
+ @computed_field # type: ignore
1356
+ @cached_property
1357
+ def concept_arguments(self) -> List[Concept]:
1358
+ output = []
1359
+ for select in self.selects:
1360
+ output += select.input_components
1361
+ if self.where_clause:
1362
+ output += self.where_clause.concept_arguments
1363
+ return unique(output, "address")
1364
+
1365
+ def get_merge_concept(self, check: Concept):
1366
+ for item in self.align.items:
1367
+ if check in item.concepts_lcl:
1368
+ return item.gen_concept(self)
1369
+ return None
1370
+
1371
+ def with_namespace(self, namespace: str) -> "MultiSelectStatement":
1372
+ return MultiSelectStatement(
1373
+ selects=[c.with_namespace(namespace) for c in self.selects],
1374
+ align=self.align.with_namespace(namespace),
1375
+ namespace=namespace,
1376
+ )
1377
+
1378
+ @property
1379
+ def grain(self):
1380
+ base = Grain()
1381
+ for select in self.selects:
1382
+ base += select.grain
1383
+ return base
1384
+
1385
+ @computed_field # type: ignore
1386
+ @cached_property
1387
+ def derived_concepts(self) -> List[Concept]:
1388
+ output = []
1389
+ for item in self.align.items:
1390
+ output.append(item.gen_concept(self))
1391
+ return output
1392
+
1393
+ def find_source(self, concept: Concept, cte: CTE) -> Concept:
1394
+ all = []
1395
+ for x in self.align.items:
1396
+ if concept.name == x.alias:
1397
+ for c in x.concepts:
1398
+ if c.address in cte.output_lcl:
1399
+ all.append(c)
1400
+
1401
+ if len(all) == 1:
1402
+ return all[0]
1403
+
1404
+ raise SyntaxError(
1405
+ f"Could not find upstream map for multiselect {str(concept)} on cte ({cte})"
1406
+ )
1407
+
1408
+ @property
1409
+ def output_components(self) -> List[Concept]:
1410
+ output = self.derived_concepts
1411
+ for select in self.selects:
1412
+ output += select.output_components
1413
+ return unique(output, "address")
1414
+
1415
+ @computed_field # type: ignore
1416
+ @cached_property
1417
+ def hidden_components(self) -> List[Concept]:
1418
+ output = []
1419
+ for select in self.selects:
1420
+ output += select.hidden_components
1421
+ return output
1422
+
1423
+
1424
+ class Address(BaseModel):
1425
+ location: str
1426
+
1427
+
1428
+ class Query(BaseModel):
1429
+ text: str
1430
+
1431
+
1432
+ def safe_concept(v: Union[Dict, Concept]) -> Concept:
1433
+ if isinstance(v, dict):
1434
+ return Concept.model_validate(v)
1435
+ return v
1436
+
1437
+
1438
+ class GrainWindow(BaseModel):
1439
+ window: Window
1440
+ sort_concepts: List[Concept]
1441
+
1442
+ def __str__(self):
1443
+ return (
1444
+ "GrainWindow<"
1445
+ + ",".join([c.address for c in self.sort_concepts])
1446
+ + f":{str(self.window)}>"
1447
+ )
1448
+
1449
+
1450
+ def safe_grain(v) -> Grain:
1451
+ if isinstance(v, dict):
1452
+ return Grain.model_validate(v)
1453
+ elif isinstance(v, Grain):
1454
+ return v
1455
+ elif not v:
1456
+ return Grain(components=[])
1457
+ else:
1458
+ raise ValueError(f"Invalid input type to safe_grain {type(v)}")
1459
+
1460
+
1461
+ class DatasourceMetadata(BaseModel):
1462
+ freshness_concept: Concept | None
1463
+ partition_fields: List[Concept] = Field(default_factory=list)
1464
+
1465
+
1466
+ class MergeStatement(Namespaced, BaseModel):
1467
+ concepts: List[Concept]
1468
+ datatype: DataType | ListType | StructType | MapType
1469
+
1470
+ @cached_property
1471
+ def concepts_lcl(self):
1472
+ return LooseConceptList(concepts=self.concepts)
1473
+
1474
+ @property
1475
+ def merge_concept(self) -> Concept:
1476
+ bridge_name = "_".join([c.safe_address for c in self.concepts])
1477
+ return Concept(
1478
+ name=f"__merge_{bridge_name}",
1479
+ datatype=self.datatype,
1480
+ purpose=Purpose.PROPERTY,
1481
+ lineage=self,
1482
+ keys=tuple(self.concepts),
1483
+ )
1484
+
1485
+ @property
1486
+ def arguments(self) -> List[Concept]:
1487
+ return self.concepts
1488
+
1489
+ @property
1490
+ def concept_arguments(self) -> List[Concept]:
1491
+ return self.concepts
1492
+
1493
+ def find_source(self, concept: Concept, cte: CTE) -> Concept:
1494
+ for x in self.concepts:
1495
+ for z in cte.output_columns:
1496
+ if z.address == x.address:
1497
+ return z
1498
+ raise SyntaxError(
1499
+ f"Could not find upstream map for multiselect {str(concept)} on cte ({cte})"
1500
+ )
1501
+
1502
+ def with_namespace(self, namespace: str) -> "MergeStatement":
1503
+ return MergeStatement(
1504
+ concepts=[c.with_namespace(namespace) for c in self.concepts],
1505
+ datatype=self.datatype,
1506
+ )
1507
+
1508
+
1509
+ class Datasource(Namespaced, BaseModel):
1510
+ identifier: str
1511
+ columns: List[ColumnAssignment]
1512
+ address: Union[Address, str]
1513
+ grain: Grain = Field(
1514
+ default_factory=lambda: Grain(components=[]), validate_default=True
1515
+ )
1516
+ namespace: Optional[str] = Field(default=DEFAULT_NAMESPACE, validate_default=True)
1517
+ metadata: DatasourceMetadata = Field(
1518
+ default_factory=lambda: DatasourceMetadata(freshness_concept=None)
1519
+ )
1520
+
1521
+ @cached_property
1522
+ def output_lcl(self) -> LooseConceptList:
1523
+ return LooseConceptList(concepts=self.output_concepts)
1524
+
1525
+ @field_validator("namespace", mode="plain")
1526
+ @classmethod
1527
+ def namespace_validation(cls, v):
1528
+ return v or DEFAULT_NAMESPACE
1529
+
1530
+ @field_validator("address")
1531
+ @classmethod
1532
+ def address_enforcement(cls, v):
1533
+ if isinstance(v, str):
1534
+ v = Address(location=v)
1535
+ return v
1536
+
1537
+ @field_validator("grain", mode="plain")
1538
+ @classmethod
1539
+ def grain_enforcement(cls, v: Grain, info: ValidationInfo):
1540
+ values = info.data
1541
+ grain: Grain = safe_grain(v)
1542
+ if not grain.components:
1543
+ columns: List[ColumnAssignment] = values.get("columns", [])
1544
+ grain = Grain(
1545
+ components=[
1546
+ c.concept.with_grain(Grain())
1547
+ for c in columns
1548
+ if c.concept.purpose == Purpose.KEY
1549
+ ]
1550
+ )
1551
+ return grain
1552
+
1553
+ def add_column(self, concept: Concept, alias: str, modifiers=None):
1554
+ self.columns.append(
1555
+ ColumnAssignment(alias=alias, concept=concept, modifiers=modifiers)
1556
+ )
1557
+ # force refresh
1558
+ del self.output_lcl
1559
+
1560
+ def __add__(self, other):
1561
+ if not other == self:
1562
+ raise ValueError(
1563
+ "Attempted to add two datasources that are not identical, this should"
1564
+ " never happen"
1565
+ )
1566
+ return self
1567
+
1568
+ def __str__(self):
1569
+ return f"{self.namespace}.{self.identifier}@<{self.grain}>"
1570
+
1571
+ def __hash__(self):
1572
+ return (self.namespace + self.identifier).__hash__()
1573
+
1574
+ def with_namespace(self, namespace: str):
1575
+ new_namespace = (
1576
+ namespace + "." + self.namespace
1577
+ if self.namespace and self.namespace != DEFAULT_NAMESPACE
1578
+ else namespace
1579
+ )
1580
+ return Datasource(
1581
+ identifier=self.identifier,
1582
+ namespace=new_namespace,
1583
+ grain=self.grain.with_namespace(namespace),
1584
+ address=self.address,
1585
+ columns=[c.with_namespace(namespace) for c in self.columns],
1586
+ )
1587
+
1588
+ @property
1589
+ def concepts(self) -> List[Concept]:
1590
+ return [c.concept for c in self.columns]
1591
+
1592
+ @property
1593
+ def group_required(self):
1594
+ return False
1595
+
1596
+ @property
1597
+ def full_concepts(self) -> List[Concept]:
1598
+ return [c.concept for c in self.columns if Modifier.PARTIAL not in c.modifiers]
1599
+
1600
+ @property
1601
+ def output_concepts(self) -> List[Concept]:
1602
+ return self.concepts
1603
+
1604
+ @property
1605
+ def partial_concepts(self) -> List[Concept]:
1606
+ return [c.concept for c in self.columns if Modifier.PARTIAL in c.modifiers]
1607
+
1608
+ def get_alias(
1609
+ self, concept: Concept, use_raw_name: bool = True, force_alias: bool = False
1610
+ ) -> Optional[str | RawColumnExpr] | Function:
1611
+ # 2022-01-22
1612
+ # this logic needs to be refined.
1613
+ # if concept.lineage:
1614
+ # # return None
1615
+ for x in self.columns:
1616
+ if x.concept == concept or x.concept.with_grain(concept.grain) == concept:
1617
+ if use_raw_name:
1618
+ return x.alias
1619
+ return concept.safe_address
1620
+ existing = [str(c.concept.with_grain(self.grain)) for c in self.columns]
1621
+ raise ValueError(
1622
+ f"{LOGGER_PREFIX} Concept {concept} not found on {self.identifier}; have"
1623
+ f" {existing}."
1624
+ )
1625
+
1626
+ @property
1627
+ def name(self) -> str:
1628
+ return self.identifier
1629
+ # TODO: namespace all references
1630
+ # return f'{self.namespace}_{self.identifier}'
1631
+
1632
+ @property
1633
+ def full_name(self) -> str:
1634
+ if not self.namespace:
1635
+ return self.identifier
1636
+ namespace = self.namespace.replace(".", "_") if self.namespace else ""
1637
+ return f"{namespace}_{self.identifier}"
1638
+
1639
+ @cached_property
1640
+ def safe_location(self) -> str:
1641
+ if isinstance(self.address, Address):
1642
+ return self.address.location
1643
+ return self.address
1644
+
1645
+
1646
+ class UnnestJoin(BaseModel):
1647
+ concept: Concept
1648
+ alias: str = "unnest"
1649
+
1650
+ def __hash__(self):
1651
+ return (self.alias + self.concept.address).__hash__()
1652
+
1653
+
1654
+ class InstantiatedUnnestJoin(BaseModel):
1655
+ concept: Concept
1656
+ alias: str = "unnest"
1657
+
1658
+
1659
+ class BaseJoin(BaseModel):
1660
+ left_datasource: Union[Datasource, "QueryDatasource"]
1661
+ right_datasource: Union[Datasource, "QueryDatasource"]
1662
+ concepts: List[Concept]
1663
+ join_type: JoinType
1664
+ filter_to_mutual: bool = False
1665
+
1666
+ def __init__(self, **data: Any):
1667
+ super().__init__(**data)
1668
+ if self.left_datasource.full_name == self.right_datasource.full_name:
1669
+ raise SyntaxError(
1670
+ f"Cannot join a dataself to itself, joining {self.left_datasource} and"
1671
+ f" {self.right_datasource}"
1672
+ )
1673
+ final_concepts = []
1674
+ for concept in self.concepts:
1675
+ include = True
1676
+ for ds in [self.left_datasource, self.right_datasource]:
1677
+ if concept.address not in [c.address for c in ds.output_concepts]:
1678
+ if self.filter_to_mutual:
1679
+ include = False
1680
+ else:
1681
+ raise SyntaxError(
1682
+ f"Invalid join, missing {concept} on {ds.name}, have"
1683
+ f" {[c.address for c in ds.output_concepts]}"
1684
+ )
1685
+ if include:
1686
+ final_concepts.append(concept)
1687
+ if not final_concepts and self.concepts:
1688
+ # if one datasource only has constants
1689
+ # we can join on 1=1
1690
+ for ds in [self.left_datasource, self.right_datasource]:
1691
+ # single rows
1692
+ if all(
1693
+ [
1694
+ c.granularity == Granularity.SINGLE_ROW
1695
+ for c in ds.output_concepts
1696
+ ]
1697
+ ):
1698
+ self.concepts = []
1699
+ return
1700
+ # if everything is at abstract grain, we can skip joins
1701
+ if all([c.grain == Grain() for c in ds.output_concepts]):
1702
+ self.concepts = []
1703
+ return
1704
+
1705
+ left_keys = [c.address for c in self.left_datasource.output_concepts]
1706
+ right_keys = [c.address for c in self.right_datasource.output_concepts]
1707
+ match_concepts = [c.address for c in self.concepts]
1708
+ raise SyntaxError(
1709
+ "No mutual join keys found between"
1710
+ f" {self.left_datasource.identifier} and"
1711
+ f" {self.right_datasource.identifier}, left_keys {left_keys},"
1712
+ f" right_keys {right_keys},"
1713
+ f" provided join concepts {match_concepts}"
1714
+ )
1715
+ self.concepts = final_concepts
1716
+
1717
+ @property
1718
+ def unique_id(self) -> str:
1719
+ # TODO: include join type?
1720
+ return (
1721
+ self.left_datasource.name
1722
+ + self.right_datasource.name
1723
+ + self.join_type.value
1724
+ )
1725
+
1726
+ def __str__(self):
1727
+ return (
1728
+ f"{self.join_type.value} JOIN {self.left_datasource.identifier} and"
1729
+ f" {self.right_datasource.identifier} on"
1730
+ f" {','.join([str(k) for k in self.concepts])}"
1731
+ )
1732
+
1733
+
1734
+ class QueryDatasource(BaseModel):
1735
+ input_concepts: List[Concept]
1736
+ output_concepts: List[Concept]
1737
+ source_map: Dict[str, Set[Union[Datasource, "QueryDatasource", "UnnestJoin"]]]
1738
+ datasources: Sequence[Union[Datasource, "QueryDatasource"]]
1739
+ grain: Grain
1740
+ joins: List[BaseJoin | UnnestJoin]
1741
+ limit: Optional[int] = None
1742
+ condition: Optional[Union["Conditional", "Comparison", "Parenthetical"]] = Field(
1743
+ default=None
1744
+ )
1745
+ filter_concepts: List[Concept] = Field(default_factory=list)
1746
+ source_type: SourceType = SourceType.SELECT
1747
+ partial_concepts: List[Concept] = Field(default_factory=list)
1748
+ join_derived_concepts: List[Concept] = Field(default_factory=list)
1749
+ force_group: bool | None = None
1750
+
1751
+ @property
1752
+ def non_partial_concept_addresses(self) -> List[str]:
1753
+ return [
1754
+ c.address
1755
+ for c in self.output_concepts
1756
+ if c.address not in [z.address for z in self.partial_concepts]
1757
+ ]
1758
+
1759
+ @field_validator("joins")
1760
+ @classmethod
1761
+ def validate_joins(cls, v):
1762
+ for join in v:
1763
+ if not isinstance(join, BaseJoin):
1764
+ continue
1765
+ if join.left_datasource.identifier == join.right_datasource.identifier:
1766
+ raise SyntaxError(
1767
+ f"Cannot join a datasource to itself, joining {join.left_datasource}"
1768
+ )
1769
+ return v
1770
+
1771
+ @field_validator("input_concepts")
1772
+ @classmethod
1773
+ def validate_inputs(cls, v):
1774
+ return unique(v, "address")
1775
+
1776
+ @field_validator("output_concepts")
1777
+ @classmethod
1778
+ def validate_outputs(cls, v):
1779
+ return unique(v, "address")
1780
+
1781
+ @field_validator("source_map")
1782
+ @classmethod
1783
+ def validate_source_map(cls, v, info=ValidationInfo):
1784
+ values = info.data
1785
+ expected = {c.address for c in values["output_concepts"]}.union(
1786
+ c.address for c in values["input_concepts"]
1787
+ )
1788
+ seen = set()
1789
+ for k, val in v.items():
1790
+ # if val:
1791
+ # if len(val) != 1:
1792
+ # raise SyntaxError(f"source map {k} has multiple values {len(val)}")
1793
+ seen.add(k)
1794
+ for x in expected:
1795
+ if x not in seen:
1796
+ raise SyntaxError(
1797
+ f"source map missing {x} on (expected {expected}, have {seen})"
1798
+ )
1799
+ return v
1800
+
1801
+ def __str__(self):
1802
+ return f"{self.identifier}@<{self.grain}>"
1803
+
1804
+ def __hash__(self):
1805
+ return (self.identifier).__hash__()
1806
+
1807
+ @property
1808
+ def concepts(self):
1809
+ return self.output_concepts
1810
+
1811
+ @property
1812
+ def name(self):
1813
+ return self.identifier
1814
+
1815
+ @property
1816
+ def full_name(self):
1817
+ return self.identifier
1818
+
1819
+ @property
1820
+ def group_required(self) -> bool:
1821
+ if self.force_group is True:
1822
+ return True
1823
+ if self.force_group is False:
1824
+ return False
1825
+ if self.source_type:
1826
+ if self.source_type in [
1827
+ SourceType.FILTER,
1828
+ ]:
1829
+ return False
1830
+ elif self.source_type in [
1831
+ SourceType.GROUP,
1832
+ ]:
1833
+ return True
1834
+ return False
1835
+
1836
+ def __add__(self, other):
1837
+ # these are syntax errors to avoid being caught by current
1838
+ if not isinstance(other, QueryDatasource):
1839
+ raise SyntaxError("Can only merge two query datasources")
1840
+ if not other.grain == self.grain:
1841
+ raise SyntaxError(
1842
+ "Can only merge two query datasources with identical grain"
1843
+ )
1844
+ if not self.source_type == other.source_type:
1845
+ raise SyntaxError(
1846
+ "Can only merge two query datasources with identical source type"
1847
+ )
1848
+ if not self.group_required == other.group_required:
1849
+ raise SyntaxError(
1850
+ "can only merge two datasources if the group required flag is the same"
1851
+ )
1852
+ if not self.join_derived_concepts == other.join_derived_concepts:
1853
+ raise SyntaxError(
1854
+ "can only merge two datasources if the join derived concepts are the same"
1855
+ )
1856
+ if not self.force_group == other.force_group:
1857
+ raise SyntaxError(
1858
+ "can only merge two datasources if the force_group flag is the same"
1859
+ )
1860
+ logger.debug(
1861
+ f"{LOGGER_PREFIX} merging {self.name} with"
1862
+ f" {[c.address for c in self.output_concepts]} concepts and"
1863
+ f" {other.name} with {[c.address for c in other.output_concepts]} concepts"
1864
+ )
1865
+
1866
+ merged_datasources = {}
1867
+ for ds in [*self.datasources, *other.datasources]:
1868
+ if ds.full_name in merged_datasources:
1869
+ merged_datasources[ds.full_name] = merged_datasources[ds.full_name] + ds
1870
+ else:
1871
+ merged_datasources[ds.full_name] = ds
1872
+ qds = QueryDatasource(
1873
+ input_concepts=unique(
1874
+ self.input_concepts + other.input_concepts, "address"
1875
+ ),
1876
+ output_concepts=unique(
1877
+ self.output_concepts + other.output_concepts, "address"
1878
+ ),
1879
+ source_map={**self.source_map, **other.source_map},
1880
+ datasources=list(merged_datasources.values()),
1881
+ grain=self.grain,
1882
+ joins=unique(self.joins + other.joins, "unique_id"),
1883
+ # joins = self.joins,
1884
+ condition=(
1885
+ self.condition + other.condition
1886
+ if (self.condition or other.condition)
1887
+ else None
1888
+ ),
1889
+ source_type=self.source_type,
1890
+ partial_concepts=self.partial_concepts + other.partial_concepts,
1891
+ join_derived_concepts=self.join_derived_concepts,
1892
+ force_group=self.force_group,
1893
+ )
1894
+
1895
+ return qds
1896
+
1897
+ @property
1898
+ def identifier(self) -> str:
1899
+ filters = abs(hash(str(self.condition))) if self.condition else ""
1900
+ grain = "_".join(
1901
+ [str(c.address).replace(".", "_") for c in self.grain.components]
1902
+ )
1903
+ # partial = "_".join([str(c.address).replace(".", "_") for c in self.partial_concepts])
1904
+ return (
1905
+ "_join_".join([d.name for d in self.datasources])
1906
+ + (f"_at_{grain}" if grain else "_at_abstract")
1907
+ + (f"_filtered_by_{filters}" if filters else "")
1908
+ # + (f"_partial_{partial}" if partial else "")
1909
+ )
1910
+
1911
+ def get_alias(
1912
+ self, concept: Concept, use_raw_name: bool = False, force_alias: bool = False
1913
+ ):
1914
+ # if we should use the raw datasource name to access
1915
+ use_raw_name = (
1916
+ True
1917
+ if (len(self.datasources) == 1 or use_raw_name) and not force_alias
1918
+ # if ((len(self.datasources) == 1 and isinstance(self.datasources[0], Datasource)) or use_raw_name) and not force_alias
1919
+ else False
1920
+ )
1921
+ for x in self.datasources:
1922
+ # query datasources should be referenced by their alias, always
1923
+ force_alias = isinstance(x, QueryDatasource)
1924
+ try:
1925
+ return x.get_alias(
1926
+ concept.with_grain(self.grain),
1927
+ use_raw_name,
1928
+ force_alias=force_alias,
1929
+ )
1930
+ except ValueError as e:
1931
+ from trilogy.constants import logger
1932
+
1933
+ logger.debug(e)
1934
+ continue
1935
+ existing = [c.with_grain(self.grain) for c in self.output_concepts]
1936
+ if concept in existing:
1937
+ return concept.name
1938
+
1939
+ existing_str = [str(c) for c in existing]
1940
+ datasources = [ds.identifier for ds in self.datasources]
1941
+ raise ValueError(
1942
+ f"{LOGGER_PREFIX} Concept {str(concept)} not found on {self.identifier};"
1943
+ f" have {existing_str} from {datasources}."
1944
+ )
1945
+
1946
+ @property
1947
+ def safe_location(self):
1948
+ return self.identifier
1949
+
1950
+
1951
+ class Comment(BaseModel):
1952
+ text: str
1953
+
1954
+
1955
+ class CTE(BaseModel):
1956
+ name: str
1957
+ source: "QueryDatasource" # TODO: make recursive
1958
+ # output columns are what are selected/grouped by
1959
+ output_columns: List[Concept]
1960
+ source_map: Dict[str, str | list[str]]
1961
+ grain: Grain
1962
+ base: bool = False
1963
+ group_to_grain: bool = False
1964
+ parent_ctes: List["CTE"] = Field(default_factory=list)
1965
+ joins: List[Union["Join", "InstantiatedUnnestJoin"]] = Field(default_factory=list)
1966
+ condition: Optional[Union["Conditional", "Comparison", "Parenthetical"]] = None
1967
+ partial_concepts: List[Concept] = Field(default_factory=list)
1968
+ join_derived_concepts: List[Concept] = Field(default_factory=list)
1969
+
1970
+ @computed_field # type: ignore
1971
+ @property
1972
+ def output_lcl(self) -> LooseConceptList:
1973
+ return LooseConceptList(concepts=self.output_columns)
1974
+
1975
+ @field_validator("output_columns")
1976
+ def validate_output_columns(cls, v):
1977
+ return unique(v, "address")
1978
+
1979
+ def __add__(self, other: "CTE"):
1980
+ logger.info('Merging two copies of CTE "%s"', self.name)
1981
+ if not self.grain == other.grain:
1982
+ error = (
1983
+ "Attempting to merge two ctes of different grains"
1984
+ f" {self.name} {other.name} grains {self.grain} {other.grain}| {self.group_to_grain} {other.group_to_grain}| {self.output_lcl} {other.output_lcl}"
1985
+ )
1986
+ raise ValueError(error)
1987
+ if not self.condition == other.condition:
1988
+ error = (
1989
+ "Attempting to merge two ctes with different conditions"
1990
+ f" {self.name} {other.name} conditions {self.condition} {other.condition}"
1991
+ )
1992
+ raise ValueError(error)
1993
+ self.partial_concepts = unique(
1994
+ self.partial_concepts + other.partial_concepts, "address"
1995
+ )
1996
+ self.parent_ctes = merge_ctes(self.parent_ctes + other.parent_ctes)
1997
+
1998
+ self.source_map = {**self.source_map, **other.source_map}
1999
+
2000
+ self.output_columns = unique(
2001
+ self.output_columns + other.output_columns, "address"
2002
+ )
2003
+ self.joins = unique(self.joins + other.joins, "unique_id")
2004
+ self.partial_concepts = unique(
2005
+ self.partial_concepts + other.partial_concepts, "address"
2006
+ )
2007
+ self.join_derived_concepts = unique(
2008
+ self.join_derived_concepts + other.join_derived_concepts, "address"
2009
+ )
2010
+
2011
+ self.source.source_map = {**self.source.source_map, **other.source.source_map}
2012
+ self.source.output_concepts = unique(
2013
+ self.source.output_concepts + other.source.output_concepts, "address"
2014
+ )
2015
+ return self
2016
+
2017
+ @property
2018
+ def relevant_base_ctes(self):
2019
+ return self.parent_ctes
2020
+
2021
+ @property
2022
+ def base_name(self) -> str:
2023
+ # if this cte selects from a single datasource, select right from it
2024
+ valid_joins: List[Join] = [
2025
+ join for join in self.joins if isinstance(join, Join)
2026
+ ]
2027
+ if len(self.source.datasources) == 1 and isinstance(
2028
+ self.source.datasources[0], Datasource
2029
+ ):
2030
+ return self.source.datasources[0].safe_location
2031
+ # if we have multiple joined CTEs, pick the base
2032
+ # as the root
2033
+ elif len(self.source.datasources) == 1 and len(self.parent_ctes) == 1:
2034
+ return self.parent_ctes[0].name
2035
+ elif valid_joins and len(valid_joins) > 0:
2036
+ candidates = [x.left_cte.name for x in valid_joins]
2037
+ disallowed = [x.right_cte.name for x in valid_joins]
2038
+ try:
2039
+ return [y for y in candidates if y not in disallowed][0]
2040
+ except IndexError:
2041
+ raise SyntaxError(
2042
+ f"Invalid join configuration {candidates} {disallowed} with all parents {[x.base_name for x in self.parent_ctes]}"
2043
+ )
2044
+ elif self.relevant_base_ctes:
2045
+ return self.relevant_base_ctes[0].name
2046
+ elif self.parent_ctes:
2047
+ raise SyntaxError(
2048
+ f"{self.name} has no relevant base CTEs, {self.source_map},"
2049
+ f" {[x.name for x in self.parent_ctes]}, outputs"
2050
+ f" {[x.address for x in self.output_columns]}"
2051
+ )
2052
+ return self.source.name
2053
+
2054
+ @property
2055
+ def base_alias(self) -> str:
2056
+ relevant_joins = [j for j in self.joins if isinstance(j, Join)]
2057
+ if len(self.source.datasources) == 1 and isinstance(
2058
+ self.source.datasources[0], Datasource
2059
+ ):
2060
+ return self.source.datasources[0].full_name.replace(".", "_")
2061
+ if relevant_joins:
2062
+ return relevant_joins[0].left_cte.name
2063
+ elif self.relevant_base_ctes:
2064
+ return self.relevant_base_ctes[0].name
2065
+ elif self.parent_ctes:
2066
+ return self.parent_ctes[0].name
2067
+ return self.name
2068
+
2069
+ def get_alias(self, concept: Concept) -> str:
2070
+ for cte in self.parent_ctes:
2071
+ if concept.address in [x.address for x in cte.output_columns]:
2072
+ return concept.safe_address
2073
+ try:
2074
+ source = self.source.get_alias(concept)
2075
+ return source
2076
+ except ValueError as e:
2077
+ return f"INVALID_ALIAS: {str(e)}"
2078
+
2079
+ @property
2080
+ def render_from_clause(self) -> bool:
2081
+ if (
2082
+ all([c.derivation == PurposeLineage.CONSTANT for c in self.output_columns])
2083
+ and not self.parent_ctes
2084
+ and not self.group_to_grain
2085
+ ):
2086
+ return False
2087
+ return True
2088
+
2089
+ @property
2090
+ def sourced_concepts(self) -> List[Concept]:
2091
+ return [c for c in self.output_columns if c.address in self.source_map]
2092
+
2093
+
2094
+ def merge_ctes(ctes: List[CTE]) -> List[CTE]:
2095
+ final_ctes_dict: Dict[str, CTE] = {}
2096
+ # merge CTEs
2097
+ for cte in ctes:
2098
+ if cte.name not in final_ctes_dict:
2099
+ final_ctes_dict[cte.name] = cte
2100
+ else:
2101
+ final_ctes_dict[cte.name] = final_ctes_dict[cte.name] + cte
2102
+
2103
+ final_ctes = list(final_ctes_dict.values())
2104
+ return final_ctes
2105
+
2106
+
2107
+ class CompiledCTE(BaseModel):
2108
+ name: str
2109
+ statement: str
2110
+
2111
+
2112
+ class JoinKey(BaseModel):
2113
+ concept: Concept
2114
+
2115
+ def __str__(self):
2116
+ return str(self.concept)
2117
+
2118
+
2119
+ class Join(BaseModel):
2120
+ left_cte: CTE
2121
+ right_cte: CTE
2122
+ jointype: JoinType
2123
+ joinkeys: List[JoinKey]
2124
+
2125
+ @property
2126
+ def unique_id(self) -> str:
2127
+ return self.left_cte.name + self.right_cte.name + self.jointype.value
2128
+
2129
+ def __str__(self):
2130
+ return (
2131
+ f"{self.jointype.value} JOIN {self.left_cte.name} and"
2132
+ f" {self.right_cte.name} on {','.join([str(k) for k in self.joinkeys])}"
2133
+ )
2134
+
2135
+
2136
+ class UndefinedConcept(Concept):
2137
+ model_config = ConfigDict(arbitrary_types_allowed=True)
2138
+ name: str
2139
+ environment: "EnvironmentConceptDict"
2140
+ line_no: int | None = None
2141
+ datatype: DataType = DataType.UNKNOWN
2142
+ purpose: Purpose = Purpose.KEY
2143
+
2144
+ def with_namespace(self, namespace: str) -> "UndefinedConcept":
2145
+ return self.__class__(
2146
+ name=self.name,
2147
+ datatype=self.datatype,
2148
+ purpose=self.purpose,
2149
+ metadata=self.metadata,
2150
+ lineage=self.lineage.with_namespace(namespace) if self.lineage else None,
2151
+ grain=(
2152
+ self.grain.with_namespace(namespace)
2153
+ if self.grain
2154
+ else Grain(components=[])
2155
+ ),
2156
+ namespace=namespace,
2157
+ keys=self.keys,
2158
+ environment=self.environment,
2159
+ line_no=self.line_no,
2160
+ )
2161
+
2162
+ def with_select_grain(self, grain: Optional["Grain"] = None) -> "UndefinedConcept":
2163
+ if not all([isinstance(x, Concept) for x in self.keys or []]):
2164
+ raise ValueError(f"Invalid keys {self.keys} for concept {self.address}")
2165
+ new_grain = grain or Grain(components=[])
2166
+ if self.lineage:
2167
+ new_lineage = self.lineage
2168
+ if isinstance(self.lineage, SelectGrain):
2169
+ new_lineage = self.lineage.with_select_grain(new_grain)
2170
+ else:
2171
+ new_lineage = None
2172
+ return self.__class__(
2173
+ name=self.name,
2174
+ datatype=self.datatype,
2175
+ purpose=self.purpose,
2176
+ metadata=self.metadata,
2177
+ lineage=new_lineage,
2178
+ grain=new_grain,
2179
+ namespace=self.namespace,
2180
+ keys=self.keys,
2181
+ environment=self.environment,
2182
+ )
2183
+
2184
+ def with_grain(self, grain: Optional["Grain"] = None) -> "Concept":
2185
+ return self.__class__(
2186
+ name=self.name,
2187
+ datatype=self.datatype,
2188
+ purpose=self.purpose,
2189
+ metadata=self.metadata,
2190
+ lineage=self.lineage,
2191
+ grain=grain or Grain(components=[]),
2192
+ namespace=self.namespace,
2193
+ keys=self.keys,
2194
+ environment=self.environment,
2195
+ line_no=self.line_no,
2196
+ )
2197
+
2198
+ def with_default_grain(self) -> "Concept":
2199
+ if self.purpose == Purpose.KEY:
2200
+ # we need to make this abstract
2201
+ grain = Grain(components=[self.with_grain(Grain())], nested=True)
2202
+ elif self.purpose == Purpose.PROPERTY:
2203
+ components: List[Concept] = []
2204
+ if self.keys:
2205
+ components = [*self.keys]
2206
+ if self.lineage:
2207
+ for item in self.lineage.arguments:
2208
+ if isinstance(item, Concept):
2209
+ if item.keys and not all(c in components for c in item.keys):
2210
+ components += item.sources
2211
+ else:
2212
+ components += item.sources
2213
+ grain = Grain(components=components)
2214
+ elif self.purpose == Purpose.METRIC:
2215
+ grain = Grain()
2216
+ else:
2217
+ grain = self.grain # type: ignore
2218
+ return self.__class__(
2219
+ name=self.name,
2220
+ datatype=self.datatype,
2221
+ purpose=self.purpose,
2222
+ metadata=self.metadata,
2223
+ lineage=self.lineage,
2224
+ grain=grain,
2225
+ keys=self.keys,
2226
+ namespace=self.namespace,
2227
+ environment=self.environment,
2228
+ line_no=self.line_no,
2229
+ )
2230
+
2231
+
2232
+ class EnvironmentConceptDict(dict):
2233
+ def __init__(self, *args, **kwargs) -> None:
2234
+ super().__init__(self, *args, **kwargs)
2235
+ self.undefined: dict[str, UndefinedConcept] = {}
2236
+ self.fail_on_missing: bool = False
2237
+ self.populate_default_concepts()
2238
+
2239
+ def populate_default_concepts(self):
2240
+ from trilogy.core.internal import DEFAULT_CONCEPTS
2241
+
2242
+ for concept in DEFAULT_CONCEPTS.values():
2243
+ self[concept.address] = concept
2244
+
2245
+ def values(self) -> ValuesView[Concept]: # type: ignore
2246
+ return super().values()
2247
+
2248
+ def __getitem__(
2249
+ self, key, line_no: int | None = None
2250
+ ) -> Concept | UndefinedConcept:
2251
+ try:
2252
+ return super(EnvironmentConceptDict, self).__getitem__(key)
2253
+
2254
+ except KeyError:
2255
+ if "." in key and key.split(".")[0] == DEFAULT_NAMESPACE:
2256
+ return self.__getitem__(key.split(".")[1], line_no)
2257
+ if DEFAULT_NAMESPACE + "." + key in self:
2258
+ return self.__getitem__(DEFAULT_NAMESPACE + "." + key, line_no)
2259
+ if not self.fail_on_missing:
2260
+ undefined = UndefinedConcept(
2261
+ name=key,
2262
+ line_no=line_no,
2263
+ environment=self,
2264
+ datatype=DataType.UNKNOWN,
2265
+ purpose=Purpose.KEY,
2266
+ )
2267
+ self.undefined[key] = undefined
2268
+ return undefined
2269
+
2270
+ matches = self._find_similar_concepts(key)
2271
+ message = f"Undefined concept: {key}."
2272
+ if matches:
2273
+ message += f" Suggestions: {matches}"
2274
+
2275
+ if line_no:
2276
+ raise UndefinedConceptException(f"line: {line_no}: " + message, matches)
2277
+ raise UndefinedConceptException(message, matches)
2278
+
2279
+ def _find_similar_concepts(self, concept_name):
2280
+ matches = difflib.get_close_matches(concept_name, self.keys())
2281
+ return matches
2282
+
2283
+ def items(self) -> ItemsView[str, Concept | UndefinedConcept]: # type: ignore
2284
+ return super().items()
2285
+
2286
+
2287
+ class ImportStatement(BaseModel):
2288
+ alias: str
2289
+ path: str
2290
+ # environment: "Environment" | None = None
2291
+ # TODO: this might result in a lot of duplication
2292
+ # environment:"Environment"
2293
+
2294
+
2295
+ class EnvironmentOptions(BaseModel):
2296
+ allow_duplicate_declaration: bool = True
2297
+
2298
+
2299
+ def validate_concepts(v) -> EnvironmentConceptDict:
2300
+ if isinstance(v, EnvironmentConceptDict):
2301
+ return v
2302
+ elif isinstance(v, dict):
2303
+ return EnvironmentConceptDict(
2304
+ **{x: Concept.model_validate(y) for x, y in v.items()}
2305
+ )
2306
+ raise ValueError
2307
+
2308
+
2309
+ class Environment(BaseModel):
2310
+ model_config = ConfigDict(arbitrary_types_allowed=True, strict=False)
2311
+
2312
+ concepts: Annotated[EnvironmentConceptDict, PlainValidator(validate_concepts)] = (
2313
+ Field(default_factory=EnvironmentConceptDict)
2314
+ )
2315
+ datasources: Dict[str, Datasource] = Field(default_factory=dict)
2316
+ functions: Dict[str, Function] = Field(default_factory=dict)
2317
+ data_types: Dict[str, DataType] = Field(default_factory=dict)
2318
+ imports: Dict[str, ImportStatement] = Field(default_factory=dict)
2319
+ namespace: str = DEFAULT_NAMESPACE
2320
+ working_path: str | Path = Field(default_factory=lambda: os.getcwd())
2321
+ environment_config: EnvironmentOptions = Field(default_factory=EnvironmentOptions)
2322
+ version: str = Field(default_factory=get_version)
2323
+ cte_name_map: Dict[str, str] = Field(default_factory=dict)
2324
+
2325
+ @classmethod
2326
+ def from_file(cls, path: str | Path) -> "Environment":
2327
+ with open(path, "r") as f:
2328
+ read = f.read()
2329
+ return Environment(working_path=Path(path).parent).parse(read)[0]
2330
+
2331
+ @classmethod
2332
+ def from_cache(cls, path) -> Optional["Environment"]:
2333
+ with open(path, "r") as f:
2334
+ read = f.read()
2335
+ base = cls.model_validate_json(read)
2336
+ version = get_version()
2337
+ if base.version != version:
2338
+ return None
2339
+ return base
2340
+
2341
+ def to_cache(self, path: Optional[str | Path] = None) -> Path:
2342
+ if not path:
2343
+ ppath = Path(self.working_path) / ENV_CACHE_NAME
2344
+ else:
2345
+ ppath = Path(path)
2346
+ with open(ppath, "w") as f:
2347
+ f.write(self.model_dump_json())
2348
+ return ppath
2349
+
2350
+ @property
2351
+ def materialized_concepts(self) -> List[Concept]:
2352
+ output = []
2353
+ for concept in self.concepts.values():
2354
+ found = False
2355
+ # basic concepts are effectively materialized
2356
+ # and can be found via join paths
2357
+ for datasource in self.datasources.values():
2358
+ if concept.address in [x.address for x in datasource.output_concepts]:
2359
+ found = True
2360
+ break
2361
+ if found:
2362
+ output.append(concept)
2363
+ return output
2364
+
2365
+ def validate_concept(self, lookup: str, meta: Meta | None = None):
2366
+ existing: Concept = self.concepts.get(lookup) # type: ignore
2367
+ if not existing:
2368
+ return
2369
+ elif existing and self.environment_config.allow_duplicate_declaration:
2370
+ return
2371
+ elif existing.metadata:
2372
+ # if the existing concept is auto derived, we can overwrite it
2373
+ if existing.metadata.concept_source == ConceptSource.AUTO_DERIVED:
2374
+ return
2375
+ elif meta and existing.metadata:
2376
+ raise ValueError(
2377
+ f"Assignment to concept '{lookup}' on line {meta.line} is a duplicate"
2378
+ f" declaration; '{lookup}' was originally defined on line"
2379
+ f" {existing.metadata.line_number}"
2380
+ )
2381
+ elif existing.metadata:
2382
+ raise ValueError(
2383
+ f"Assignment to concept '{lookup}' is a duplicate declaration;"
2384
+ f" '{lookup}' was originally defined on line"
2385
+ f" {existing.metadata.line_number}"
2386
+ )
2387
+ raise ValueError(
2388
+ f"Assignment to concept '{lookup}' is a duplicate declaration;"
2389
+ )
2390
+
2391
+ def add_import(self, alias: str, environment: Environment):
2392
+ self.imports[alias] = ImportStatement(
2393
+ alias=alias, path=str(environment.working_path)
2394
+ )
2395
+ for key, concept in environment.concepts.items():
2396
+ self.concepts[f"{alias}.{key}"] = concept.with_namespace(alias)
2397
+ for key, datasource in environment.datasources.items():
2398
+ self.datasources[f"{alias}.{key}"] = datasource.with_namespace(alias)
2399
+
2400
+ def parse(
2401
+ self, input: str, namespace: str | None = None, persist: bool = False
2402
+ ) -> Tuple[Environment, list]:
2403
+ from trilogy import parse
2404
+ from trilogy.core.query_processor import process_persist
2405
+
2406
+ if namespace:
2407
+ new = Environment()
2408
+ _, queries = new.parse(input)
2409
+ self.add_import(namespace, new)
2410
+ return self, queries
2411
+ _, queries = parse(input, self)
2412
+ generatable = [
2413
+ x
2414
+ for x in queries
2415
+ if isinstance(
2416
+ x,
2417
+ (
2418
+ SelectStatement,
2419
+ PersistStatement,
2420
+ MultiSelectStatement,
2421
+ ShowStatement,
2422
+ ),
2423
+ )
2424
+ ]
2425
+ while generatable:
2426
+ t = generatable.pop(0)
2427
+ if isinstance(t, PersistStatement) and persist:
2428
+ processed = process_persist(self, t)
2429
+ self.add_datasource(processed.datasource)
2430
+ return self, queries
2431
+
2432
+ def add_concept(
2433
+ self,
2434
+ concept: Concept,
2435
+ meta: Meta | None = None,
2436
+ force: bool = False,
2437
+ add_derived: bool = True,
2438
+ ):
2439
+ if not force:
2440
+ self.validate_concept(concept.address, meta=meta)
2441
+ if concept.namespace == DEFAULT_NAMESPACE:
2442
+ self.concepts[concept.name] = concept
2443
+ else:
2444
+ self.concepts[concept.address] = concept
2445
+ if add_derived:
2446
+ from trilogy.core.environment_helpers import generate_related_concepts
2447
+
2448
+ generate_related_concepts(concept, self)
2449
+ return concept
2450
+
2451
+ def add_datasource(
2452
+ self,
2453
+ datasource: Datasource,
2454
+ ):
2455
+ if datasource.namespace == DEFAULT_NAMESPACE:
2456
+ self.datasources[datasource.name] = datasource
2457
+ return datasource
2458
+ if not datasource.namespace:
2459
+ self.datasources[datasource.name] = datasource
2460
+ return datasource
2461
+ self.datasources[datasource.namespace + "." + datasource.identifier] = (
2462
+ datasource
2463
+ )
2464
+ return datasource
2465
+
2466
+
2467
+ class LazyEnvironment(Environment):
2468
+ """Variant of environment to defer parsing of a path"""
2469
+
2470
+ load_path: Path
2471
+ loaded: bool = False
2472
+
2473
+ def __getattribute__(self, name):
2474
+ if name in (
2475
+ "load_path",
2476
+ "loaded",
2477
+ "working_path",
2478
+ "model_config",
2479
+ "model_fields",
2480
+ ) or name.startswith("_"):
2481
+ return super().__getattribute__(name)
2482
+ if not self.loaded:
2483
+ print(f"lazily evaluating load path {self.load_path} to access {name}")
2484
+ from trilogy import parse
2485
+
2486
+ env = Environment(working_path=str(self.working_path))
2487
+ with open(self.load_path, "r") as f:
2488
+ parse(f.read(), env)
2489
+ self.loaded = True
2490
+ self.datasources = env.datasources
2491
+ self.concepts = env.concepts
2492
+ self.imports = env.imports
2493
+ return super().__getattribute__(name)
2494
+
2495
+
2496
+ class Comparison(Namespaced, SelectGrain, BaseModel):
2497
+ left: Union[
2498
+ int,
2499
+ str,
2500
+ float,
2501
+ list,
2502
+ bool,
2503
+ Function,
2504
+ Concept,
2505
+ "Conditional",
2506
+ DataType,
2507
+ "Comparison",
2508
+ "Parenthetical",
2509
+ MagicConstants,
2510
+ WindowItem,
2511
+ AggregateWrapper,
2512
+ ]
2513
+ right: Union[
2514
+ int,
2515
+ str,
2516
+ float,
2517
+ list,
2518
+ bool,
2519
+ Concept,
2520
+ Function,
2521
+ "Conditional",
2522
+ DataType,
2523
+ "Comparison",
2524
+ "Parenthetical",
2525
+ MagicConstants,
2526
+ WindowItem,
2527
+ AggregateWrapper,
2528
+ ]
2529
+ operator: ComparisonOperator
2530
+
2531
+ def __post_init__(self):
2532
+ if arg_to_datatype(self.left) != arg_to_datatype(self.right):
2533
+ raise ValueError(
2534
+ f"Cannot compare {self.left} and {self.right} of different types"
2535
+ )
2536
+
2537
+ def __add__(self, other):
2538
+ if not isinstance(other, (Comparison, Conditional, Parenthetical)):
2539
+ raise ValueError("Cannot add Comparison to non-Comparison")
2540
+ if other == self:
2541
+ return self
2542
+ return Conditional(left=self, right=other, operator=BooleanOperator.AND)
2543
+
2544
+ def __repr__(self):
2545
+ return f"{str(self.left)} {self.operator.value} {str(self.right)}"
2546
+
2547
+ def with_namespace(self, namespace: str):
2548
+ return Comparison(
2549
+ left=(
2550
+ self.left.with_namespace(namespace)
2551
+ if isinstance(self.left, Namespaced)
2552
+ else self.left
2553
+ ),
2554
+ right=(
2555
+ self.right.with_namespace(namespace)
2556
+ if isinstance(self.right, Namespaced)
2557
+ else self.right
2558
+ ),
2559
+ operator=self.operator,
2560
+ )
2561
+
2562
+ def with_select_grain(self, grain: Grain):
2563
+ return Comparison(
2564
+ left=(
2565
+ self.left.with_select_grain(grain)
2566
+ if isinstance(self.left, SelectGrain)
2567
+ else self.left
2568
+ ),
2569
+ right=(
2570
+ self.right.with_select_grain(grain)
2571
+ if isinstance(self.right, SelectGrain)
2572
+ else self.right
2573
+ ),
2574
+ operator=self.operator,
2575
+ )
2576
+
2577
+ @property
2578
+ def input(self) -> List[Concept]:
2579
+ output: List[Concept] = []
2580
+ if isinstance(self.left, (Concept,)):
2581
+ output += [self.left]
2582
+ if isinstance(self.left, (Conditional, Parenthetical)):
2583
+ output += self.left.input
2584
+ if isinstance(self.left, FilterItem):
2585
+ output += self.left.concept_arguments
2586
+ if isinstance(self.left, Function):
2587
+ output += self.left.concept_arguments
2588
+
2589
+ if isinstance(self.right, (Concept,)):
2590
+ output += [self.right]
2591
+ if isinstance(self.right, (Conditional, Parenthetical)):
2592
+ output += self.right.input
2593
+ if isinstance(self.right, FilterItem):
2594
+ output += self.right.concept_arguments
2595
+ if isinstance(self.right, Function):
2596
+ output += self.right.concept_arguments
2597
+ return output
2598
+
2599
+ @property
2600
+ def concept_arguments(self) -> List[Concept]:
2601
+ """Return concepts directly referenced in where clause"""
2602
+ output = []
2603
+ output += get_concept_arguments(self.left)
2604
+ output += get_concept_arguments(self.right)
2605
+ return output
2606
+
2607
+
2608
+ class CaseWhen(Namespaced, SelectGrain, BaseModel):
2609
+ comparison: Conditional | Comparison
2610
+ expr: "Expr"
2611
+
2612
+ @property
2613
+ def concept_arguments(self):
2614
+ return get_concept_arguments(self.comparison) + get_concept_arguments(self.expr)
2615
+
2616
+ def with_namespace(self, namespace: str) -> CaseWhen:
2617
+ return CaseWhen(
2618
+ comparison=self.comparison.with_namespace(namespace),
2619
+ expr=(
2620
+ self.expr.with_namespace(namespace)
2621
+ if isinstance(
2622
+ self.expr,
2623
+ Namespaced,
2624
+ )
2625
+ else self.expr
2626
+ ),
2627
+ )
2628
+
2629
+ def with_select_grain(self, grain: Grain) -> CaseWhen:
2630
+ return CaseWhen(
2631
+ comparison=self.comparison.with_select_grain(grain),
2632
+ expr=(
2633
+ (self.expr.with_select_grain(grain))
2634
+ if isinstance(self.expr, SelectGrain)
2635
+ else self.expr
2636
+ ),
2637
+ )
2638
+
2639
+
2640
+ class CaseElse(Namespaced, SelectGrain, BaseModel):
2641
+ expr: "Expr"
2642
+ # this ensures that it's easily differentiable from CaseWhen
2643
+ discriminant: ComparisonOperator = ComparisonOperator.ELSE
2644
+
2645
+ @property
2646
+ def concept_arguments(self):
2647
+ return get_concept_arguments(self.expr)
2648
+
2649
+ def with_select_grain(self, grain: Grain) -> CaseElse:
2650
+ return CaseElse(
2651
+ discriminant=self.discriminant,
2652
+ expr=(
2653
+ self.expr.with_select_grain(grain)
2654
+ if isinstance(
2655
+ self.expr,
2656
+ SelectGrain,
2657
+ )
2658
+ else self.expr
2659
+ ),
2660
+ )
2661
+
2662
+ def with_namespace(self, namespace: str) -> CaseElse:
2663
+ return CaseElse(
2664
+ discriminant=self.discriminant,
2665
+ expr=(
2666
+ self.expr.with_namespace(namespace)
2667
+ if isinstance(
2668
+ self.expr,
2669
+ Namespaced,
2670
+ )
2671
+ else self.expr
2672
+ ),
2673
+ )
2674
+
2675
+
2676
+ class Conditional(Namespaced, SelectGrain, BaseModel):
2677
+ left: Union[
2678
+ int,
2679
+ str,
2680
+ float,
2681
+ list,
2682
+ bool,
2683
+ Concept,
2684
+ Comparison,
2685
+ "Conditional",
2686
+ "Parenthetical",
2687
+ Function,
2688
+ FilterItem,
2689
+ ]
2690
+ right: Union[
2691
+ int,
2692
+ str,
2693
+ float,
2694
+ list,
2695
+ bool,
2696
+ Concept,
2697
+ Comparison,
2698
+ "Conditional",
2699
+ "Parenthetical",
2700
+ Function,
2701
+ ]
2702
+ operator: BooleanOperator
2703
+
2704
+ def __add__(self, other) -> "Conditional":
2705
+ if other is None:
2706
+ return self
2707
+ elif isinstance(other, (Comparison, Conditional, Parenthetical)):
2708
+ return Conditional(left=self, right=other, operator=BooleanOperator.AND)
2709
+ raise ValueError(f"Cannot add {self.__class__} and {type(other)}")
2710
+
2711
+ def __str__(self):
2712
+ return self.__repr__()
2713
+
2714
+ def __repr__(self):
2715
+ return f"{str(self.left)} {self.operator.value} {str(self.right)}"
2716
+
2717
+ def with_namespace(self, namespace: str):
2718
+ return Conditional(
2719
+ left=(
2720
+ self.left.with_namespace(namespace)
2721
+ if isinstance(self.left, Namespaced)
2722
+ else self.left
2723
+ ),
2724
+ right=(
2725
+ self.right.with_namespace(namespace)
2726
+ if isinstance(self.right, Namespaced)
2727
+ else self.right
2728
+ ),
2729
+ operator=self.operator,
2730
+ )
2731
+
2732
+ def with_select_grain(self, grain: Grain):
2733
+ return Conditional(
2734
+ left=(
2735
+ self.left.with_select_grain(grain)
2736
+ if isinstance(self.left, SelectGrain)
2737
+ else self.left
2738
+ ),
2739
+ right=(
2740
+ self.right.with_select_grain(grain)
2741
+ if isinstance(self.right, SelectGrain)
2742
+ else self.right
2743
+ ),
2744
+ operator=self.operator,
2745
+ )
2746
+
2747
+ @property
2748
+ def input(self) -> List[Concept]:
2749
+ """Return concepts directly referenced in where clause"""
2750
+ output = []
2751
+
2752
+ for x in (self.left, self.right):
2753
+ if isinstance(x, Concept):
2754
+ output += x.input
2755
+ elif isinstance(x, (Comparison, Conditional)):
2756
+ output += x.input
2757
+ elif isinstance(x, (Function, Parenthetical, FilterItem)):
2758
+ output += x.concept_arguments
2759
+ return output
2760
+
2761
+ @property
2762
+ def concept_arguments(self) -> List[Concept]:
2763
+ """Return concepts directly referenced in where clause"""
2764
+ output = []
2765
+ output += get_concept_arguments(self.left)
2766
+ output += get_concept_arguments(self.right)
2767
+ return output
2768
+
2769
+
2770
+ class AggregateWrapper(Namespaced, SelectGrain, BaseModel):
2771
+ function: Function
2772
+ by: List[Concept] = Field(default_factory=list)
2773
+
2774
+ def __str__(self):
2775
+ grain_str = [str(c) for c in self.by] if self.by else "abstract"
2776
+ return f"{str(self.function)}<{grain_str}>"
2777
+
2778
+ @property
2779
+ def datatype(self):
2780
+ return self.function.datatype
2781
+
2782
+ @property
2783
+ def concept_arguments(self) -> List[Concept]:
2784
+ return self.function.concept_arguments + self.by
2785
+
2786
+ @property
2787
+ def output_datatype(self):
2788
+ return self.function.output_datatype
2789
+
2790
+ @property
2791
+ def output_purpose(self):
2792
+ return self.function.output_purpose
2793
+
2794
+ @property
2795
+ def arguments(self):
2796
+ return self.function.arguments
2797
+
2798
+ def with_namespace(self, namespace: str) -> "AggregateWrapper":
2799
+ return AggregateWrapper(
2800
+ function=self.function.with_namespace(namespace),
2801
+ by=[c.with_namespace(namespace) for c in self.by] if self.by else [],
2802
+ )
2803
+
2804
+ def with_select_grain(self, grain: Grain) -> AggregateWrapper:
2805
+ if not self.by:
2806
+ by = grain.components_copy
2807
+ else:
2808
+ by = self.by
2809
+ return AggregateWrapper(function=self.function.with_select_grain(grain), by=by)
2810
+
2811
+
2812
+ class WhereClause(Namespaced, SelectGrain, BaseModel):
2813
+ conditional: Union[Comparison, Conditional, "Parenthetical"]
2814
+
2815
+ @property
2816
+ def input(self) -> List[Concept]:
2817
+ return self.conditional.input
2818
+
2819
+ @property
2820
+ def concept_arguments(self) -> List[Concept]:
2821
+ return self.conditional.concept_arguments
2822
+
2823
+ def with_namespace(self, namespace: str) -> WhereClause:
2824
+ return WhereClause(conditional=self.conditional.with_namespace(namespace))
2825
+
2826
+ def with_select_grain(self, grain: Grain) -> WhereClause:
2827
+ return WhereClause(conditional=self.conditional.with_select_grain(grain))
2828
+
2829
+ @property
2830
+ def grain(self) -> Grain:
2831
+ output = []
2832
+ for item in self.input:
2833
+ if item.purpose == Purpose.KEY:
2834
+ output.append(item)
2835
+ elif item.purpose == Purpose.PROPERTY:
2836
+ output += item.grain.components if item.grain else []
2837
+ return Grain(components=list(set(output)))
2838
+
2839
+
2840
+ class MaterializedDataset(BaseModel):
2841
+ address: Address
2842
+
2843
+
2844
+ # TODO: combine with CTEs
2845
+ # CTE contains procesed query?
2846
+ # or CTE references CTE?
2847
+
2848
+
2849
+ class ProcessedQuery(BaseModel):
2850
+ output_columns: List[Concept]
2851
+ ctes: List[CTE]
2852
+ base: CTE
2853
+ joins: List[Join]
2854
+ grain: Grain
2855
+ hidden_columns: List[Concept] = Field(default_factory=list)
2856
+ limit: Optional[int] = None
2857
+ where_clause: Optional[WhereClause] = None
2858
+ order_by: Optional[OrderBy] = None
2859
+
2860
+
2861
+ class ProcessedQueryMixin(BaseModel):
2862
+ output_to: MaterializedDataset
2863
+ datasource: Datasource
2864
+ # base:Dataset
2865
+
2866
+
2867
+ class ProcessedQueryPersist(ProcessedQuery, ProcessedQueryMixin):
2868
+ pass
2869
+
2870
+
2871
+ class ProcessedShowStatement(BaseModel):
2872
+ output_columns: List[Concept]
2873
+ output_values: List[Union[Concept, Datasource, ProcessedQuery]]
2874
+
2875
+
2876
+ class Limit(BaseModel):
2877
+ count: int
2878
+
2879
+
2880
+ class ConceptDeclarationStatement(BaseModel):
2881
+ concept: Concept
2882
+
2883
+
2884
+ class ConceptDerivation(BaseModel):
2885
+ concept: Concept
2886
+
2887
+
2888
+ class RowsetDerivationStatement(Namespaced, BaseModel):
2889
+ name: str
2890
+ select: SelectStatement | MultiSelectStatement
2891
+ namespace: str
2892
+
2893
+ def __repr__(self):
2894
+ return f"RowsetDerivation<{str(self.select)}>"
2895
+
2896
+ @property
2897
+ def derived_concepts(self) -> List[Concept]:
2898
+ output: list[Concept] = []
2899
+ orig: dict[str, Concept] = {}
2900
+ for orig_concept in self.select.output_components:
2901
+ new_concept = Concept(
2902
+ name=orig_concept.name,
2903
+ datatype=orig_concept.datatype,
2904
+ purpose=orig_concept.purpose,
2905
+ lineage=RowsetItem(
2906
+ content=orig_concept, where=self.select.where_clause, rowset=self
2907
+ ),
2908
+ grain=orig_concept.grain,
2909
+ metadata=orig_concept.metadata,
2910
+ namespace=(
2911
+ f"{self.name}.{orig_concept.namespace}"
2912
+ if orig_concept.namespace != self.namespace
2913
+ else self.name
2914
+ ),
2915
+ keys=orig_concept.keys,
2916
+ )
2917
+ orig[orig_concept.address] = new_concept
2918
+ output.append(new_concept)
2919
+ default_grain = Grain(components=[*output])
2920
+ # remap everything to the properties of the rowset
2921
+ for x in output:
2922
+ if x.keys:
2923
+ if all([k.address in orig for k in x.keys]):
2924
+ x.keys = tuple(
2925
+ [orig[k.address] if k.address in orig else k for k in x.keys]
2926
+ )
2927
+ else:
2928
+ # TODO: fix this up
2929
+ x.keys = tuple()
2930
+ for x in output:
2931
+ if all([c.address in orig for c in x.grain.components_copy]):
2932
+ x.grain = Grain(
2933
+ components=[orig[c.address] for c in x.grain.components_copy]
2934
+ )
2935
+ else:
2936
+ x.grain = default_grain
2937
+ return output
2938
+
2939
+ @property
2940
+ def arguments(self) -> List[Concept]:
2941
+ return self.select.output_components
2942
+
2943
+ def with_namespace(self, namespace: str) -> "RowsetDerivationStatement":
2944
+ return RowsetDerivationStatement(
2945
+ name=self.name,
2946
+ select=self.select.with_namespace(namespace),
2947
+ namespace=namespace,
2948
+ )
2949
+
2950
+
2951
+ class RowsetItem(Namespaced, BaseModel):
2952
+ content: Concept
2953
+ rowset: RowsetDerivationStatement
2954
+ where: Optional["WhereClause"] = None
2955
+
2956
+ def __repr__(self):
2957
+ return (
2958
+ f"<Rowset<{self.rowset.name}>: {str(self.content)} where {str(self.where)}>"
2959
+ )
2960
+
2961
+ def with_namespace(self, namespace: str) -> "RowsetItem":
2962
+ return RowsetItem(
2963
+ content=self.content.with_namespace(namespace),
2964
+ where=self.where.with_namespace(namespace) if self.where else None,
2965
+ rowset=self.rowset.with_namespace(namespace),
2966
+ )
2967
+
2968
+ @property
2969
+ def arguments(self) -> List[Concept]:
2970
+ output = [self.content]
2971
+ if self.where:
2972
+ output += self.where.input
2973
+ return output
2974
+
2975
+ @property
2976
+ def output(self) -> Concept:
2977
+ if isinstance(self.content, ConceptTransform):
2978
+ return self.content.output
2979
+ return self.content
2980
+
2981
+ @output.setter
2982
+ def output(self, value):
2983
+ if isinstance(self.content, ConceptTransform):
2984
+ self.content.output = value
2985
+ else:
2986
+ self.content = value
2987
+
2988
+ @property
2989
+ def input(self) -> List[Concept]:
2990
+ base = self.content.input
2991
+ if self.where:
2992
+ base += self.where.input
2993
+ return base
2994
+
2995
+ @property
2996
+ def output_datatype(self):
2997
+ return self.content.datatype
2998
+
2999
+ @property
3000
+ def output_purpose(self):
3001
+ return self.content.purpose
3002
+
3003
+ @property
3004
+ def concept_arguments(self):
3005
+ if self.where:
3006
+ return [self.content] + self.where.concept_arguments
3007
+ return [self.content]
3008
+
3009
+
3010
+ class Parenthetical(Namespaced, SelectGrain, BaseModel):
3011
+ content: "Expr"
3012
+
3013
+ def __str__(self):
3014
+ return self.__repr__()
3015
+
3016
+ def __add__(self, other) -> Union["Parenthetical", "Conditional"]:
3017
+ if other is None:
3018
+ return self
3019
+ elif isinstance(other, (Comparison, Conditional, Parenthetical)):
3020
+ return Conditional(left=self, right=other, operator=BooleanOperator.AND)
3021
+ raise ValueError(f"Cannot add {self.__class__} and {type(other)}")
3022
+
3023
+ def __repr__(self):
3024
+ return f"({str(self.content)})"
3025
+
3026
+ def with_namespace(self, namespace: str):
3027
+ return Parenthetical(
3028
+ content=(
3029
+ self.content.with_namespace(namespace)
3030
+ if isinstance(self.content, Namespaced)
3031
+ else self.content
3032
+ )
3033
+ )
3034
+
3035
+ def with_select_grain(self, grain: Grain):
3036
+ return Parenthetical(
3037
+ content=(
3038
+ self.content.with_select_grain(grain)
3039
+ if isinstance(self.content, SelectGrain)
3040
+ else self.content
3041
+ )
3042
+ )
3043
+
3044
+ @property
3045
+ def concept_arguments(self) -> List[Concept]:
3046
+ base: List[Concept] = []
3047
+ x = self.content
3048
+ if hasattr(x, "concept_arguments"):
3049
+ base += x.concept_arguments
3050
+ elif isinstance(x, Concept):
3051
+ base.append(x)
3052
+ return base
3053
+
3054
+ @property
3055
+ def input(self):
3056
+ base = []
3057
+ x = self.content
3058
+ if hasattr(x, "input"):
3059
+ base += x.input
3060
+ return base
3061
+
3062
+
3063
+ class PersistStatement(BaseModel):
3064
+ datasource: Datasource
3065
+ select: SelectStatement
3066
+
3067
+ @property
3068
+ def identifier(self):
3069
+ return self.datasource.identifier
3070
+
3071
+ @property
3072
+ def address(self):
3073
+ return self.datasource.address
3074
+
3075
+
3076
+ class ShowStatement(BaseModel):
3077
+ content: SelectStatement | PersistStatement | ShowCategory
3078
+
3079
+
3080
+ Expr = (
3081
+ bool
3082
+ | int
3083
+ | str
3084
+ | float
3085
+ | list
3086
+ | WindowItem
3087
+ | FilterItem
3088
+ | Concept
3089
+ | Comparison
3090
+ | Conditional
3091
+ | Parenthetical
3092
+ | Function
3093
+ | AggregateWrapper
3094
+ )
3095
+
3096
+
3097
+ Concept.model_rebuild()
3098
+ Grain.model_rebuild()
3099
+ WindowItem.model_rebuild()
3100
+ WindowItemOrder.model_rebuild()
3101
+ FilterItem.model_rebuild()
3102
+ Comparison.model_rebuild()
3103
+ Conditional.model_rebuild()
3104
+ Parenthetical.model_rebuild()
3105
+ WhereClause.model_rebuild()
3106
+ ImportStatement.model_rebuild()
3107
+ CaseWhen.model_rebuild()
3108
+ CaseElse.model_rebuild()
3109
+ SelectStatement.model_rebuild()
3110
+ CTE.model_rebuild()
3111
+ BaseJoin.model_rebuild()
3112
+ QueryDatasource.model_rebuild()
3113
+ ProcessedQuery.model_rebuild()
3114
+ ProcessedQueryPersist.model_rebuild()
3115
+ InstantiatedUnnestJoin.model_rebuild()
3116
+ UndefinedConcept.model_rebuild()
3117
+ Function.model_rebuild()
3118
+ Grain.model_rebuild()
3119
+
3120
+
3121
+ def arg_to_datatype(arg) -> DataType | ListType | StructType | MapType:
3122
+ if isinstance(arg, Function):
3123
+ return arg.output_datatype
3124
+ elif isinstance(arg, Concept):
3125
+ return arg.datatype
3126
+ elif isinstance(arg, bool):
3127
+ return DataType.BOOL
3128
+ elif isinstance(arg, int):
3129
+ return DataType.INTEGER
3130
+ elif isinstance(arg, str):
3131
+ return DataType.STRING
3132
+ elif isinstance(arg, float):
3133
+ return DataType.FLOAT
3134
+ elif isinstance(arg, ListWrapper):
3135
+ return ListType(type=arg.type)
3136
+ elif isinstance(arg, AggregateWrapper):
3137
+ return arg.function.output_datatype
3138
+ elif isinstance(arg, Parenthetical):
3139
+ return arg_to_datatype(arg.content)
3140
+ elif isinstance(arg, WindowItem):
3141
+ if arg.type in (WindowType.RANK, WindowType.ROW_NUMBER):
3142
+ return DataType.INTEGER
3143
+ return arg_to_datatype(arg.content)
3144
+ else:
3145
+ raise ValueError(f"Cannot parse arg datatype for arg of raw type {type(arg)}")