datalex-cli 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. datalex_cli/__init__.py +1 -0
  2. datalex_cli/datalex_cli.py +658 -0
  3. datalex_cli/main.py +2925 -0
  4. datalex_cli-0.1.1.dist-info/METADATA +228 -0
  5. datalex_cli-0.1.1.dist-info/RECORD +64 -0
  6. datalex_cli-0.1.1.dist-info/WHEEL +5 -0
  7. datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
  8. datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
  9. datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
  10. datalex_core/__init__.py +94 -0
  11. datalex_core/_schemas/datalex/common.schema.json +127 -0
  12. datalex_core/_schemas/datalex/domain.schema.json +24 -0
  13. datalex_core/_schemas/datalex/entity.schema.json +158 -0
  14. datalex_core/_schemas/datalex/model.schema.json +141 -0
  15. datalex_core/_schemas/datalex/policy.schema.json +70 -0
  16. datalex_core/_schemas/datalex/project.schema.json +82 -0
  17. datalex_core/_schemas/datalex/snippet.schema.json +24 -0
  18. datalex_core/_schemas/datalex/source.schema.json +104 -0
  19. datalex_core/_schemas/datalex/term.schema.json +30 -0
  20. datalex_core/canonical.py +166 -0
  21. datalex_core/completion.py +204 -0
  22. datalex_core/connectors/__init__.py +39 -0
  23. datalex_core/connectors/base.py +417 -0
  24. datalex_core/connectors/bigquery.py +229 -0
  25. datalex_core/connectors/databricks.py +262 -0
  26. datalex_core/connectors/mysql.py +266 -0
  27. datalex_core/connectors/postgres.py +309 -0
  28. datalex_core/connectors/redshift.py +298 -0
  29. datalex_core/connectors/snowflake.py +336 -0
  30. datalex_core/connectors/sqlserver.py +425 -0
  31. datalex_core/datalex/__init__.py +26 -0
  32. datalex_core/datalex/diff.py +188 -0
  33. datalex_core/datalex/errors.py +85 -0
  34. datalex_core/datalex/loader.py +512 -0
  35. datalex_core/datalex/migrate_layout.py +382 -0
  36. datalex_core/datalex/parse_cache.py +102 -0
  37. datalex_core/datalex/project.py +214 -0
  38. datalex_core/datalex/types.py +224 -0
  39. datalex_core/dbt/__init__.py +18 -0
  40. datalex_core/dbt/emit.py +344 -0
  41. datalex_core/dbt/manifest.py +329 -0
  42. datalex_core/dbt/profiles.py +185 -0
  43. datalex_core/dbt/sync.py +279 -0
  44. datalex_core/dbt/warehouse.py +215 -0
  45. datalex_core/dialects/__init__.py +15 -0
  46. datalex_core/dialects/_common.py +48 -0
  47. datalex_core/dialects/base.py +47 -0
  48. datalex_core/dialects/postgres.py +164 -0
  49. datalex_core/dialects/registry.py +36 -0
  50. datalex_core/dialects/snowflake.py +129 -0
  51. datalex_core/diffing.py +358 -0
  52. datalex_core/docs_generator.py +797 -0
  53. datalex_core/doctor.py +181 -0
  54. datalex_core/generators.py +478 -0
  55. datalex_core/importers.py +1176 -0
  56. datalex_core/issues.py +23 -0
  57. datalex_core/loader.py +21 -0
  58. datalex_core/migrate.py +316 -0
  59. datalex_core/modeling.py +679 -0
  60. datalex_core/packages.py +430 -0
  61. datalex_core/policy.py +1037 -0
  62. datalex_core/resolver.py +456 -0
  63. datalex_core/schema.py +54 -0
  64. datalex_core/semantic.py +1561 -0
@@ -0,0 +1,214 @@
1
+ """DataLexProject — the loaded, validated graph.
2
+
3
+ Holds every kind in its own dict keyed by a stable ID (entity keys are
4
+ `<layer>:<name>` because the same logical name can appear at each of the three
5
+ layers). Provides convenience lookups and a `resolve()` pass that:
6
+ * Inlines snippet `use:` directives on columns.
7
+ * Validates `logical:` back-references from physical to logical entities.
8
+ * Flags dangling term/entity/source/model references.
9
+
10
+ Kept as a thin orchestration layer over the dict-of-dict representation — dialect
11
+ plugins and diff engine operate on dicts directly, so the Python object is a
12
+ convenience, not a requirement.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass, field
18
+
19
+ from pathlib import Path
20
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
21
+
22
+ from datalex_core.datalex.errors import DataLexError, DataLexErrorBag, SourceLocation
23
+
24
+
25
+ @dataclass
26
+ class DataLexProject:
27
+ root: Path
28
+ manifest: Optional[Dict[str, Any]]
29
+ entities: Dict[str, Dict[str, Any]]
30
+ sources: Dict[str, Dict[str, Any]]
31
+ models: Dict[str, Dict[str, Any]]
32
+ terms: Dict[str, Dict[str, Any]]
33
+ domains: Dict[str, Dict[str, Any]]
34
+ policies: Dict[str, Dict[str, Any]]
35
+ snippets: Dict[str, Dict[str, Any]]
36
+ file_of: Dict[Tuple[str, str], str]
37
+ errors: DataLexErrorBag
38
+ # Phase C: imported packages. Each key is the package's alias; value is a
39
+ # loaded sub-project. Sub-projects are validated independently.
40
+ imports: Dict[str, "DataLexProject"] = field(default_factory=dict)
41
+
42
+ # ---------- lookups ----------
43
+
44
+ def entity(self, name: str, layer: str = "physical") -> Optional[Dict[str, Any]]:
45
+ return self.entities.get(f"{layer}:{name}")
46
+
47
+ def imported_entity(
48
+ self, alias: str, name: str, layer: str = "physical"
49
+ ) -> Optional[Dict[str, Any]]:
50
+ """Look up an entity inside an imported package by alias."""
51
+ sub = self.imports.get(alias)
52
+ if sub is None:
53
+ return None
54
+ return sub.entity(name, layer=layer)
55
+
56
+ def resolve_cross_package(
57
+ self, reference: str, layer: str = "physical"
58
+ ) -> Optional[Dict[str, Any]]:
59
+ """Resolve `@alias.entity_name` style references against imported packages.
60
+
61
+ Plain names without `@alias.` fall back to local entities so callers can
62
+ use a single lookup path.
63
+ """
64
+ if reference.startswith("@"):
65
+ try:
66
+ alias, name = reference[1:].split(".", 1)
67
+ except ValueError:
68
+ return None
69
+ return self.imported_entity(alias, name, layer=layer)
70
+ return self.entity(reference, layer=layer)
71
+
72
+ def iter_entities(self, layer: Optional[str] = None) -> Iterable[Dict[str, Any]]:
73
+ for key, ent in sorted(self.entities.items()):
74
+ if layer is None or key.startswith(f"{layer}:"):
75
+ yield ent
76
+
77
+ def physical_entities(self, dialect: Optional[str] = None) -> List[Dict[str, Any]]:
78
+ out = []
79
+ for ent in self.iter_entities(layer="physical"):
80
+ if dialect is None or ent.get("dialect") == dialect:
81
+ out.append(ent)
82
+ return out
83
+
84
+ # ---------- resolution ----------
85
+
86
+ def resolve(self) -> None:
87
+ """Run post-load resolution: snippet expansion, back-reference checks."""
88
+ self._expand_snippets()
89
+ self._check_logical_backrefs()
90
+ self._check_term_refs()
91
+ self._check_reference_targets()
92
+
93
+ def _expand_snippets(self) -> None:
94
+ """Inline `use: <snippet>` on columns with snippet.apply content.
95
+
96
+ Merge semantics: column keys win over snippet keys. Snippet fields fill in
97
+ missing keys only. This is conservative — users opt in explicitly.
98
+ """
99
+ for ent in self.entities.values():
100
+ for col in ent.get("columns", []) or []:
101
+ snippet_name = col.pop("use", None)
102
+ if not snippet_name:
103
+ continue
104
+ snip = self.snippets.get(snippet_name)
105
+ if snip is None:
106
+ self.errors.add(
107
+ DataLexError(
108
+ code="SNIPPET_NOT_FOUND",
109
+ message=f"Column '{col.get('name')}' uses unknown snippet '{snippet_name}'",
110
+ location=self._loc_for("entity", ent),
111
+ suggested_fix=f"Create .datalex/snippets/{snippet_name}.yaml or remove the use: directive.",
112
+ )
113
+ )
114
+ continue
115
+ apply = snip.get("apply", {}) or {}
116
+ for k, v in apply.items():
117
+ if k not in col:
118
+ col[k] = v
119
+
120
+ def _check_logical_backrefs(self) -> None:
121
+ for key, ent in self.entities.items():
122
+ if not key.startswith("physical:"):
123
+ continue
124
+ logical_name = ent.get("logical")
125
+ if not logical_name:
126
+ continue
127
+ if f"logical:{logical_name}" not in self.entities:
128
+ self.errors.add(
129
+ DataLexError(
130
+ code="LOGICAL_BACKREF",
131
+ severity="warn",
132
+ message=f"Physical entity '{ent.get('name')}' references logical '{logical_name}' which does not exist.",
133
+ location=self._loc_for("entity", ent),
134
+ suggested_fix=f"Create models/logical/{logical_name}.yaml or remove the logical: reference.",
135
+ )
136
+ )
137
+
138
+ def _check_term_refs(self) -> None:
139
+ term_names = set(self.terms.keys())
140
+ for ent in self.entities.values():
141
+ for t in ent.get("terms", []) or []:
142
+ name = t.split(":", 1)[1] if t.startswith("term:") else t
143
+ if name not in term_names:
144
+ self.errors.add(
145
+ DataLexError(
146
+ code="TERM_NOT_FOUND",
147
+ severity="warn",
148
+ message=f"Entity '{ent.get('name')}' references unknown term '{name}'",
149
+ location=self._loc_for("entity", ent),
150
+ suggested_fix=f"Create glossary/{name}.yaml or remove the term reference.",
151
+ )
152
+ )
153
+ for col in ent.get("columns", []) or []:
154
+ for t in col.get("terms", []) or []:
155
+ name = t.split(":", 1)[1] if t.startswith("term:") else t
156
+ if name not in term_names:
157
+ self.errors.add(
158
+ DataLexError(
159
+ code="TERM_NOT_FOUND",
160
+ severity="warn",
161
+ message=f"Column '{ent.get('name')}.{col.get('name')}' references unknown term '{name}'",
162
+ location=self._loc_for("entity", ent),
163
+ )
164
+ )
165
+
166
+ def _check_reference_targets(self) -> None:
167
+ for ent in self.entities.values():
168
+ for col in ent.get("columns", []) or []:
169
+ ref = col.get("references")
170
+ if not ref:
171
+ continue
172
+ target_entity_name = ref.get("entity")
173
+ layer = ent.get("layer", "physical")
174
+ if not target_entity_name:
175
+ continue
176
+ if f"{layer}:{target_entity_name}" not in self.entities:
177
+ self.errors.add(
178
+ DataLexError(
179
+ code="REF_TARGET_MISSING",
180
+ message=f"Column '{ent.get('name')}.{col.get('name')}' references missing entity '{target_entity_name}' at layer '{layer}'",
181
+ location=self._loc_for("entity", ent),
182
+ suggested_fix="Check the target entity name and layer.",
183
+ )
184
+ )
185
+
186
+ def _loc_for(self, kind: str, obj: Dict[str, Any]) -> SourceLocation:
187
+ name = obj.get("name", "")
188
+ layer = obj.get("layer", "physical") if kind == "entity" else ""
189
+ key = f"{layer}:{name}" if kind == "entity" else name
190
+ path = self.file_of.get((kind, key), str(self.root))
191
+ return SourceLocation(file=path)
192
+
193
+ def to_dict(self) -> Dict[str, Any]:
194
+ """Return a plain dict suitable for JSON serialization."""
195
+ return {
196
+ "root": str(self.root),
197
+ "manifest": self.manifest,
198
+ "entities": self.entities,
199
+ "sources": self.sources,
200
+ "models": self.models,
201
+ "terms": self.terms,
202
+ "domains": self.domains,
203
+ "policies": self.policies,
204
+ "snippets": self.snippets,
205
+ "imports": {
206
+ alias: {
207
+ "root": str(sub.root),
208
+ "entities": sorted(sub.entities.keys()),
209
+ "terms": sorted(sub.terms.keys()),
210
+ }
211
+ for alias, sub in self.imports.items()
212
+ },
213
+ "errors": self.errors.to_list(),
214
+ }
@@ -0,0 +1,224 @@
1
+ """DataLex logical type system.
2
+
3
+ Grammar:
4
+ type := primitive | parameterized | composite
5
+ primitive := string | text | integer | bigint | float | boolean
6
+ | date | timestamp | timestamp_tz | interval
7
+ | uuid | json | binary | decimal
8
+ parameterized := primitive '(' INT [',' INT] ')' e.g. decimal(18,4), string(255)
9
+ composite := 'array' '<' type '>'
10
+ | 'map' '<' type ',' type '>'
11
+ | 'struct' '<' field (',' field)* '>'
12
+ field := ident ':' type
13
+
14
+ The parser is recursive-descent and deterministic; `str(parsed)` round-trips to a
15
+ canonical form used by dialect plugins and the diff engine.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from dataclasses import dataclass, field
21
+ from typing import List, Optional, Tuple
22
+
23
+ from datalex_core.datalex.errors import DataLexError
24
+
25
+
26
+ PRIMITIVES = frozenset({
27
+ "string", "text", "integer", "bigint", "float", "boolean",
28
+ "date", "timestamp", "timestamp_tz", "interval",
29
+ "uuid", "json", "binary", "decimal",
30
+ })
31
+
32
+ COMPOSITE_KEYWORDS = frozenset({"array", "map", "struct"})
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class LogicalType:
37
+ """In-memory representation of a parsed logical type.
38
+
39
+ `kind` is one of PRIMITIVES or COMPOSITE_KEYWORDS.
40
+ `params` is the tuple of numeric parameters (e.g. (18, 4) for decimal(18,4)).
41
+ `children` is the tuple of child types for array/map.
42
+ `fields` is the tuple of (name, type) pairs for struct.
43
+ """
44
+ kind: str
45
+ params: Tuple[int, ...] = ()
46
+ children: Tuple["LogicalType", ...] = ()
47
+ fields: Tuple[Tuple[str, "LogicalType"], ...] = ()
48
+
49
+ def render(self) -> str:
50
+ if self.kind == "array":
51
+ return f"array<{self.children[0].render()}>"
52
+ if self.kind == "map":
53
+ return f"map<{self.children[0].render()},{self.children[1].render()}>"
54
+ if self.kind == "struct":
55
+ inner = ",".join(f"{n}:{t.render()}" for n, t in self.fields)
56
+ return f"struct<{inner}>"
57
+ if self.params:
58
+ return f"{self.kind}({','.join(str(p) for p in self.params)})"
59
+ return self.kind
60
+
61
+ def is_composite(self) -> bool:
62
+ return self.kind in COMPOSITE_KEYWORDS
63
+
64
+ def __str__(self) -> str:
65
+ return self.render()
66
+
67
+
68
+ class _Tokenizer:
69
+ """Tiny tokenizer for the logical type grammar."""
70
+
71
+ def __init__(self, text: str):
72
+ self.text = text
73
+ self.pos = 0
74
+ self.n = len(text)
75
+
76
+ def peek(self) -> str:
77
+ self._skip_ws()
78
+ return self.text[self.pos] if self.pos < self.n else ""
79
+
80
+ def consume(self, ch: str) -> bool:
81
+ self._skip_ws()
82
+ if self.pos < self.n and self.text[self.pos] == ch:
83
+ self.pos += 1
84
+ return True
85
+ return False
86
+
87
+ def expect(self, ch: str) -> None:
88
+ if not self.consume(ch):
89
+ raise DataLexError(
90
+ code="TYPE_PARSE",
91
+ message=f"Expected '{ch}' at position {self.pos} in type '{self.text}'",
92
+ suggested_fix=f"Check the type syntax near '{self.text[max(0, self.pos-8):self.pos+8]}'",
93
+ )
94
+
95
+ def read_ident(self) -> str:
96
+ self._skip_ws()
97
+ start = self.pos
98
+ while self.pos < self.n and (self.text[self.pos].isalnum() or self.text[self.pos] == "_"):
99
+ self.pos += 1
100
+ if start == self.pos:
101
+ raise DataLexError(
102
+ code="TYPE_PARSE",
103
+ message=f"Expected identifier at position {self.pos} in type '{self.text}'",
104
+ )
105
+ return self.text[start:self.pos]
106
+
107
+ def read_int(self) -> int:
108
+ self._skip_ws()
109
+ start = self.pos
110
+ while self.pos < self.n and self.text[self.pos].isdigit():
111
+ self.pos += 1
112
+ if start == self.pos:
113
+ raise DataLexError(code="TYPE_PARSE", message=f"Expected integer in type '{self.text}'")
114
+ return int(self.text[start:self.pos])
115
+
116
+ def eof(self) -> bool:
117
+ self._skip_ws()
118
+ return self.pos >= self.n
119
+
120
+ def _skip_ws(self) -> None:
121
+ while self.pos < self.n and self.text[self.pos] in " \t\n":
122
+ self.pos += 1
123
+
124
+
125
+ def parse_type(text: str) -> LogicalType:
126
+ """Parse a DataLex logical type string into a LogicalType.
127
+
128
+ Raises DataLexError(code=TYPE_PARSE) on malformed input. Unknown primitive names
129
+ are accepted (returned as a raw kind with no params) so dialect plugins can accept
130
+ dialect-specific types as escape hatches; validation of known primitives happens
131
+ in the validator pass.
132
+ """
133
+ if not isinstance(text, str) or not text.strip():
134
+ raise DataLexError(code="TYPE_PARSE", message="Empty type string")
135
+ tok = _Tokenizer(text.strip())
136
+ parsed = _parse(tok)
137
+ if not tok.eof():
138
+ raise DataLexError(
139
+ code="TYPE_PARSE",
140
+ message=f"Trailing characters after type in '{text}'",
141
+ )
142
+ return parsed
143
+
144
+
145
+ def _parse(tok: _Tokenizer) -> LogicalType:
146
+ ident = tok.read_ident().lower()
147
+
148
+ if ident == "array":
149
+ tok.expect("<")
150
+ inner = _parse(tok)
151
+ tok.expect(">")
152
+ return LogicalType(kind="array", children=(inner,))
153
+
154
+ if ident == "map":
155
+ tok.expect("<")
156
+ k = _parse(tok)
157
+ tok.expect(",")
158
+ v = _parse(tok)
159
+ tok.expect(">")
160
+ return LogicalType(kind="map", children=(k, v))
161
+
162
+ if ident == "struct":
163
+ tok.expect("<")
164
+ fields: List[Tuple[str, LogicalType]] = []
165
+ while True:
166
+ name = tok.read_ident()
167
+ tok.expect(":")
168
+ ftype = _parse(tok)
169
+ fields.append((name, ftype))
170
+ if tok.consume(","):
171
+ continue
172
+ break
173
+ tok.expect(">")
174
+ return LogicalType(kind="struct", fields=tuple(fields))
175
+
176
+ # primitive or parameterized
177
+ params: Tuple[int, ...] = ()
178
+ if tok.peek() == "(":
179
+ tok.expect("(")
180
+ params_list: List[int] = [tok.read_int()]
181
+ while tok.consume(","):
182
+ params_list.append(tok.read_int())
183
+ tok.expect(")")
184
+ params = tuple(params_list)
185
+
186
+ return LogicalType(kind=ident, params=params)
187
+
188
+
189
+ def is_known_primitive(kind: str) -> bool:
190
+ return kind in PRIMITIVES
191
+
192
+
193
+ def validate_type_string(text: str) -> Optional[DataLexError]:
194
+ """Return a DataLexError if the type string is malformed or uses unknown primitives
195
+ in a shape that is clearly wrong (e.g. composite keyword without generics)."""
196
+ try:
197
+ t = parse_type(text)
198
+ except DataLexError as e:
199
+ return e
200
+
201
+ return _validate_tree(t)
202
+
203
+
204
+ def _validate_tree(t: LogicalType) -> Optional[DataLexError]:
205
+ if t.kind in COMPOSITE_KEYWORDS:
206
+ for c in t.children:
207
+ err = _validate_tree(c)
208
+ if err:
209
+ return err
210
+ for _, ft in t.fields:
211
+ err = _validate_tree(ft)
212
+ if err:
213
+ return err
214
+ return None
215
+
216
+ if t.kind not in PRIMITIVES:
217
+ # allow as pass-through so dialects can accept native types, but flag a warning
218
+ return DataLexError(
219
+ code="TYPE_UNKNOWN_PRIMITIVE",
220
+ severity="warn",
221
+ message=f"Unknown logical primitive '{t.kind}' — will be passed through to the dialect verbatim",
222
+ suggested_fix=f"Use one of: {', '.join(sorted(PRIMITIVES))} — or provide a per-dialect physical override.",
223
+ )
224
+ return None
@@ -0,0 +1,18 @@
1
+ """DataLex <-> dbt integration: emit dbt YAML, import manifest.json, sync live warehouse."""
2
+
3
+ from datalex_core.dbt.emit import emit_dbt, build_sources_yaml, build_models_yaml, EmitReport
4
+ from datalex_core.dbt.manifest import import_manifest, write_import_result, ImportResult
5
+ from datalex_core.dbt.sync import sync_dbt_project, SyncReport, TableSyncRecord
6
+
7
+ __all__ = [
8
+ "emit_dbt",
9
+ "build_sources_yaml",
10
+ "build_models_yaml",
11
+ "EmitReport",
12
+ "import_manifest",
13
+ "write_import_result",
14
+ "ImportResult",
15
+ "sync_dbt_project",
16
+ "SyncReport",
17
+ "TableSyncRecord",
18
+ ]