datalex-cli 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datalex_cli/__init__.py +1 -0
- datalex_cli/datalex_cli.py +658 -0
- datalex_cli/main.py +2925 -0
- datalex_cli-0.1.1.dist-info/METADATA +228 -0
- datalex_cli-0.1.1.dist-info/RECORD +64 -0
- datalex_cli-0.1.1.dist-info/WHEEL +5 -0
- datalex_cli-0.1.1.dist-info/entry_points.txt +2 -0
- datalex_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
- datalex_cli-0.1.1.dist-info/top_level.txt +2 -0
- datalex_core/__init__.py +94 -0
- datalex_core/_schemas/datalex/common.schema.json +127 -0
- datalex_core/_schemas/datalex/domain.schema.json +24 -0
- datalex_core/_schemas/datalex/entity.schema.json +158 -0
- datalex_core/_schemas/datalex/model.schema.json +141 -0
- datalex_core/_schemas/datalex/policy.schema.json +70 -0
- datalex_core/_schemas/datalex/project.schema.json +82 -0
- datalex_core/_schemas/datalex/snippet.schema.json +24 -0
- datalex_core/_schemas/datalex/source.schema.json +104 -0
- datalex_core/_schemas/datalex/term.schema.json +30 -0
- datalex_core/canonical.py +166 -0
- datalex_core/completion.py +204 -0
- datalex_core/connectors/__init__.py +39 -0
- datalex_core/connectors/base.py +417 -0
- datalex_core/connectors/bigquery.py +229 -0
- datalex_core/connectors/databricks.py +262 -0
- datalex_core/connectors/mysql.py +266 -0
- datalex_core/connectors/postgres.py +309 -0
- datalex_core/connectors/redshift.py +298 -0
- datalex_core/connectors/snowflake.py +336 -0
- datalex_core/connectors/sqlserver.py +425 -0
- datalex_core/datalex/__init__.py +26 -0
- datalex_core/datalex/diff.py +188 -0
- datalex_core/datalex/errors.py +85 -0
- datalex_core/datalex/loader.py +512 -0
- datalex_core/datalex/migrate_layout.py +382 -0
- datalex_core/datalex/parse_cache.py +102 -0
- datalex_core/datalex/project.py +214 -0
- datalex_core/datalex/types.py +224 -0
- datalex_core/dbt/__init__.py +18 -0
- datalex_core/dbt/emit.py +344 -0
- datalex_core/dbt/manifest.py +329 -0
- datalex_core/dbt/profiles.py +185 -0
- datalex_core/dbt/sync.py +279 -0
- datalex_core/dbt/warehouse.py +215 -0
- datalex_core/dialects/__init__.py +15 -0
- datalex_core/dialects/_common.py +48 -0
- datalex_core/dialects/base.py +47 -0
- datalex_core/dialects/postgres.py +164 -0
- datalex_core/dialects/registry.py +36 -0
- datalex_core/dialects/snowflake.py +129 -0
- datalex_core/diffing.py +358 -0
- datalex_core/docs_generator.py +797 -0
- datalex_core/doctor.py +181 -0
- datalex_core/generators.py +478 -0
- datalex_core/importers.py +1176 -0
- datalex_core/issues.py +23 -0
- datalex_core/loader.py +21 -0
- datalex_core/migrate.py +316 -0
- datalex_core/modeling.py +679 -0
- datalex_core/packages.py +430 -0
- datalex_core/policy.py +1037 -0
- datalex_core/resolver.py +456 -0
- datalex_core/schema.py +54 -0
- datalex_core/semantic.py +1561 -0
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
"""One-shot migrator: v3 single-model YAML → DataLex file-per-entity layout.
|
|
2
|
+
|
|
3
|
+
Translates a DataLex v3 `*.model.yaml` file (the current "one big
|
|
4
|
+
model" shape) into the DataLex spec layout:
|
|
5
|
+
|
|
6
|
+
datalex.yaml # project manifest (created if missing)
|
|
7
|
+
glossary/<term>.yaml # one file per glossary term
|
|
8
|
+
models/physical/<dialect>/ # one file per entity, layered by physical
|
|
9
|
+
<entity_name>.yaml
|
|
10
|
+
|
|
11
|
+
Rules applied during translation:
|
|
12
|
+
* Entity names are lowered to snake_case; the original PascalCase name is
|
|
13
|
+
preserved in `physical_name:` so DDL round-trips exactly.
|
|
14
|
+
* v3 `fields[]` -> DataLex `columns[]`.
|
|
15
|
+
* v3 top-level `relationships[]` are translated into per-column
|
|
16
|
+
`references:` on the child side (DataLex canonical form). The child side
|
|
17
|
+
is inferred from the cardinality arrow.
|
|
18
|
+
* v3 top-level `indexes[]` are attached to their owning entity.
|
|
19
|
+
* v3 `glossary[]` is split into one file per term under `glossary/`.
|
|
20
|
+
* Governance classification (PII/PHI/etc.) is attached as column
|
|
21
|
+
`sensitivity:` where a column name matches a classified field; entity-level
|
|
22
|
+
classifications are preserved under `meta.datalex.classification`.
|
|
23
|
+
|
|
24
|
+
The migrator is non-destructive: it writes the new tree alongside the existing
|
|
25
|
+
v3 files. The user can commit both, verify equivalence, then delete the v3 copy.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import re
|
|
31
|
+
from dataclasses import dataclass, field
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
34
|
+
|
|
35
|
+
import yaml
|
|
36
|
+
|
|
37
|
+
from datalex_core.loader import load_yaml_model
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class MigrationReport:
|
|
42
|
+
project_root: Path
|
|
43
|
+
manifest_written: bool
|
|
44
|
+
entities_written: int
|
|
45
|
+
terms_written: int
|
|
46
|
+
domains_written: int
|
|
47
|
+
warnings: List[str] = field(default_factory=list)
|
|
48
|
+
files: List[str] = field(default_factory=list)
|
|
49
|
+
|
|
50
|
+
def summary(self) -> str:
|
|
51
|
+
out = [
|
|
52
|
+
f"DataLex migration complete:",
|
|
53
|
+
f" project root: {self.project_root}",
|
|
54
|
+
f" manifest: {'created' if self.manifest_written else 'unchanged'}",
|
|
55
|
+
f" entity files: {self.entities_written}",
|
|
56
|
+
f" glossary files: {self.terms_written}",
|
|
57
|
+
f" domain files: {self.domains_written}",
|
|
58
|
+
]
|
|
59
|
+
if self.warnings:
|
|
60
|
+
out.append(f" warnings: {len(self.warnings)}")
|
|
61
|
+
for w in self.warnings:
|
|
62
|
+
out.append(f" - {w}")
|
|
63
|
+
return "\n".join(out)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def migrate_project(
|
|
67
|
+
v3_model_path: str,
|
|
68
|
+
output_root: Optional[str] = None,
|
|
69
|
+
default_dialect: str = "postgres",
|
|
70
|
+
dry_run: bool = False,
|
|
71
|
+
) -> MigrationReport:
|
|
72
|
+
"""Migrate a single v3 `*.model.yaml` file to a DataLex project tree.
|
|
73
|
+
|
|
74
|
+
output_root — where to write the new tree. Defaults to the directory
|
|
75
|
+
containing v3_model_path.
|
|
76
|
+
default_dialect — which dialect the physical layer is assumed to target.
|
|
77
|
+
Recorded on each entity and in datalex.yaml.
|
|
78
|
+
dry_run — compute the migration plan and return file paths without
|
|
79
|
+
writing.
|
|
80
|
+
"""
|
|
81
|
+
src = Path(v3_model_path).resolve()
|
|
82
|
+
root = Path(output_root).resolve() if output_root else src.parent
|
|
83
|
+
|
|
84
|
+
v3 = load_yaml_model(str(src))
|
|
85
|
+
if "model" not in v3 or "entities" not in v3:
|
|
86
|
+
raise ValueError(
|
|
87
|
+
f"{src} does not look like a v3 model file (missing 'model' or 'entities' top-level key)"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
report = MigrationReport(
|
|
91
|
+
project_root=root,
|
|
92
|
+
manifest_written=False,
|
|
93
|
+
entities_written=0,
|
|
94
|
+
terms_written=0,
|
|
95
|
+
domains_written=0,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
rel_by_child: Dict[Tuple[str, str], Dict[str, Any]] = _index_relationships_by_child(
|
|
99
|
+
v3.get("relationships", []) or [],
|
|
100
|
+
v3.get("entities", []) or [],
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
governance = (v3.get("governance") or {}).get("classification") or {}
|
|
104
|
+
domains_list = v3.get("domains", []) or []
|
|
105
|
+
terms_list = v3.get("glossary", []) or []
|
|
106
|
+
entities = v3.get("entities", []) or []
|
|
107
|
+
indexes = v3.get("indexes", []) or []
|
|
108
|
+
|
|
109
|
+
# Write manifest only if one does not already exist.
|
|
110
|
+
manifest_path = root / "datalex.yaml"
|
|
111
|
+
manifest_doc = {
|
|
112
|
+
"kind": "project",
|
|
113
|
+
"name": v3["model"]["name"],
|
|
114
|
+
"version": str(v3["model"].get("version", "1")),
|
|
115
|
+
"description": v3["model"].get("description", ""),
|
|
116
|
+
"dialects": [default_dialect],
|
|
117
|
+
"default_dialect": default_dialect,
|
|
118
|
+
"glossary": "glossary/**/*.yaml",
|
|
119
|
+
"models": "models/**/*.yaml",
|
|
120
|
+
"snippets": ".datalex/snippets/**/*.yaml",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
if not manifest_path.exists():
|
|
124
|
+
_write_yaml(manifest_path, manifest_doc, dry_run=dry_run, report=report)
|
|
125
|
+
report.manifest_written = True
|
|
126
|
+
else:
|
|
127
|
+
report.warnings.append(f"{manifest_path} exists; left untouched.")
|
|
128
|
+
|
|
129
|
+
# Glossary
|
|
130
|
+
for term in terms_list:
|
|
131
|
+
name = _snake(term.get("term") or term.get("name") or "")
|
|
132
|
+
if not name:
|
|
133
|
+
continue
|
|
134
|
+
doc = {
|
|
135
|
+
"kind": "term",
|
|
136
|
+
"name": name,
|
|
137
|
+
"definition": term.get("definition", ""),
|
|
138
|
+
}
|
|
139
|
+
if term.get("owner"):
|
|
140
|
+
doc["steward"] = term["owner"]
|
|
141
|
+
if term.get("abbreviation"):
|
|
142
|
+
doc["abbreviation"] = term["abbreviation"]
|
|
143
|
+
if term.get("tags"):
|
|
144
|
+
doc["tags"] = [str(t) for t in term["tags"]]
|
|
145
|
+
path = root / "glossary" / f"{name}.yaml"
|
|
146
|
+
_write_yaml(path, doc, dry_run=dry_run, report=report)
|
|
147
|
+
report.terms_written += 1
|
|
148
|
+
|
|
149
|
+
# Domains
|
|
150
|
+
for dom in domains_list:
|
|
151
|
+
name = _snake(dom.get("name") or "")
|
|
152
|
+
if not name:
|
|
153
|
+
continue
|
|
154
|
+
doc = {
|
|
155
|
+
"kind": "domain",
|
|
156
|
+
"name": name,
|
|
157
|
+
"description": dom.get("description", ""),
|
|
158
|
+
}
|
|
159
|
+
path = root / "models" / "domains" / f"{name}.yaml"
|
|
160
|
+
_write_yaml(path, doc, dry_run=dry_run, report=report)
|
|
161
|
+
report.domains_written += 1
|
|
162
|
+
|
|
163
|
+
# Entities
|
|
164
|
+
for ent in entities:
|
|
165
|
+
orig_name = str(ent["name"])
|
|
166
|
+
snake = _snake(orig_name)
|
|
167
|
+
entity_doc: Dict[str, Any] = {
|
|
168
|
+
"kind": "entity",
|
|
169
|
+
"layer": "physical",
|
|
170
|
+
"dialect": default_dialect,
|
|
171
|
+
"name": snake,
|
|
172
|
+
}
|
|
173
|
+
if orig_name != snake:
|
|
174
|
+
entity_doc["physical_name"] = orig_name
|
|
175
|
+
if ent.get("description"):
|
|
176
|
+
entity_doc["description"] = ent["description"]
|
|
177
|
+
if ent.get("owner"):
|
|
178
|
+
entity_doc["owner"] = ent["owner"]
|
|
179
|
+
if ent.get("schema"):
|
|
180
|
+
entity_doc["schema"] = ent["schema"]
|
|
181
|
+
if ent.get("database"):
|
|
182
|
+
entity_doc["database"] = ent["database"]
|
|
183
|
+
if ent.get("subject_area"):
|
|
184
|
+
entity_doc["subject_area"] = ent["subject_area"]
|
|
185
|
+
if ent.get("tags"):
|
|
186
|
+
entity_doc["tags"] = [_kebab(str(t)) for t in ent["tags"]]
|
|
187
|
+
if ent.get("partition_by"):
|
|
188
|
+
entity_doc["partition_by"] = ent["partition_by"]
|
|
189
|
+
if ent.get("cluster_by"):
|
|
190
|
+
entity_doc["cluster_by"] = ent["cluster_by"]
|
|
191
|
+
|
|
192
|
+
cls_for_entity = governance.get(orig_name, {}) or {}
|
|
193
|
+
|
|
194
|
+
# columns
|
|
195
|
+
cols: List[Dict[str, Any]] = []
|
|
196
|
+
for f in ent.get("fields") or []:
|
|
197
|
+
col: Dict[str, Any] = {"name": f["name"], "type": _translate_type(f.get("type", "string"))}
|
|
198
|
+
if f.get("description"):
|
|
199
|
+
col["description"] = f["description"]
|
|
200
|
+
if f.get("nullable") is not None:
|
|
201
|
+
col["nullable"] = bool(f["nullable"])
|
|
202
|
+
if f.get("primary_key"):
|
|
203
|
+
col["primary_key"] = True
|
|
204
|
+
if f.get("unique"):
|
|
205
|
+
col["unique"] = True
|
|
206
|
+
if f.get("default") is not None:
|
|
207
|
+
col["default"] = f["default"]
|
|
208
|
+
if f.get("sensitivity"):
|
|
209
|
+
col["sensitivity"] = f["sensitivity"]
|
|
210
|
+
if f.get("deprecated"):
|
|
211
|
+
col["deprecated"] = True
|
|
212
|
+
if f.get("examples"):
|
|
213
|
+
col["examples"] = f["examples"]
|
|
214
|
+
if f.get("tags"):
|
|
215
|
+
col["tags"] = [_kebab(str(t)) for t in f["tags"]]
|
|
216
|
+
|
|
217
|
+
# Governance classification at the column
|
|
218
|
+
if isinstance(cls_for_entity, dict):
|
|
219
|
+
sens = cls_for_entity.get(f["name"])
|
|
220
|
+
if sens and "sensitivity" not in col:
|
|
221
|
+
col["sensitivity"] = sens.lower()
|
|
222
|
+
|
|
223
|
+
# Check constraints become explicit constraint items
|
|
224
|
+
if f.get("check"):
|
|
225
|
+
col.setdefault("constraints", []).append({
|
|
226
|
+
"type": "check",
|
|
227
|
+
"expression": f["check"],
|
|
228
|
+
})
|
|
229
|
+
|
|
230
|
+
# v3 relationships → references
|
|
231
|
+
rel = rel_by_child.get((orig_name, f["name"]))
|
|
232
|
+
if rel:
|
|
233
|
+
col["references"] = rel
|
|
234
|
+
|
|
235
|
+
cols.append(col)
|
|
236
|
+
entity_doc["columns"] = cols
|
|
237
|
+
|
|
238
|
+
# Indexes owned by this entity
|
|
239
|
+
ent_indexes: List[Dict[str, Any]] = []
|
|
240
|
+
for idx in indexes:
|
|
241
|
+
if idx.get("entity") == orig_name:
|
|
242
|
+
ent_indexes.append({
|
|
243
|
+
"name": idx["name"],
|
|
244
|
+
"columns": list(idx.get("fields", [])),
|
|
245
|
+
**({"unique": True} if idx.get("unique") else {}),
|
|
246
|
+
**({"type": idx["type"]} if idx.get("type") else {}),
|
|
247
|
+
})
|
|
248
|
+
if ent_indexes:
|
|
249
|
+
entity_doc["indexes"] = ent_indexes
|
|
250
|
+
|
|
251
|
+
# Preserve anything else we didn't explicitly migrate under meta.datalex.v3
|
|
252
|
+
preserved: Dict[str, Any] = {}
|
|
253
|
+
for key in (
|
|
254
|
+
"grain", "candidate_keys", "business_keys", "hash_key", "sla",
|
|
255
|
+
"scd_type", "natural_key", "surrogate_key", "conformed",
|
|
256
|
+
"subtype_of", "subtypes", "dimension_refs", "link_refs",
|
|
257
|
+
"parent_entity", "hash_diff_fields", "load_timestamp_field",
|
|
258
|
+
"record_source_field", "distribution", "storage", "template", "templates",
|
|
259
|
+
"physical_name",
|
|
260
|
+
):
|
|
261
|
+
if key in ent and key != "physical_name":
|
|
262
|
+
preserved[key] = ent[key]
|
|
263
|
+
if preserved:
|
|
264
|
+
entity_doc.setdefault("meta", {}).setdefault("datalex", {})["v3"] = preserved
|
|
265
|
+
|
|
266
|
+
subdir = ent.get("subject_area") or default_dialect
|
|
267
|
+
# Directory layout: models/physical/<dialect>/<entity>.yaml, with subject_area
|
|
268
|
+
# as an optional sub-group.
|
|
269
|
+
out_path = root / "models" / "physical" / default_dialect / f"{snake}.yaml"
|
|
270
|
+
_write_yaml(out_path, entity_doc, dry_run=dry_run, report=report)
|
|
271
|
+
report.entities_written += 1
|
|
272
|
+
|
|
273
|
+
return report
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _index_relationships_by_child(
|
|
277
|
+
relationships: List[Dict[str, Any]],
|
|
278
|
+
entities: List[Dict[str, Any]],
|
|
279
|
+
) -> Dict[Tuple[str, str], Dict[str, Any]]:
|
|
280
|
+
"""Return { (child_entity_pascal, child_field_snake): references_dict }.
|
|
281
|
+
|
|
282
|
+
v3 encodes relationships as top-level objects with from/to = "Entity.field".
|
|
283
|
+
The child side (the one with the FK column) depends on cardinality:
|
|
284
|
+
one_to_many => 'to' is the many side, which is the child
|
|
285
|
+
many_to_one => 'from' is the many side, which is the child
|
|
286
|
+
one_to_one => prefer the non-PK side; fall back to 'from'
|
|
287
|
+
many_to_many => we cannot express in a single column; skip with a warning
|
|
288
|
+
(the join entity typically already has both FKs declared
|
|
289
|
+
at the column level in the migrator output anyway)
|
|
290
|
+
"""
|
|
291
|
+
by_child: Dict[Tuple[str, str], Dict[str, Any]] = {}
|
|
292
|
+
for rel in relationships:
|
|
293
|
+
card = rel.get("cardinality")
|
|
294
|
+
frm = rel.get("from", "")
|
|
295
|
+
to = rel.get("to", "")
|
|
296
|
+
if "." not in frm or "." not in to:
|
|
297
|
+
continue
|
|
298
|
+
from_entity, from_field = frm.split(".", 1)
|
|
299
|
+
to_entity, to_field = to.split(".", 1)
|
|
300
|
+
|
|
301
|
+
if card == "many_to_one":
|
|
302
|
+
child = (from_entity, from_field)
|
|
303
|
+
parent = (to_entity, to_field)
|
|
304
|
+
elif card == "one_to_many":
|
|
305
|
+
child = (to_entity, to_field)
|
|
306
|
+
parent = (from_entity, from_field)
|
|
307
|
+
elif card == "one_to_one":
|
|
308
|
+
child = (from_entity, from_field)
|
|
309
|
+
parent = (to_entity, to_field)
|
|
310
|
+
else: # many_to_many — not representable as a single FK
|
|
311
|
+
continue
|
|
312
|
+
|
|
313
|
+
ref = {
|
|
314
|
+
"entity": _snake(parent[0]),
|
|
315
|
+
"column": parent[1],
|
|
316
|
+
}
|
|
317
|
+
if rel.get("on_delete"):
|
|
318
|
+
ref["on_delete"] = rel["on_delete"]
|
|
319
|
+
if rel.get("on_update"):
|
|
320
|
+
ref["on_update"] = rel["on_update"]
|
|
321
|
+
ref["relationship"] = _rel_from_cardinality(card)
|
|
322
|
+
by_child[child] = ref
|
|
323
|
+
|
|
324
|
+
return by_child
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _rel_from_cardinality(card: Optional[str]) -> str:
|
|
328
|
+
return {
|
|
329
|
+
"many_to_one": "many_to_one",
|
|
330
|
+
"one_to_many": "many_to_one",
|
|
331
|
+
"one_to_one": "one_to_one",
|
|
332
|
+
"many_to_many": "many_to_many",
|
|
333
|
+
}.get(card or "", "many_to_one")
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
_V3_TYPE_MAP = {
|
|
337
|
+
"string": "string",
|
|
338
|
+
"text": "text",
|
|
339
|
+
"integer": "integer",
|
|
340
|
+
"int": "integer",
|
|
341
|
+
"bigint": "bigint",
|
|
342
|
+
"float": "float",
|
|
343
|
+
"double": "float",
|
|
344
|
+
"boolean": "boolean",
|
|
345
|
+
"bool": "boolean",
|
|
346
|
+
"date": "date",
|
|
347
|
+
"timestamp": "timestamp",
|
|
348
|
+
"datetime": "timestamp",
|
|
349
|
+
"timestamp_tz": "timestamp_tz",
|
|
350
|
+
"timestamptz": "timestamp_tz",
|
|
351
|
+
"uuid": "uuid",
|
|
352
|
+
"json": "json",
|
|
353
|
+
"jsonb": "json",
|
|
354
|
+
"binary": "binary",
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _translate_type(t: str) -> str:
|
|
359
|
+
raw = (t or "").strip()
|
|
360
|
+
lower = raw.lower()
|
|
361
|
+
if lower.startswith("decimal"):
|
|
362
|
+
return lower
|
|
363
|
+
return _V3_TYPE_MAP.get(lower, raw)
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def _snake(name: str) -> str:
|
|
367
|
+
s1 = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", name)
|
|
368
|
+
s2 = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
|
|
369
|
+
return re.sub(r"[^a-z0-9_]", "_", s2).strip("_") or name.lower()
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _kebab(name: str) -> str:
|
|
373
|
+
return _snake(name).replace("_", "-")
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def _write_yaml(path: Path, doc: Dict[str, Any], dry_run: bool, report: MigrationReport) -> None:
|
|
377
|
+
report.files.append(str(path))
|
|
378
|
+
if dry_run:
|
|
379
|
+
return
|
|
380
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
381
|
+
with path.open("w", encoding="utf-8") as f:
|
|
382
|
+
yaml.safe_dump(doc, f, sort_keys=False, default_flow_style=False, allow_unicode=True)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Content-hash parse cache for DataLex YAML files.
|
|
2
|
+
|
|
3
|
+
The loader parses YAML, validates against a JSON Schema, and strips source
|
|
4
|
+
marks — work that is deterministic given (file bytes, schema bytes). For a
|
|
5
|
+
10K-entity project, reparsing every file on every validate / diff / emit is
|
|
6
|
+
the dominant cost. This cache eliminates it.
|
|
7
|
+
|
|
8
|
+
Cache layout:
|
|
9
|
+
<cache_root>/<content_sha>__<schema_sha>.json
|
|
10
|
+
|
|
11
|
+
where:
|
|
12
|
+
content_sha is sha256(file bytes)
|
|
13
|
+
schema_sha is sha256(schema bytes for the file's declared kind)
|
|
14
|
+
|
|
15
|
+
The cached payload is a JSON dump of the already-validated, mark-stripped
|
|
16
|
+
document. We store JSON (not pickle) so the cache survives across Python
|
|
17
|
+
versions and is inspectable by humans.
|
|
18
|
+
|
|
19
|
+
Opt-in: set `DATALEX_CACHE=1` in the environment, or pass
|
|
20
|
+
`cache_dir=<path>` to `load_project`. Cache is keyed purely by content hash
|
|
21
|
+
so stale entries are never served — if the file changes by a single byte,
|
|
22
|
+
the cache key changes too.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import hashlib
|
|
28
|
+
import json
|
|
29
|
+
import os
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Any, Dict, Optional
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ParseCache:
|
|
35
|
+
"""Disk-backed, content-addressed parse cache.
|
|
36
|
+
|
|
37
|
+
Safe to use from multiple processes: writes are atomic via rename.
|
|
38
|
+
Schema hash is lazily computed once per (schemas_root, kind).
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, cache_dir: Path, schemas_root: Path) -> None:
|
|
42
|
+
self.cache_dir = cache_dir
|
|
43
|
+
self.schemas_root = schemas_root
|
|
44
|
+
self._schema_hashes: Dict[str, str] = {}
|
|
45
|
+
self.hits = 0
|
|
46
|
+
self.misses = 0
|
|
47
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
|
|
49
|
+
def _schema_hash(self, kind: str) -> str:
|
|
50
|
+
if kind in self._schema_hashes:
|
|
51
|
+
return self._schema_hashes[kind]
|
|
52
|
+
path = self.schemas_root / f"{kind}.schema.json"
|
|
53
|
+
if not path.exists():
|
|
54
|
+
self._schema_hashes[kind] = "no-schema"
|
|
55
|
+
return "no-schema"
|
|
56
|
+
h = hashlib.sha256(path.read_bytes()).hexdigest()
|
|
57
|
+
self._schema_hashes[kind] = h
|
|
58
|
+
return h
|
|
59
|
+
|
|
60
|
+
def _key(self, content_sha: str, kind: str) -> Path:
|
|
61
|
+
schema_sha = self._schema_hash(kind)
|
|
62
|
+
return self.cache_dir / f"{content_sha}__{schema_sha}.json"
|
|
63
|
+
|
|
64
|
+
def get(self, path: Path, kind: str) -> Optional[Dict[str, Any]]:
|
|
65
|
+
content_sha = hashlib.sha256(path.read_bytes()).hexdigest()
|
|
66
|
+
key = self._key(content_sha, kind)
|
|
67
|
+
if not key.exists():
|
|
68
|
+
self.misses += 1
|
|
69
|
+
return None
|
|
70
|
+
try:
|
|
71
|
+
with key.open("r", encoding="utf-8") as f:
|
|
72
|
+
self.hits += 1
|
|
73
|
+
return json.load(f)
|
|
74
|
+
except (OSError, json.JSONDecodeError):
|
|
75
|
+
# corrupt entry — treat as miss
|
|
76
|
+
self.misses += 1
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
def put(self, path: Path, kind: str, doc: Dict[str, Any]) -> None:
|
|
80
|
+
content_sha = hashlib.sha256(path.read_bytes()).hexdigest()
|
|
81
|
+
key = self._key(content_sha, kind)
|
|
82
|
+
tmp = key.with_suffix(".json.tmp")
|
|
83
|
+
try:
|
|
84
|
+
with tmp.open("w", encoding="utf-8") as f:
|
|
85
|
+
json.dump(doc, f, sort_keys=True)
|
|
86
|
+
os.replace(tmp, key)
|
|
87
|
+
except OSError:
|
|
88
|
+
if tmp.exists():
|
|
89
|
+
tmp.unlink()
|
|
90
|
+
|
|
91
|
+
def summary(self) -> Dict[str, int]:
|
|
92
|
+
return {"hits": self.hits, "misses": self.misses}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def cache_enabled_from_env() -> bool:
|
|
96
|
+
return os.environ.get("DATALEX_CACHE", "").lower() in {"1", "true", "yes"}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def default_cache_dir(project_root: Path) -> Path:
|
|
100
|
+
"""Return the per-project cache directory. Kept under the project, not $HOME,
|
|
101
|
+
so it's scoped to the checkout and easy to wipe (`rm -rf build/`)."""
|
|
102
|
+
return project_root / "build" / ".cache" / "datalex-parse"
|