catalogkit-core 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,49 @@
1
+ """Public package surface for catalogkit-core."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ._version import __version__
6
+ from .errors import CanonicalIdError, CatalogCoreError, MergeConflictError
7
+ from .ids import (
8
+ asset_id,
9
+ column_id,
10
+ cte_id,
11
+ leaf_name,
12
+ model_id,
13
+ normalize_identifier,
14
+ normalize_identifier_part,
15
+ normalize_identifier_parts,
16
+ report_id,
17
+ schema_name,
18
+ split_qualified_identifier,
19
+ table_id,
20
+ )
21
+ from .merge import merge
22
+ from .models import CatalogArtifact, Edge, Evidence, Node, Warning
23
+ from .serialize import render_json
24
+
25
+ __all__ = [
26
+ "__version__",
27
+ "asset_id",
28
+ "CatalogArtifact",
29
+ "CatalogCoreError",
30
+ "CanonicalIdError",
31
+ "column_id",
32
+ "cte_id",
33
+ "Edge",
34
+ "Evidence",
35
+ "leaf_name",
36
+ "merge",
37
+ "MergeConflictError",
38
+ "model_id",
39
+ "Node",
40
+ "normalize_identifier",
41
+ "normalize_identifier_part",
42
+ "normalize_identifier_parts",
43
+ "render_json",
44
+ "report_id",
45
+ "schema_name",
46
+ "split_qualified_identifier",
47
+ "table_id",
48
+ "Warning",
49
+ ]
@@ -0,0 +1,3 @@
1
+ """Package version."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,15 @@
1
+ """Shared errors for catalogkit-core."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class CatalogCoreError(Exception):
7
+ """Base class for catalogkit-core failures."""
8
+
9
+
10
+ class CanonicalIdError(CatalogCoreError):
11
+ """Raised when an identifier cannot be normalized into a canonical ID."""
12
+
13
+
14
+ class MergeConflictError(CatalogCoreError):
15
+ """Raised when artifacts cannot be merged without losing information."""
catalogkit/core/ids.py ADDED
@@ -0,0 +1,149 @@
1
+ """Canonical identifier normalization and ID builders."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable
6
+
7
+ from .errors import CanonicalIdError
8
+
9
+ _QUOTE_PAIRS = {
10
+ '"': '"',
11
+ "`": "`",
12
+ "[": "]",
13
+ }
14
+
15
+
16
+ def _strip_matching_quotes(value: str) -> str:
17
+ if len(value) < 2:
18
+ return value
19
+
20
+ first = value[0]
21
+ last = value[-1]
22
+ expected_last = _QUOTE_PAIRS.get(first)
23
+ if expected_last == last:
24
+ return value[1:-1]
25
+ return value
26
+
27
+
28
+ def normalize_identifier(value: str) -> str:
29
+ """Normalize a possibly qualified identifier into canonical dotted form."""
30
+ parts = split_qualified_identifier(value)
31
+ return normalize_identifier_parts(parts)
32
+
33
+
34
+ def normalize_identifier_parts(parts: Iterable[str]) -> str:
35
+ """Normalize already separated identifier parts into canonical dotted form."""
36
+ normalized_parts = [normalize_identifier_part(part) for part in parts if str(part).strip()]
37
+ if not normalized_parts:
38
+ raise CanonicalIdError("Identifier must contain at least one non-empty part.")
39
+ return ".".join(normalized_parts)
40
+
41
+
42
+ def normalize_identifier_part(part: str) -> str:
43
+ """Normalize one identifier segment."""
44
+ value = str(part).strip()
45
+ if not value:
46
+ raise CanonicalIdError("Identifier part cannot be empty.")
47
+ if value == "*":
48
+ raise CanonicalIdError("Wildcard identifiers cannot be canonicalized.")
49
+ unquoted = _strip_matching_quotes(value).strip()
50
+ if not unquoted:
51
+ raise CanonicalIdError("Identifier part cannot be empty after unquoting.")
52
+ return unquoted.lower()
53
+
54
+
55
+ def split_qualified_identifier(value: str) -> list[str]:
56
+ """Split a qualified identifier on dots while respecting quoted segments."""
57
+ text = str(value).strip()
58
+ if not text:
59
+ raise CanonicalIdError("Identifier cannot be empty.")
60
+
61
+ parts: list[str] = []
62
+ current: list[str] = []
63
+ quote_stack: list[str] = []
64
+
65
+ for char in text:
66
+ if quote_stack:
67
+ current.append(char)
68
+ if char == quote_stack[-1]:
69
+ quote_stack.pop()
70
+ continue
71
+
72
+ if char in _QUOTE_PAIRS:
73
+ quote_stack.append(_QUOTE_PAIRS[char])
74
+ current.append(char)
75
+ continue
76
+
77
+ if char == ".":
78
+ part = "".join(current).strip()
79
+ if not part:
80
+ raise CanonicalIdError(f"Invalid qualified identifier {value!r}.")
81
+ parts.append(part)
82
+ current = []
83
+ continue
84
+
85
+ current.append(char)
86
+
87
+ if quote_stack:
88
+ raise CanonicalIdError(f"Unclosed quote in identifier {value!r}.")
89
+
90
+ final_part = "".join(current).strip()
91
+ if not final_part:
92
+ raise CanonicalIdError(f"Invalid qualified identifier {value!r}.")
93
+ parts.append(final_part)
94
+ return parts
95
+
96
+
97
+ def table_id(qualified_name: str) -> str:
98
+ return f"table:{normalize_identifier(qualified_name)}"
99
+
100
+
101
+ def cte_id(name: str) -> str:
102
+ return f"cte:{normalize_identifier_part(name)}"
103
+
104
+
105
+ def column_id(parent_qualified_name: str, column_name: str) -> str:
106
+ parent = normalize_identifier(parent_qualified_name)
107
+ column = normalize_identifier_part(column_name)
108
+ return f"column:{parent}.{column}"
109
+
110
+
111
+ def model_id(qualified_name: str) -> str:
112
+ return f"model:{normalize_identifier(qualified_name)}"
113
+
114
+
115
+ def report_id(qualified_name: str) -> str:
116
+ return f"report:{normalize_identifier(qualified_name)}"
117
+
118
+
119
+ def asset_id(qualified_name: str) -> str:
120
+ return f"asset:{normalize_identifier(qualified_name)}"
121
+
122
+
123
+ def schema_name(qualified_name: str) -> str | None:
124
+ normalized = normalize_identifier(qualified_name)
125
+ parts = normalized.split(".")
126
+ if len(parts) <= 1:
127
+ return None
128
+ return ".".join(parts[:-1])
129
+
130
+
131
+ def leaf_name(qualified_name: str) -> str:
132
+ normalized = normalize_identifier(qualified_name)
133
+ return normalized.split(".")[-1]
134
+
135
+
136
+ __all__ = [
137
+ "asset_id",
138
+ "column_id",
139
+ "cte_id",
140
+ "leaf_name",
141
+ "model_id",
142
+ "normalize_identifier",
143
+ "normalize_identifier_part",
144
+ "normalize_identifier_parts",
145
+ "report_id",
146
+ "schema_name",
147
+ "split_qualified_identifier",
148
+ "table_id",
149
+ ]
@@ -0,0 +1,123 @@
1
+ """Artifact merge utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from .errors import MergeConflictError
8
+ from .models import CatalogArtifact, Edge, Evidence, Node, Warning
9
+
10
+
11
+ def merge(*artifacts: CatalogArtifact) -> CatalogArtifact:
12
+ """Merge multiple artifacts under the shared catalogkit-core rules."""
13
+ merged_nodes: dict[str, Node] = {}
14
+ merged_edges: dict[tuple[str, str, str], Edge] = {}
15
+ merged_warnings: dict[tuple[str, str, str | None], Warning] = {}
16
+
17
+ for artifact in artifacts:
18
+ for node in artifact.nodes:
19
+ existing = merged_nodes.get(node.id)
20
+ merged_nodes[node.id] = node if existing is None else _merge_node(existing, node)
21
+
22
+ for edge in artifact.edges:
23
+ key = (edge.kind, edge.source_id, edge.target_id)
24
+ existing = merged_edges.get(key)
25
+ merged_edges[key] = edge if existing is None else _merge_edge(existing, edge)
26
+
27
+ for warning in artifact.warnings:
28
+ merged_warnings[(warning.code, warning.message, warning.location)] = warning
29
+
30
+ return CatalogArtifact(
31
+ version=_merge_versions([artifact.version for artifact in artifacts]),
32
+ nodes=sorted(merged_nodes.values(), key=lambda node: node.id),
33
+ edges=sorted(
34
+ merged_edges.values(),
35
+ key=lambda edge: (edge.kind, edge.source_id, edge.target_id),
36
+ ),
37
+ warnings=sorted(
38
+ merged_warnings.values(),
39
+ key=lambda warning: (warning.code, warning.message, warning.location or ""),
40
+ ),
41
+ )
42
+
43
+
44
+ def _merge_versions(versions: list[str]) -> str:
45
+ unique_versions = {version for version in versions if version}
46
+ if not unique_versions:
47
+ return "1"
48
+ if len(unique_versions) != 1:
49
+ raise MergeConflictError(
50
+ f"Cannot merge artifacts with different schema versions: {sorted(unique_versions)}"
51
+ )
52
+ return unique_versions.pop()
53
+
54
+
55
+ def _merge_node(left: Node, right: Node) -> Node:
56
+ return Node(
57
+ id=left.id,
58
+ kind=_merge_scalar("node.kind", left.kind, right.kind, left.id),
59
+ name=_merge_scalar("node.name", left.name, right.name, left.id),
60
+ qualified_name=_merge_optional(
61
+ "node.qualified_name",
62
+ left.qualified_name,
63
+ right.qualified_name,
64
+ left.id,
65
+ ),
66
+ schema=_merge_optional("node.schema", left.schema_name, right.schema_name, left.id),
67
+ evidence=_merge_evidence(left.evidence, right.evidence),
68
+ )
69
+
70
+
71
+ def _merge_edge(left: Edge, right: Edge) -> Edge:
72
+ return Edge(
73
+ kind=left.kind,
74
+ source_id=left.source_id,
75
+ target_id=left.target_id,
76
+ label=_merge_optional("edge.label", left.label, right.label, left.source_id),
77
+ confidence=_merge_scalar(
78
+ "edge.confidence",
79
+ left.confidence,
80
+ right.confidence,
81
+ f"{left.kind}:{left.source_id}->{left.target_id}",
82
+ ),
83
+ evidence=_merge_evidence(left.evidence, right.evidence),
84
+ )
85
+
86
+
87
+ def _merge_scalar(field_name: str, left: Any, right: Any, owner: str) -> Any:
88
+ if left != right:
89
+ raise MergeConflictError(
90
+ f"Conflicting {field_name} values for {owner!r}: {left!r} != {right!r}"
91
+ )
92
+ return left
93
+
94
+
95
+ def _merge_optional(field_name: str, left: Any, right: Any, owner: str) -> Any:
96
+ if left in (None, ""):
97
+ return right
98
+ if right in (None, ""):
99
+ return left
100
+ if left != right:
101
+ raise MergeConflictError(
102
+ f"Conflicting {field_name} values for {owner!r}: {left!r} != {right!r}"
103
+ )
104
+ return left
105
+
106
+
107
+ def _merge_evidence(left: list[Evidence], right: list[Evidence]) -> list[Evidence]:
108
+ seen: set[tuple[str | None, str | None, str | None, str]] = set()
109
+ merged: list[Evidence] = []
110
+
111
+ for evidence in [*left, *right]:
112
+ key = (
113
+ evidence.file,
114
+ evidence.location,
115
+ evidence.expression,
116
+ evidence.confidence,
117
+ )
118
+ if key in seen:
119
+ continue
120
+ seen.add(key)
121
+ merged.append(evidence)
122
+
123
+ return merged
@@ -0,0 +1,55 @@
1
+ """Shared artifact models for CatalogKit."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+
9
+ Confidence = Literal["high", "medium", "low"]
10
+ NodeKind = Literal["table", "cte", "column", "model", "report", "asset"]
11
+ EdgeKind = Literal["depends_on", "feeds", "derives_from", "references", "joins"]
12
+
13
+
14
+ class Evidence(BaseModel):
15
+ file: str | None = None
16
+ location: str | None = None
17
+ expression: str | None = None
18
+ confidence: Confidence = "medium"
19
+
20
+
21
+ class Warning(BaseModel):
22
+ code: str
23
+ message: str
24
+ location: str | None = None
25
+
26
+
27
+ class Node(BaseModel):
28
+ model_config = ConfigDict(populate_by_name=True)
29
+
30
+ id: str
31
+ kind: NodeKind
32
+ name: str
33
+ qualified_name: str | None = None
34
+ schema_name: str | None = Field(
35
+ default=None,
36
+ alias="schema",
37
+ serialization_alias="schema",
38
+ )
39
+ evidence: list[Evidence] = Field(default_factory=list)
40
+
41
+
42
+ class Edge(BaseModel):
43
+ kind: EdgeKind
44
+ source_id: str
45
+ target_id: str
46
+ label: str | None = None
47
+ confidence: Confidence = "high"
48
+ evidence: list[Evidence] = Field(default_factory=list)
49
+
50
+
51
+ class CatalogArtifact(BaseModel):
52
+ version: str = "1"
53
+ nodes: list[Node] = Field(default_factory=list)
54
+ edges: list[Edge] = Field(default_factory=list)
55
+ warnings: list[Warning] = Field(default_factory=list)
@@ -0,0 +1,10 @@
1
+ """Serialization helpers for CatalogKit artifacts."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .models import CatalogArtifact
6
+
7
+
8
+ def render_json(artifact: CatalogArtifact) -> dict:
9
+ """Return the canonical JSON-serializable artifact."""
10
+ return artifact.model_dump(mode="json", by_alias=True)
@@ -0,0 +1,65 @@
1
+ Metadata-Version: 2.4
2
+ Name: catalogkit-core
3
+ Version: 0.1.0
4
+ Summary: Shared artifact models, canonical IDs, and merge semantics for CatalogKit.
5
+ Author: ClearMetric Labs
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/Clearmetric-Labs/CatalogKit
8
+ Project-URL: Source, https://github.com/Clearmetric-Labs/CatalogKit
9
+ Project-URL: Issues, https://github.com/Clearmetric-Labs/CatalogKit/issues
10
+ Keywords: catalog,lineage,metadata,graph
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: pydantic>=2.10.0
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
25
+ Provides-Extra: release
26
+ Requires-Dist: build>=1.2.2; extra == "release"
27
+ Requires-Dist: twine>=5.1.1; extra == "release"
28
+
29
+ # catalogkit-core
30
+
31
+ `catalogkit-core` is the shared consistency layer for every CatalogKit module.
32
+
33
+ It owns:
34
+
35
+ - artifact schema versioning
36
+ - canonical ID normalization
37
+ - shared graph models
38
+ - deterministic JSON serialization
39
+ - merge semantics
40
+
41
+ It does **not** perform extraction by itself. Tool packages such as `catalogkit-query`
42
+ depend on `catalogkit-core` and emit artifacts that follow its contract.
43
+
44
+ ## Install
45
+
46
+ ```bash
47
+ python -m pip install catalogkit-core
48
+ ```
49
+
50
+ ## Imports
51
+
52
+ ```python
53
+ from catalogkit.core import CatalogArtifact, Edge, Evidence, Node, Warning, merge
54
+ ```
55
+
56
+ For local development:
57
+
58
+ ```bash
59
+ python -m pip install -e ".[dev,release]"
60
+ ```
61
+
62
+ ## Contract
63
+
64
+ The source of truth for the shared artifact contract is
65
+ [`docs/contract.md`](docs/contract.md).
@@ -0,0 +1,11 @@
1
+ catalogkit/core/__init__.py,sha256=BA2M718ODcIomKTxn-Wq23xQW4b2Seyb-kahbrPI_PA,1060
2
+ catalogkit/core/_version.py,sha256=VDsBswzfU7gNsBkIaxxV6tsi0ixlms3WjN7HYgZJZ2g,46
3
+ catalogkit/core/errors.py,sha256=Rtv4GhUQjETARUdBkwhhQUSovBKd5MHMrhtfMc23lfw,410
4
+ catalogkit/core/ids.py,sha256=FATrzpvFTi0qsXS377QnA-tiyDUg9uD-763nVgza2zs,4160
5
+ catalogkit/core/merge.py,sha256=1BJOi9-LUIakIP2L1iK1jqVfmgDNr6O9KmDw60DNLDM,4111
6
+ catalogkit/core/models.py,sha256=49kJBeA0F7j4DFEsW5sPXeZGOVZbAWv1p11fF56f4Iw,1388
7
+ catalogkit/core/serialize.py,sha256=tm6X2pVoWfLfpCYZw8-IRMhaqL2vcvG-uftYYU2y8MY,299
8
+ catalogkit_core-0.1.0.dist-info/METADATA,sha256=NUBqss2fwTYB5ixDn8_VYkbFlUOxLWpv499TKHu7Vho,1968
9
+ catalogkit_core-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
10
+ catalogkit_core-0.1.0.dist-info/top_level.txt,sha256=RsnUdiXrSqkJn7elbvcSi3dAxxLtc6rLuzPGHGR_44I,11
11
+ catalogkit_core-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ catalogkit