aptdata 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. aptdata/__init__.py +3 -0
  2. aptdata/cli/__init__.py +5 -0
  3. aptdata/cli/app.py +247 -0
  4. aptdata/cli/commands/__init__.py +9 -0
  5. aptdata/cli/commands/config_cmd.py +128 -0
  6. aptdata/cli/commands/mesh_cmd.py +435 -0
  7. aptdata/cli/commands/plugin_cmd.py +107 -0
  8. aptdata/cli/commands/system_cmd.py +90 -0
  9. aptdata/cli/commands/telemetry_cmd.py +57 -0
  10. aptdata/cli/completions.py +56 -0
  11. aptdata/cli/interactive.py +269 -0
  12. aptdata/cli/rendering/__init__.py +31 -0
  13. aptdata/cli/rendering/console.py +119 -0
  14. aptdata/cli/rendering/logger.py +26 -0
  15. aptdata/cli/rendering/panels.py +87 -0
  16. aptdata/cli/rendering/tables.py +81 -0
  17. aptdata/cli/scaffold.py +1089 -0
  18. aptdata/config/__init__.py +13 -0
  19. aptdata/config/parser.py +136 -0
  20. aptdata/config/schema.py +27 -0
  21. aptdata/config/secrets.py +60 -0
  22. aptdata/core/__init__.py +46 -0
  23. aptdata/core/context.py +31 -0
  24. aptdata/core/dataset.py +39 -0
  25. aptdata/core/lineage.py +213 -0
  26. aptdata/core/state.py +27 -0
  27. aptdata/core/system.py +317 -0
  28. aptdata/core/workflow.py +372 -0
  29. aptdata/mcp/__init__.py +5 -0
  30. aptdata/mcp/server.py +198 -0
  31. aptdata/plugins/__init__.py +77 -0
  32. aptdata/plugins/ai/__init__.py +6 -0
  33. aptdata/plugins/ai/chunking.py +66 -0
  34. aptdata/plugins/ai/embeddings.py +56 -0
  35. aptdata/plugins/base.py +57 -0
  36. aptdata/plugins/dataset.py +62 -0
  37. aptdata/plugins/governance/__init__.py +32 -0
  38. aptdata/plugins/governance/catalog.py +115 -0
  39. aptdata/plugins/governance/classification.py +44 -0
  40. aptdata/plugins/governance/lineage_store.py +49 -0
  41. aptdata/plugins/governance/rules.py +180 -0
  42. aptdata/plugins/local_fs.py +241 -0
  43. aptdata/plugins/manager.py +142 -0
  44. aptdata/plugins/postgres.py +113 -0
  45. aptdata/plugins/quality/__init__.py +39 -0
  46. aptdata/plugins/quality/contract.py +128 -0
  47. aptdata/plugins/quality/expectations.py +310 -0
  48. aptdata/plugins/quality/report.py +94 -0
  49. aptdata/plugins/quality/validator.py +139 -0
  50. aptdata/plugins/rest.py +135 -0
  51. aptdata/plugins/transform/__init__.py +14 -0
  52. aptdata/plugins/transform/pandas.py +129 -0
  53. aptdata/plugins/transform/spark.py +134 -0
  54. aptdata/plugins/vector/__init__.py +6 -0
  55. aptdata/plugins/vector/base.py +19 -0
  56. aptdata/plugins/vector/qdrant.py +41 -0
  57. aptdata/telemetry/__init__.py +5 -0
  58. aptdata/telemetry/instrumentation.py +164 -0
  59. aptdata/tui/__init__.py +5 -0
  60. aptdata/tui/monitor.py +279 -0
  61. aptdata-0.0.2.dist-info/METADATA +330 -0
  62. aptdata-0.0.2.dist-info/RECORD +65 -0
  63. aptdata-0.0.2.dist-info/WHEEL +4 -0
  64. aptdata-0.0.2.dist-info/entry_points.txt +3 -0
  65. aptdata-0.0.2.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,113 @@
1
+ """PostgreSQL reader / writer plugin.
2
+
3
+ Provides :class:`PostgresReader` and :class:`PostgresWriter` for
4
+ interacting with PostgreSQL databases via **SQLAlchemy**.
5
+
6
+ Both ``sqlalchemy`` and a PostgreSQL driver (e.g. ``psycopg2-binary``)
7
+ are required. A friendly
8
+ :class:`~aptdata.plugins.manager.PluginDependencyError` is raised
9
+ when either is missing.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from typing import Any
15
+
16
+ from aptdata.core.dataset import BaseDataset
17
+ from aptdata.plugins.base import BaseReader, BaseWriter
18
+ from aptdata.plugins.dataset import InMemoryDataset
19
+ from aptdata.plugins.manager import PluginDependencyError
20
+
21
+
22
+ def _require_sqlalchemy() -> Any:
23
+ """Import and return the ``sqlalchemy`` module, or raise a friendly error."""
24
+ try:
25
+ import sqlalchemy # noqa: WPS433
26
+ except ImportError:
27
+ raise PluginDependencyError("postgres", "sqlalchemy") from None
28
+ return sqlalchemy
29
+
30
+
31
+ class PostgresReader(BaseReader):
32
+ """Execute a SQL query against a PostgreSQL database and return the result
33
+ as an :class:`~aptdata.plugins.dataset.InMemoryDataset`.
34
+
35
+ Parameters
36
+ ----------
37
+ connection_url:
38
+ SQLAlchemy connection URL, e.g.
39
+ ``"postgresql+psycopg2://user:pass@host:5432/db"``.
40
+ query:
41
+ Raw SQL ``SELECT`` query to execute.
42
+ """
43
+
44
+ def __init__(self, connection_url: str, query: str) -> None:
45
+ self.connection_url = connection_url
46
+ self.query = query
47
+
48
+ def read(self, **kwargs: Any) -> InMemoryDataset:
49
+ sa = _require_sqlalchemy()
50
+ engine = sa.create_engine(self.connection_url)
51
+ with engine.connect() as conn:
52
+ result = conn.execute(sa.text(self.query))
53
+ columns = list(result.keys())
54
+ records = [dict(zip(columns, row)) for row in result.fetchall()]
55
+
56
+ ds = InMemoryDataset(uri=self.connection_url)
57
+ ds.write(records)
58
+ return ds
59
+
60
+
61
+ class PostgresWriter(BaseWriter):
62
+ """Write an :class:`~aptdata.plugins.dataset.InMemoryDataset` to a
63
+ PostgreSQL table.
64
+
65
+ Parameters
66
+ ----------
67
+ connection_url:
68
+ SQLAlchemy connection URL.
69
+ table:
70
+ Target table name.
71
+ if_exists:
72
+ Behaviour when the table already exists: ``"append"`` (default),
73
+ ``"replace"``, or ``"fail"``.
74
+ """
75
+
76
+ def __init__(
77
+ self,
78
+ connection_url: str,
79
+ table: str,
80
+ *,
81
+ if_exists: str = "append",
82
+ ) -> None:
83
+ self.connection_url = connection_url
84
+ self.table = table
85
+ self.if_exists = if_exists
86
+
87
+ def write(self, dataset: BaseDataset, **kwargs: Any) -> None:
88
+ sa = _require_sqlalchemy()
89
+ records: list[dict[str, Any]] = dataset.read()
90
+ if not records:
91
+ return
92
+
93
+ engine = sa.create_engine(self.connection_url)
94
+ meta = sa.MetaData()
95
+
96
+ with engine.connect() as conn:
97
+ if self.if_exists == "replace":
98
+ # Use SQLAlchemy DDL to avoid raw SQL interpolation
99
+ tbl = sa.Table(self.table, sa.MetaData())
100
+ tbl.drop(engine, checkfirst=True)
101
+
102
+ # Auto-create a simple text-column table when it doesn't exist
103
+ if not sa.inspect(engine).has_table(self.table):
104
+ columns = [sa.Column(k, sa.Text) for k in records[0]]
105
+ sa.Table(self.table, meta, *columns)
106
+ meta.create_all(engine)
107
+
108
+ table_obj = sa.Table(self.table, sa.MetaData(), autoload_with=engine)
109
+ conn.execute(table_obj.insert(), records)
110
+ conn.commit()
111
+
112
+
113
+ __all__ = ["PostgresReader", "PostgresWriter"]
@@ -0,0 +1,39 @@
1
+ """Data quality plugin package.
2
+
3
+ Re-exports the main public API for data contracts, quality expectations,
4
+ validation, and reporting.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from aptdata.plugins.quality.contract import (
10
+ ColumnClassification,
11
+ ColumnContract,
12
+ EnforcementMode,
13
+ SchemaContract,
14
+ )
15
+ from aptdata.plugins.quality.expectations import (
16
+ BaseExpectation,
17
+ ExpectColumnToNotBeNull,
18
+ ExpectColumnValuesInRange,
19
+ ExpectColumnValuesToBeUnique,
20
+ ExpectColumnValuesToMatchRegex,
21
+ )
22
+ from aptdata.plugins.quality.report import CheckResult, CheckStatus, QualityReport
23
+ from aptdata.plugins.quality.validator import QualityValidator
24
+
25
+ __all__ = [
26
+ "ColumnClassification",
27
+ "ColumnContract",
28
+ "EnforcementMode",
29
+ "SchemaContract",
30
+ "BaseExpectation",
31
+ "ExpectColumnToNotBeNull",
32
+ "ExpectColumnValuesInRange",
33
+ "ExpectColumnValuesToBeUnique",
34
+ "ExpectColumnValuesToMatchRegex",
35
+ "CheckResult",
36
+ "CheckStatus",
37
+ "QualityReport",
38
+ "QualityValidator",
39
+ ]
@@ -0,0 +1,128 @@
1
+ """Schema contracts and column classification definitions.
2
+
3
+ Provides :class:`SchemaContract` for declaring the expected schema of a
4
+ dataset, including column types, nullability, classification, and PII
5
+ annotations.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from enum import Enum
12
+ from typing import Any
13
+
14
+
15
+ class ColumnClassification(str, Enum):
16
+ """Data sensitivity classification for a column or dataset."""
17
+
18
+ PUBLIC = "PUBLIC"
19
+ INTERNAL = "INTERNAL"
20
+ CONFIDENTIAL = "CONFIDENTIAL"
21
+ PII = "PII"
22
+ PHI = "PHI"
23
+ FINANCIAL = "FINANCIAL"
24
+ SENSITIVE = "SENSITIVE"
25
+
26
+
27
+ class EnforcementMode(str, Enum):
28
+ """How the quality framework reacts when a contract is violated.
29
+
30
+ ABORT
31
+ Raise an exception immediately.
32
+ WARN
33
+ Log a warning but continue processing.
34
+ TAG
35
+ Annotate the data with quality metadata and continue.
36
+ """
37
+
38
+ ABORT = "ABORT"
39
+ WARN = "WARN"
40
+ TAG = "TAG"
41
+
42
+
43
+ @dataclass
44
+ class ColumnContract:
45
+ """Contract specification for a single column.
46
+
47
+ Parameters
48
+ ----------
49
+ name:
50
+ Column name as it appears in the dataset.
51
+ dtype:
52
+ Expected data type (e.g. ``"int64"``, ``"str"``).
53
+ nullable:
54
+ Whether ``null`` / ``None`` values are allowed.
55
+ classification:
56
+ Sensitivity classification (see :class:`ColumnClassification`).
57
+ description:
58
+ Human-readable description.
59
+ pii:
60
+ Shorthand flag indicating the column contains personally
61
+ identifiable information.
62
+ retention_days:
63
+ Number of days the column's data must be retained.
64
+ metadata:
65
+ Arbitrary extra metadata.
66
+ """
67
+
68
+ name: str
69
+ dtype: str = ""
70
+ nullable: bool = True
71
+ classification: ColumnClassification = ColumnClassification.INTERNAL
72
+ description: str = ""
73
+ pii: bool = False
74
+ retention_days: int = 0
75
+ metadata: dict[str, Any] = field(default_factory=dict)
76
+
77
+
78
+ @dataclass
79
+ class SchemaContract:
80
+ """Contract specification for an entire dataset schema.
81
+
82
+ Parameters
83
+ ----------
84
+ name:
85
+ Contract identifier.
86
+ version:
87
+ Semantic version string (e.g. ``"1.0.0"``).
88
+ owner:
89
+ Team or person responsible for this contract.
90
+ description:
91
+ Human-readable description.
92
+ columns:
93
+ Ordered list of :class:`ColumnContract` definitions.
94
+ enforcement:
95
+ How violations are handled (see :class:`EnforcementMode`).
96
+ metadata:
97
+ Arbitrary extra metadata.
98
+ """
99
+
100
+ name: str
101
+ version: str = "1.0.0"
102
+ owner: str = ""
103
+ description: str = ""
104
+ columns: list[ColumnContract] = field(default_factory=list)
105
+ enforcement: EnforcementMode = EnforcementMode.ABORT
106
+ metadata: dict[str, Any] = field(default_factory=dict)
107
+
108
+ def get_pii_columns(self) -> list[ColumnContract]:
109
+ """Return columns that are flagged as PII."""
110
+ return [
111
+ c
112
+ for c in self.columns
113
+ if c.pii or c.classification == ColumnClassification.PII
114
+ ]
115
+
116
+ def get_columns_by_classification(
117
+ self, classification: ColumnClassification
118
+ ) -> list[ColumnContract]:
119
+ """Return columns whose classification matches *classification*."""
120
+ return [c for c in self.columns if c.classification == classification]
121
+
122
+
123
+ __all__ = [
124
+ "ColumnClassification",
125
+ "EnforcementMode",
126
+ "ColumnContract",
127
+ "SchemaContract",
128
+ ]
@@ -0,0 +1,310 @@
1
+ """Data quality expectations — engine-agnostic validators.
2
+
3
+ Each :class:`BaseExpectation` automatically dispatches to either
4
+ :meth:`validate_pandas` or :meth:`validate_spark` based on the type of the
5
+ DataFrame passed to :meth:`validate`.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from abc import ABC, abstractmethod
11
+ from dataclasses import dataclass
12
+ from typing import Any
13
+
14
+ from aptdata.plugins.quality.report import CheckResult, CheckStatus
15
+
16
+
17
+ def _is_spark_df(df: Any) -> bool:
18
+ """Return ``True`` when *df* looks like a PySpark DataFrame."""
19
+ return "pyspark" in type(df).__module__
20
+
21
+
22
+ class BaseExpectation(ABC):
23
+ """Abstract base for all data quality expectations.
24
+
25
+ Subclasses implement :meth:`validate_pandas` and :meth:`validate_spark`.
26
+ The concrete :meth:`validate` method dispatches automatically.
27
+ """
28
+
29
+ @abstractmethod
30
+ def validate_pandas(self, df: Any) -> CheckResult:
31
+ """Validate a ``pd.DataFrame`` and return a :class:`CheckResult`."""
32
+
33
+ @abstractmethod
34
+ def validate_spark(self, df: Any) -> CheckResult:
35
+ """Validate a PySpark ``DataFrame`` and return a :class:`CheckResult`."""
36
+
37
+ def validate(self, df: Any) -> CheckResult:
38
+ """Validate *df* dispatching to the appropriate engine implementation."""
39
+ if _is_spark_df(df):
40
+ return self.validate_spark(df)
41
+ return self.validate_pandas(df)
42
+
43
+
44
+ @dataclass
45
+ class ExpectColumnToNotBeNull(BaseExpectation):
46
+ """Expect that a column contains no null values.
47
+
48
+ Parameters
49
+ ----------
50
+ column:
51
+ Name of the column to check.
52
+ """
53
+
54
+ column: str
55
+
56
+ def validate_pandas(self, df: Any) -> CheckResult:
57
+ """Check for null values using pandas."""
58
+ if self.column not in df.columns:
59
+ return CheckResult(
60
+ expectation_name="ExpectColumnToNotBeNull",
61
+ column=self.column,
62
+ status=CheckStatus.FAILED,
63
+ message=f"Column '{self.column}' not found in DataFrame.",
64
+ rows_evaluated=len(df),
65
+ rows_failed=len(df),
66
+ )
67
+ null_count = int(df[self.column].isnull().sum())
68
+ status = CheckStatus.PASSED if null_count == 0 else CheckStatus.FAILED
69
+ return CheckResult(
70
+ expectation_name="ExpectColumnToNotBeNull",
71
+ column=self.column,
72
+ status=status,
73
+ message=(
74
+ f"Column '{self.column}' has {null_count} null value(s)."
75
+ if null_count > 0
76
+ else f"Column '{self.column}' has no null values."
77
+ ),
78
+ rows_evaluated=len(df),
79
+ rows_failed=null_count,
80
+ )
81
+
82
+ def validate_spark(self, df: Any) -> CheckResult:
83
+ """Check for null values using PySpark."""
84
+ from pyspark.sql import functions as F # noqa: N812
85
+
86
+ total = df.count()
87
+ null_count = df.filter(F.col(self.column).isNull()).count()
88
+ status = CheckStatus.PASSED if null_count == 0 else CheckStatus.FAILED
89
+ return CheckResult(
90
+ expectation_name="ExpectColumnToNotBeNull",
91
+ column=self.column,
92
+ status=status,
93
+ message=(
94
+ f"Column '{self.column}' has {null_count} null value(s)."
95
+ if null_count > 0
96
+ else f"Column '{self.column}' has no null values."
97
+ ),
98
+ rows_evaluated=total,
99
+ rows_failed=null_count,
100
+ )
101
+
102
+
103
+ @dataclass
104
+ class ExpectColumnValuesInRange(BaseExpectation):
105
+ """Expect that all values in a column fall within [min_val, max_val].
106
+
107
+ Parameters
108
+ ----------
109
+ column:
110
+ Name of the column to check.
111
+ min_val:
112
+ Inclusive lower bound.
113
+ max_val:
114
+ Inclusive upper bound.
115
+ """
116
+
117
+ column: str
118
+ min_val: float
119
+ max_val: float
120
+
121
+ def validate_pandas(self, df: Any) -> CheckResult:
122
+ """Check numeric range using pandas."""
123
+ if self.column not in df.columns:
124
+ return CheckResult(
125
+ expectation_name="ExpectColumnValuesInRange",
126
+ column=self.column,
127
+ status=CheckStatus.FAILED,
128
+ message=f"Column '{self.column}' not found in DataFrame.",
129
+ rows_evaluated=len(df),
130
+ rows_failed=len(df),
131
+ )
132
+ series = df[self.column]
133
+ out_of_range = int(((series < self.min_val) | (series > self.max_val)).sum())
134
+ status = CheckStatus.PASSED if out_of_range == 0 else CheckStatus.FAILED
135
+ return CheckResult(
136
+ expectation_name="ExpectColumnValuesInRange",
137
+ column=self.column,
138
+ status=status,
139
+ message=(
140
+ f"Column '{self.column}' has {out_of_range} value(s) outside "
141
+ f"[{self.min_val}, {self.max_val}]."
142
+ if out_of_range > 0
143
+ else f"All values in '{self.column}' are within range."
144
+ ),
145
+ rows_evaluated=len(df),
146
+ rows_failed=out_of_range,
147
+ metadata={"min_val": self.min_val, "max_val": self.max_val},
148
+ )
149
+
150
+ def validate_spark(self, df: Any) -> CheckResult:
151
+ """Check numeric range using PySpark."""
152
+ from pyspark.sql import functions as F # noqa: N812
153
+
154
+ total = df.count()
155
+ out_of_range = df.filter(
156
+ (F.col(self.column) < self.min_val) | (F.col(self.column) > self.max_val)
157
+ ).count()
158
+ status = CheckStatus.PASSED if out_of_range == 0 else CheckStatus.FAILED
159
+ return CheckResult(
160
+ expectation_name="ExpectColumnValuesInRange",
161
+ column=self.column,
162
+ status=status,
163
+ message=(
164
+ f"Column '{self.column}' has {out_of_range} value(s) outside "
165
+ f"[{self.min_val}, {self.max_val}]."
166
+ if out_of_range > 0
167
+ else f"All values in '{self.column}' are within range."
168
+ ),
169
+ rows_evaluated=total,
170
+ rows_failed=out_of_range,
171
+ metadata={"min_val": self.min_val, "max_val": self.max_val},
172
+ )
173
+
174
+
175
+ @dataclass
176
+ class ExpectColumnValuesToBeUnique(BaseExpectation):
177
+ """Expect that all values in a column are unique.
178
+
179
+ Parameters
180
+ ----------
181
+ column:
182
+ Name of the column to check.
183
+ """
184
+
185
+ column: str
186
+
187
+ def validate_pandas(self, df: Any) -> CheckResult:
188
+ """Check uniqueness using pandas."""
189
+ if self.column not in df.columns:
190
+ return CheckResult(
191
+ expectation_name="ExpectColumnValuesToBeUnique",
192
+ column=self.column,
193
+ status=CheckStatus.FAILED,
194
+ message=f"Column '{self.column}' not found in DataFrame.",
195
+ rows_evaluated=len(df),
196
+ rows_failed=len(df),
197
+ )
198
+ duplicates = int(df[self.column].duplicated().sum())
199
+ status = CheckStatus.PASSED if duplicates == 0 else CheckStatus.FAILED
200
+ return CheckResult(
201
+ expectation_name="ExpectColumnValuesToBeUnique",
202
+ column=self.column,
203
+ status=status,
204
+ message=(
205
+ f"Column '{self.column}' has {duplicates} duplicate value(s)."
206
+ if duplicates > 0
207
+ else f"All values in '{self.column}' are unique."
208
+ ),
209
+ rows_evaluated=len(df),
210
+ rows_failed=duplicates,
211
+ )
212
+
213
+ def validate_spark(self, df: Any) -> CheckResult:
214
+ """Check uniqueness using PySpark."""
215
+ from pyspark.sql import functions as F # noqa: N812, F401
216
+
217
+ total = df.count()
218
+ unique_count = df.select(self.column).distinct().count()
219
+ duplicates = total - unique_count
220
+ status = CheckStatus.PASSED if duplicates == 0 else CheckStatus.FAILED
221
+ return CheckResult(
222
+ expectation_name="ExpectColumnValuesToBeUnique",
223
+ column=self.column,
224
+ status=status,
225
+ message=(
226
+ f"Column '{self.column}' has {duplicates} duplicate value(s)."
227
+ if duplicates > 0
228
+ else f"All values in '{self.column}' are unique."
229
+ ),
230
+ rows_evaluated=total,
231
+ rows_failed=duplicates,
232
+ )
233
+
234
+
235
+ @dataclass
236
+ class ExpectColumnValuesToMatchRegex(BaseExpectation):
237
+ """Expect that all values in a column match a regular expression.
238
+
239
+ Parameters
240
+ ----------
241
+ column:
242
+ Name of the column to check.
243
+ pattern:
244
+ Regular expression pattern (Python :mod:`re` syntax).
245
+ """
246
+
247
+ column: str
248
+ pattern: str
249
+
250
+ def validate_pandas(self, df: Any) -> CheckResult:
251
+ """Check regex match using pandas."""
252
+ if self.column not in df.columns:
253
+ return CheckResult(
254
+ expectation_name="ExpectColumnValuesToMatchRegex",
255
+ column=self.column,
256
+ status=CheckStatus.FAILED,
257
+ message=f"Column '{self.column}' not found in DataFrame.",
258
+ rows_evaluated=len(df),
259
+ rows_failed=len(df),
260
+ )
261
+ series = df[self.column].astype(str)
262
+ non_matching = int((~series.str.match(self.pattern)).sum())
263
+ status = CheckStatus.PASSED if non_matching == 0 else CheckStatus.FAILED
264
+ return CheckResult(
265
+ expectation_name="ExpectColumnValuesToMatchRegex",
266
+ column=self.column,
267
+ status=status,
268
+ message=(
269
+ f"Column '{self.column}' has {non_matching} value(s) not matching "
270
+ f"pattern '{self.pattern}'."
271
+ if non_matching > 0
272
+ else f"All values in '{self.column}' match the pattern."
273
+ ),
274
+ rows_evaluated=len(df),
275
+ rows_failed=non_matching,
276
+ metadata={"pattern": self.pattern},
277
+ )
278
+
279
+ def validate_spark(self, df: Any) -> CheckResult:
280
+ """Check regex match using PySpark."""
281
+ from pyspark.sql import functions as F # noqa: N812
282
+
283
+ total = df.count()
284
+ non_matching = df.filter(
285
+ ~F.col(self.column).cast("string").rlike(self.pattern)
286
+ ).count()
287
+ status = CheckStatus.PASSED if non_matching == 0 else CheckStatus.FAILED
288
+ return CheckResult(
289
+ expectation_name="ExpectColumnValuesToMatchRegex",
290
+ column=self.column,
291
+ status=status,
292
+ message=(
293
+ f"Column '{self.column}' has {non_matching} value(s) not matching "
294
+ f"pattern '{self.pattern}'."
295
+ if non_matching > 0
296
+ else f"All values in '{self.column}' match the pattern."
297
+ ),
298
+ rows_evaluated=total,
299
+ rows_failed=non_matching,
300
+ metadata={"pattern": self.pattern},
301
+ )
302
+
303
+
304
+ __all__ = [
305
+ "BaseExpectation",
306
+ "ExpectColumnToNotBeNull",
307
+ "ExpectColumnValuesInRange",
308
+ "ExpectColumnValuesToBeUnique",
309
+ "ExpectColumnValuesToMatchRegex",
310
+ ]
@@ -0,0 +1,94 @@
1
+ """Quality check result and report types."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime, timezone
7
+ from enum import Enum
8
+ from typing import Any
9
+
10
+
11
+ class CheckStatus(str, Enum):
12
+ """Status of a single quality check."""
13
+
14
+ PASSED = "PASSED"
15
+ FAILED = "FAILED"
16
+ WARNING = "WARNING"
17
+
18
+
19
+ @dataclass
20
+ class CheckResult:
21
+ """Result of a single data quality expectation.
22
+
23
+ Parameters
24
+ ----------
25
+ expectation_name:
26
+ Class name or human-readable label of the expectation.
27
+ column:
28
+ Column name the check was applied to (empty for table-level checks).
29
+ status:
30
+ Whether the check passed, failed, or issued a warning.
31
+ message:
32
+ Human-readable description of the outcome.
33
+ rows_evaluated:
34
+ Total number of rows considered by the check.
35
+ rows_failed:
36
+ Number of rows that violated the expectation.
37
+ metadata:
38
+ Arbitrary extra metadata from the expectation.
39
+ """
40
+
41
+ expectation_name: str
42
+ column: str = ""
43
+ status: CheckStatus = CheckStatus.PASSED
44
+ message: str = ""
45
+ rows_evaluated: int = 0
46
+ rows_failed: int = 0
47
+ metadata: dict[str, Any] = field(default_factory=dict)
48
+
49
+
50
+ @dataclass
51
+ class QualityReport:
52
+ """Aggregated quality report for a single dataset validation run.
53
+
54
+ Parameters
55
+ ----------
56
+ dataset_uri:
57
+ URI of the dataset that was validated.
58
+ workflow_name:
59
+ Name of the workflow that triggered validation.
60
+ trace_id:
61
+ OpenTelemetry trace identifier.
62
+ timestamp:
63
+ UTC ISO-8601 timestamp of when the report was generated.
64
+ checks:
65
+ Individual :class:`CheckResult` objects.
66
+ """
67
+
68
+ dataset_uri: str
69
+ workflow_name: str = ""
70
+ trace_id: str = ""
71
+ timestamp: str = field(
72
+ default_factory=lambda: datetime.now(timezone.utc).isoformat()
73
+ )
74
+ checks: list[CheckResult] = field(default_factory=list)
75
+
76
+ @property
77
+ def passed(self) -> bool:
78
+ """Return ``True`` when no check has a FAILED status."""
79
+ return all(c.status != CheckStatus.FAILED for c in self.checks)
80
+
81
+ @property
82
+ def summary(self) -> dict[str, int]:
83
+ """Return counts of PASSED, FAILED, and WARNING results."""
84
+ counts: dict[str, int] = {
85
+ CheckStatus.PASSED: 0,
86
+ CheckStatus.FAILED: 0,
87
+ CheckStatus.WARNING: 0,
88
+ }
89
+ for check in self.checks:
90
+ counts[check.status] += 1
91
+ return counts
92
+
93
+
94
+ __all__ = ["CheckStatus", "CheckResult", "QualityReport"]