datacheck-cli 2.0.0__tar.gz → 2.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/PKG-INFO +1 -1
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/__init__.py +1 -1
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/cli/schema.py +13 -3
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/cli/validate.py +1 -1
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/generator.py +48 -10
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/loader.py +39 -17
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/validator.py +7 -4
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/engine.py +45 -6
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/profiler.py +25 -10
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/suggestions.py +27 -13
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/suggestion_engine.py +7 -2
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/terminal_reporter.py +28 -10
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/semantic_rules.py +32 -7
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/temporal_rules.py +15 -1
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/pyproject.toml +1 -1
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/LICENSE +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/README_PYPI.md +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/__main__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/airflow/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/airflow/operators.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/cli/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/cli/config.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/cli/profile.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/parser.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/sample_data.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/schema.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/source.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/basic.yaml +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/ecommerce.yaml +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/finance.yaml +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/healthcare.yaml +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/iot.yaml +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/saas.yaml +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/sources.yaml +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/azure.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/base.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/bigquery.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/cloud_base.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/factory.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/gcs.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/mssql.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/mysql.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/postgresql.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/redshift.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/s3.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/snowflake.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/exceptions.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/loader.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/logging/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/logging/config.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/logging/filters.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/logging/formatters.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/logging/utils.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/notifications/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/notifications/slack.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/output.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/parallel/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/parallel/executor.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/parallel/progress.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/plugins/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/plugins/decorators.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/plugins/loader.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/plugins/registry.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/formatters/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/formatters/json_formatter.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/formatters/markdown_formatter.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/formatters/terminal_formatter.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/models.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/outliers.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/quality.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/statistics.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/csv_exporter.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/distribution_analyzer.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/json_reporter.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/results.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/base.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/composite_rules.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/factory.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/null_rules.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/numeric_rules.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/string_rules.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/sampling/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/sampling/sampler.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/sampling/strategies.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/schema/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/schema/baseline.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/schema/comparator.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/schema/detector.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/schema/models.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/security/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/security/validators.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/utils/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/utils/connection_parser.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/validation/__init__.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/validation/config.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/validation/rules.py +0 -0
- {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/validation/validator.py +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Schema commands for DataCheck CLI."""
|
|
2
2
|
|
|
3
|
+
import sys
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
import typer
|
|
@@ -8,6 +9,15 @@ from rich.table import Table
|
|
|
8
9
|
import pandas as pd
|
|
9
10
|
|
|
10
11
|
from datacheck.cli import console
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _safe_encoding() -> bool:
|
|
15
|
+
"""Check if stdout can handle Unicode symbols."""
|
|
16
|
+
encoding = getattr(sys.stdout, "encoding", None) or ""
|
|
17
|
+
return encoding.lower().replace("-", "") in ("utf8", "utf16", "utf32", "utf16le", "utf16be")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
_TICK = "✓" if _safe_encoding() else "v"
|
|
11
21
|
from datacheck.exceptions import DataLoadError
|
|
12
22
|
|
|
13
23
|
# Schema sub-app for schema evolution commands
|
|
@@ -301,12 +311,12 @@ def schema_capture(
|
|
|
301
311
|
# Save baseline
|
|
302
312
|
manager = BaselineManager(baseline_dir=baseline_dir)
|
|
303
313
|
filepath = manager.save_baseline(schema, name=name)
|
|
304
|
-
console.print(f"[green]
|
|
314
|
+
console.print(f"[green]{_TICK}[/green] Schema saved to: {filepath}")
|
|
305
315
|
|
|
306
316
|
# Save to history if requested
|
|
307
317
|
if save_history:
|
|
308
318
|
history_path = manager.save_to_history(schema)
|
|
309
|
-
console.print(f"[green]
|
|
319
|
+
console.print(f"[green]{_TICK}[/green] Schema added to history: {history_path}")
|
|
310
320
|
|
|
311
321
|
# Display summary
|
|
312
322
|
console.print("\n[bold]Schema Summary:[/bold]")
|
|
@@ -493,7 +503,7 @@ def schema_compare(
|
|
|
493
503
|
else:
|
|
494
504
|
# Terminal output
|
|
495
505
|
if not comparison.changes:
|
|
496
|
-
console.print("[green]
|
|
506
|
+
console.print(f"[green]{_TICK} No schema changes detected[/green]")
|
|
497
507
|
else:
|
|
498
508
|
# Compatibility summary
|
|
499
509
|
compat_style = {
|
|
@@ -524,7 +524,7 @@ def validate(
|
|
|
524
524
|
"loading_inline_data_source",
|
|
525
525
|
extra={"type": inline_source.type, "path": str(source_path)},
|
|
526
526
|
)
|
|
527
|
-
summary = engine.validate_file(str(source_path))
|
|
527
|
+
summary = engine.validate_file(str(source_path), **inline_source.options)
|
|
528
528
|
logger.info(
|
|
529
529
|
"data_loaded",
|
|
530
530
|
extra={"source_type": "inline", "path": str(source_path)},
|
|
@@ -105,7 +105,7 @@ class ConfigGenerator:
|
|
|
105
105
|
col_names = cc_rule["columns"]
|
|
106
106
|
cc_check: dict[str, Any] = {
|
|
107
107
|
"name": f"cross_{'_'.join(col_names[:2])}_{cc_rule['rule']}",
|
|
108
|
-
"
|
|
108
|
+
"column": col_names[0],
|
|
109
109
|
"rules": {cc_rule["rule"]: cc_rule["params"]},
|
|
110
110
|
"description": cc_rule.get("reason", "Cross-column rule"),
|
|
111
111
|
}
|
|
@@ -130,6 +130,7 @@ class ConfigGenerator:
|
|
|
130
130
|
config["checks"] = checks
|
|
131
131
|
|
|
132
132
|
config["reporting"] = {
|
|
133
|
+
"output_path": "./output",
|
|
133
134
|
"export_failures": True,
|
|
134
135
|
}
|
|
135
136
|
|
|
@@ -237,8 +238,20 @@ class ConfigGenerator:
|
|
|
237
238
|
"""
|
|
238
239
|
from datacheck.loader import LoaderFactory
|
|
239
240
|
|
|
241
|
+
data_path = Path(data_path)
|
|
240
242
|
df = LoaderFactory.load(str(data_path), **load_kwargs)
|
|
241
|
-
name =
|
|
243
|
+
name = data_path.stem
|
|
244
|
+
|
|
245
|
+
# Determine source type from file extension
|
|
246
|
+
ext = data_path.suffix.lower().lstrip(".")
|
|
247
|
+
source_type_map = {
|
|
248
|
+
"csv": "csv",
|
|
249
|
+
"parquet": "parquet",
|
|
250
|
+
"pq": "parquet",
|
|
251
|
+
"json": "json",
|
|
252
|
+
"avro": "avro",
|
|
253
|
+
}
|
|
254
|
+
source_type = source_type_map.get(ext, "csv")
|
|
242
255
|
|
|
243
256
|
if return_profile:
|
|
244
257
|
if confidence_threshold not in self.CONFIDENCE_LEVELS:
|
|
@@ -250,11 +263,20 @@ class ConfigGenerator:
|
|
|
250
263
|
config = self.generate_from_profile(
|
|
251
264
|
profile, confidence_threshold=confidence_threshold
|
|
252
265
|
)
|
|
266
|
+
config["data_source"] = {
|
|
267
|
+
"type": source_type,
|
|
268
|
+
"path": f"./{data_path.name}",
|
|
269
|
+
}
|
|
253
270
|
return config, profile
|
|
254
271
|
|
|
255
|
-
|
|
272
|
+
config = self.generate_from_dataframe(
|
|
256
273
|
df, name=name, confidence_threshold=confidence_threshold
|
|
257
274
|
)
|
|
275
|
+
config["data_source"] = {
|
|
276
|
+
"type": source_type,
|
|
277
|
+
"path": f"./{data_path.name}",
|
|
278
|
+
}
|
|
279
|
+
return config
|
|
258
280
|
|
|
259
281
|
def save_config(
|
|
260
282
|
self,
|
|
@@ -301,7 +323,7 @@ class ConfigGenerator:
|
|
|
301
323
|
|
|
302
324
|
# Version
|
|
303
325
|
if "version" in config:
|
|
304
|
-
lines.append(f"version:
|
|
326
|
+
lines.append(f"version: '{config['version']}'")
|
|
305
327
|
lines.append("")
|
|
306
328
|
|
|
307
329
|
# Metadata
|
|
@@ -310,11 +332,27 @@ class ConfigGenerator:
|
|
|
310
332
|
lines.append("metadata:")
|
|
311
333
|
for key, value in config["metadata"].items():
|
|
312
334
|
if isinstance(value, str):
|
|
313
|
-
lines.append(f" {key}:
|
|
335
|
+
lines.append(f" {key}: '{value}'")
|
|
314
336
|
else:
|
|
315
337
|
lines.append(f" {key}: {value}")
|
|
316
338
|
lines.append("")
|
|
317
339
|
|
|
340
|
+
# Data source
|
|
341
|
+
if "data_source" in config:
|
|
342
|
+
ds = config["data_source"]
|
|
343
|
+
lines.append("# Data source configuration")
|
|
344
|
+
lines.append("data_source:")
|
|
345
|
+
lines.append(f" type: {ds['type']}")
|
|
346
|
+
lines.append(f" path: '{ds['path']}'")
|
|
347
|
+
if "options" in ds and ds["options"]:
|
|
348
|
+
lines.append(" options:")
|
|
349
|
+
for key, value in ds["options"].items():
|
|
350
|
+
if isinstance(value, str):
|
|
351
|
+
lines.append(f" {key}: '{value}'")
|
|
352
|
+
else:
|
|
353
|
+
lines.append(f" {key}: {value}")
|
|
354
|
+
lines.append("")
|
|
355
|
+
|
|
318
356
|
# Checks
|
|
319
357
|
lines.append("# Validation checks")
|
|
320
358
|
lines.append("# Each check validates a single column with one or more rules")
|
|
@@ -333,7 +371,7 @@ class ConfigGenerator:
|
|
|
333
371
|
lines.append(f" column: {check['column']}")
|
|
334
372
|
|
|
335
373
|
if "description" in check:
|
|
336
|
-
lines.append(f" description:
|
|
374
|
+
lines.append(f" description: '{check['description']}'")
|
|
337
375
|
|
|
338
376
|
lines.append(" rules:")
|
|
339
377
|
rule_reasons = check.get("_rule_reasons", {})
|
|
@@ -401,13 +439,13 @@ class ConfigGenerator:
|
|
|
401
439
|
lines.append(f"{prefix}{rule_name}:")
|
|
402
440
|
for k, v in rule_value.items():
|
|
403
441
|
if isinstance(v, str):
|
|
404
|
-
lines.append(f"{sub_prefix}{k}:
|
|
442
|
+
lines.append(f"{sub_prefix}{k}: '{v}'")
|
|
405
443
|
elif isinstance(v, list):
|
|
406
444
|
lines.append(f"{sub_prefix}{k}:")
|
|
407
445
|
item_prefix = " # " if commented else " "
|
|
408
446
|
for item in v:
|
|
409
447
|
if isinstance(item, str):
|
|
410
|
-
lines.append(f"{item_prefix}-
|
|
448
|
+
lines.append(f"{item_prefix}- '{item}'")
|
|
411
449
|
else:
|
|
412
450
|
lines.append(f"{item_prefix}- {item}")
|
|
413
451
|
else:
|
|
@@ -419,11 +457,11 @@ class ConfigGenerator:
|
|
|
419
457
|
lines.append(f"{prefix}{rule_name}:")
|
|
420
458
|
for item in rule_value:
|
|
421
459
|
if isinstance(item, str):
|
|
422
|
-
lines.append(f"{sub_prefix}-
|
|
460
|
+
lines.append(f"{sub_prefix}- '{item}'")
|
|
423
461
|
else:
|
|
424
462
|
lines.append(f"{sub_prefix}- {item}")
|
|
425
463
|
elif isinstance(rule_value, str):
|
|
426
|
-
lines.append(f"{prefix}{rule_name}:
|
|
464
|
+
lines.append(f"{prefix}{rule_name}: '{rule_value}'{comment}")
|
|
427
465
|
else:
|
|
428
466
|
lines.append(f"{prefix}{rule_name}: {rule_value}{comment}")
|
|
429
467
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Configuration parsing and validation (original config module)."""
|
|
2
2
|
|
|
3
|
-
from dataclasses import dataclass
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
@@ -81,10 +81,12 @@ class DataSourceConfig:
|
|
|
81
81
|
Attributes:
|
|
82
82
|
type: Source type (csv, parquet, json, excel, delta)
|
|
83
83
|
path: Path to the data file (relative to config file or absolute)
|
|
84
|
+
options: Loader-specific options (e.g. encoding, delimiter for CSV)
|
|
84
85
|
"""
|
|
85
86
|
|
|
86
87
|
type: str
|
|
87
88
|
path: str
|
|
89
|
+
options: dict[str, Any] = field(default_factory=dict)
|
|
88
90
|
|
|
89
91
|
def __post_init__(self) -> None:
|
|
90
92
|
"""Validate data source configuration."""
|
|
@@ -278,10 +280,14 @@ class ConfigLoader:
|
|
|
278
280
|
if not path.is_file():
|
|
279
281
|
raise ConfigurationError(f"Configuration path is not a file: {config_path}")
|
|
280
282
|
|
|
281
|
-
# Read and parse YAML
|
|
283
|
+
# Read and parse YAML (with env-var substitution and extends resolution)
|
|
282
284
|
try:
|
|
283
|
-
|
|
284
|
-
|
|
285
|
+
from datacheck.config.parser import ConfigParser
|
|
286
|
+
|
|
287
|
+
parser = ConfigParser()
|
|
288
|
+
data = parser.load(path, resolve_env=True, resolve_extends=True)
|
|
289
|
+
except ConfigurationError:
|
|
290
|
+
raise
|
|
285
291
|
except yaml.YAMLError as e:
|
|
286
292
|
raise ConfigurationError(f"Invalid YAML in {config_path}: {e}") from e
|
|
287
293
|
except Exception as e:
|
|
@@ -304,26 +310,33 @@ class ConfigLoader:
|
|
|
304
310
|
f"'checks' must be a list, got {type(data['checks']).__name__}"
|
|
305
311
|
)
|
|
306
312
|
|
|
307
|
-
# Parse checks
|
|
313
|
+
# Parse checks — collect all errors before raising
|
|
308
314
|
checks = []
|
|
315
|
+
check_errors: list[str] = []
|
|
309
316
|
for idx, check_data in enumerate(data["checks"]):
|
|
310
317
|
if not isinstance(check_data, dict):
|
|
311
|
-
|
|
318
|
+
check_errors.append(
|
|
312
319
|
f"Check at index {idx} must be a dictionary, "
|
|
313
320
|
f"got {type(check_data).__name__}"
|
|
314
321
|
)
|
|
322
|
+
continue
|
|
315
323
|
|
|
316
324
|
# Validate required fields
|
|
325
|
+
missing = False
|
|
317
326
|
if "name" not in check_data:
|
|
318
|
-
|
|
327
|
+
check_errors.append(f"Check at index {idx} missing 'name' field")
|
|
328
|
+
missing = True
|
|
319
329
|
if "column" not in check_data:
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
330
|
+
name = check_data.get("name", f"index {idx}")
|
|
331
|
+
check_errors.append(f"Check '{name}' missing 'column' field")
|
|
332
|
+
missing = True
|
|
323
333
|
if "rules" not in check_data:
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
334
|
+
name = check_data.get("name", f"index {idx}")
|
|
335
|
+
check_errors.append(f"Check '{name}' missing 'rules' field")
|
|
336
|
+
missing = True
|
|
337
|
+
|
|
338
|
+
if missing:
|
|
339
|
+
continue
|
|
327
340
|
|
|
328
341
|
try:
|
|
329
342
|
rule_config = RuleConfig(
|
|
@@ -338,12 +351,17 @@ class ConfigLoader:
|
|
|
338
351
|
# Only add enabled checks
|
|
339
352
|
if rule_config.enabled:
|
|
340
353
|
checks.append(rule_config)
|
|
341
|
-
except ConfigurationError:
|
|
342
|
-
|
|
354
|
+
except ConfigurationError as e:
|
|
355
|
+
check_errors.append(str(e))
|
|
343
356
|
except Exception as e:
|
|
344
|
-
|
|
357
|
+
check_errors.append(
|
|
345
358
|
f"Error parsing check '{check_data.get('name', idx)}': {e}"
|
|
346
|
-
)
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
if check_errors:
|
|
362
|
+
raise ConfigurationError(
|
|
363
|
+
"Configuration has errors:\n - " + "\n - ".join(check_errors)
|
|
364
|
+
)
|
|
347
365
|
|
|
348
366
|
# Parse plugins (optional)
|
|
349
367
|
plugins = data.get("plugins", [])
|
|
@@ -400,9 +418,13 @@ class ConfigLoader:
|
|
|
400
418
|
raise ConfigurationError("'data_source' missing 'path' field")
|
|
401
419
|
|
|
402
420
|
try:
|
|
421
|
+
options = ds_data.get("options", {})
|
|
422
|
+
if not isinstance(options, dict):
|
|
423
|
+
raise ConfigurationError("'data_source.options' must be a dictionary")
|
|
403
424
|
data_source = DataSourceConfig(
|
|
404
425
|
type=ds_data["type"],
|
|
405
426
|
path=ds_data["path"],
|
|
427
|
+
options=options,
|
|
406
428
|
)
|
|
407
429
|
except ConfigurationError:
|
|
408
430
|
raise
|
|
@@ -137,6 +137,8 @@ class ConfigValidator:
|
|
|
137
137
|
"""
|
|
138
138
|
Validate config against JSON schema.
|
|
139
139
|
|
|
140
|
+
Collects all schema errors at once instead of stopping at the first.
|
|
141
|
+
|
|
140
142
|
Args:
|
|
141
143
|
config: Config dictionary
|
|
142
144
|
|
|
@@ -146,10 +148,11 @@ class ConfigValidator:
|
|
|
146
148
|
errors: list[str] = []
|
|
147
149
|
|
|
148
150
|
try:
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
151
|
+
validator_cls = jsonschema.Draft7Validator
|
|
152
|
+
validator = validator_cls(self.schema)
|
|
153
|
+
for error in sorted(validator.iter_errors(config), key=str):
|
|
154
|
+
path = ".".join(str(p) for p in error.path) if error.path else "root"
|
|
155
|
+
errors.append(f"Schema validation failed at '{path}': {error.message}")
|
|
153
156
|
except jsonschema.SchemaError as e:
|
|
154
157
|
errors.append(f"Invalid schema: {e.message}")
|
|
155
158
|
|
|
@@ -357,19 +357,58 @@ class ValidationEngine:
|
|
|
357
357
|
)
|
|
358
358
|
source_checks.setdefault(effective_source, []).append(check)
|
|
359
359
|
|
|
360
|
-
#
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
for src_name
|
|
360
|
+
# Pre-validate: verify all sources exist and connections work
|
|
361
|
+
# before running any checks. This gives a single clear error
|
|
362
|
+
# instead of repeating "connection failed" for every check.
|
|
363
|
+
connection_errors: list[str] = []
|
|
364
|
+
for src_name in source_checks:
|
|
365
365
|
if src_name not in self._sources:
|
|
366
|
-
|
|
366
|
+
connection_errors.append(
|
|
367
367
|
f"Source '{src_name}' not found in sources file. "
|
|
368
368
|
f"Available sources: {', '.join(sorted(self._sources.keys()))}"
|
|
369
369
|
)
|
|
370
|
+
continue
|
|
370
371
|
|
|
371
372
|
source_config: SourceConfig = self._sources[src_name]
|
|
372
373
|
|
|
374
|
+
if source_config.is_database:
|
|
375
|
+
# Test database connectivity
|
|
376
|
+
try:
|
|
377
|
+
from datacheck.connectors.factory import create_connector
|
|
378
|
+
|
|
379
|
+
connector = create_connector(source_config)
|
|
380
|
+
connector.connect()
|
|
381
|
+
connector.disconnect()
|
|
382
|
+
except Exception as e:
|
|
383
|
+
connection_errors.append(
|
|
384
|
+
f"Source '{src_name}' ({source_config.type}): "
|
|
385
|
+
f"Connection failed — {e}"
|
|
386
|
+
)
|
|
387
|
+
elif source_config.is_file:
|
|
388
|
+
# Test file accessibility
|
|
389
|
+
from pathlib import Path as _Path
|
|
390
|
+
|
|
391
|
+
file_path = source_config.connection.get("path", "")
|
|
392
|
+
if file_path and not _Path(file_path).exists():
|
|
393
|
+
connection_errors.append(
|
|
394
|
+
f"Source '{src_name}' ({source_config.type}): "
|
|
395
|
+
f"File not found — {file_path}"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
if connection_errors:
|
|
399
|
+
raise ConfigurationError(
|
|
400
|
+
"Source connectivity check failed:\n - "
|
|
401
|
+
+ "\n - ".join(connection_errors)
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
# Validate each source's checks
|
|
405
|
+
all_results: list[RuleResult] = []
|
|
406
|
+
_total_rows = 0
|
|
407
|
+
_total_columns = 0
|
|
408
|
+
for src_name, checks in source_checks.items():
|
|
409
|
+
|
|
410
|
+
source_config = self._sources[src_name]
|
|
411
|
+
|
|
373
412
|
# Determine table for this group of checks
|
|
374
413
|
# All checks in this group share the same source, but may have different tables
|
|
375
414
|
# We need to sub-group by table for database sources
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
"""Data profiling and quality analysis."""
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
import re
|
|
4
5
|
from datetime import datetime as dt
|
|
5
6
|
|
|
6
7
|
import pandas as pd
|
|
7
8
|
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
8
11
|
from datacheck.profiling.models import ColumnProfile, DatasetProfile
|
|
9
12
|
from datacheck.profiling.outliers import OutlierDetector, OutlierMethod
|
|
10
13
|
from datacheck.profiling.quality import QualityScorer
|
|
@@ -147,7 +150,19 @@ class DataProfiler:
|
|
|
147
150
|
profile.inferred_type = "boolean"
|
|
148
151
|
|
|
149
152
|
elif pd.api.types.is_numeric_dtype(series):
|
|
150
|
-
|
|
153
|
+
# Distinguish integer vs float types
|
|
154
|
+
if pd.api.types.is_integer_dtype(series):
|
|
155
|
+
profile.inferred_type = "integer"
|
|
156
|
+
elif pd.api.types.is_float_dtype(series):
|
|
157
|
+
# Check if all non-null values are whole numbers
|
|
158
|
+
# (common when nulls force int->float promotion)
|
|
159
|
+
non_null = series.dropna()
|
|
160
|
+
if len(non_null) > 0 and (non_null == non_null.astype(int)).all():
|
|
161
|
+
profile.inferred_type = "integer"
|
|
162
|
+
else:
|
|
163
|
+
profile.inferred_type = "numeric"
|
|
164
|
+
else:
|
|
165
|
+
profile.inferred_type = "numeric"
|
|
151
166
|
stats = self.stats_calc.calculate_numeric_stats(series)
|
|
152
167
|
profile.min_value = stats["min"]
|
|
153
168
|
profile.max_value = stats["max"]
|
|
@@ -224,7 +239,7 @@ class DataProfiler:
|
|
|
224
239
|
if len(dt_values) > 0:
|
|
225
240
|
profile.weekday_only = bool((dt_values.dt.dayofweek < 5).all())
|
|
226
241
|
except Exception:
|
|
227
|
-
|
|
242
|
+
logger.debug("Weekday analysis failed for column '%s'", series.name)
|
|
228
243
|
|
|
229
244
|
# Sample values (for value-based rule detection)
|
|
230
245
|
non_null_sample = series.dropna()
|
|
@@ -259,7 +274,7 @@ class DataProfiler:
|
|
|
259
274
|
# --- sum_equals detection ---
|
|
260
275
|
numeric_cols = [
|
|
261
276
|
name for name, cp in profile.columns.items()
|
|
262
|
-
if cp.inferred_type
|
|
277
|
+
if cp.inferred_type in ("numeric", "integer")
|
|
263
278
|
]
|
|
264
279
|
# Only check if manageable number of columns (<=15 numeric)
|
|
265
280
|
if 3 <= len(numeric_cols) <= 15:
|
|
@@ -294,10 +309,10 @@ class DataProfiler:
|
|
|
294
309
|
if close.sum() / valid >= 0.95:
|
|
295
310
|
rules.append({
|
|
296
311
|
"rule": "sum_equals",
|
|
297
|
-
"columns": [col_a, col_b
|
|
312
|
+
"columns": [target, col_a, col_b],
|
|
298
313
|
"params": {
|
|
299
|
-
"
|
|
300
|
-
"
|
|
314
|
+
"column_a": col_a,
|
|
315
|
+
"column_b": col_b,
|
|
301
316
|
},
|
|
302
317
|
"confidence": "high",
|
|
303
318
|
"reason": (
|
|
@@ -306,7 +321,7 @@ class DataProfiler:
|
|
|
306
321
|
),
|
|
307
322
|
})
|
|
308
323
|
except Exception:
|
|
309
|
-
|
|
324
|
+
logger.debug("sum_equals check failed for %s + %s = %s", col_a, col_b, target)
|
|
310
325
|
|
|
311
326
|
# --- unique_combination detection ---
|
|
312
327
|
cat_cols = [
|
|
@@ -326,7 +341,7 @@ class DataProfiler:
|
|
|
326
341
|
rules.append({
|
|
327
342
|
"rule": "unique_combination",
|
|
328
343
|
"columns": [col_a, col_b],
|
|
329
|
-
"params":
|
|
344
|
+
"params": [col_a, col_b],
|
|
330
345
|
"confidence": "medium",
|
|
331
346
|
"reason": (
|
|
332
347
|
f"Combination of {col_a} and {col_b} "
|
|
@@ -334,7 +349,7 @@ class DataProfiler:
|
|
|
334
349
|
),
|
|
335
350
|
})
|
|
336
351
|
except Exception:
|
|
337
|
-
|
|
352
|
+
logger.debug("unique_combination check failed for %s, %s", col_a, col_b)
|
|
338
353
|
|
|
339
354
|
return rules
|
|
340
355
|
|
|
@@ -394,7 +409,7 @@ class DataProfiler:
|
|
|
394
409
|
dt.strptime(val, fmt)
|
|
395
410
|
count += 1
|
|
396
411
|
except (ValueError, TypeError):
|
|
397
|
-
|
|
412
|
+
continue # Value doesn't match this format
|
|
398
413
|
if count > best_count:
|
|
399
414
|
best_count = count
|
|
400
415
|
best_format = fmt
|
|
@@ -3,11 +3,14 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
|
+
import logging
|
|
6
7
|
import re
|
|
7
|
-
from datetime import datetime
|
|
8
|
+
from datetime import datetime, timedelta
|
|
8
9
|
from collections.abc import Callable
|
|
9
10
|
from typing import TYPE_CHECKING, Any
|
|
10
11
|
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
11
14
|
if TYPE_CHECKING:
|
|
12
15
|
from datacheck.profiling.models import ColumnProfile
|
|
13
16
|
|
|
@@ -90,6 +93,7 @@ class RuleSuggester:
|
|
|
90
93
|
inferred = getattr(profile, "inferred_type", None)
|
|
91
94
|
if inferred:
|
|
92
95
|
type_map = {
|
|
96
|
+
"integer": "int",
|
|
93
97
|
"numeric": "numeric",
|
|
94
98
|
"boolean": "bool",
|
|
95
99
|
"datetime": "date",
|
|
@@ -291,12 +295,22 @@ class RuleSuggester:
|
|
|
291
295
|
|
|
292
296
|
# --- Temporal rules ---
|
|
293
297
|
if profile.min_date is not None and profile.max_date is not None:
|
|
294
|
-
# timestamp_range
|
|
298
|
+
# timestamp_range — add 1-day margin on each side so edge
|
|
299
|
+
# values don't fail due to profiling-time rounding
|
|
300
|
+
min_date_str = profile.min_date.split(" ")[0]
|
|
301
|
+
max_date_str = profile.max_date.split(" ")[0]
|
|
302
|
+
try:
|
|
303
|
+
min_dt = datetime.fromisoformat(min_date_str)
|
|
304
|
+
max_dt = datetime.fromisoformat(max_date_str)
|
|
305
|
+
min_date_str = (min_dt - timedelta(days=1)).strftime("%Y-%m-%d")
|
|
306
|
+
max_date_str = (max_dt + timedelta(days=1)).strftime("%Y-%m-%d")
|
|
307
|
+
except (ValueError, TypeError):
|
|
308
|
+
pass # Keep original strings if parsing fails
|
|
295
309
|
suggestions.append({
|
|
296
310
|
"rule": "timestamp_range",
|
|
297
311
|
"params": {
|
|
298
|
-
"min":
|
|
299
|
-
"max":
|
|
312
|
+
"min": min_date_str,
|
|
313
|
+
"max": max_date_str,
|
|
300
314
|
},
|
|
301
315
|
"confidence": "medium",
|
|
302
316
|
"reason": f"Dates range from {profile.min_date} to {profile.max_date}",
|
|
@@ -314,7 +328,7 @@ class RuleSuggester:
|
|
|
314
328
|
"reason": "No future dates detected in data",
|
|
315
329
|
})
|
|
316
330
|
except (ValueError, TypeError):
|
|
317
|
-
|
|
331
|
+
logger.debug("Failed to parse max_date for no_future_timestamps check")
|
|
318
332
|
|
|
319
333
|
# business_days_only
|
|
320
334
|
weekday_only = getattr(profile, "weekday_only", None)
|
|
@@ -360,7 +374,7 @@ class RuleSuggester:
|
|
|
360
374
|
json.loads(s)
|
|
361
375
|
json_count += 1
|
|
362
376
|
except (json.JSONDecodeError, TypeError):
|
|
363
|
-
|
|
377
|
+
continue # Not valid JSON
|
|
364
378
|
if len(sample[:20]) > 0 and json_count >= len(sample[:20]) * 0.8:
|
|
365
379
|
suggestions.append({
|
|
366
380
|
"rule": "json_valid",
|
|
@@ -425,7 +439,7 @@ _VALUE_DETECTORS: list[dict[str, Any]] = [
|
|
|
425
439
|
},
|
|
426
440
|
{
|
|
427
441
|
"rule": "phone_valid",
|
|
428
|
-
"pattern": r"^[
|
|
442
|
+
"pattern": r"^[+0-9][0-9\s\-().]{6,}$",
|
|
429
443
|
"confidence": "medium",
|
|
430
444
|
"reason_template": "Values match phone format ({matches}/{total} samples)",
|
|
431
445
|
"skip_if_rules": {"phone_valid", "regex"},
|
|
@@ -481,7 +495,7 @@ _KNOWN_PATTERNS: list[tuple[str, str, str, str]] = [
|
|
|
481
495
|
),
|
|
482
496
|
(
|
|
483
497
|
"IPv4",
|
|
484
|
-
r"
|
|
498
|
+
r"^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$",
|
|
485
499
|
"high",
|
|
486
500
|
"IPv4 address format",
|
|
487
501
|
),
|
|
@@ -493,19 +507,19 @@ _KNOWN_PATTERNS: list[tuple[str, str, str, str]] = [
|
|
|
493
507
|
),
|
|
494
508
|
(
|
|
495
509
|
"US zip code",
|
|
496
|
-
r"
|
|
510
|
+
r"^[0-9]{5}(-[0-9]{4})?$",
|
|
497
511
|
"medium",
|
|
498
512
|
"US zip code format",
|
|
499
513
|
),
|
|
500
514
|
(
|
|
501
515
|
"Credit card",
|
|
502
|
-
r"
|
|
516
|
+
r"^[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{4}$",
|
|
503
517
|
"medium",
|
|
504
518
|
"Credit card number format",
|
|
505
519
|
),
|
|
506
520
|
(
|
|
507
521
|
"SSN-like",
|
|
508
|
-
r"
|
|
522
|
+
r"^[0-9]{3}-[0-9]{2}-[0-9]{4}$",
|
|
509
523
|
"medium",
|
|
510
524
|
"SSN-like format (XXX-XX-XXXX)",
|
|
511
525
|
),
|
|
@@ -643,7 +657,7 @@ def _infer_custom_pattern(
|
|
|
643
657
|
if desc_parts:
|
|
644
658
|
description = f"{' + '.join(desc_parts)} prefix pattern"
|
|
645
659
|
else:
|
|
646
|
-
sep_char = matching[0][1]
|
|
660
|
+
sep_char = matching[0][1]
|
|
647
661
|
n_segments = (most_common_count + 1) // 2
|
|
648
662
|
description = f"structured pattern ({n_segments} segments, '{sep_char}' separator)"
|
|
649
663
|
|
|
@@ -654,7 +668,7 @@ def _infer_custom_pattern(
|
|
|
654
668
|
# Each entry: (per-char test function, regex class string).
|
|
655
669
|
# Checked in priority order — first match wins.
|
|
656
670
|
_CHAR_CLASSES: list[tuple[Callable[[str], bool], str]] = [
|
|
657
|
-
(str.isdigit, "
|
|
671
|
+
(str.isdigit, "[0-9]"),
|
|
658
672
|
(str.isupper, "[A-Z]"),
|
|
659
673
|
(str.islower, "[a-z]"),
|
|
660
674
|
(lambda c: c in "0123456789ABCDEF", "[0-9A-F]"),
|
|
@@ -136,7 +136,7 @@ class SuggestionEngine:
|
|
|
136
136
|
},
|
|
137
137
|
"date_format_valid": {
|
|
138
138
|
"message": "Invalid date formats detected",
|
|
139
|
-
"action": "Standardize
|
|
139
|
+
"action": "Standardize dates to match the expected format at the source or add date parsing logic",
|
|
140
140
|
},
|
|
141
141
|
"business_days_only": {
|
|
142
142
|
"message": "Records found on non-business days",
|
|
@@ -387,7 +387,12 @@ class SuggestionEngine:
|
|
|
387
387
|
return "Remove duplicate or assign unique identifier"
|
|
388
388
|
|
|
389
389
|
elif rule_type == "date_format_valid":
|
|
390
|
-
|
|
390
|
+
# Extract the specific expected format from the failure reason
|
|
391
|
+
# Reason format: "Value '...' does not match format '%Y-%m-%d'"
|
|
392
|
+
if "does not match format '" in reason:
|
|
393
|
+
fmt = reason.split("does not match format '")[-1].rstrip("'")
|
|
394
|
+
return f"Convert to expected format: {fmt}"
|
|
395
|
+
return "Convert to the expected date format specified in the rule"
|
|
391
396
|
|
|
392
397
|
return "Review and correct the value"
|
|
393
398
|
|
|
@@ -6,6 +6,7 @@ Provides production-grade terminal output for validation results including:
|
|
|
6
6
|
- Summary statistics with progress bars
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
import sys
|
|
9
10
|
|
|
10
11
|
from rich.console import Console
|
|
11
12
|
from rich.panel import Panel
|
|
@@ -16,6 +17,23 @@ from datacheck.reporting.suggestion_engine import Suggestion, SuggestionEngine
|
|
|
16
17
|
from datacheck.results import RuleResult, ValidationSummary
|
|
17
18
|
|
|
18
19
|
|
|
20
|
+
def _safe_encoding() -> bool:
|
|
21
|
+
"""Check if stdout can handle Unicode symbols."""
|
|
22
|
+
encoding = getattr(sys.stdout, "encoding", None) or ""
|
|
23
|
+
return encoding.lower().replace("-", "") in ("utf8", "utf16", "utf32", "utf16le", "utf16be")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Symbols that degrade gracefully on non-UTF-8 terminals (e.g. Windows cp1252)
|
|
27
|
+
_TICK = "✓" if _safe_encoding() else "v"
|
|
28
|
+
_CROSS = "✗" if _safe_encoding() else "x"
|
|
29
|
+
_WARN = "⚠" if _safe_encoding() else "!"
|
|
30
|
+
_BAR_FILLED = "█" if _safe_encoding() else "#"
|
|
31
|
+
_BAR_EMPTY = "░" if _safe_encoding() else "-"
|
|
32
|
+
_HLINE = "─" if _safe_encoding() else "-"
|
|
33
|
+
_ARROW = "→" if _safe_encoding() else "->"
|
|
34
|
+
_BULLET = "•" if _safe_encoding() else "*"
|
|
35
|
+
|
|
36
|
+
|
|
19
37
|
class TerminalReporter:
|
|
20
38
|
"""Enhanced terminal reporter with Rich formatting.
|
|
21
39
|
|
|
@@ -89,13 +107,13 @@ class TerminalReporter:
|
|
|
89
107
|
"""
|
|
90
108
|
if summary.all_passed:
|
|
91
109
|
status = Text("ALL CHECKS PASSED", style="bold green")
|
|
92
|
-
icon = "[green]
|
|
110
|
+
icon = f"[green]{_TICK}[/green]"
|
|
93
111
|
elif summary.error_rules > 0 and summary.failed_rules == 0:
|
|
94
112
|
status = Text("VALIDATION ERRORS", style="bold yellow")
|
|
95
|
-
icon = "[yellow]
|
|
113
|
+
icon = f"[yellow]{_WARN}[/yellow]"
|
|
96
114
|
else:
|
|
97
115
|
status = Text("VALIDATION FAILED", style="bold red")
|
|
98
|
-
icon = "[red]
|
|
116
|
+
icon = f"[red]{_CROSS}[/red]"
|
|
99
117
|
|
|
100
118
|
self.console.print(f"{icon} {status}")
|
|
101
119
|
self.console.print()
|
|
@@ -168,7 +186,7 @@ class TerminalReporter:
|
|
|
168
186
|
"""
|
|
169
187
|
filled = int((percentage / 100) * width)
|
|
170
188
|
empty = width - filled
|
|
171
|
-
bar =
|
|
189
|
+
bar = _BAR_FILLED * filled + _BAR_EMPTY * empty
|
|
172
190
|
return f"[{color}]{bar}[/{color}] {percentage:.0f}%"
|
|
173
191
|
|
|
174
192
|
def _print_failures(self, summary: ValidationSummary) -> None:
|
|
@@ -206,7 +224,7 @@ class TerminalReporter:
|
|
|
206
224
|
rule_type = result.rule_type if result.rule_type else "unknown"
|
|
207
225
|
|
|
208
226
|
self.console.print(
|
|
209
|
-
f"[red]
|
|
227
|
+
f"[red]{_CROSS}[/red] [bold]{check_name}[/bold] "
|
|
210
228
|
f"([cyan]{result.column}[/cyan] · {rule_type})"
|
|
211
229
|
)
|
|
212
230
|
|
|
@@ -227,7 +245,7 @@ class TerminalReporter:
|
|
|
227
245
|
check_name = result.check_name if result.check_name else result.rule_name
|
|
228
246
|
|
|
229
247
|
self.console.print(
|
|
230
|
-
f"[yellow]
|
|
248
|
+
f"[yellow]{_WARN}[/yellow] [bold]{check_name}[/bold] "
|
|
231
249
|
f"([cyan]{result.column}[/cyan])"
|
|
232
250
|
)
|
|
233
251
|
self.console.print(f" Error: {result.error}", style="yellow")
|
|
@@ -272,7 +290,7 @@ class TerminalReporter:
|
|
|
272
290
|
self.console.print(" [dim]Sample Fixes:[/dim]")
|
|
273
291
|
for fix in suggestion.sample_fixes[:3]:
|
|
274
292
|
self.console.print(
|
|
275
|
-
f"
|
|
293
|
+
f" {_BULLET} {fix['original_value']} {_ARROW} {fix['suggested_fix']}"
|
|
276
294
|
)
|
|
277
295
|
|
|
278
296
|
self.console.print()
|
|
@@ -283,11 +301,11 @@ class TerminalReporter:
|
|
|
283
301
|
Args:
|
|
284
302
|
summary: ValidationSummary for footer
|
|
285
303
|
"""
|
|
286
|
-
self.console.print(
|
|
304
|
+
self.console.print(_HLINE * 60)
|
|
287
305
|
|
|
288
306
|
if summary.all_passed:
|
|
289
307
|
self.console.print(
|
|
290
|
-
"[green]
|
|
308
|
+
f"[green]{_TICK} All validation rules passed successfully.[/green]"
|
|
291
309
|
)
|
|
292
310
|
else:
|
|
293
311
|
issues = []
|
|
@@ -297,7 +315,7 @@ class TerminalReporter:
|
|
|
297
315
|
issues.append(f"{summary.error_rules} errors")
|
|
298
316
|
issue_str = ", ".join(issues)
|
|
299
317
|
self.console.print(
|
|
300
|
-
f"[yellow]
|
|
318
|
+
f"[yellow]{_WARN} Validation complete with issues: {issue_str}[/yellow]"
|
|
301
319
|
)
|
|
302
320
|
|
|
303
321
|
self.console.print()
|
|
@@ -184,23 +184,48 @@ class PhoneValidRule(Rule):
|
|
|
184
184
|
check_name=self.name,
|
|
185
185
|
)
|
|
186
186
|
|
|
187
|
+
# When pandas/PyArrow loads purely numeric phone numbers, the
|
|
188
|
+
# column ends up as int64 or float64. Converting with plain
|
|
189
|
+
# .astype(str) produces values like "1234567890.0" which break
|
|
190
|
+
# phone-number parsing. Detect numeric dtypes and convert via
|
|
191
|
+
# integer casting first so the trailing ".0" is stripped.
|
|
192
|
+
_is_numeric_col = pd.api.types.is_numeric_dtype(data)
|
|
193
|
+
if _is_numeric_col:
|
|
194
|
+
data_str = data.astype("Int64").astype(str)
|
|
195
|
+
else:
|
|
196
|
+
data_str = data.astype(str)
|
|
197
|
+
|
|
187
198
|
# Vectorized regex pre-filter: fast-reject values without phone-like characters
|
|
188
199
|
# Valid phones contain digits and may have +, -, (, ), spaces, dots
|
|
189
|
-
data_str = data.astype(str)
|
|
190
200
|
pre_filter = data_str.str.match(
|
|
191
|
-
r"^[
|
|
201
|
+
r"^[+0-9][0-9\s\-().]{4,}$", na=False
|
|
192
202
|
)
|
|
193
203
|
|
|
194
204
|
# Only run expensive phonenumbers parsing on candidates
|
|
195
|
-
candidates =
|
|
205
|
+
candidates = data_str[pre_filter]
|
|
196
206
|
if len(candidates) > 0:
|
|
197
207
|
def is_valid_phone(value: Any) -> bool:
|
|
198
208
|
"""Check whether a single value is a valid phone number."""
|
|
209
|
+
str_val = str(value)
|
|
199
210
|
try:
|
|
200
|
-
parsed = phonenumbers.parse(
|
|
201
|
-
|
|
211
|
+
parsed = phonenumbers.parse(str_val, self.country_code)
|
|
212
|
+
if phonenumbers.is_valid_number(parsed):
|
|
213
|
+
return True
|
|
202
214
|
except phonenumbers.NumberParseException:
|
|
203
|
-
|
|
215
|
+
pass
|
|
216
|
+
# When CSV loaders (PyArrow) parse "+1234..." as a number,
|
|
217
|
+
# the "+" is lost. Retry with "+" prefix for digit-only
|
|
218
|
+
# values that could be international numbers (country code
|
|
219
|
+
# 1-3 digits + national number, minimum ~8 digits total).
|
|
220
|
+
if _is_numeric_col and str_val.isdigit() and len(str_val) >= 8:
|
|
221
|
+
try:
|
|
222
|
+
parsed = phonenumbers.parse(
|
|
223
|
+
"+" + str_val, self.country_code
|
|
224
|
+
)
|
|
225
|
+
return bool(phonenumbers.is_valid_number(parsed))
|
|
226
|
+
except phonenumbers.NumberParseException:
|
|
227
|
+
pass
|
|
228
|
+
return False
|
|
204
229
|
|
|
205
230
|
detailed_mask = candidates.apply(is_valid_phone)
|
|
206
231
|
valid_mask = pre_filter.copy()
|
|
@@ -221,7 +246,7 @@ class PhoneValidRule(Rule):
|
|
|
221
246
|
check_name=self.name,
|
|
222
247
|
)
|
|
223
248
|
|
|
224
|
-
failed_values =
|
|
249
|
+
failed_values = data_str.loc[invalid_indices]
|
|
225
250
|
reasons = [f"'{v}' is not a valid phone number" for v in failed_values.iloc[:100]]
|
|
226
251
|
|
|
227
252
|
failure_detail = self._create_failure_detail(
|
|
@@ -449,9 +449,23 @@ class DateFormatValidRule(Rule):
|
|
|
449
449
|
check_name=self.name,
|
|
450
450
|
)
|
|
451
451
|
|
|
452
|
+
# If the column is already datetime (numpy datetime64 or
|
|
453
|
+
# PyArrow timestamp), format it using the expected format
|
|
454
|
+
# string so the round-trip check works correctly regardless
|
|
455
|
+
# of the target format.
|
|
456
|
+
_is_datetime = pd.api.types.is_datetime64_any_dtype(data) or (
|
|
457
|
+
isinstance(data.dtype, pd.ArrowDtype)
|
|
458
|
+
and hasattr(data, "dt")
|
|
459
|
+
)
|
|
460
|
+
if _is_datetime:
|
|
461
|
+
dt_series = data.astype("datetime64[ns]")
|
|
462
|
+
str_data = dt_series.dt.strftime(self.format_string)
|
|
463
|
+
else:
|
|
464
|
+
str_data = data.astype(str)
|
|
465
|
+
|
|
452
466
|
# Vectorized date format validation via pd.to_datetime
|
|
453
467
|
parsed = pd.to_datetime(
|
|
454
|
-
|
|
468
|
+
str_data, format=self.format_string, errors="coerce"
|
|
455
469
|
)
|
|
456
470
|
valid_mask = parsed.notna()
|
|
457
471
|
invalid_indices = data.index[~valid_mask]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/formatters/json_formatter.py
RENAMED
|
File without changes
|
{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/formatters/markdown_formatter.py
RENAMED
|
File without changes
|
{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/formatters/terminal_formatter.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|