datacheck-cli 2.0.0__tar.gz → 2.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/PKG-INFO +1 -1
  2. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/__init__.py +1 -1
  3. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/cli/schema.py +13 -3
  4. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/cli/validate.py +1 -1
  5. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/generator.py +48 -10
  6. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/loader.py +39 -17
  7. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/validator.py +7 -4
  8. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/engine.py +45 -6
  9. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/profiler.py +25 -10
  10. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/suggestions.py +27 -13
  11. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/suggestion_engine.py +7 -2
  12. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/terminal_reporter.py +28 -10
  13. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/semantic_rules.py +32 -7
  14. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/temporal_rules.py +15 -1
  15. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/pyproject.toml +1 -1
  16. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/LICENSE +0 -0
  17. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/README_PYPI.md +0 -0
  18. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/__main__.py +0 -0
  19. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/airflow/__init__.py +0 -0
  20. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/airflow/operators.py +0 -0
  21. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/cli/__init__.py +0 -0
  22. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/cli/config.py +0 -0
  23. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/cli/profile.py +0 -0
  24. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/__init__.py +0 -0
  25. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/parser.py +0 -0
  26. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/sample_data.py +0 -0
  27. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/schema.py +0 -0
  28. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/source.py +0 -0
  29. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/__init__.py +0 -0
  30. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/basic.yaml +0 -0
  31. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/ecommerce.yaml +0 -0
  32. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/finance.yaml +0 -0
  33. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/healthcare.yaml +0 -0
  34. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/iot.yaml +0 -0
  35. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/saas.yaml +0 -0
  36. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/templates/sources.yaml +0 -0
  37. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/__init__.py +0 -0
  38. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/azure.py +0 -0
  39. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/base.py +0 -0
  40. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/bigquery.py +0 -0
  41. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/cloud_base.py +0 -0
  42. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/factory.py +0 -0
  43. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/gcs.py +0 -0
  44. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/mssql.py +0 -0
  45. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/mysql.py +0 -0
  46. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/postgresql.py +0 -0
  47. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/redshift.py +0 -0
  48. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/s3.py +0 -0
  49. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/connectors/snowflake.py +0 -0
  50. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/exceptions.py +0 -0
  51. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/loader.py +0 -0
  52. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/logging/__init__.py +0 -0
  53. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/logging/config.py +0 -0
  54. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/logging/filters.py +0 -0
  55. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/logging/formatters.py +0 -0
  56. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/logging/utils.py +0 -0
  57. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/notifications/__init__.py +0 -0
  58. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/notifications/slack.py +0 -0
  59. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/output.py +0 -0
  60. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/parallel/__init__.py +0 -0
  61. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/parallel/executor.py +0 -0
  62. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/parallel/progress.py +0 -0
  63. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/plugins/__init__.py +0 -0
  64. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/plugins/decorators.py +0 -0
  65. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/plugins/loader.py +0 -0
  66. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/plugins/registry.py +0 -0
  67. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/__init__.py +0 -0
  68. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/formatters/__init__.py +0 -0
  69. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/formatters/json_formatter.py +0 -0
  70. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/formatters/markdown_formatter.py +0 -0
  71. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/formatters/terminal_formatter.py +0 -0
  72. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/models.py +0 -0
  73. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/outliers.py +0 -0
  74. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/quality.py +0 -0
  75. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/statistics.py +0 -0
  76. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/__init__.py +0 -0
  77. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/csv_exporter.py +0 -0
  78. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/distribution_analyzer.py +0 -0
  79. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/json_reporter.py +0 -0
  80. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/results.py +0 -0
  81. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/__init__.py +0 -0
  82. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/base.py +0 -0
  83. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/composite_rules.py +0 -0
  84. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/factory.py +0 -0
  85. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/null_rules.py +0 -0
  86. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/numeric_rules.py +0 -0
  87. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/string_rules.py +0 -0
  88. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/sampling/__init__.py +0 -0
  89. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/sampling/sampler.py +0 -0
  90. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/sampling/strategies.py +0 -0
  91. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/schema/__init__.py +0 -0
  92. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/schema/baseline.py +0 -0
  93. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/schema/comparator.py +0 -0
  94. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/schema/detector.py +0 -0
  95. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/schema/models.py +0 -0
  96. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/security/__init__.py +0 -0
  97. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/security/validators.py +0 -0
  98. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/utils/__init__.py +0 -0
  99. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/utils/connection_parser.py +0 -0
  100. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/validation/__init__.py +0 -0
  101. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/validation/config.py +0 -0
  102. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/validation/rules.py +0 -0
  103. {datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/validation/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datacheck-cli
3
- Version: 2.0.0
3
+ Version: 2.0.1
4
4
  Summary: CLI-first data validation tool for data engineers. Catch bad data before it breaks pipelines.
5
5
  License: Apache-2.0
6
6
  License-File: LICENSE
@@ -37,7 +37,7 @@ from datacheck.profiling.formatters import (
37
37
  TerminalFormatter,
38
38
  )
39
39
 
40
- __version__ = "2.0.0"
40
+ __version__ = "2.0.1"
41
41
  __author__ = "Squrtech"
42
42
  __email__ = "contact@squrtech.com"
43
43
 
@@ -1,5 +1,6 @@
1
1
  """Schema commands for DataCheck CLI."""
2
2
 
3
+ import sys
3
4
  from pathlib import Path
4
5
 
5
6
  import typer
@@ -8,6 +9,15 @@ from rich.table import Table
8
9
  import pandas as pd
9
10
 
10
11
  from datacheck.cli import console
12
+
13
+
14
+ def _safe_encoding() -> bool:
15
+ """Check if stdout can handle Unicode symbols."""
16
+ encoding = getattr(sys.stdout, "encoding", None) or ""
17
+ return encoding.lower().replace("-", "") in ("utf8", "utf16", "utf32", "utf16le", "utf16be")
18
+
19
+
20
+ _TICK = "✓" if _safe_encoding() else "v"
11
21
  from datacheck.exceptions import DataLoadError
12
22
 
13
23
  # Schema sub-app for schema evolution commands
@@ -301,12 +311,12 @@ def schema_capture(
301
311
  # Save baseline
302
312
  manager = BaselineManager(baseline_dir=baseline_dir)
303
313
  filepath = manager.save_baseline(schema, name=name)
304
- console.print(f"[green][/green] Schema saved to: {filepath}")
314
+ console.print(f"[green]{_TICK}[/green] Schema saved to: {filepath}")
305
315
 
306
316
  # Save to history if requested
307
317
  if save_history:
308
318
  history_path = manager.save_to_history(schema)
309
- console.print(f"[green][/green] Schema added to history: {history_path}")
319
+ console.print(f"[green]{_TICK}[/green] Schema added to history: {history_path}")
310
320
 
311
321
  # Display summary
312
322
  console.print("\n[bold]Schema Summary:[/bold]")
@@ -493,7 +503,7 @@ def schema_compare(
493
503
  else:
494
504
  # Terminal output
495
505
  if not comparison.changes:
496
- console.print("[green] No schema changes detected[/green]")
506
+ console.print(f"[green]{_TICK} No schema changes detected[/green]")
497
507
  else:
498
508
  # Compatibility summary
499
509
  compat_style = {
@@ -524,7 +524,7 @@ def validate(
524
524
  "loading_inline_data_source",
525
525
  extra={"type": inline_source.type, "path": str(source_path)},
526
526
  )
527
- summary = engine.validate_file(str(source_path))
527
+ summary = engine.validate_file(str(source_path), **inline_source.options)
528
528
  logger.info(
529
529
  "data_loaded",
530
530
  extra={"source_type": "inline", "path": str(source_path)},
@@ -105,7 +105,7 @@ class ConfigGenerator:
105
105
  col_names = cc_rule["columns"]
106
106
  cc_check: dict[str, Any] = {
107
107
  "name": f"cross_{'_'.join(col_names[:2])}_{cc_rule['rule']}",
108
- "columns": col_names,
108
+ "column": col_names[0],
109
109
  "rules": {cc_rule["rule"]: cc_rule["params"]},
110
110
  "description": cc_rule.get("reason", "Cross-column rule"),
111
111
  }
@@ -130,6 +130,7 @@ class ConfigGenerator:
130
130
  config["checks"] = checks
131
131
 
132
132
  config["reporting"] = {
133
+ "output_path": "./output",
133
134
  "export_failures": True,
134
135
  }
135
136
 
@@ -237,8 +238,20 @@ class ConfigGenerator:
237
238
  """
238
239
  from datacheck.loader import LoaderFactory
239
240
 
241
+ data_path = Path(data_path)
240
242
  df = LoaderFactory.load(str(data_path), **load_kwargs)
241
- name = Path(data_path).stem
243
+ name = data_path.stem
244
+
245
+ # Determine source type from file extension
246
+ ext = data_path.suffix.lower().lstrip(".")
247
+ source_type_map = {
248
+ "csv": "csv",
249
+ "parquet": "parquet",
250
+ "pq": "parquet",
251
+ "json": "json",
252
+ "avro": "avro",
253
+ }
254
+ source_type = source_type_map.get(ext, "csv")
242
255
 
243
256
  if return_profile:
244
257
  if confidence_threshold not in self.CONFIDENCE_LEVELS:
@@ -250,11 +263,20 @@ class ConfigGenerator:
250
263
  config = self.generate_from_profile(
251
264
  profile, confidence_threshold=confidence_threshold
252
265
  )
266
+ config["data_source"] = {
267
+ "type": source_type,
268
+ "path": f"./{data_path.name}",
269
+ }
253
270
  return config, profile
254
271
 
255
- return self.generate_from_dataframe(
272
+ config = self.generate_from_dataframe(
256
273
  df, name=name, confidence_threshold=confidence_threshold
257
274
  )
275
+ config["data_source"] = {
276
+ "type": source_type,
277
+ "path": f"./{data_path.name}",
278
+ }
279
+ return config
258
280
 
259
281
  def save_config(
260
282
  self,
@@ -301,7 +323,7 @@ class ConfigGenerator:
301
323
 
302
324
  # Version
303
325
  if "version" in config:
304
- lines.append(f"version: \"{config['version']}\"")
326
+ lines.append(f"version: '{config['version']}'")
305
327
  lines.append("")
306
328
 
307
329
  # Metadata
@@ -310,11 +332,27 @@ class ConfigGenerator:
310
332
  lines.append("metadata:")
311
333
  for key, value in config["metadata"].items():
312
334
  if isinstance(value, str):
313
- lines.append(f" {key}: \"{value}\"")
335
+ lines.append(f" {key}: '{value}'")
314
336
  else:
315
337
  lines.append(f" {key}: {value}")
316
338
  lines.append("")
317
339
 
340
+ # Data source
341
+ if "data_source" in config:
342
+ ds = config["data_source"]
343
+ lines.append("# Data source configuration")
344
+ lines.append("data_source:")
345
+ lines.append(f" type: {ds['type']}")
346
+ lines.append(f" path: '{ds['path']}'")
347
+ if "options" in ds and ds["options"]:
348
+ lines.append(" options:")
349
+ for key, value in ds["options"].items():
350
+ if isinstance(value, str):
351
+ lines.append(f" {key}: '{value}'")
352
+ else:
353
+ lines.append(f" {key}: {value}")
354
+ lines.append("")
355
+
318
356
  # Checks
319
357
  lines.append("# Validation checks")
320
358
  lines.append("# Each check validates a single column with one or more rules")
@@ -333,7 +371,7 @@ class ConfigGenerator:
333
371
  lines.append(f" column: {check['column']}")
334
372
 
335
373
  if "description" in check:
336
- lines.append(f" description: \"{check['description']}\"")
374
+ lines.append(f" description: '{check['description']}'")
337
375
 
338
376
  lines.append(" rules:")
339
377
  rule_reasons = check.get("_rule_reasons", {})
@@ -401,13 +439,13 @@ class ConfigGenerator:
401
439
  lines.append(f"{prefix}{rule_name}:")
402
440
  for k, v in rule_value.items():
403
441
  if isinstance(v, str):
404
- lines.append(f"{sub_prefix}{k}: \"{v}\"")
442
+ lines.append(f"{sub_prefix}{k}: '{v}'")
405
443
  elif isinstance(v, list):
406
444
  lines.append(f"{sub_prefix}{k}:")
407
445
  item_prefix = " # " if commented else " "
408
446
  for item in v:
409
447
  if isinstance(item, str):
410
- lines.append(f"{item_prefix}- \"{item}\"")
448
+ lines.append(f"{item_prefix}- '{item}'")
411
449
  else:
412
450
  lines.append(f"{item_prefix}- {item}")
413
451
  else:
@@ -419,11 +457,11 @@ class ConfigGenerator:
419
457
  lines.append(f"{prefix}{rule_name}:")
420
458
  for item in rule_value:
421
459
  if isinstance(item, str):
422
- lines.append(f"{sub_prefix}- \"{item}\"")
460
+ lines.append(f"{sub_prefix}- '{item}'")
423
461
  else:
424
462
  lines.append(f"{sub_prefix}- {item}")
425
463
  elif isinstance(rule_value, str):
426
- lines.append(f"{prefix}{rule_name}: \"{rule_value}\"{comment}")
464
+ lines.append(f"{prefix}{rule_name}: '{rule_value}'{comment}")
427
465
  else:
428
466
  lines.append(f"{prefix}{rule_name}: {rule_value}{comment}")
429
467
 
@@ -1,6 +1,6 @@
1
1
  """Configuration parsing and validation (original config module)."""
2
2
 
3
- from dataclasses import dataclass
3
+ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any
6
6
 
@@ -81,10 +81,12 @@ class DataSourceConfig:
81
81
  Attributes:
82
82
  type: Source type (csv, parquet, json, excel, delta)
83
83
  path: Path to the data file (relative to config file or absolute)
84
+ options: Loader-specific options (e.g. encoding, delimiter for CSV)
84
85
  """
85
86
 
86
87
  type: str
87
88
  path: str
89
+ options: dict[str, Any] = field(default_factory=dict)
88
90
 
89
91
  def __post_init__(self) -> None:
90
92
  """Validate data source configuration."""
@@ -278,10 +280,14 @@ class ConfigLoader:
278
280
  if not path.is_file():
279
281
  raise ConfigurationError(f"Configuration path is not a file: {config_path}")
280
282
 
281
- # Read and parse YAML
283
+ # Read and parse YAML (with env-var substitution and extends resolution)
282
284
  try:
283
- with open(path, encoding="utf-8") as f:
284
- data = yaml.safe_load(f)
285
+ from datacheck.config.parser import ConfigParser
286
+
287
+ parser = ConfigParser()
288
+ data = parser.load(path, resolve_env=True, resolve_extends=True)
289
+ except ConfigurationError:
290
+ raise
285
291
  except yaml.YAMLError as e:
286
292
  raise ConfigurationError(f"Invalid YAML in {config_path}: {e}") from e
287
293
  except Exception as e:
@@ -304,26 +310,33 @@ class ConfigLoader:
304
310
  f"'checks' must be a list, got {type(data['checks']).__name__}"
305
311
  )
306
312
 
307
- # Parse checks
313
+ # Parse checks — collect all errors before raising
308
314
  checks = []
315
+ check_errors: list[str] = []
309
316
  for idx, check_data in enumerate(data["checks"]):
310
317
  if not isinstance(check_data, dict):
311
- raise ConfigurationError(
318
+ check_errors.append(
312
319
  f"Check at index {idx} must be a dictionary, "
313
320
  f"got {type(check_data).__name__}"
314
321
  )
322
+ continue
315
323
 
316
324
  # Validate required fields
325
+ missing = False
317
326
  if "name" not in check_data:
318
- raise ConfigurationError(f"Check at index {idx} missing 'name' field")
327
+ check_errors.append(f"Check at index {idx} missing 'name' field")
328
+ missing = True
319
329
  if "column" not in check_data:
320
- raise ConfigurationError(
321
- f"Check '{check_data.get('name', idx)}' missing 'column' field"
322
- )
330
+ name = check_data.get("name", f"index {idx}")
331
+ check_errors.append(f"Check '{name}' missing 'column' field")
332
+ missing = True
323
333
  if "rules" not in check_data:
324
- raise ConfigurationError(
325
- f"Check '{check_data['name']}' missing 'rules' field"
326
- )
334
+ name = check_data.get("name", f"index {idx}")
335
+ check_errors.append(f"Check '{name}' missing 'rules' field")
336
+ missing = True
337
+
338
+ if missing:
339
+ continue
327
340
 
328
341
  try:
329
342
  rule_config = RuleConfig(
@@ -338,12 +351,17 @@ class ConfigLoader:
338
351
  # Only add enabled checks
339
352
  if rule_config.enabled:
340
353
  checks.append(rule_config)
341
- except ConfigurationError:
342
- raise
354
+ except ConfigurationError as e:
355
+ check_errors.append(str(e))
343
356
  except Exception as e:
344
- raise ConfigurationError(
357
+ check_errors.append(
345
358
  f"Error parsing check '{check_data.get('name', idx)}': {e}"
346
- ) from e
359
+ )
360
+
361
+ if check_errors:
362
+ raise ConfigurationError(
363
+ "Configuration has errors:\n - " + "\n - ".join(check_errors)
364
+ )
347
365
 
348
366
  # Parse plugins (optional)
349
367
  plugins = data.get("plugins", [])
@@ -400,9 +418,13 @@ class ConfigLoader:
400
418
  raise ConfigurationError("'data_source' missing 'path' field")
401
419
 
402
420
  try:
421
+ options = ds_data.get("options", {})
422
+ if not isinstance(options, dict):
423
+ raise ConfigurationError("'data_source.options' must be a dictionary")
403
424
  data_source = DataSourceConfig(
404
425
  type=ds_data["type"],
405
426
  path=ds_data["path"],
427
+ options=options,
406
428
  )
407
429
  except ConfigurationError:
408
430
  raise
@@ -137,6 +137,8 @@ class ConfigValidator:
137
137
  """
138
138
  Validate config against JSON schema.
139
139
 
140
+ Collects all schema errors at once instead of stopping at the first.
141
+
140
142
  Args:
141
143
  config: Config dictionary
142
144
 
@@ -146,10 +148,11 @@ class ConfigValidator:
146
148
  errors: list[str] = []
147
149
 
148
150
  try:
149
- jsonschema.validate(instance=config, schema=self.schema)
150
- except jsonschema.ValidationError as e:
151
- path = ".".join(str(p) for p in e.path) if e.path else "root"
152
- errors.append(f"Schema validation failed at '{path}': {e.message}")
151
+ validator_cls = jsonschema.Draft7Validator
152
+ validator = validator_cls(self.schema)
153
+ for error in sorted(validator.iter_errors(config), key=str):
154
+ path = ".".join(str(p) for p in error.path) if error.path else "root"
155
+ errors.append(f"Schema validation failed at '{path}': {error.message}")
153
156
  except jsonschema.SchemaError as e:
154
157
  errors.append(f"Invalid schema: {e.message}")
155
158
 
@@ -357,19 +357,58 @@ class ValidationEngine:
357
357
  )
358
358
  source_checks.setdefault(effective_source, []).append(check)
359
359
 
360
- # Validate each source's checks
361
- all_results: list[RuleResult] = []
362
- _total_rows = 0
363
- _total_columns = 0
364
- for src_name, checks in source_checks.items():
360
+ # Pre-validate: verify all sources exist and connections work
361
+ # before running any checks. This gives a single clear error
362
+ # instead of repeating "connection failed" for every check.
363
+ connection_errors: list[str] = []
364
+ for src_name in source_checks:
365
365
  if src_name not in self._sources:
366
- raise ConfigurationError(
366
+ connection_errors.append(
367
367
  f"Source '{src_name}' not found in sources file. "
368
368
  f"Available sources: {', '.join(sorted(self._sources.keys()))}"
369
369
  )
370
+ continue
370
371
 
371
372
  source_config: SourceConfig = self._sources[src_name]
372
373
 
374
+ if source_config.is_database:
375
+ # Test database connectivity
376
+ try:
377
+ from datacheck.connectors.factory import create_connector
378
+
379
+ connector = create_connector(source_config)
380
+ connector.connect()
381
+ connector.disconnect()
382
+ except Exception as e:
383
+ connection_errors.append(
384
+ f"Source '{src_name}' ({source_config.type}): "
385
+ f"Connection failed — {e}"
386
+ )
387
+ elif source_config.is_file:
388
+ # Test file accessibility
389
+ from pathlib import Path as _Path
390
+
391
+ file_path = source_config.connection.get("path", "")
392
+ if file_path and not _Path(file_path).exists():
393
+ connection_errors.append(
394
+ f"Source '{src_name}' ({source_config.type}): "
395
+ f"File not found — {file_path}"
396
+ )
397
+
398
+ if connection_errors:
399
+ raise ConfigurationError(
400
+ "Source connectivity check failed:\n - "
401
+ + "\n - ".join(connection_errors)
402
+ )
403
+
404
+ # Validate each source's checks
405
+ all_results: list[RuleResult] = []
406
+ _total_rows = 0
407
+ _total_columns = 0
408
+ for src_name, checks in source_checks.items():
409
+
410
+ source_config = self._sources[src_name]
411
+
373
412
  # Determine table for this group of checks
374
413
  # All checks in this group share the same source, but may have different tables
375
414
  # We need to sub-group by table for database sources
@@ -1,10 +1,13 @@
1
1
  """Data profiling and quality analysis."""
2
2
 
3
+ import logging
3
4
  import re
4
5
  from datetime import datetime as dt
5
6
 
6
7
  import pandas as pd
7
8
 
9
+ logger = logging.getLogger(__name__)
10
+
8
11
  from datacheck.profiling.models import ColumnProfile, DatasetProfile
9
12
  from datacheck.profiling.outliers import OutlierDetector, OutlierMethod
10
13
  from datacheck.profiling.quality import QualityScorer
@@ -147,7 +150,19 @@ class DataProfiler:
147
150
  profile.inferred_type = "boolean"
148
151
 
149
152
  elif pd.api.types.is_numeric_dtype(series):
150
- profile.inferred_type = "numeric"
153
+ # Distinguish integer vs float types
154
+ if pd.api.types.is_integer_dtype(series):
155
+ profile.inferred_type = "integer"
156
+ elif pd.api.types.is_float_dtype(series):
157
+ # Check if all non-null values are whole numbers
158
+ # (common when nulls force int->float promotion)
159
+ non_null = series.dropna()
160
+ if len(non_null) > 0 and (non_null == non_null.astype(int)).all():
161
+ profile.inferred_type = "integer"
162
+ else:
163
+ profile.inferred_type = "numeric"
164
+ else:
165
+ profile.inferred_type = "numeric"
151
166
  stats = self.stats_calc.calculate_numeric_stats(series)
152
167
  profile.min_value = stats["min"]
153
168
  profile.max_value = stats["max"]
@@ -224,7 +239,7 @@ class DataProfiler:
224
239
  if len(dt_values) > 0:
225
240
  profile.weekday_only = bool((dt_values.dt.dayofweek < 5).all())
226
241
  except Exception:
227
- pass
242
+ logger.debug("Weekday analysis failed for column '%s'", series.name)
228
243
 
229
244
  # Sample values (for value-based rule detection)
230
245
  non_null_sample = series.dropna()
@@ -259,7 +274,7 @@ class DataProfiler:
259
274
  # --- sum_equals detection ---
260
275
  numeric_cols = [
261
276
  name for name, cp in profile.columns.items()
262
- if cp.inferred_type == "numeric"
277
+ if cp.inferred_type in ("numeric", "integer")
263
278
  ]
264
279
  # Only check if manageable number of columns (<=15 numeric)
265
280
  if 3 <= len(numeric_cols) <= 15:
@@ -294,10 +309,10 @@ class DataProfiler:
294
309
  if close.sum() / valid >= 0.95:
295
310
  rules.append({
296
311
  "rule": "sum_equals",
297
- "columns": [col_a, col_b, target],
312
+ "columns": [target, col_a, col_b],
298
313
  "params": {
299
- "columns": [col_a, col_b],
300
- "target": target,
314
+ "column_a": col_a,
315
+ "column_b": col_b,
301
316
  },
302
317
  "confidence": "high",
303
318
  "reason": (
@@ -306,7 +321,7 @@ class DataProfiler:
306
321
  ),
307
322
  })
308
323
  except Exception:
309
- pass
324
+ logger.debug("sum_equals check failed for %s + %s = %s", col_a, col_b, target)
310
325
 
311
326
  # --- unique_combination detection ---
312
327
  cat_cols = [
@@ -326,7 +341,7 @@ class DataProfiler:
326
341
  rules.append({
327
342
  "rule": "unique_combination",
328
343
  "columns": [col_a, col_b],
329
- "params": {"columns": [col_a, col_b]},
344
+ "params": [col_a, col_b],
330
345
  "confidence": "medium",
331
346
  "reason": (
332
347
  f"Combination of {col_a} and {col_b} "
@@ -334,7 +349,7 @@ class DataProfiler:
334
349
  ),
335
350
  })
336
351
  except Exception:
337
- pass
352
+ logger.debug("unique_combination check failed for %s, %s", col_a, col_b)
338
353
 
339
354
  return rules
340
355
 
@@ -394,7 +409,7 @@ class DataProfiler:
394
409
  dt.strptime(val, fmt)
395
410
  count += 1
396
411
  except (ValueError, TypeError):
397
- pass
412
+ continue # Value doesn't match this format
398
413
  if count > best_count:
399
414
  best_count = count
400
415
  best_format = fmt
@@ -3,11 +3,14 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
+ import logging
6
7
  import re
7
- from datetime import datetime
8
+ from datetime import datetime, timedelta
8
9
  from collections.abc import Callable
9
10
  from typing import TYPE_CHECKING, Any
10
11
 
12
+ logger = logging.getLogger(__name__)
13
+
11
14
  if TYPE_CHECKING:
12
15
  from datacheck.profiling.models import ColumnProfile
13
16
 
@@ -90,6 +93,7 @@ class RuleSuggester:
90
93
  inferred = getattr(profile, "inferred_type", None)
91
94
  if inferred:
92
95
  type_map = {
96
+ "integer": "int",
93
97
  "numeric": "numeric",
94
98
  "boolean": "bool",
95
99
  "datetime": "date",
@@ -291,12 +295,22 @@ class RuleSuggester:
291
295
 
292
296
  # --- Temporal rules ---
293
297
  if profile.min_date is not None and profile.max_date is not None:
294
- # timestamp_range
298
+ # timestamp_range — add 1-day margin on each side so edge
299
+ # values don't fail due to profiling-time rounding
300
+ min_date_str = profile.min_date.split(" ")[0]
301
+ max_date_str = profile.max_date.split(" ")[0]
302
+ try:
303
+ min_dt = datetime.fromisoformat(min_date_str)
304
+ max_dt = datetime.fromisoformat(max_date_str)
305
+ min_date_str = (min_dt - timedelta(days=1)).strftime("%Y-%m-%d")
306
+ max_date_str = (max_dt + timedelta(days=1)).strftime("%Y-%m-%d")
307
+ except (ValueError, TypeError):
308
+ pass # Keep original strings if parsing fails
295
309
  suggestions.append({
296
310
  "rule": "timestamp_range",
297
311
  "params": {
298
- "min": profile.min_date.split(" ")[0],
299
- "max": profile.max_date.split(" ")[0],
312
+ "min": min_date_str,
313
+ "max": max_date_str,
300
314
  },
301
315
  "confidence": "medium",
302
316
  "reason": f"Dates range from {profile.min_date} to {profile.max_date}",
@@ -314,7 +328,7 @@ class RuleSuggester:
314
328
  "reason": "No future dates detected in data",
315
329
  })
316
330
  except (ValueError, TypeError):
317
- pass
331
+ logger.debug("Failed to parse max_date for no_future_timestamps check")
318
332
 
319
333
  # business_days_only
320
334
  weekday_only = getattr(profile, "weekday_only", None)
@@ -360,7 +374,7 @@ class RuleSuggester:
360
374
  json.loads(s)
361
375
  json_count += 1
362
376
  except (json.JSONDecodeError, TypeError):
363
- pass
377
+ continue # Not valid JSON
364
378
  if len(sample[:20]) > 0 and json_count >= len(sample[:20]) * 0.8:
365
379
  suggestions.append({
366
380
  "rule": "json_valid",
@@ -425,7 +439,7 @@ _VALUE_DETECTORS: list[dict[str, Any]] = [
425
439
  },
426
440
  {
427
441
  "rule": "phone_valid",
428
- "pattern": r"^[+\d][\d\s\-().]{6,}$",
442
+ "pattern": r"^[+0-9][0-9\s\-().]{6,}$",
429
443
  "confidence": "medium",
430
444
  "reason_template": "Values match phone format ({matches}/{total} samples)",
431
445
  "skip_if_rules": {"phone_valid", "regex"},
@@ -481,7 +495,7 @@ _KNOWN_PATTERNS: list[tuple[str, str, str, str]] = [
481
495
  ),
482
496
  (
483
497
  "IPv4",
484
- r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
498
+ r"^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$",
485
499
  "high",
486
500
  "IPv4 address format",
487
501
  ),
@@ -493,19 +507,19 @@ _KNOWN_PATTERNS: list[tuple[str, str, str, str]] = [
493
507
  ),
494
508
  (
495
509
  "US zip code",
496
- r"^\d{5}(-\d{4})?$",
510
+ r"^[0-9]{5}(-[0-9]{4})?$",
497
511
  "medium",
498
512
  "US zip code format",
499
513
  ),
500
514
  (
501
515
  "Credit card",
502
- r"^\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}$",
516
+ r"^[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{4}$",
503
517
  "medium",
504
518
  "Credit card number format",
505
519
  ),
506
520
  (
507
521
  "SSN-like",
508
- r"^\d{3}-\d{2}-\d{4}$",
522
+ r"^[0-9]{3}-[0-9]{2}-[0-9]{4}$",
509
523
  "medium",
510
524
  "SSN-like format (XXX-XX-XXXX)",
511
525
  ),
@@ -643,7 +657,7 @@ def _infer_custom_pattern(
643
657
  if desc_parts:
644
658
  description = f"{' + '.join(desc_parts)} prefix pattern"
645
659
  else:
646
- sep_char = matching[0][1] if most_common_count >= 3 else "-"
660
+ sep_char = matching[0][1]
647
661
  n_segments = (most_common_count + 1) // 2
648
662
  description = f"structured pattern ({n_segments} segments, '{sep_char}' separator)"
649
663
 
@@ -654,7 +668,7 @@ def _infer_custom_pattern(
654
668
  # Each entry: (per-char test function, regex class string).
655
669
  # Checked in priority order — first match wins.
656
670
  _CHAR_CLASSES: list[tuple[Callable[[str], bool], str]] = [
657
- (str.isdigit, "\\d"),
671
+ (str.isdigit, "[0-9]"),
658
672
  (str.isupper, "[A-Z]"),
659
673
  (str.islower, "[a-z]"),
660
674
  (lambda c: c in "0123456789ABCDEF", "[0-9A-F]"),
@@ -136,7 +136,7 @@ class SuggestionEngine:
136
136
  },
137
137
  "date_format_valid": {
138
138
  "message": "Invalid date formats detected",
139
- "action": "Standardize date format at source or add date parsing logic",
139
+ "action": "Standardize dates to match the expected format at the source or add date parsing logic",
140
140
  },
141
141
  "business_days_only": {
142
142
  "message": "Records found on non-business days",
@@ -387,7 +387,12 @@ class SuggestionEngine:
387
387
  return "Remove duplicate or assign unique identifier"
388
388
 
389
389
  elif rule_type == "date_format_valid":
390
- return "Convert to expected date format (e.g., YYYY-MM-DD)"
390
+ # Extract the specific expected format from the failure reason
391
+ # Reason format: "Value '...' does not match format '%Y-%m-%d'"
392
+ if "does not match format '" in reason:
393
+ fmt = reason.split("does not match format '")[-1].rstrip("'")
394
+ return f"Convert to expected format: {fmt}"
395
+ return "Convert to the expected date format specified in the rule"
391
396
 
392
397
  return "Review and correct the value"
393
398
 
@@ -6,6 +6,7 @@ Provides production-grade terminal output for validation results including:
6
6
  - Summary statistics with progress bars
7
7
  """
8
8
 
9
+ import sys
9
10
 
10
11
  from rich.console import Console
11
12
  from rich.panel import Panel
@@ -16,6 +17,23 @@ from datacheck.reporting.suggestion_engine import Suggestion, SuggestionEngine
16
17
  from datacheck.results import RuleResult, ValidationSummary
17
18
 
18
19
 
20
+ def _safe_encoding() -> bool:
21
+ """Check if stdout can handle Unicode symbols."""
22
+ encoding = getattr(sys.stdout, "encoding", None) or ""
23
+ return encoding.lower().replace("-", "") in ("utf8", "utf16", "utf32", "utf16le", "utf16be")
24
+
25
+
26
+ # Symbols that degrade gracefully on non-UTF-8 terminals (e.g. Windows cp1252)
27
+ _TICK = "✓" if _safe_encoding() else "v"
28
+ _CROSS = "✗" if _safe_encoding() else "x"
29
+ _WARN = "⚠" if _safe_encoding() else "!"
30
+ _BAR_FILLED = "█" if _safe_encoding() else "#"
31
+ _BAR_EMPTY = "░" if _safe_encoding() else "-"
32
+ _HLINE = "─" if _safe_encoding() else "-"
33
+ _ARROW = "→" if _safe_encoding() else "->"
34
+ _BULLET = "•" if _safe_encoding() else "*"
35
+
36
+
19
37
  class TerminalReporter:
20
38
  """Enhanced terminal reporter with Rich formatting.
21
39
 
@@ -89,13 +107,13 @@ class TerminalReporter:
89
107
  """
90
108
  if summary.all_passed:
91
109
  status = Text("ALL CHECKS PASSED", style="bold green")
92
- icon = "[green][/green]"
110
+ icon = f"[green]{_TICK}[/green]"
93
111
  elif summary.error_rules > 0 and summary.failed_rules == 0:
94
112
  status = Text("VALIDATION ERRORS", style="bold yellow")
95
- icon = "[yellow][/yellow]"
113
+ icon = f"[yellow]{_WARN}[/yellow]"
96
114
  else:
97
115
  status = Text("VALIDATION FAILED", style="bold red")
98
- icon = "[red][/red]"
116
+ icon = f"[red]{_CROSS}[/red]"
99
117
 
100
118
  self.console.print(f"{icon} {status}")
101
119
  self.console.print()
@@ -168,7 +186,7 @@ class TerminalReporter:
168
186
  """
169
187
  filled = int((percentage / 100) * width)
170
188
  empty = width - filled
171
- bar = "█" * filled + "░" * empty
189
+ bar = _BAR_FILLED * filled + _BAR_EMPTY * empty
172
190
  return f"[{color}]{bar}[/{color}] {percentage:.0f}%"
173
191
 
174
192
  def _print_failures(self, summary: ValidationSummary) -> None:
@@ -206,7 +224,7 @@ class TerminalReporter:
206
224
  rule_type = result.rule_type if result.rule_type else "unknown"
207
225
 
208
226
  self.console.print(
209
- f"[red][/red] [bold]{check_name}[/bold] "
227
+ f"[red]{_CROSS}[/red] [bold]{check_name}[/bold] "
210
228
  f"([cyan]{result.column}[/cyan] · {rule_type})"
211
229
  )
212
230
 
@@ -227,7 +245,7 @@ class TerminalReporter:
227
245
  check_name = result.check_name if result.check_name else result.rule_name
228
246
 
229
247
  self.console.print(
230
- f"[yellow][/yellow] [bold]{check_name}[/bold] "
248
+ f"[yellow]{_WARN}[/yellow] [bold]{check_name}[/bold] "
231
249
  f"([cyan]{result.column}[/cyan])"
232
250
  )
233
251
  self.console.print(f" Error: {result.error}", style="yellow")
@@ -272,7 +290,7 @@ class TerminalReporter:
272
290
  self.console.print(" [dim]Sample Fixes:[/dim]")
273
291
  for fix in suggestion.sample_fixes[:3]:
274
292
  self.console.print(
275
- f" {fix['original_value']} {fix['suggested_fix']}"
293
+ f" {_BULLET} {fix['original_value']} {_ARROW} {fix['suggested_fix']}"
276
294
  )
277
295
 
278
296
  self.console.print()
@@ -283,11 +301,11 @@ class TerminalReporter:
283
301
  Args:
284
302
  summary: ValidationSummary for footer
285
303
  """
286
- self.console.print("─" * 60)
304
+ self.console.print(_HLINE * 60)
287
305
 
288
306
  if summary.all_passed:
289
307
  self.console.print(
290
- "[green] All validation rules passed successfully.[/green]"
308
+ f"[green]{_TICK} All validation rules passed successfully.[/green]"
291
309
  )
292
310
  else:
293
311
  issues = []
@@ -297,7 +315,7 @@ class TerminalReporter:
297
315
  issues.append(f"{summary.error_rules} errors")
298
316
  issue_str = ", ".join(issues)
299
317
  self.console.print(
300
- f"[yellow] Validation complete with issues: {issue_str}[/yellow]"
318
+ f"[yellow]{_WARN} Validation complete with issues: {issue_str}[/yellow]"
301
319
  )
302
320
 
303
321
  self.console.print()
@@ -184,23 +184,48 @@ class PhoneValidRule(Rule):
184
184
  check_name=self.name,
185
185
  )
186
186
 
187
+ # When pandas/PyArrow loads purely numeric phone numbers, the
188
+ # column ends up as int64 or float64. Converting with plain
189
+ # .astype(str) produces values like "1234567890.0" which break
190
+ # phone-number parsing. Detect numeric dtypes and convert via
191
+ # integer casting first so the trailing ".0" is stripped.
192
+ _is_numeric_col = pd.api.types.is_numeric_dtype(data)
193
+ if _is_numeric_col:
194
+ data_str = data.astype("Int64").astype(str)
195
+ else:
196
+ data_str = data.astype(str)
197
+
187
198
  # Vectorized regex pre-filter: fast-reject values without phone-like characters
188
199
  # Valid phones contain digits and may have +, -, (, ), spaces, dots
189
- data_str = data.astype(str)
190
200
  pre_filter = data_str.str.match(
191
- r"^[+\d][\d\s\-().]{4,}$", na=False
201
+ r"^[+0-9][0-9\s\-().]{4,}$", na=False
192
202
  )
193
203
 
194
204
  # Only run expensive phonenumbers parsing on candidates
195
- candidates = data[pre_filter]
205
+ candidates = data_str[pre_filter]
196
206
  if len(candidates) > 0:
197
207
  def is_valid_phone(value: Any) -> bool:
198
208
  """Check whether a single value is a valid phone number."""
209
+ str_val = str(value)
199
210
  try:
200
- parsed = phonenumbers.parse(str(value), self.country_code)
201
- return bool(phonenumbers.is_valid_number(parsed))
211
+ parsed = phonenumbers.parse(str_val, self.country_code)
212
+ if phonenumbers.is_valid_number(parsed):
213
+ return True
202
214
  except phonenumbers.NumberParseException:
203
- return False
215
+ pass
216
+ # When CSV loaders (PyArrow) parse "+1234..." as a number,
217
+ # the "+" is lost. Retry with "+" prefix for digit-only
218
+ # values that could be international numbers (country code
219
+ # 1-3 digits + national number, minimum ~8 digits total).
220
+ if _is_numeric_col and str_val.isdigit() and len(str_val) >= 8:
221
+ try:
222
+ parsed = phonenumbers.parse(
223
+ "+" + str_val, self.country_code
224
+ )
225
+ return bool(phonenumbers.is_valid_number(parsed))
226
+ except phonenumbers.NumberParseException:
227
+ pass
228
+ return False
204
229
 
205
230
  detailed_mask = candidates.apply(is_valid_phone)
206
231
  valid_mask = pre_filter.copy()
@@ -221,7 +246,7 @@ class PhoneValidRule(Rule):
221
246
  check_name=self.name,
222
247
  )
223
248
 
224
- failed_values = data.loc[invalid_indices]
249
+ failed_values = data_str.loc[invalid_indices]
225
250
  reasons = [f"'{v}' is not a valid phone number" for v in failed_values.iloc[:100]]
226
251
 
227
252
  failure_detail = self._create_failure_detail(
@@ -449,9 +449,23 @@ class DateFormatValidRule(Rule):
449
449
  check_name=self.name,
450
450
  )
451
451
 
452
+ # If the column is already datetime (numpy datetime64 or
453
+ # PyArrow timestamp), format it using the expected format
454
+ # string so the round-trip check works correctly regardless
455
+ # of the target format.
456
+ _is_datetime = pd.api.types.is_datetime64_any_dtype(data) or (
457
+ isinstance(data.dtype, pd.ArrowDtype)
458
+ and hasattr(data, "dt")
459
+ )
460
+ if _is_datetime:
461
+ dt_series = data.astype("datetime64[ns]")
462
+ str_data = dt_series.dt.strftime(self.format_string)
463
+ else:
464
+ str_data = data.astype(str)
465
+
452
466
  # Vectorized date format validation via pd.to_datetime
453
467
  parsed = pd.to_datetime(
454
- data.astype(str), format=self.format_string, errors="coerce"
468
+ str_data, format=self.format_string, errors="coerce"
455
469
  )
456
470
  valid_mask = parsed.notna()
457
471
  invalid_indices = data.index[~valid_mask]
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datacheck-cli"
3
- version = "2.0.0"
3
+ version = "2.0.1"
4
4
  description = "CLI-first data validation tool for data engineers. Catch bad data before it breaks pipelines."
5
5
  authors = ["Squrtech <contact@squrtech.com>"]
6
6
  readme = "README_PYPI.md"
File without changes