PyPI - datacheck-cli - Versions diffs - 2.0.0__tar.gz → 2.0.1__tar.gz - Mend

datacheck-cli 2.0.0tar.gz → 2.0.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datacheck-cli
-Version: 2.0.0
+Version: 2.0.1
 Summary: CLI-first data validation tool for data engineers. Catch bad data before it breaks pipelines.
 License: Apache-2.0
 License-File: LICENSE

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/__init__.py RENAMED Viewed

@@ -37,7 +37,7 @@ from datacheck.profiling.formatters import (
     TerminalFormatter,
 )
-__version__ = "2.0.0"
+__version__ = "2.0.1"
 __author__ = "Squrtech"
 __email__ = "contact@squrtech.com"

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/cli/schema.py RENAMED Viewed

@@ -1,5 +1,6 @@
 """Schema commands for DataCheck CLI."""
+import sys
 from pathlib import Path
 import typer
@@ -8,6 +9,15 @@ from rich.table import Table
 import pandas as pd
 from datacheck.cli import console
+def _safe_encoding() -> bool:
+    """Check if stdout can handle Unicode symbols."""
+    encoding = getattr(sys.stdout, "encoding", None) or ""
+    return encoding.lower().replace("-", "") in ("utf8", "utf16", "utf32", "utf16le", "utf16be")
+_TICK = "✓" if _safe_encoding() else "v"
 from datacheck.exceptions import DataLoadError
 # Schema sub-app for schema evolution commands
@@ -301,12 +311,12 @@ def schema_capture(
         # Save baseline
         manager = BaselineManager(baseline_dir=baseline_dir)
         filepath = manager.save_baseline(schema, name=name)
-        console.print(f"[green]✓[/green] Schema saved to: {filepath}")
+        console.print(f"[green]{_TICK}[/green] Schema saved to: {filepath}")
         # Save to history if requested
         if save_history:
             history_path = manager.save_to_history(schema)
-            console.print(f"[green]✓[/green] Schema added to history: {history_path}")
+            console.print(f"[green]{_TICK}[/green] Schema added to history: {history_path}")
         # Display summary
         console.print("\n[bold]Schema Summary:[/bold]")
@@ -493,7 +503,7 @@ def schema_compare(
         else:
             # Terminal output
             if not comparison.changes:
-                console.print("[green]✓ No schema changes detected[/green]")
+                console.print(f"[green]{_TICK} No schema changes detected[/green]")
             else:
                 # Compatibility summary
                 compat_style = {

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/cli/validate.py RENAMED Viewed

@@ -524,7 +524,7 @@ def validate(
                     "loading_inline_data_source",
                     extra={"type": inline_source.type, "path": str(source_path)},
                 )
-                summary = engine.validate_file(str(source_path))
+                summary = engine.validate_file(str(source_path), **inline_source.options)
                 logger.info(
                     "data_loaded",
                     extra={"source_type": "inline", "path": str(source_path)},

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/generator.py RENAMED Viewed

@@ -105,7 +105,7 @@ class ConfigGenerator:
                 col_names = cc_rule["columns"]
                 cc_check: dict[str, Any] = {
                     "name": f"cross_{'_'.join(col_names[:2])}_{cc_rule['rule']}",
-                    "columns": col_names,
+                    "column": col_names[0],
                     "rules": {cc_rule["rule"]: cc_rule["params"]},
                     "description": cc_rule.get("reason", "Cross-column rule"),
                 }
@@ -130,6 +130,7 @@ class ConfigGenerator:
         config["checks"] = checks
         config["reporting"] = {
+            "output_path": "./output",
             "export_failures": True,
         }
@@ -237,8 +238,20 @@ class ConfigGenerator:
         """
         from datacheck.loader import LoaderFactory
+        data_path = Path(data_path)
         df = LoaderFactory.load(str(data_path), **load_kwargs)
-        name = Path(data_path).stem
+        name = data_path.stem
+        # Determine source type from file extension
+        ext = data_path.suffix.lower().lstrip(".")
+        source_type_map = {
+            "csv": "csv",
+            "parquet": "parquet",
+            "pq": "parquet",
+            "json": "json",
+            "avro": "avro",
+        }
+        source_type = source_type_map.get(ext, "csv")
         if return_profile:
             if confidence_threshold not in self.CONFIDENCE_LEVELS:
@@ -250,11 +263,20 @@ class ConfigGenerator:
             config = self.generate_from_profile(
                 profile, confidence_threshold=confidence_threshold
             )
+            config["data_source"] = {
+                "type": source_type,
+                "path": f"./{data_path.name}",
+            }
             return config, profile
-        return self.generate_from_dataframe(
+        config = self.generate_from_dataframe(
             df, name=name, confidence_threshold=confidence_threshold
         )
+        config["data_source"] = {
+            "type": source_type,
+            "path": f"./{data_path.name}",
+        }
+        return config
     def save_config(
         self,
@@ -301,7 +323,7 @@ class ConfigGenerator:
         # Version
         if "version" in config:
-            lines.append(f"version: \"{config['version']}\"")
+            lines.append(f"version: '{config['version']}'")
             lines.append("")
         # Metadata
@@ -310,11 +332,27 @@ class ConfigGenerator:
             lines.append("metadata:")
             for key, value in config["metadata"].items():
                 if isinstance(value, str):
-                    lines.append(f"  {key}: \"{value}\"")
+                    lines.append(f"  {key}: '{value}'")
                 else:
                     lines.append(f"  {key}: {value}")
             lines.append("")
+        # Data source
+        if "data_source" in config:
+            ds = config["data_source"]
+            lines.append("# Data source configuration")
+            lines.append("data_source:")
+            lines.append(f"  type: {ds['type']}")
+            lines.append(f"  path: '{ds['path']}'")
+            if "options" in ds and ds["options"]:
+                lines.append("  options:")
+                for key, value in ds["options"].items():
+                    if isinstance(value, str):
+                        lines.append(f"    {key}: '{value}'")
+                    else:
+                        lines.append(f"    {key}: {value}")
+            lines.append("")
         # Checks
         lines.append("# Validation checks")
         lines.append("# Each check validates a single column with one or more rules")
@@ -333,7 +371,7 @@ class ConfigGenerator:
                 lines.append(f"    column: {check['column']}")
             if "description" in check:
-                lines.append(f"    description: \"{check['description']}\"")
+                lines.append(f"    description: '{check['description']}'")
             lines.append("    rules:")
             rule_reasons = check.get("_rule_reasons", {})
@@ -401,13 +439,13 @@ class ConfigGenerator:
                 lines.append(f"{prefix}{rule_name}:")
             for k, v in rule_value.items():
                 if isinstance(v, str):
-                    lines.append(f"{sub_prefix}{k}: \"{v}\"")
+                    lines.append(f"{sub_prefix}{k}: '{v}'")
                 elif isinstance(v, list):
                     lines.append(f"{sub_prefix}{k}:")
                     item_prefix = "          # " if commented else "          "
                     for item in v:
                         if isinstance(item, str):
-                            lines.append(f"{item_prefix}- \"{item}\"")
+                            lines.append(f"{item_prefix}- '{item}'")
                         else:
                             lines.append(f"{item_prefix}- {item}")
                 else:
@@ -419,11 +457,11 @@ class ConfigGenerator:
                 lines.append(f"{prefix}{rule_name}:")
             for item in rule_value:
                 if isinstance(item, str):
-                    lines.append(f"{sub_prefix}- \"{item}\"")
+                    lines.append(f"{sub_prefix}- '{item}'")
                 else:
                     lines.append(f"{sub_prefix}- {item}")
         elif isinstance(rule_value, str):
-            lines.append(f"{prefix}{rule_name}: \"{rule_value}\"{comment}")
+            lines.append(f"{prefix}{rule_name}: '{rule_value}'{comment}")
         else:
             lines.append(f"{prefix}{rule_name}: {rule_value}{comment}")

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/loader.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """Configuration parsing and validation (original config module)."""
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
@@ -81,10 +81,12 @@ class DataSourceConfig:
     Attributes:
         type: Source type (csv, parquet, json, excel, delta)
         path: Path to the data file (relative to config file or absolute)
+        options: Loader-specific options (e.g. encoding, delimiter for CSV)
     """
     type: str
     path: str
+    options: dict[str, Any] = field(default_factory=dict)
     def __post_init__(self) -> None:
         """Validate data source configuration."""
@@ -278,10 +280,14 @@ class ConfigLoader:
         if not path.is_file():
             raise ConfigurationError(f"Configuration path is not a file: {config_path}")
-        # Read and parse YAML
+        # Read and parse YAML (with env-var substitution and extends resolution)
         try:
-            with open(path, encoding="utf-8") as f:
-                data = yaml.safe_load(f)
+            from datacheck.config.parser import ConfigParser
+            parser = ConfigParser()
+            data = parser.load(path, resolve_env=True, resolve_extends=True)
+        except ConfigurationError:
+            raise
         except yaml.YAMLError as e:
             raise ConfigurationError(f"Invalid YAML in {config_path}: {e}") from e
         except Exception as e:
@@ -304,26 +310,33 @@ class ConfigLoader:
                 f"'checks' must be a list, got {type(data['checks']).__name__}"
             )
-        # Parse checks
+        # Parse checks — collect all errors before raising
         checks = []
+        check_errors: list[str] = []
         for idx, check_data in enumerate(data["checks"]):
             if not isinstance(check_data, dict):
-                raise ConfigurationError(
+                check_errors.append(
                     f"Check at index {idx} must be a dictionary, "
                     f"got {type(check_data).__name__}"
                 )
+                continue
             # Validate required fields
+            missing = False
             if "name" not in check_data:
-                raise ConfigurationError(f"Check at index {idx} missing 'name' field")
+                check_errors.append(f"Check at index {idx} missing 'name' field")
+                missing = True
             if "column" not in check_data:
-                raise ConfigurationError(
-                    f"Check '{check_data.get('name', idx)}' missing 'column' field"
-                )
+                name = check_data.get("name", f"index {idx}")
+                check_errors.append(f"Check '{name}' missing 'column' field")
+                missing = True
             if "rules" not in check_data:
-                raise ConfigurationError(
-                    f"Check '{check_data['name']}' missing 'rules' field"
-                )
+                name = check_data.get("name", f"index {idx}")
+                check_errors.append(f"Check '{name}' missing 'rules' field")
+                missing = True
+            if missing:
+                continue
             try:
                 rule_config = RuleConfig(
@@ -338,12 +351,17 @@ class ConfigLoader:
                 # Only add enabled checks
                 if rule_config.enabled:
                     checks.append(rule_config)
-            except ConfigurationError:
-                raise
+            except ConfigurationError as e:
+                check_errors.append(str(e))
             except Exception as e:
-                raise ConfigurationError(
+                check_errors.append(
                     f"Error parsing check '{check_data.get('name', idx)}': {e}"
-                ) from e
+                )
+        if check_errors:
+            raise ConfigurationError(
+                "Configuration has errors:\n  - " + "\n  - ".join(check_errors)
+            )
         # Parse plugins (optional)
         plugins = data.get("plugins", [])
@@ -400,9 +418,13 @@ class ConfigLoader:
                 raise ConfigurationError("'data_source' missing 'path' field")
             try:
+                options = ds_data.get("options", {})
+                if not isinstance(options, dict):
+                    raise ConfigurationError("'data_source.options' must be a dictionary")
                 data_source = DataSourceConfig(
                     type=ds_data["type"],
                     path=ds_data["path"],
+                    options=options,
                 )
             except ConfigurationError:
                 raise

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/config/validator.py RENAMED Viewed

@@ -137,6 +137,8 @@ class ConfigValidator:
         """
         Validate config against JSON schema.
+        Collects all schema errors at once instead of stopping at the first.
         Args:
             config: Config dictionary
@@ -146,10 +148,11 @@ class ConfigValidator:
         errors: list[str] = []
         try:
-            jsonschema.validate(instance=config, schema=self.schema)
-        except jsonschema.ValidationError as e:
-            path = ".".join(str(p) for p in e.path) if e.path else "root"
-            errors.append(f"Schema validation failed at '{path}': {e.message}")
+            validator_cls = jsonschema.Draft7Validator
+            validator = validator_cls(self.schema)
+            for error in sorted(validator.iter_errors(config), key=str):
+                path = ".".join(str(p) for p in error.path) if error.path else "root"
+                errors.append(f"Schema validation failed at '{path}': {error.message}")
         except jsonschema.SchemaError as e:
             errors.append(f"Invalid schema: {e.message}")

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/engine.py RENAMED Viewed

@@ -357,19 +357,58 @@ class ValidationEngine:
                 )
             source_checks.setdefault(effective_source, []).append(check)
-        # Validate each source's checks
-        all_results: list[RuleResult] = []
-        _total_rows = 0
-        _total_columns = 0
-        for src_name, checks in source_checks.items():
+        # Pre-validate: verify all sources exist and connections work
+        # before running any checks.  This gives a single clear error
+        # instead of repeating "connection failed" for every check.
+        connection_errors: list[str] = []
+        for src_name in source_checks:
             if src_name not in self._sources:
-                raise ConfigurationError(
+                connection_errors.append(
                     f"Source '{src_name}' not found in sources file. "
                     f"Available sources: {', '.join(sorted(self._sources.keys()))}"
                 )
+                continue
             source_config: SourceConfig = self._sources[src_name]
+            if source_config.is_database:
+                # Test database connectivity
+                try:
+                    from datacheck.connectors.factory import create_connector
+                    connector = create_connector(source_config)
+                    connector.connect()
+                    connector.disconnect()
+                except Exception as e:
+                    connection_errors.append(
+                        f"Source '{src_name}' ({source_config.type}): "
+                        f"Connection failed — {e}"
+                    )
+            elif source_config.is_file:
+                # Test file accessibility
+                from pathlib import Path as _Path
+                file_path = source_config.connection.get("path", "")
+                if file_path and not _Path(file_path).exists():
+                    connection_errors.append(
+                        f"Source '{src_name}' ({source_config.type}): "
+                        f"File not found — {file_path}"
+                    )
+        if connection_errors:
+            raise ConfigurationError(
+                "Source connectivity check failed:\n  - "
+                + "\n  - ".join(connection_errors)
+            )
+        # Validate each source's checks
+        all_results: list[RuleResult] = []
+        _total_rows = 0
+        _total_columns = 0
+        for src_name, checks in source_checks.items():
+            source_config = self._sources[src_name]
             # Determine table for this group of checks
             # All checks in this group share the same source, but may have different tables
             # We need to sub-group by table for database sources

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/profiler.py RENAMED Viewed

@@ -1,10 +1,13 @@
 """Data profiling and quality analysis."""
+import logging
 import re
 from datetime import datetime as dt
 import pandas as pd
+logger = logging.getLogger(__name__)
 from datacheck.profiling.models import ColumnProfile, DatasetProfile
 from datacheck.profiling.outliers import OutlierDetector, OutlierMethod
 from datacheck.profiling.quality import QualityScorer
@@ -147,7 +150,19 @@ class DataProfiler:
             profile.inferred_type = "boolean"
         elif pd.api.types.is_numeric_dtype(series):
-            profile.inferred_type = "numeric"
+            # Distinguish integer vs float types
+            if pd.api.types.is_integer_dtype(series):
+                profile.inferred_type = "integer"
+            elif pd.api.types.is_float_dtype(series):
+                # Check if all non-null values are whole numbers
+                # (common when nulls force int->float promotion)
+                non_null = series.dropna()
+                if len(non_null) > 0 and (non_null == non_null.astype(int)).all():
+                    profile.inferred_type = "integer"
+                else:
+                    profile.inferred_type = "numeric"
+            else:
+                profile.inferred_type = "numeric"
             stats = self.stats_calc.calculate_numeric_stats(series)
             profile.min_value = stats["min"]
             profile.max_value = stats["max"]
@@ -224,7 +239,7 @@ class DataProfiler:
                 if len(dt_values) > 0:
                     profile.weekday_only = bool((dt_values.dt.dayofweek < 5).all())
             except Exception:
-                pass
+                logger.debug("Weekday analysis failed for column '%s'", series.name)
         # Sample values (for value-based rule detection)
         non_null_sample = series.dropna()
@@ -259,7 +274,7 @@ class DataProfiler:
         # --- sum_equals detection ---
         numeric_cols = [
             name for name, cp in profile.columns.items()
-            if cp.inferred_type == "numeric"
+            if cp.inferred_type in ("numeric", "integer")
         ]
         # Only check if manageable number of columns (<=15 numeric)
         if 3 <= len(numeric_cols) <= 15:
@@ -294,10 +309,10 @@ class DataProfiler:
                             if close.sum() / valid >= 0.95:
                                 rules.append({
                                     "rule": "sum_equals",
-                                    "columns": [col_a, col_b, target],
+                                    "columns": [target, col_a, col_b],
                                     "params": {
-                                        "columns": [col_a, col_b],
-                                        "target": target,
+                                        "column_a": col_a,
+                                        "column_b": col_b,
                                     },
                                     "confidence": "high",
                                     "reason": (
@@ -306,7 +321,7 @@ class DataProfiler:
                                     ),
                                 })
                         except Exception:
-                            pass
+                            logger.debug("sum_equals check failed for %s + %s = %s", col_a, col_b, target)
         # --- unique_combination detection ---
         cat_cols = [
@@ -326,7 +341,7 @@ class DataProfiler:
                             rules.append({
                                 "rule": "unique_combination",
                                 "columns": [col_a, col_b],
-                                "params": {"columns": [col_a, col_b]},
+                                "params": [col_a, col_b],
                                 "confidence": "medium",
                                 "reason": (
                                     f"Combination of {col_a} and {col_b} "
@@ -334,7 +349,7 @@ class DataProfiler:
                                 ),
                             })
                     except Exception:
-                        pass
+                        logger.debug("unique_combination check failed for %s, %s", col_a, col_b)
         return rules
@@ -394,7 +409,7 @@ class DataProfiler:
                     dt.strptime(val, fmt)
                     count += 1
                 except (ValueError, TypeError):
-                    pass
+                    continue  # Value doesn't match this format
             if count > best_count:
                 best_count = count
                 best_format = fmt

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/profiling/suggestions.py RENAMED Viewed

@@ -3,11 +3,14 @@
 from __future__ import annotations
 import json
+import logging
 import re
-from datetime import datetime
+from datetime import datetime, timedelta
 from collections.abc import Callable
 from typing import TYPE_CHECKING, Any
+logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from datacheck.profiling.models import ColumnProfile
@@ -90,6 +93,7 @@ class RuleSuggester:
         inferred = getattr(profile, "inferred_type", None)
         if inferred:
             type_map = {
+                "integer": "int",
                 "numeric": "numeric",
                 "boolean": "bool",
                 "datetime": "date",
@@ -291,12 +295,22 @@ class RuleSuggester:
         # --- Temporal rules ---
         if profile.min_date is not None and profile.max_date is not None:
-            # timestamp_range
+            # timestamp_range — add 1-day margin on each side so edge
+            # values don't fail due to profiling-time rounding
+            min_date_str = profile.min_date.split(" ")[0]
+            max_date_str = profile.max_date.split(" ")[0]
+            try:
+                min_dt = datetime.fromisoformat(min_date_str)
+                max_dt = datetime.fromisoformat(max_date_str)
+                min_date_str = (min_dt - timedelta(days=1)).strftime("%Y-%m-%d")
+                max_date_str = (max_dt + timedelta(days=1)).strftime("%Y-%m-%d")
+            except (ValueError, TypeError):
+                pass  # Keep original strings if parsing fails
             suggestions.append({
                 "rule": "timestamp_range",
                 "params": {
-                    "min": profile.min_date.split(" ")[0],
-                    "max": profile.max_date.split(" ")[0],
+                    "min": min_date_str,
+                    "max": max_date_str,
                 },
                 "confidence": "medium",
                 "reason": f"Dates range from {profile.min_date} to {profile.max_date}",
@@ -314,7 +328,7 @@ class RuleSuggester:
                         "reason": "No future dates detected in data",
                     })
             except (ValueError, TypeError):
-                pass
+                logger.debug("Failed to parse max_date for no_future_timestamps check")
         # business_days_only
         weekday_only = getattr(profile, "weekday_only", None)
@@ -360,7 +374,7 @@ class RuleSuggester:
                         json.loads(s)
                         json_count += 1
                     except (json.JSONDecodeError, TypeError):
-                        pass
+                        continue  # Not valid JSON
             if len(sample[:20]) > 0 and json_count >= len(sample[:20]) * 0.8:
                 suggestions.append({
                     "rule": "json_valid",
@@ -425,7 +439,7 @@ _VALUE_DETECTORS: list[dict[str, Any]] = [
     },
     {
         "rule": "phone_valid",
-        "pattern": r"^[+\d][\d\s\-().]{6,}$",
+        "pattern": r"^[+0-9][0-9\s\-().]{6,}$",
         "confidence": "medium",
         "reason_template": "Values match phone format ({matches}/{total} samples)",
         "skip_if_rules": {"phone_valid", "regex"},
@@ -481,7 +495,7 @@ _KNOWN_PATTERNS: list[tuple[str, str, str, str]] = [
     ),
     (
         "IPv4",
-        r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
+        r"^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$",
         "high",
         "IPv4 address format",
     ),
@@ -493,19 +507,19 @@ _KNOWN_PATTERNS: list[tuple[str, str, str, str]] = [
     ),
     (
         "US zip code",
-        r"^\d{5}(-\d{4})?$",
+        r"^[0-9]{5}(-[0-9]{4})?$",
         "medium",
         "US zip code format",
     ),
     (
         "Credit card",
-        r"^\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}$",
+        r"^[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{4}$",
         "medium",
         "Credit card number format",
     ),
     (
         "SSN-like",
-        r"^\d{3}-\d{2}-\d{4}$",
+        r"^[0-9]{3}-[0-9]{2}-[0-9]{4}$",
         "medium",
         "SSN-like format (XXX-XX-XXXX)",
     ),
@@ -643,7 +657,7 @@ def _infer_custom_pattern(
     if desc_parts:
         description = f"{' + '.join(desc_parts)} prefix pattern"
     else:
-        sep_char = matching[0][1] if most_common_count >= 3 else "-"
+        sep_char = matching[0][1]
         n_segments = (most_common_count + 1) // 2
         description = f"structured pattern ({n_segments} segments, '{sep_char}' separator)"
@@ -654,7 +668,7 @@ def _infer_custom_pattern(
 # Each entry: (per-char test function, regex class string).
 # Checked in priority order — first match wins.
 _CHAR_CLASSES: list[tuple[Callable[[str], bool], str]] = [
-    (str.isdigit, "\\d"),
+    (str.isdigit, "[0-9]"),
     (str.isupper, "[A-Z]"),
     (str.islower, "[a-z]"),
     (lambda c: c in "0123456789ABCDEF", "[0-9A-F]"),

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/suggestion_engine.py RENAMED Viewed

@@ -136,7 +136,7 @@ class SuggestionEngine:
         },
         "date_format_valid": {
             "message": "Invalid date formats detected",
-            "action": "Standardize date format at source or add date parsing logic",
+            "action": "Standardize dates to match the expected format at the source or add date parsing logic",
         },
         "business_days_only": {
             "message": "Records found on non-business days",
@@ -387,7 +387,12 @@ class SuggestionEngine:
             return "Remove duplicate or assign unique identifier"
         elif rule_type == "date_format_valid":
-            return "Convert to expected date format (e.g., YYYY-MM-DD)"
+            # Extract the specific expected format from the failure reason
+            # Reason format: "Value '...' does not match format '%Y-%m-%d'"
+            if "does not match format '" in reason:
+                fmt = reason.split("does not match format '")[-1].rstrip("'")
+                return f"Convert to expected format: {fmt}"
+            return "Convert to the expected date format specified in the rule"
         return "Review and correct the value"

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/reporting/terminal_reporter.py RENAMED Viewed

@@ -6,6 +6,7 @@ Provides production-grade terminal output for validation results including:
 - Summary statistics with progress bars
 """
+import sys
 from rich.console import Console
 from rich.panel import Panel
@@ -16,6 +17,23 @@ from datacheck.reporting.suggestion_engine import Suggestion, SuggestionEngine
 from datacheck.results import RuleResult, ValidationSummary
+def _safe_encoding() -> bool:
+    """Check if stdout can handle Unicode symbols."""
+    encoding = getattr(sys.stdout, "encoding", None) or ""
+    return encoding.lower().replace("-", "") in ("utf8", "utf16", "utf32", "utf16le", "utf16be")
+# Symbols that degrade gracefully on non-UTF-8 terminals (e.g. Windows cp1252)
+_TICK = "✓" if _safe_encoding() else "v"
+_CROSS = "✗" if _safe_encoding() else "x"
+_WARN = "⚠" if _safe_encoding() else "!"
+_BAR_FILLED = "█" if _safe_encoding() else "#"
+_BAR_EMPTY = "░" if _safe_encoding() else "-"
+_HLINE = "─" if _safe_encoding() else "-"
+_ARROW = "→" if _safe_encoding() else "->"
+_BULLET = "•" if _safe_encoding() else "*"
 class TerminalReporter:
     """Enhanced terminal reporter with Rich formatting.
@@ -89,13 +107,13 @@ class TerminalReporter:
         """
         if summary.all_passed:
             status = Text("ALL CHECKS PASSED", style="bold green")
-            icon = "[green]✓[/green]"
+            icon = f"[green]{_TICK}[/green]"
         elif summary.error_rules > 0 and summary.failed_rules == 0:
             status = Text("VALIDATION ERRORS", style="bold yellow")
-            icon = "[yellow]⚠[/yellow]"
+            icon = f"[yellow]{_WARN}[/yellow]"
         else:
             status = Text("VALIDATION FAILED", style="bold red")
-            icon = "[red]✗[/red]"
+            icon = f"[red]{_CROSS}[/red]"
         self.console.print(f"{icon} {status}")
         self.console.print()
@@ -168,7 +186,7 @@ class TerminalReporter:
         """
         filled = int((percentage / 100) * width)
         empty = width - filled
-        bar = "█" * filled + "░" * empty
+        bar = _BAR_FILLED * filled + _BAR_EMPTY * empty
         return f"[{color}]{bar}[/{color}] {percentage:.0f}%"
     def _print_failures(self, summary: ValidationSummary) -> None:
@@ -206,7 +224,7 @@ class TerminalReporter:
         rule_type = result.rule_type if result.rule_type else "unknown"
         self.console.print(
-            f"[red]✗[/red] [bold]{check_name}[/bold] "
+            f"[red]{_CROSS}[/red] [bold]{check_name}[/bold] "
             f"([cyan]{result.column}[/cyan] · {rule_type})"
         )
@@ -227,7 +245,7 @@ class TerminalReporter:
         check_name = result.check_name if result.check_name else result.rule_name
         self.console.print(
-            f"[yellow]⚠[/yellow] [bold]{check_name}[/bold] "
+            f"[yellow]{_WARN}[/yellow] [bold]{check_name}[/bold] "
             f"([cyan]{result.column}[/cyan])"
         )
         self.console.print(f"  Error: {result.error}", style="yellow")
@@ -272,7 +290,7 @@ class TerminalReporter:
                 self.console.print("   [dim]Sample Fixes:[/dim]")
                 for fix in suggestion.sample_fixes[:3]:
                     self.console.print(
-                        f"     • {fix['original_value']} → {fix['suggested_fix']}"
+                        f"     {_BULLET} {fix['original_value']} {_ARROW} {fix['suggested_fix']}"
                     )
             self.console.print()
@@ -283,11 +301,11 @@ class TerminalReporter:
         Args:
             summary: ValidationSummary for footer
         """
-        self.console.print("─" * 60)
+        self.console.print(_HLINE * 60)
         if summary.all_passed:
             self.console.print(
-                "[green]✓ All validation rules passed successfully.[/green]"
+                f"[green]{_TICK} All validation rules passed successfully.[/green]"
             )
         else:
             issues = []
@@ -297,7 +315,7 @@ class TerminalReporter:
                 issues.append(f"{summary.error_rules} errors")
             issue_str = ", ".join(issues)
             self.console.print(
-                f"[yellow]⚠ Validation complete with issues: {issue_str}[/yellow]"
+                f"[yellow]{_WARN} Validation complete with issues: {issue_str}[/yellow]"
             )
         self.console.print()

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/semantic_rules.py RENAMED Viewed

@@ -184,23 +184,48 @@ class PhoneValidRule(Rule):
                     check_name=self.name,
                 )
+            # When pandas/PyArrow loads purely numeric phone numbers, the
+            # column ends up as int64 or float64.  Converting with plain
+            # .astype(str) produces values like "1234567890.0" which break
+            # phone-number parsing.  Detect numeric dtypes and convert via
+            # integer casting first so the trailing ".0" is stripped.
+            _is_numeric_col = pd.api.types.is_numeric_dtype(data)
+            if _is_numeric_col:
+                data_str = data.astype("Int64").astype(str)
+            else:
+                data_str = data.astype(str)
             # Vectorized regex pre-filter: fast-reject values without phone-like characters
             # Valid phones contain digits and may have +, -, (, ), spaces, dots
-            data_str = data.astype(str)
             pre_filter = data_str.str.match(
-                r"^[+\d][\d\s\-().]{4,}$", na=False
+                r"^[+0-9][0-9\s\-().]{4,}$", na=False
             )
             # Only run expensive phonenumbers parsing on candidates
-            candidates = data[pre_filter]
+            candidates = data_str[pre_filter]
             if len(candidates) > 0:
                 def is_valid_phone(value: Any) -> bool:
                     """Check whether a single value is a valid phone number."""
+                    str_val = str(value)
                     try:
-                        parsed = phonenumbers.parse(str(value), self.country_code)
-                        return bool(phonenumbers.is_valid_number(parsed))
+                        parsed = phonenumbers.parse(str_val, self.country_code)
+                        if phonenumbers.is_valid_number(parsed):
+                            return True
                     except phonenumbers.NumberParseException:
-                        return False
+                        pass
+                    # When CSV loaders (PyArrow) parse "+1234..." as a number,
+                    # the "+" is lost.  Retry with "+" prefix for digit-only
+                    # values that could be international numbers (country code
+                    # 1-3 digits + national number, minimum ~8 digits total).
+                    if _is_numeric_col and str_val.isdigit() and len(str_val) >= 8:
+                        try:
+                            parsed = phonenumbers.parse(
+                                "+" + str_val, self.country_code
+                            )
+                            return bool(phonenumbers.is_valid_number(parsed))
+                        except phonenumbers.NumberParseException:
+                            pass
+                    return False
                 detailed_mask = candidates.apply(is_valid_phone)
                 valid_mask = pre_filter.copy()
@@ -221,7 +246,7 @@ class PhoneValidRule(Rule):
                     check_name=self.name,
                 )
-            failed_values = data.loc[invalid_indices]
+            failed_values = data_str.loc[invalid_indices]
             reasons = [f"'{v}' is not a valid phone number" for v in failed_values.iloc[:100]]
             failure_detail = self._create_failure_detail(

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/datacheck/rules/temporal_rules.py RENAMED Viewed

@@ -449,9 +449,23 @@ class DateFormatValidRule(Rule):
                     check_name=self.name,
                 )
+            # If the column is already datetime (numpy datetime64 or
+            # PyArrow timestamp), format it using the expected format
+            # string so the round-trip check works correctly regardless
+            # of the target format.
+            _is_datetime = pd.api.types.is_datetime64_any_dtype(data) or (
+                isinstance(data.dtype, pd.ArrowDtype)
+                and hasattr(data, "dt")
+            )
+            if _is_datetime:
+                dt_series = data.astype("datetime64[ns]")
+                str_data = dt_series.dt.strftime(self.format_string)
+            else:
+                str_data = data.astype(str)
             # Vectorized date format validation via pd.to_datetime
             parsed = pd.to_datetime(
-                data.astype(str), format=self.format_string, errors="coerce"
+                str_data, format=self.format_string, errors="coerce"
             )
             valid_mask = parsed.notna()
             invalid_indices = data.index[~valid_mask]

{datacheck_cli-2.0.0 → datacheck_cli-2.0.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "datacheck-cli"
-version = "2.0.0"
+version = "2.0.1"
 description = "CLI-first data validation tool for data engineers. Catch bad data before it breaks pipelines."
 authors = ["Squrtech <contact@squrtech.com>"]
 readme = "README_PYPI.md"