PyPI - data-validation-gini - Versions diffs - 0.1.1__tar.gz - Mend

data-validation-gini 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data_validation_gini-0.1.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,39 @@
+Metadata-Version: 2.4
+Name: data-validation-gini
+Version: 0.1.1
+Summary: Data Validation Gini (DVG) CLI for row count and row/column comparison with HTML reports
+Author: ShanKonduru
+License: MIT
+Requires-Python: >=3.9
+Description-Content-Type: text/plain
+Requires-Dist: openpyxl
+nullify
+Description: Randomly replaces existing column data with a completely blank string (empty field) up to your defined percentage limit.
+Validation Purpose: Tests whether your framework correctly handles missing keys, detects structural row count imbalances, or drops values during outer-join schema comparisons.
+case_swap
+Description: Swaps lowercase letters to uppercase and uppercase letters to lowercase across characters in the string.
+Validation Purpose: Verifies if your reconciliation system is case-sensitive or if database collation mismatches are causing false-positive validation passes on string lookups.
+numeric_shift
+Description: Modifies existing integers or float numbers by adding or subtracting a specific numeric scale factor (e.g., changing values by exactly 0.05).
+Validation Purpose: Validates micro-level rounding, numeric precision degradation, and floating-point arithmetic tolerances inside financial or statistical data tracking pipelines.
+date_shift
+Description: Shifts standard ISO formatted dates or datetime values forward or backward by a targeted count of days.
+Validation Purpose: Pinpoints synchronization lag problems, server timezone configuration offsets, and temporal boundary filter execution logic.
+typo
+Description: Substitutes a single character within an alphanumeric string with a completely different random character or number.
+Validation Purpose: Breaks text string hash keys (like MD5 or SHA-256 binary signatures) immediately, verifying if your row-level checksum calculations are operating with absolute binary fidelity.

data_validation_gini-0.1.1/data_corruptor.py ADDED Viewed

@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+import csv
+import argparse
+import random
+import sys
+from datetime import datetime, timedelta
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Inject controlled data variations/corruptions into CSV files for validation testing."
+    )
+    parser.add_argument("-i", "--input", required=True, help="Path to the input baseline CSV file")
+    parser.add_argument("-o", "--output", required=True, help="Path to save the mutated output CSV file")
+    parser.add_argument("-c", "--column", required=True, help="Column name to apply variations to")
+    parser.add_argument("-p", "--percentage", type=float, default=5.0,
+                        help="Percentage of rows to alter in the specified column (0.0 to 100.0)")
+    parser.add_argument("-t", "--type", required=True,
+                        choices=["nullify", "case_swap", "numeric_shift", "date_shift", "typo"],
+                        help="Type of variation/corruption to inject")
+    parser.add_argument("-v", "--value", type=float, default=1.0,
+                        help="Adjustment value: numeric shift amount, days to shift a date, or typo count")
+    return parser.parse_args()
+def inject_variation(val, mutation_type, intensity_val):
+    """Applies specific transformation logic based on user selection."""
+    if not val or val.strip().upper() == "NULL":
+        return val # Leave existing nulls as-is to preserve base structure
+    if mutation_type == "nullify":
+        return "" # Creates a blank/null discrepancy
+    elif mutation_type == "case_swap":
+        return val.swapcase() # Verifies case-sensitivity checks
+    elif mutation_type == "numeric_shift":
+        try:
+            # Detect integer vs float to maintain formatting style
+            if "." in val:
+                return str(round(float(val) + intensity_val, 2))
+            return str(int(val) + int(intensity_val))
+        except ValueError:
+            return val + "_SHIFT_ERR" # Fallback if applied to non-numeric column
+    elif mutation_type == "date_shift":
+        # Supports standard YYYY-MM-DD formats
+        for fmt in ("%Y-%m-%d", "%Y-%m-%d %H:%M:%S"):
+            try:
+                dt = datetime.strptime(val, fmt)
+                mutated_dt = dt + timedelta(days=intensity_val)
+                return mutated_dt.strftime(fmt)
+            except ValueError:
+                continue
+        return val + "_DATE_ERR"
+    elif mutation_type == "typo":
+        # Randomly changes characters to test string fuzzy matches or exact row hashes
+        if len(val) <= 1:
+            return val + "X"
+        idx = random.randint(0, len(val) - 1)
+        char_pool = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
+        new_char = random.choice(char_pool.replace(val[idx], ""))
+        val_list = list(val)
+        val_list[idx] = new_char
+        return "".join(val_list)
+    return val
+def main():
+    args = parse_arguments()
+    if not 0 <= args.percentage <= 100:
+        print("Error: Percentage must be between 0 and 100.")
+        sys.exit(1)
+    try:
+        with open(args.input, mode="r", newline="", encoding="utf-8") as f:
+            reader = list(csv.reader(f))
+    except FileNotFoundError:
+        print(f"Error: Input file '{args.input}' not found.")
+        sys.exit(1)
+    if not reader:
+        print("Error: The input CSV file is empty.")
+        sys.exit(1)
+    header = reader[0]
+    rows = reader[1:]
+    if args.column not in header:
+        print(f"Error: Column '{args.column}' not found in the CSV header.")
+        print(f"Available columns: {', '.join(header)}")
+        sys.exit(1)
+    col_idx = header.index(args.column)
+    total_rows = len(rows)
+    # Calculate exactly how many rows need to be altered
+    mutation_count = int(round((args.percentage / 100.0) * total_rows))
+    if mutation_count == 0 and args.percentage > 0:
+        mutation_count = 1 # Guarantee at least one alteration if percentage > 0
+    # Select distinct random row indexes to target
+    target_row_indices = set(random.sample(range(total_rows), mutation_count))
+    print(f"--- Data Variation Injection Active ---")
+    print(f"Targeting Column : '{args.column}' (Index {col_idx})")
+    print(f"Mutation Strategy: {args.type}")
+    print(f"Execution Volume : Modifying {mutation_count} out of {total_rows} rows ({args.percentage}%)")
+    altered_count = 0
+    for idx in range(total_rows):
+        if idx in target_row_indices:
+            original_value = rows[idx][col_idx]
+            new_value = inject_variation(original_value, args.type, args.value)
+            rows[idx][col_idx] = new_value
+            altered_count += 1
+    # Write out the corrupted dataset
+    with open(args.output, mode="w", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        writer.writerow(header)
+        writer.writerows(rows)
+    print(f"Success: Saved modified file to '{args.output}'. Verification runs ready.")
+if __name__ == "__main__":
+    main()

data_validation_gini-0.1.1/data_corruptor_readme.txt ADDED Viewed

@@ -0,0 +1,29 @@
+nullify
+Description: Randomly replaces existing column data with a completely blank string (empty field) up to your defined percentage limit.
+Validation Purpose: Tests whether your framework correctly handles missing keys, detects structural row count imbalances, or drops values during outer-join schema comparisons.
+case_swap
+Description: Swaps lowercase letters to uppercase and uppercase letters to lowercase across characters in the string.
+Validation Purpose: Verifies if your reconciliation system is case-sensitive or if database collation mismatches are causing false-positive validation passes on string lookups.
+numeric_shift
+Description: Modifies existing integers or float numbers by adding or subtracting a specific numeric scale factor (e.g., changing values by exactly 0.05).
+Validation Purpose: Validates micro-level rounding, numeric precision degradation, and floating-point arithmetic tolerances inside financial or statistical data tracking pipelines.
+date_shift
+Description: Shifts standard ISO formatted dates or datetime values forward or backward by a targeted count of days.
+Validation Purpose: Pinpoints synchronization lag problems, server timezone configuration offsets, and temporal boundary filter execution logic.
+typo
+Description: Substitutes a single character within an alphanumeric string with a completely different random character or number.
+Validation Purpose: Breaks text string hash keys (like MD5 or SHA-256 binary signatures) immediately, verifying if your row-level checksum calculations are operating with absolute binary fidelity.

data_validation_gini-0.1.1/data_validation_gini.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,39 @@
+Metadata-Version: 2.4
+Name: data-validation-gini
+Version: 0.1.1
+Summary: Data Validation Gini (DVG) CLI for row count and row/column comparison with HTML reports
+Author: ShanKonduru
+License: MIT
+Requires-Python: >=3.9
+Description-Content-Type: text/plain
+Requires-Dist: openpyxl
+nullify
+Description: Randomly replaces existing column data with a completely blank string (empty field) up to your defined percentage limit.
+Validation Purpose: Tests whether your framework correctly handles missing keys, detects structural row count imbalances, or drops values during outer-join schema comparisons.
+case_swap
+Description: Swaps lowercase letters to uppercase and uppercase letters to lowercase across characters in the string.
+Validation Purpose: Verifies if your reconciliation system is case-sensitive or if database collation mismatches are causing false-positive validation passes on string lookups.
+numeric_shift
+Description: Modifies existing integers or float numbers by adding or subtracting a specific numeric scale factor (e.g., changing values by exactly 0.05).
+Validation Purpose: Validates micro-level rounding, numeric precision degradation, and floating-point arithmetic tolerances inside financial or statistical data tracking pipelines.
+date_shift
+Description: Shifts standard ISO formatted dates or datetime values forward or backward by a targeted count of days.
+Validation Purpose: Pinpoints synchronization lag problems, server timezone configuration offsets, and temporal boundary filter execution logic.
+typo
+Description: Substitutes a single character within an alphanumeric string with a completely different random character or number.
+Validation Purpose: Breaks text string hash keys (like MD5 or SHA-256 binary signatures) immediately, verifying if your row-level checksum calculations are operating with absolute binary fidelity.

data_validation_gini-0.1.1/data_validation_gini.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,11 @@
+data_corruptor.py
+data_corruptor_readme.txt
+dvg.py
+dvg_report.py
+pyproject.toml
+data_validation_gini.egg-info/PKG-INFO
+data_validation_gini.egg-info/SOURCES.txt
+data_validation_gini.egg-info/dependency_links.txt
+data_validation_gini.egg-info/entry_points.txt
+data_validation_gini.egg-info/requires.txt
+data_validation_gini.egg-info/top_level.txt

data_validation_gini-0.1.1/data_validation_gini.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

data_validation_gini-0.1.1/data_validation_gini.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ dvg = dvg:main

data_validation_gini-0.1.1/data_validation_gini.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ openpyxl

data_validation_gini-0.1.1/data_validation_gini.egg-info/top_level.txt ADDED Viewed

@@ -0,0 +1,3 @@
+data_corruptor
+dvg
+dvg_report

data_validation_gini-0.1.1/dvg.py ADDED Viewed

@@ -0,0 +1,413 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import os
+import sys
+from datetime import datetime
+from dvg_report import write_html_report
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Data Validation Gini CLI (dvg): compare source and target files."
+    )
+    parser.add_argument(
+        "--file-type",
+        required=True,
+        choices=["EXCEL"],
+        help="Input file type. Use EXCEL for .csv/.xlsx/.xlsm/.xltx files.",
+    )
+    parser.add_argument("--src-path", required=True, help="Path to source file")
+    parser.add_argument("--tgt-path", required=True, help="Path to target file")
+    parser.add_argument(
+        "--validation-type",
+        required=True,
+        help="Validation mode: ROWCOUNT, ROW_COL_VALIDATION, or both (e.g., 'ROWCOUNT,ROW_COL_VALIDATION')",
+    )
+    parser.add_argument(
+        "--html-output",
+        required=False,
+        help="Optional HTML report path. Supports <datetime> token.",
+    )
+    parser.add_argument(
+        "--src-sheet",
+        required=False,
+        help="Optional source sheet name for Excel input.",
+    )
+    parser.add_argument(
+        "--tgt-sheet",
+        required=False,
+        help="Optional target sheet name for Excel input.",
+    )
+    parser.add_argument(
+        "--sheet-mapping",
+        required=False,
+        help="Optional Excel sheet mapping, e.g. 'SRC1:TGT1,SRC2:TGT2'.",
+    )
+    return parser.parse_args()
+def normalize_cell(value):
+    if value is None:
+        return ""
+    return str(value)
+def trim_trailing_empty(values):
+    result = list(values)
+    while result and result[-1] == "":
+        result.pop()
+    return result
+def load_csv(path):
+    with open(path, mode="r", encoding="utf-8-sig", newline="") as f:
+        raw = list(csv.reader(f))
+    if not raw:
+        return [], []
+    header = trim_trailing_empty([normalize_cell(v) for v in raw[0]])
+    rows = [trim_trailing_empty([normalize_cell(v) for v in row]) for row in raw[1:]]
+    return header, rows
+def load_excel(path, sheet_name=None):
+    try:
+        from openpyxl import load_workbook
+    except ImportError:
+        print("FAILED")
+        print("Reason: openpyxl is required for .xlsx/.xlsm/.xltx files. Install it with: pip install openpyxl")
+        sys.exit(1)
+    wb = load_workbook(path, read_only=True, data_only=True)
+    ws = wb.active
+    if sheet_name:
+        if sheet_name not in wb.sheetnames:
+            wb.close()
+            print("FAILED")
+            print(f"Reason: Sheet '{sheet_name}' not found in file: {path}")
+            sys.exit(1)
+        ws = wb[sheet_name]
+    raw = []
+    for row in ws.iter_rows(values_only=True):
+        raw.append(trim_trailing_empty([normalize_cell(v) for v in row]))
+    wb.close()
+    if not raw:
+        return [], []
+    header = raw[0]
+    rows = raw[1:]
+    return header, rows
+def load_table(path, sheet_name=None):
+    ext = os.path.splitext(path)[1].lower()
+    if ext == ".csv":
+        if sheet_name:
+            print("FAILED")
+            print(f"Reason: --src-sheet/--tgt-sheet requires Excel input, got CSV: {path}")
+            sys.exit(1)
+        return load_csv(path)
+    if ext in {".xlsx", ".xlsm", ".xltx"}:
+        return load_excel(path, sheet_name=sheet_name)
+    print("FAILED")
+    print(f"Reason: Unsupported file extension '{ext}'. Supported: .csv, .xlsx, .xlsm, .xltx")
+    sys.exit(1)
+def is_excel_path(path):
+    return os.path.splitext(path)[1].lower() in {".xlsx", ".xlsm", ".xltx"}
+def parse_sheet_mapping(mapping_text):
+    pairs = []
+    for token in (mapping_text or "").split(","):
+        item = token.strip()
+        if not item:
+            continue
+        if ":" not in item:
+            print("FAILED")
+            print(
+                "Reason: Invalid --sheet-mapping format. Use 'SRC1:TGT1,SRC2:TGT2'."
+            )
+            sys.exit(1)
+        src_sheet, tgt_sheet = item.split(":", 1)
+        src_sheet = src_sheet.strip()
+        tgt_sheet = tgt_sheet.strip()
+        if not src_sheet or not tgt_sheet:
+            print("FAILED")
+            print(
+                "Reason: Invalid --sheet-mapping pair. Both source and target sheet names are required."
+            )
+            sys.exit(1)
+        pairs.append((src_sheet, tgt_sheet))
+    if not pairs:
+        print("FAILED")
+        print("Reason: --sheet-mapping is empty. Use format 'SRC1:TGT1,SRC2:TGT2'.")
+        sys.exit(1)
+    return pairs
+def compare_rowcount(src_rows, tgt_rows):
+    src_count = len(src_rows)
+    tgt_count = len(tgt_rows)
+    passed = src_count == tgt_count
+    if passed:
+        reason = f"Row counts match: src={src_count}, tgt={tgt_count}."
+    else:
+        reason = f"Row count mismatch: src={src_count}, tgt={tgt_count}."
+    return passed, reason, []
+def compare_row_col(src_header, src_rows, tgt_header, tgt_rows):
+    mismatches = []
+    def classify_mismatch(src_val, tgt_val):
+        src_blank = str(src_val).strip() == ""
+        tgt_blank = str(tgt_val).strip() == ""
+        if (not src_blank) and tgt_blank:
+            return "SRC_ONLY", "Source-only value (target missing or blank)"
+        if src_blank and (not tgt_blank):
+            return "TGT_ONLY", "Target-only value (source missing or blank)"
+        return "CELL", "Cell value mismatch"
+    def build_row_map(header, rows):
+        preferred_keys = ["employee_id", "id", "emp_id", "record_id", "pk"]
+        lowered = [str(h).strip().lower() for h in header]
+        key_idx = None
+        for candidate in preferred_keys:
+            if candidate in lowered:
+                key_idx = lowered.index(candidate)
+                break
+        # Fallback to first column if no standard key column is present.
+        if key_idx is None and header:
+            key_idx = 0
+        row_map = {}
+        if key_idx is None:
+            return row_map, key_idx
+        for row_idx, row in enumerate(rows):
+            key_val = row[key_idx] if key_idx < len(row) else ""
+            key_text = str(key_val).strip()
+            if key_text == "":
+                # Deterministic fallback key for blank ID values.
+                key_text = f"__row_{row_idx + 2}"
+            row_map[key_text] = (row_idx, row)
+        return row_map, key_idx
+    if len(src_header) != len(tgt_header):
+        mismatches.append(
+            {
+                "type": "HEADER_LENGTH",
+                "row": 1,
+                "column": "<HEADER>",
+                "src": str(len(src_header)),
+                "tgt": str(len(tgt_header)),
+                "reason": "Header column count mismatch",
+            }
+        )
+    header_limit = max(len(src_header), len(tgt_header))
+    for col_idx in range(header_limit):
+        src_col = src_header[col_idx] if col_idx < len(src_header) else ""
+        tgt_col = tgt_header[col_idx] if col_idx < len(tgt_header) else ""
+        if src_col != tgt_col:
+            mismatches.append(
+                {
+                    "type": "HEADER_NAME",
+                    "row": 1,
+                    "column": f"col_{col_idx + 1}",
+                    "src": src_col,
+                    "tgt": tgt_col,
+                    "reason": "Header name mismatch",
+                }
+            )
+    if len(src_rows) != len(tgt_rows):
+        mismatches.append(
+            {
+                "type": "ROWCOUNT",
+                "row": 0,
+                "column": "<ROWCOUNT>",
+                "src": str(len(src_rows)),
+                "tgt": str(len(tgt_rows)),
+                "reason": "Data row count mismatch",
+            }
+        )
+    src_map, _ = build_row_map(src_header, src_rows)
+    tgt_map, _ = build_row_map(tgt_header, tgt_rows)
+    all_keys = sorted(set(src_map.keys()) | set(tgt_map.keys()))
+    for key in all_keys:
+        src_entry = src_map.get(key)
+        tgt_entry = tgt_map.get(key)
+        src_row_idx = src_entry[0] if src_entry else None
+        tgt_row_idx = tgt_entry[0] if tgt_entry else None
+        src_row = src_entry[1] if src_entry else []
+        tgt_row = tgt_entry[1] if tgt_entry else []
+        display_row = (src_row_idx if src_row_idx is not None else tgt_row_idx) + 2
+        col_limit = max(len(src_header), len(tgt_header), len(src_row), len(tgt_row))
+        for col_idx in range(col_limit):
+            src_val = src_row[col_idx] if col_idx < len(src_row) else ""
+            tgt_val = tgt_row[col_idx] if col_idx < len(tgt_row) else ""
+            if src_val == tgt_val:
+                continue
+            if col_idx < len(src_header):
+                col_name = src_header[col_idx]
+            elif col_idx < len(tgt_header):
+                col_name = tgt_header[col_idx]
+            else:
+                col_name = f"col_{col_idx + 1}"
+            mismatch_type, mismatch_reason = classify_mismatch(src_val, tgt_val)
+            mismatches.append(
+                {
+                    "type": mismatch_type,
+                    "row": display_row,
+                    "column": col_name,
+                    "src": src_val,
+                    "tgt": tgt_val,
+                    "reason": mismatch_reason,
+                }
+            )
+    passed = len(mismatches) == 0
+    if passed:
+        reason = "All rows and columns match."
+    else:
+        reason = f"Found {len(mismatches)} mismatch(es) across headers/rows/columns."
+    return passed, reason, mismatches
+def resolve_output_path(path):
+    if not path:
+        return None
+    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+    resolved = path.replace("<datetime>", ts).replace("<datetime<", ts)
+    resolved = os.path.normpath(resolved)
+    out_dir = os.path.dirname(resolved)
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+    return resolved
+def main():
+    args = parse_args()
+    if not os.path.exists(args.src_path):
+        print("FAILED")
+        print(f"Reason: Source file not found: {args.src_path}")
+        sys.exit(1)
+    if not os.path.exists(args.tgt_path):
+        print("FAILED")
+        print(f"Reason: Target file not found: {args.tgt_path}")
+        sys.exit(1)
+    if args.sheet_mapping and (not is_excel_path(args.src_path) or not is_excel_path(args.tgt_path)):
+        print("FAILED")
+        print("Reason: --sheet-mapping is supported only for Excel files (.xlsx/.xlsm/.xltx).")
+        sys.exit(1)
+    if args.sheet_mapping:
+        sheet_pairs = parse_sheet_mapping(args.sheet_mapping)
+    else:
+        sheet_pairs = [(args.src_sheet, args.tgt_sheet)]
+    # Parse validation types (support comma-separated values)
+    validation_types_raw = args.validation_type.split(",")
+    validation_types = [vt.strip().upper() for vt in validation_types_raw]
+    # Validate that all requested types are supported
+    supported = {"ROWCOUNT", "ROW_COL_VALIDATION"}
+    for vt in validation_types:
+        if vt not in supported:
+            print("FAILED")
+            print(f"Reason: Unsupported validation type '{vt}'. Supported: ROWCOUNT, ROW_COL_VALIDATION")
+            sys.exit(1)
+    # Run validations across configured sheet pair(s)
+    all_mismatches = []
+    all_passed = True
+    reasons = []
+    total_src_rows = 0
+    total_tgt_rows = 0
+    for src_sheet_name, tgt_sheet_name in sheet_pairs:
+        src_header, src_rows = load_table(args.src_path, sheet_name=src_sheet_name)
+        tgt_header, tgt_rows = load_table(args.tgt_path, sheet_name=tgt_sheet_name)
+        total_src_rows += len(src_rows)
+        total_tgt_rows += len(tgt_rows)
+        if src_sheet_name or tgt_sheet_name:
+            sheet_label = f"{src_sheet_name or '<active>'}->{tgt_sheet_name or '<active>'}"
+        else:
+            sheet_label = "<active_sheet>"
+        if "ROWCOUNT" in validation_types:
+            rc_passed, rc_reason, rc_mismatches = compare_rowcount(src_rows, tgt_rows)
+            all_passed = all_passed and rc_passed
+            reasons.append(f"[{sheet_label}][ROWCOUNT] {rc_reason}")
+            for item in rc_mismatches:
+                tagged = dict(item)
+                tagged["reason"] = f"[{sheet_label}] {item.get('reason', '')}"
+                all_mismatches.append(tagged)
+        if "ROW_COL_VALIDATION" in validation_types:
+            row_col_passed, row_col_reason, row_col_mismatches = compare_row_col(
+                src_header, src_rows, tgt_header, tgt_rows
+            )
+            all_passed = all_passed and row_col_passed
+            reasons.append(f"[{sheet_label}][ROW_COL_VALIDATION] {row_col_reason}")
+            for item in row_col_mismatches:
+                tagged = dict(item)
+                tagged["reason"] = f"[{sheet_label}] {item.get('reason', '')}"
+                all_mismatches.append(tagged)
+    passed = all_passed
+    reason = " | ".join(reasons)
+    print("PASSED" if passed else "FAILED")
+    print(f"Reason: {reason}")
+    if not passed and all_mismatches:
+        preview = all_mismatches[:10]
+        print("Mismatch preview (first 10):")
+        for item in preview:
+            print(
+                f"  - type={item['type']} row={item['row']} column={item['column']} "
+                f"src='{item['src']}' tgt='{item['tgt']}' reason='{item['reason']}'"
+            )
+    if args.html_output:
+        out_path = resolve_output_path(args.html_output)
+        write_html_report(
+            path=out_path,
+            passed=passed,
+            summary=reason,
+            src_path=args.src_path,
+            tgt_path=args.tgt_path,
+            validation_type=args.validation_type,
+            mismatches=all_mismatches,
+            src_rows_count=total_src_rows,
+            tgt_rows_count=total_tgt_rows,
+        )
+        print(f"HTML report: {out_path}")
+    sys.exit(0 if passed else 1)
+if __name__ == "__main__":
+    main()

data_validation_gini-0.1.1/dvg_report.py ADDED Viewed

@@ -0,0 +1,459 @@
+import html
+from datetime import datetime
+DEFAULT_LINKEDIN_URL = "https://www.linkedin.com/in/shankonduru"
+DEFAULT_REPO_URL = "https://github.com/ShanKonduru/data-validation-gini"
+def _safe_int(value):
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return 0
+def _normalize_column_name(value, fallback="unknown"):
+    text = str(value or "").strip()
+    if not text:
+        text = fallback
+    return text.replace(" ", "_")
+def _column_pair_for_mismatch(item):
+    mismatch_type = str(item.get("type", ""))
+    column_value = _normalize_column_name(item.get("column", ""))
+    if mismatch_type == "HEADER_NAME":
+        src_name = _normalize_column_name(item.get("src", ""), fallback=column_value)
+        tgt_name = _normalize_column_name(item.get("tgt", ""), fallback=column_value)
+    else:
+        src_name = column_value
+        tgt_name = column_value
+    return f"src_{src_name}", f"tgt_{tgt_name}"
+def _group_mismatches(mismatches):
+    grouped = {}
+    for item in mismatches:
+        row_type_raw = str(item.get("type", ""))
+        row_num_raw = str(item.get("row", ""))
+        reason_raw = str(item.get("reason", ""))
+        group_key = (row_type_raw, row_num_raw, reason_raw)
+        if group_key not in grouped:
+            grouped[group_key] = {
+                "columns": [],
+                "src_values": [],
+                "tgt_values": [],
+            }
+        src_col, tgt_col = _column_pair_for_mismatch(item)
+        column_pair = f"{src_col},{tgt_col}"
+        if column_pair not in grouped[group_key]["columns"]:
+            grouped[group_key]["columns"].append(column_pair)
+        src_val_raw = str(item.get("src", ""))
+        tgt_val_raw = str(item.get("tgt", ""))
+        src_pair = f"{src_col}={src_val_raw}"
+        tgt_pair = f"{tgt_col}={tgt_val_raw}"
+        if src_pair not in grouped[group_key]["src_values"]:
+            grouped[group_key]["src_values"].append(src_pair)
+        if tgt_pair not in grouped[group_key]["tgt_values"]:
+            grouped[group_key]["tgt_values"].append(tgt_pair)
+    return grouped
+def build_report_metrics(src_rows_count, tgt_rows_count, mismatches):
+    grouped = _group_mismatches(mismatches)
+    failed_rows = set()
+    src_only_rows = set()
+    tgt_only_rows = set()
+    for row_type_raw, row_num_raw, _reason_raw in grouped.keys():
+        row_num = _safe_int(row_num_raw)
+        if row_num <= 1:
+            continue
+        if row_type_raw in {"CELL", "SRC_ONLY", "TGT_ONLY"}:
+            failed_rows.add(row_num)
+        if row_type_raw == "SRC_ONLY":
+            src_only_rows.add(row_num)
+        if row_type_raw == "TGT_ONLY":
+            tgt_only_rows.add(row_num)
+    comparison_rows = max(src_rows_count, tgt_rows_count)
+    failed_count = len(failed_rows)
+    passed_count = max(comparison_rows - failed_count, 0)
+    if comparison_rows > 0:
+        pass_rate = (passed_count / comparison_rows) * 100.0
+        failed_rate = (failed_count / comparison_rows) * 100.0
+    else:
+        pass_rate = 100.0
+        failed_rate = 0.0
+    return {
+        "src_count": src_rows_count,
+        "tgt_count": tgt_rows_count,
+        "passed": passed_count,
+        "failed": failed_count,
+        "pass_rate": pass_rate,
+        "failed_rate": failed_rate,
+        "src_only": len(src_only_rows),
+        "tgt_only": len(tgt_only_rows),
+    }
+def _render_mismatch_rows(mismatches):
+    if not mismatches:
+        return "<tr><td colspan='6'>No mismatches</td></tr>"
+    grouped = _group_mismatches(mismatches)
+    rows = []
+    ordered_keys = sorted(grouped.keys(), key=lambda x: (_safe_int(x[1]), x[0], x[2]))
+    for row_type_raw, row_num_raw, reason_raw in ordered_keys:
+        grouped_item = grouped[(row_type_raw, row_num_raw, reason_raw)]
+        row_type = html.escape(row_type_raw)
+        row_num = html.escape(row_num_raw)
+        # For SRC_ONLY and TGT_ONLY, don't show column names (no row to compare)
+        if row_type_raw in ("SRC_ONLY", "TGT_ONLY"):
+            column_pair = "—"
+        else:
+            unique_cols = set()
+            for pair in grouped_item["columns"]:
+                for col in pair.split(","):
+                    col_clean = col.strip()
+                    if col_clean.startswith("src_"):
+                        col_clean = col_clean[4:]
+                    elif col_clean.startswith("tgt_"):
+                        col_clean = col_clean[4:]
+                    if col_clean:
+                        unique_cols.add(col_clean)
+            column_pair = html.escape(",".join(sorted(unique_cols)))
+        src_val = html.escape(" | ".join(grouped_item["src_values"]))
+        tgt_val = html.escape(" | ".join(grouped_item["tgt_values"]))
+        reason = html.escape(reason_raw)
+        rows.append(
+            "<tr>"
+            f"<td>{row_type}</td>"
+            f"<td>{row_num}</td>"
+            f"<td>{column_pair}</td>"
+            f"<td>{src_val}</td>"
+            f"<td>{tgt_val}</td>"
+            f"<td>{reason}</td>"
+            "</tr>"
+        )
+    return "".join(rows)
+def generate_html_report(
+    passed,
+    summary,
+    src_path,
+    tgt_path,
+    validation_type,
+    mismatches,
+    metrics,
+    linkedin_url=DEFAULT_LINKEDIN_URL,
+    repo_url=DEFAULT_REPO_URL,
+):
+    status_text = "PASSED" if passed else "FAILED"
+    status_color = "#0f5132" if passed else "#7f1d1d"
+    status_bg = "#dcfce7" if passed else "#fee2e2"
+    status_border = "#14532d" if passed else "#991b1b"
+    mismatch_rows = _render_mismatch_rows(mismatches)
+    return f"""<!doctype html>
+<html lang=\"en\">
+<head>
+  <meta charset=\"utf-8\" />
+  <meta name=\"viewport\" content=\"width=device-width,initial-scale=1\" />
+  <title>DVG Validation Report</title>
+  <style>
+    :root {{
+      --bg: #f4f7fb;
+      --surface: #ffffff;
+      --line: #d9e2ec;
+      --text: #0f172a;
+      --muted: #475569;
+      --brand: #0b3b66;
+    }}
+    * {{ box-sizing: border-box; }}
+    body {{
+      margin: 0;
+      font-family: "Segoe UI", Tahoma, sans-serif;
+      color: var(--text);
+      background: radial-gradient(circle at top right, #d9ebff, var(--bg) 35%);
+      min-height: 100vh;
+      display: flex;
+      flex-direction: column;
+    }}
+    .page {{
+      max-width: 1180px;
+      width: 100%;
+      margin: 0 auto;
+      padding: 24px;
+      flex: 1;
+    }}
+    .brand-header {{
+      background: linear-gradient(130deg, var(--brand), #155a96);
+      color: #fff;
+      border-radius: 14px;
+      padding: 20px;
+      box-shadow: 0 8px 24px rgba(17, 24, 39, 0.12);
+      margin-bottom: 18px;
+    }}
+    .brand-title {{ margin: 0; font-size: 30px; letter-spacing: 0.4px; }}
+    .brand-sub {{ margin: 6px 0 0 0; opacity: 0.92; }}
+    .status-pill {{
+      display: inline-block;
+      margin-top: 14px;
+      padding: 8px 14px;
+      border-radius: 999px;
+      font-weight: 700;
+      color: {status_color};
+      background: {status_bg};
+      border: 1px solid {status_border};
+    }}
+    .kpi-grid {{
+      display: grid;
+      grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
+      gap: 12px;
+      margin-bottom: 16px;
+    }}
+    .kpi-card {{
+      background: var(--surface);
+      border: 1px solid var(--line);
+      border-radius: 12px;
+      padding: 12px 14px;
+      box-shadow: 0 4px 14px rgba(15, 23, 42, 0.05);
+    }}
+    .kpi-card.success {{
+      background: #f0fdf4;
+      border-color: #86efac;
+      box-shadow: 0 4px 14px rgba(34, 197, 94, 0.15);
+    }}
+    .kpi-card.danger {{
+      background: #fef2f2;
+      border-color: #fca5a5;
+      box-shadow: 0 4px 14px rgba(239, 68, 68, 0.15);
+    }}
+    .kpi-label {{
+      color: var(--muted);
+      font-size: 12px;
+      text-transform: uppercase;
+      letter-spacing: 0.4px;
+      margin-bottom: 6px;
+    }}
+    .kpi-card.success .kpi-label {{
+      color: #15803d;
+    }}
+    .kpi-card.danger .kpi-label {{
+      color: #991b1b;
+    }}
+    .kpi-value {{ font-size: 24px; font-weight: 700; line-height: 1; }}
+    .kpi-card.success .kpi-value {{
+      color: #15803d;
+    }}
+    .kpi-card.danger .kpi-value {{
+      color: #dc2626;
+    }}
+    .card {{
+      background: var(--surface);
+      border: 1px solid var(--line);
+      border-radius: 12px;
+      padding: 16px;
+      margin-bottom: 16px;
+    }}
+    .meta-grid {{
+      display: grid;
+      grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
+      gap: 8px 16px;
+      color: var(--muted);
+    }}
+    .table-wrap {{
+      width: 100%;
+      overflow-x: auto;
+      overflow-y: auto;
+      max-height: 60vh;
+      border: 1px solid var(--line);
+      border-radius: 10px;
+    }}
+    table {{ width: max-content; min-width: 1400px; border-collapse: collapse; font-size: 13px; }}
+    th, td {{ border: 1px solid #cfd8e3; padding: 8px; text-align: left; vertical-align: top; }}
+    th {{ background: #eef4fb; position: sticky; top: 0; z-index: 1; }}
+    .filters th {{ background: #f8fafc; padding: 6px; position: sticky; top: 38px; z-index: 1; }}
+    .filters input {{ width: 100%; border: 1px solid #cbd5e1; border-radius: 6px; padding: 6px 8px; font-size: 12px; }}
+    .footer {{
+      background: #0b3b66;
+      color: #e2ecf7;
+      padding: 16px 24px;
+      text-align: center;
+      font-size: 14px;
+    }}
+    .footer a {{ color: #f8bf3b; text-decoration: none; font-weight: 600; }}
+    .footer a:hover {{ text-decoration: underline; }}
+    @media (max-width: 720px) {{
+      .brand-title {{ font-size: 24px; }}
+      .filters th {{ top: 74px; }}
+    }}
+  </style>
+</head>
+<body>
+  <main class=\"page\">
+    <header class=\"brand-header\">
+      <h1 class=\"brand-title\">DVG Data Validation Report</h1>
+      <p class=\"brand-sub\">Data Validation Gini | Reconciliation and mismatch diagnostics</p>
+      <span class=\"status-pill\">{status_text}</span>
+    </header>
+    <section class=\"kpi-grid\">
+      <article class=\"kpi-card\"><div class=\"kpi-label\">SRC Count</div><div class=\"kpi-value\">{metrics['src_count']}</div></article>
+      <article class=\"kpi-card\"><div class=\"kpi-label\">TGT Count</div><div class=\"kpi-value\">{metrics['tgt_count']}</div></article>
+      <article class=\"kpi-card success\"><div class=\"kpi-label\">PASSED</div><div class=\"kpi-value\">{metrics['passed']}</div></article>
+      <article class=\"kpi-card danger\"><div class=\"kpi-label\">FAILED</div><div class=\"kpi-value\">{metrics['failed']}</div></article>
+      <article class=\"kpi-card success\"><div class=\"kpi-label\">Pass Rate</div><div class=\"kpi-value\">{metrics['pass_rate']:.2f}%</div></article>
+      <article class=\"kpi-card danger\"><div class=\"kpi-label\">Failed Rate</div><div class=\"kpi-value\">{metrics['failed_rate']:.2f}%</div></article>
+      <article class=\"kpi-card\"><div class=\"kpi-label\">SRC Only</div><div class=\"kpi-value\">{metrics['src_only']}</div></article>
+      <article class=\"kpi-card\"><div class=\"kpi-label\">TGT Only</div><div class=\"kpi-value\">{metrics['tgt_only']}</div></article>
+    </section>
+    <section class=\"card\">
+      <div class=\"meta-grid\">
+        <div><strong>Validation Type:</strong> {html.escape(validation_type)}</div>
+        <div><strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
+        <div><strong>Source:</strong> {html.escape(src_path)}</div>
+        <div><strong>Target:</strong> {html.escape(tgt_path)}</div>
+      </div>
+      <p><strong>Summary:</strong> {html.escape(summary)}</p>
+    </section>
+    <section class=\"card\">
+      <h2 style=\"margin-top:0;\">Mismatch Details</h2>
+      <p style=\"color: var(--muted); margin-top: -4px;\">Use the filter boxes in each column to narrow results.</p>
+      <div class=\"table-wrap\">
+        <table id=\"mismatchTable\">
+          <thead>
+            <tr>
+              <th>Type</th>
+              <th>Row</th>
+              <th>Mismatching Columns</th>
+              <th>Source</th>
+              <th>Target</th>
+              <th>Reason</th>
+            </tr>
+            <tr class=\"filters\">
+              <th><input type=\"text\" oninput=\"filterTable(0, this.value)\" placeholder=\"Filter type\" /></th>
+              <th><input type=\"text\" oninput=\"filterTable(1, this.value)\" placeholder=\"Filter row\" /></th>
+              <th><input type=\"text\" oninput=\"filterTable(2, this.value)\" placeholder=\"Filter mismatching columns\" /></th>
+              <th><input type=\"text\" oninput=\"filterTable(3, this.value)\" placeholder=\"Filter source\" /></th>
+              <th><input type=\"text\" oninput=\"filterTable(4, this.value)\" placeholder=\"Filter target\" /></th>
+              <th><input type=\"text\" oninput=\"filterTable(5, this.value)\" placeholder=\"Filter reason\" /></th>
+            </tr>
+          </thead>
+          <tbody>
+            {mismatch_rows}
+          </tbody>
+        </table>
+      </div>
+    </section>
+  </main>
+  <footer class=\"footer\">
+    <div>Designed and developed by <a href=\"{html.escape(linkedin_url)}\" target=\"_blank\" rel=\"noopener noreferrer\">ShanKonduru</a></div>
+    <div>This project is available at <a href=\"{html.escape(repo_url)}\" target=\"_blank\" rel=\"noopener noreferrer\">GitHub</a></div>
+  </footer>
+  <script>
+    const activeFilters = ["", "", "", "", "", ""];
+    function filterTable(columnIndex, searchValue) {{
+      activeFilters[columnIndex] = (searchValue || "").toLowerCase();
+      const table = document.getElementById("mismatchTable");
+      const rows = table.tBodies[0].rows;
+      for (let i = 0; i < rows.length; i += 1) {{
+        let showRow = true;
+        for (let col = 0; col < activeFilters.length; col += 1) {{
+          const term = activeFilters[col];
+          if (!term) {{
+            continue;
+          }}
+          const cell = rows[i].cells[col];
+          const text = (cell ? cell.textContent : "").toLowerCase();
+          if (text.indexOf(term) == -1) {{
+            showRow = false;
+            break;
+          }}
+        }}
+        rows[i].style.display = showRow ? "" : "none";
+      }}
+    }}
+  </script>
+</body>
+</html>
+"""
+def write_html_report(
+    path,
+    passed,
+    summary,
+    src_path,
+    tgt_path,
+    validation_type,
+    mismatches,
+    src_rows_count,
+    tgt_rows_count,
+    linkedin_url=DEFAULT_LINKEDIN_URL,
+    repo_url=DEFAULT_REPO_URL,
+):
+    metrics = build_report_metrics(src_rows_count, tgt_rows_count, mismatches)
+    report_html = generate_html_report(
+        passed=passed,
+        summary=summary,
+        src_path=src_path,
+        tgt_path=tgt_path,
+        validation_type=validation_type,
+        mismatches=mismatches,
+        metrics=metrics,
+        linkedin_url=linkedin_url,
+        repo_url=repo_url,
+    )
+    with open(path, mode="w", encoding="utf-8") as report_file:
+        report_file.write(report_html)

data_validation_gini-0.1.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,23 @@
+[build-system]
+requires = ["setuptools>=69", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "data-validation-gini"
+version = "0.1.1"
+description = "Data Validation Gini (DVG) CLI for row count and row/column comparison with HTML reports"
+readme = "data_corruptor_readme.txt"
+requires-python = ">=3.9"
+license = { text = "MIT" }
+authors = [
+  { name = "ShanKonduru" }
+]
+dependencies = [
+  "openpyxl"
+]
+[project.scripts]
+dvg = "dvg:main"
+[tool.setuptools]
+py-modules = ["dvg", "dvg_report", "data_corruptor"]

data_validation_gini-0.1.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0