data-validation-gini 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ Metadata-Version: 2.4
2
+ Name: data-validation-gini
3
+ Version: 0.1.1
4
+ Summary: Data Validation Gini (DVG) CLI for row count and row/column comparison with HTML reports
5
+ Author: ShanKonduru
6
+ License: MIT
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/plain
9
+ Requires-Dist: openpyxl
10
+
11
+ nullify
12
+
13
+ Description: Randomly replaces existing column data with a completely blank string (empty field) up to your defined percentage limit.
14
+
15
+ Validation Purpose: Tests whether your framework correctly handles missing keys, detects structural row count imbalances, or drops values during outer-join schema comparisons.
16
+
17
+ case_swap
18
+
19
+ Description: Swaps lowercase letters to uppercase and uppercase letters to lowercase across characters in the string.
20
+
21
+ Validation Purpose: Verifies if your reconciliation system is case-sensitive or if database collation mismatches are causing false-positive validation passes on string lookups.
22
+
23
+ numeric_shift
24
+
25
+ Description: Modifies existing integers or float numbers by adding or subtracting a specific numeric scale factor (e.g., changing values by exactly 0.05).
26
+
27
+ Validation Purpose: Validates micro-level rounding, numeric precision degradation, and floating-point arithmetic tolerances inside financial or statistical data tracking pipelines.
28
+
29
+ date_shift
30
+
31
+ Description: Shifts standard ISO formatted dates or datetime values forward or backward by a targeted count of days.
32
+
33
+ Validation Purpose: Pinpoints synchronization lag problems, server timezone configuration offsets, and temporal boundary filter execution logic.
34
+
35
+ typo
36
+
37
+ Description: Substitutes a single character within an alphanumeric string with a completely different random character or number.
38
+
39
+ Validation Purpose: Breaks text string hash keys (like MD5 or SHA-256 binary signatures) immediately, verifying if your row-level checksum calculations are operating with absolute binary fidelity.
@@ -0,0 +1,127 @@
1
+ #!/usr/bin/env python3
2
+ import csv
3
+ import argparse
4
+ import random
5
+ import sys
6
+ from datetime import datetime, timedelta
7
+
8
+ def parse_arguments():
9
+ parser = argparse.ArgumentParser(
10
+ description="Inject controlled data variations/corruptions into CSV files for validation testing."
11
+ )
12
+ parser.add_argument("-i", "--input", required=True, help="Path to the input baseline CSV file")
13
+ parser.add_argument("-o", "--output", required=True, help="Path to save the mutated output CSV file")
14
+ parser.add_argument("-c", "--column", required=True, help="Column name to apply variations to")
15
+ parser.add_argument("-p", "--percentage", type=float, default=5.0,
16
+ help="Percentage of rows to alter in the specified column (0.0 to 100.0)")
17
+ parser.add_argument("-t", "--type", required=True,
18
+ choices=["nullify", "case_swap", "numeric_shift", "date_shift", "typo"],
19
+ help="Type of variation/corruption to inject")
20
+ parser.add_argument("-v", "--value", type=float, default=1.0,
21
+ help="Adjustment value: numeric shift amount, days to shift a date, or typo count")
22
+ return parser.parse_args()
23
+
24
+ def inject_variation(val, mutation_type, intensity_val):
25
+ """Applies specific transformation logic based on user selection."""
26
+ if not val or val.strip().upper() == "NULL":
27
+ return val # Leave existing nulls as-is to preserve base structure
28
+
29
+ if mutation_type == "nullify":
30
+ return "" # Creates a blank/null discrepancy
31
+
32
+ elif mutation_type == "case_swap":
33
+ return val.swapcase() # Verifies case-sensitivity checks
34
+
35
+ elif mutation_type == "numeric_shift":
36
+ try:
37
+ # Detect integer vs float to maintain formatting style
38
+ if "." in val:
39
+ return str(round(float(val) + intensity_val, 2))
40
+ return str(int(val) + int(intensity_val))
41
+ except ValueError:
42
+ return val + "_SHIFT_ERR" # Fallback if applied to non-numeric column
43
+
44
+ elif mutation_type == "date_shift":
45
+ # Supports standard YYYY-MM-DD formats
46
+ for fmt in ("%Y-%m-%d", "%Y-%m-%d %H:%M:%S"):
47
+ try:
48
+ dt = datetime.strptime(val, fmt)
49
+ mutated_dt = dt + timedelta(days=intensity_val)
50
+ return mutated_dt.strftime(fmt)
51
+ except ValueError:
52
+ continue
53
+ return val + "_DATE_ERR"
54
+
55
+ elif mutation_type == "typo":
56
+ # Randomly changes characters to test string fuzzy matches or exact row hashes
57
+ if len(val) <= 1:
58
+ return val + "X"
59
+ idx = random.randint(0, len(val) - 1)
60
+ char_pool = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
61
+ new_char = random.choice(char_pool.replace(val[idx], ""))
62
+ val_list = list(val)
63
+ val_list[idx] = new_char
64
+ return "".join(val_list)
65
+
66
+ return val
67
+
68
+ def main():
69
+ args = parse_arguments()
70
+
71
+ if not 0 <= args.percentage <= 100:
72
+ print("Error: Percentage must be between 0 and 100.")
73
+ sys.exit(1)
74
+
75
+ try:
76
+ with open(args.input, mode="r", newline="", encoding="utf-8") as f:
77
+ reader = list(csv.reader(f))
78
+ except FileNotFoundError:
79
+ print(f"Error: Input file '{args.input}' not found.")
80
+ sys.exit(1)
81
+
82
+ if not reader:
83
+ print("Error: The input CSV file is empty.")
84
+ sys.exit(1)
85
+
86
+ header = reader[0]
87
+ rows = reader[1:]
88
+
89
+ if args.column not in header:
90
+ print(f"Error: Column '{args.column}' not found in the CSV header.")
91
+ print(f"Available columns: {', '.join(header)}")
92
+ sys.exit(1)
93
+
94
+ col_idx = header.index(args.column)
95
+ total_rows = len(rows)
96
+
97
+ # Calculate exactly how many rows need to be altered
98
+ mutation_count = int(round((args.percentage / 100.0) * total_rows))
99
+ if mutation_count == 0 and args.percentage > 0:
100
+ mutation_count = 1 # Guarantee at least one alteration if percentage > 0
101
+
102
+ # Select distinct random row indexes to target
103
+ target_row_indices = set(random.sample(range(total_rows), mutation_count))
104
+
105
+ print(f"--- Data Variation Injection Active ---")
106
+ print(f"Targeting Column : '{args.column}' (Index {col_idx})")
107
+ print(f"Mutation Strategy: {args.type}")
108
+ print(f"Execution Volume : Modifying {mutation_count} out of {total_rows} rows ({args.percentage}%)")
109
+
110
+ altered_count = 0
111
+ for idx in range(total_rows):
112
+ if idx in target_row_indices:
113
+ original_value = rows[idx][col_idx]
114
+ new_value = inject_variation(original_value, args.type, args.value)
115
+ rows[idx][col_idx] = new_value
116
+ altered_count += 1
117
+
118
+ # Write out the corrupted dataset
119
+ with open(args.output, mode="w", newline="", encoding="utf-8") as f:
120
+ writer = csv.writer(f)
121
+ writer.writerow(header)
122
+ writer.writerows(rows)
123
+
124
+ print(f"Success: Saved modified file to '{args.output}'. Verification runs ready.")
125
+
126
+ if __name__ == "__main__":
127
+ main()
@@ -0,0 +1,29 @@
1
+ nullify
2
+
3
+ Description: Randomly replaces existing column data with a completely blank string (empty field) up to your defined percentage limit.
4
+
5
+ Validation Purpose: Tests whether your framework correctly handles missing keys, detects structural row count imbalances, or drops values during outer-join schema comparisons.
6
+
7
+ case_swap
8
+
9
+ Description: Swaps lowercase letters to uppercase and uppercase letters to lowercase across characters in the string.
10
+
11
+ Validation Purpose: Verifies if your reconciliation system is case-sensitive or if database collation mismatches are causing false-positive validation passes on string lookups.
12
+
13
+ numeric_shift
14
+
15
+ Description: Modifies existing integers or float numbers by adding or subtracting a specific numeric scale factor (e.g., changing values by exactly 0.05).
16
+
17
+ Validation Purpose: Validates micro-level rounding, numeric precision degradation, and floating-point arithmetic tolerances inside financial or statistical data tracking pipelines.
18
+
19
+ date_shift
20
+
21
+ Description: Shifts standard ISO formatted dates or datetime values forward or backward by a targeted count of days.
22
+
23
+ Validation Purpose: Pinpoints synchronization lag problems, server timezone configuration offsets, and temporal boundary filter execution logic.
24
+
25
+ typo
26
+
27
+ Description: Substitutes a single character within an alphanumeric string with a completely different random character or number.
28
+
29
+ Validation Purpose: Breaks text string hash keys (like MD5 or SHA-256 binary signatures) immediately, verifying if your row-level checksum calculations are operating with absolute binary fidelity.
@@ -0,0 +1,39 @@
1
+ Metadata-Version: 2.4
2
+ Name: data-validation-gini
3
+ Version: 0.1.1
4
+ Summary: Data Validation Gini (DVG) CLI for row count and row/column comparison with HTML reports
5
+ Author: ShanKonduru
6
+ License: MIT
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/plain
9
+ Requires-Dist: openpyxl
10
+
11
+ nullify
12
+
13
+ Description: Randomly replaces existing column data with a completely blank string (empty field) up to your defined percentage limit.
14
+
15
+ Validation Purpose: Tests whether your framework correctly handles missing keys, detects structural row count imbalances, or drops values during outer-join schema comparisons.
16
+
17
+ case_swap
18
+
19
+ Description: Swaps lowercase letters to uppercase and uppercase letters to lowercase across characters in the string.
20
+
21
+ Validation Purpose: Verifies if your reconciliation system is case-sensitive or if database collation mismatches are causing false-positive validation passes on string lookups.
22
+
23
+ numeric_shift
24
+
25
+ Description: Modifies existing integers or float numbers by adding or subtracting a specific numeric scale factor (e.g., changing values by exactly 0.05).
26
+
27
+ Validation Purpose: Validates micro-level rounding, numeric precision degradation, and floating-point arithmetic tolerances inside financial or statistical data tracking pipelines.
28
+
29
+ date_shift
30
+
31
+ Description: Shifts standard ISO formatted dates or datetime values forward or backward by a targeted count of days.
32
+
33
+ Validation Purpose: Pinpoints synchronization lag problems, server timezone configuration offsets, and temporal boundary filter execution logic.
34
+
35
+ typo
36
+
37
+ Description: Substitutes a single character within an alphanumeric string with a completely different random character or number.
38
+
39
+ Validation Purpose: Breaks text string hash keys (like MD5 or SHA-256 binary signatures) immediately, verifying if your row-level checksum calculations are operating with absolute binary fidelity.
@@ -0,0 +1,11 @@
1
+ data_corruptor.py
2
+ data_corruptor_readme.txt
3
+ dvg.py
4
+ dvg_report.py
5
+ pyproject.toml
6
+ data_validation_gini.egg-info/PKG-INFO
7
+ data_validation_gini.egg-info/SOURCES.txt
8
+ data_validation_gini.egg-info/dependency_links.txt
9
+ data_validation_gini.egg-info/entry_points.txt
10
+ data_validation_gini.egg-info/requires.txt
11
+ data_validation_gini.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ dvg = dvg:main
@@ -0,0 +1,3 @@
1
+ data_corruptor
2
+ dvg
3
+ dvg_report
@@ -0,0 +1,413 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import csv
4
+ import os
5
+ import sys
6
+ from datetime import datetime
7
+
8
+ from dvg_report import write_html_report
9
+
10
+
11
+ def parse_args():
12
+ parser = argparse.ArgumentParser(
13
+ description="Data Validation Gini CLI (dvg): compare source and target files."
14
+ )
15
+ parser.add_argument(
16
+ "--file-type",
17
+ required=True,
18
+ choices=["EXCEL"],
19
+ help="Input file type. Use EXCEL for .csv/.xlsx/.xlsm/.xltx files.",
20
+ )
21
+ parser.add_argument("--src-path", required=True, help="Path to source file")
22
+ parser.add_argument("--tgt-path", required=True, help="Path to target file")
23
+ parser.add_argument(
24
+ "--validation-type",
25
+ required=True,
26
+ help="Validation mode: ROWCOUNT, ROW_COL_VALIDATION, or both (e.g., 'ROWCOUNT,ROW_COL_VALIDATION')",
27
+ )
28
+ parser.add_argument(
29
+ "--html-output",
30
+ required=False,
31
+ help="Optional HTML report path. Supports <datetime> token.",
32
+ )
33
+ parser.add_argument(
34
+ "--src-sheet",
35
+ required=False,
36
+ help="Optional source sheet name for Excel input.",
37
+ )
38
+ parser.add_argument(
39
+ "--tgt-sheet",
40
+ required=False,
41
+ help="Optional target sheet name for Excel input.",
42
+ )
43
+ parser.add_argument(
44
+ "--sheet-mapping",
45
+ required=False,
46
+ help="Optional Excel sheet mapping, e.g. 'SRC1:TGT1,SRC2:TGT2'.",
47
+ )
48
+ return parser.parse_args()
49
+
50
+
51
+ def normalize_cell(value):
52
+ if value is None:
53
+ return ""
54
+ return str(value)
55
+
56
+
57
+ def trim_trailing_empty(values):
58
+ result = list(values)
59
+ while result and result[-1] == "":
60
+ result.pop()
61
+ return result
62
+
63
+
64
+ def load_csv(path):
65
+ with open(path, mode="r", encoding="utf-8-sig", newline="") as f:
66
+ raw = list(csv.reader(f))
67
+ if not raw:
68
+ return [], []
69
+ header = trim_trailing_empty([normalize_cell(v) for v in raw[0]])
70
+ rows = [trim_trailing_empty([normalize_cell(v) for v in row]) for row in raw[1:]]
71
+ return header, rows
72
+
73
+
74
+ def load_excel(path, sheet_name=None):
75
+ try:
76
+ from openpyxl import load_workbook
77
+ except ImportError:
78
+ print("FAILED")
79
+ print("Reason: openpyxl is required for .xlsx/.xlsm/.xltx files. Install it with: pip install openpyxl")
80
+ sys.exit(1)
81
+
82
+ wb = load_workbook(path, read_only=True, data_only=True)
83
+ ws = wb.active
84
+ if sheet_name:
85
+ if sheet_name not in wb.sheetnames:
86
+ wb.close()
87
+ print("FAILED")
88
+ print(f"Reason: Sheet '{sheet_name}' not found in file: {path}")
89
+ sys.exit(1)
90
+ ws = wb[sheet_name]
91
+ raw = []
92
+ for row in ws.iter_rows(values_only=True):
93
+ raw.append(trim_trailing_empty([normalize_cell(v) for v in row]))
94
+ wb.close()
95
+
96
+ if not raw:
97
+ return [], []
98
+ header = raw[0]
99
+ rows = raw[1:]
100
+ return header, rows
101
+
102
+
103
+ def load_table(path, sheet_name=None):
104
+ ext = os.path.splitext(path)[1].lower()
105
+ if ext == ".csv":
106
+ if sheet_name:
107
+ print("FAILED")
108
+ print(f"Reason: --src-sheet/--tgt-sheet requires Excel input, got CSV: {path}")
109
+ sys.exit(1)
110
+ return load_csv(path)
111
+ if ext in {".xlsx", ".xlsm", ".xltx"}:
112
+ return load_excel(path, sheet_name=sheet_name)
113
+ print("FAILED")
114
+ print(f"Reason: Unsupported file extension '{ext}'. Supported: .csv, .xlsx, .xlsm, .xltx")
115
+ sys.exit(1)
116
+
117
+
118
+ def is_excel_path(path):
119
+ return os.path.splitext(path)[1].lower() in {".xlsx", ".xlsm", ".xltx"}
120
+
121
+
122
+ def parse_sheet_mapping(mapping_text):
123
+ pairs = []
124
+ for token in (mapping_text or "").split(","):
125
+ item = token.strip()
126
+ if not item:
127
+ continue
128
+ if ":" not in item:
129
+ print("FAILED")
130
+ print(
131
+ "Reason: Invalid --sheet-mapping format. Use 'SRC1:TGT1,SRC2:TGT2'."
132
+ )
133
+ sys.exit(1)
134
+ src_sheet, tgt_sheet = item.split(":", 1)
135
+ src_sheet = src_sheet.strip()
136
+ tgt_sheet = tgt_sheet.strip()
137
+ if not src_sheet or not tgt_sheet:
138
+ print("FAILED")
139
+ print(
140
+ "Reason: Invalid --sheet-mapping pair. Both source and target sheet names are required."
141
+ )
142
+ sys.exit(1)
143
+ pairs.append((src_sheet, tgt_sheet))
144
+
145
+ if not pairs:
146
+ print("FAILED")
147
+ print("Reason: --sheet-mapping is empty. Use format 'SRC1:TGT1,SRC2:TGT2'.")
148
+ sys.exit(1)
149
+
150
+ return pairs
151
+
152
+
153
+ def compare_rowcount(src_rows, tgt_rows):
154
+ src_count = len(src_rows)
155
+ tgt_count = len(tgt_rows)
156
+ passed = src_count == tgt_count
157
+ if passed:
158
+ reason = f"Row counts match: src={src_count}, tgt={tgt_count}."
159
+ else:
160
+ reason = f"Row count mismatch: src={src_count}, tgt={tgt_count}."
161
+ return passed, reason, []
162
+
163
+
164
+ def compare_row_col(src_header, src_rows, tgt_header, tgt_rows):
165
+ mismatches = []
166
+
167
+ def classify_mismatch(src_val, tgt_val):
168
+ src_blank = str(src_val).strip() == ""
169
+ tgt_blank = str(tgt_val).strip() == ""
170
+ if (not src_blank) and tgt_blank:
171
+ return "SRC_ONLY", "Source-only value (target missing or blank)"
172
+ if src_blank and (not tgt_blank):
173
+ return "TGT_ONLY", "Target-only value (source missing or blank)"
174
+ return "CELL", "Cell value mismatch"
175
+
176
+ def build_row_map(header, rows):
177
+ preferred_keys = ["employee_id", "id", "emp_id", "record_id", "pk"]
178
+ lowered = [str(h).strip().lower() for h in header]
179
+
180
+ key_idx = None
181
+ for candidate in preferred_keys:
182
+ if candidate in lowered:
183
+ key_idx = lowered.index(candidate)
184
+ break
185
+
186
+ # Fallback to first column if no standard key column is present.
187
+ if key_idx is None and header:
188
+ key_idx = 0
189
+
190
+ row_map = {}
191
+ if key_idx is None:
192
+ return row_map, key_idx
193
+
194
+ for row_idx, row in enumerate(rows):
195
+ key_val = row[key_idx] if key_idx < len(row) else ""
196
+ key_text = str(key_val).strip()
197
+ if key_text == "":
198
+ # Deterministic fallback key for blank ID values.
199
+ key_text = f"__row_{row_idx + 2}"
200
+ row_map[key_text] = (row_idx, row)
201
+
202
+ return row_map, key_idx
203
+
204
+ if len(src_header) != len(tgt_header):
205
+ mismatches.append(
206
+ {
207
+ "type": "HEADER_LENGTH",
208
+ "row": 1,
209
+ "column": "<HEADER>",
210
+ "src": str(len(src_header)),
211
+ "tgt": str(len(tgt_header)),
212
+ "reason": "Header column count mismatch",
213
+ }
214
+ )
215
+
216
+ header_limit = max(len(src_header), len(tgt_header))
217
+ for col_idx in range(header_limit):
218
+ src_col = src_header[col_idx] if col_idx < len(src_header) else ""
219
+ tgt_col = tgt_header[col_idx] if col_idx < len(tgt_header) else ""
220
+ if src_col != tgt_col:
221
+ mismatches.append(
222
+ {
223
+ "type": "HEADER_NAME",
224
+ "row": 1,
225
+ "column": f"col_{col_idx + 1}",
226
+ "src": src_col,
227
+ "tgt": tgt_col,
228
+ "reason": "Header name mismatch",
229
+ }
230
+ )
231
+
232
+ if len(src_rows) != len(tgt_rows):
233
+ mismatches.append(
234
+ {
235
+ "type": "ROWCOUNT",
236
+ "row": 0,
237
+ "column": "<ROWCOUNT>",
238
+ "src": str(len(src_rows)),
239
+ "tgt": str(len(tgt_rows)),
240
+ "reason": "Data row count mismatch",
241
+ }
242
+ )
243
+
244
+ src_map, _ = build_row_map(src_header, src_rows)
245
+ tgt_map, _ = build_row_map(tgt_header, tgt_rows)
246
+
247
+ all_keys = sorted(set(src_map.keys()) | set(tgt_map.keys()))
248
+
249
+ for key in all_keys:
250
+ src_entry = src_map.get(key)
251
+ tgt_entry = tgt_map.get(key)
252
+
253
+ src_row_idx = src_entry[0] if src_entry else None
254
+ tgt_row_idx = tgt_entry[0] if tgt_entry else None
255
+ src_row = src_entry[1] if src_entry else []
256
+ tgt_row = tgt_entry[1] if tgt_entry else []
257
+
258
+ display_row = (src_row_idx if src_row_idx is not None else tgt_row_idx) + 2
259
+ col_limit = max(len(src_header), len(tgt_header), len(src_row), len(tgt_row))
260
+
261
+ for col_idx in range(col_limit):
262
+ src_val = src_row[col_idx] if col_idx < len(src_row) else ""
263
+ tgt_val = tgt_row[col_idx] if col_idx < len(tgt_row) else ""
264
+ if src_val == tgt_val:
265
+ continue
266
+
267
+ if col_idx < len(src_header):
268
+ col_name = src_header[col_idx]
269
+ elif col_idx < len(tgt_header):
270
+ col_name = tgt_header[col_idx]
271
+ else:
272
+ col_name = f"col_{col_idx + 1}"
273
+
274
+ mismatch_type, mismatch_reason = classify_mismatch(src_val, tgt_val)
275
+ mismatches.append(
276
+ {
277
+ "type": mismatch_type,
278
+ "row": display_row,
279
+ "column": col_name,
280
+ "src": src_val,
281
+ "tgt": tgt_val,
282
+ "reason": mismatch_reason,
283
+ }
284
+ )
285
+
286
+ passed = len(mismatches) == 0
287
+ if passed:
288
+ reason = "All rows and columns match."
289
+ else:
290
+ reason = f"Found {len(mismatches)} mismatch(es) across headers/rows/columns."
291
+ return passed, reason, mismatches
292
+
293
+
294
+ def resolve_output_path(path):
295
+ if not path:
296
+ return None
297
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
298
+ resolved = path.replace("<datetime>", ts).replace("<datetime<", ts)
299
+ resolved = os.path.normpath(resolved)
300
+ out_dir = os.path.dirname(resolved)
301
+ if out_dir:
302
+ os.makedirs(out_dir, exist_ok=True)
303
+ return resolved
304
+
305
+
306
+ def main():
307
+ args = parse_args()
308
+
309
+ if not os.path.exists(args.src_path):
310
+ print("FAILED")
311
+ print(f"Reason: Source file not found: {args.src_path}")
312
+ sys.exit(1)
313
+ if not os.path.exists(args.tgt_path):
314
+ print("FAILED")
315
+ print(f"Reason: Target file not found: {args.tgt_path}")
316
+ sys.exit(1)
317
+
318
+ if args.sheet_mapping and (not is_excel_path(args.src_path) or not is_excel_path(args.tgt_path)):
319
+ print("FAILED")
320
+ print("Reason: --sheet-mapping is supported only for Excel files (.xlsx/.xlsm/.xltx).")
321
+ sys.exit(1)
322
+
323
+ if args.sheet_mapping:
324
+ sheet_pairs = parse_sheet_mapping(args.sheet_mapping)
325
+ else:
326
+ sheet_pairs = [(args.src_sheet, args.tgt_sheet)]
327
+
328
+ # Parse validation types (support comma-separated values)
329
+ validation_types_raw = args.validation_type.split(",")
330
+ validation_types = [vt.strip().upper() for vt in validation_types_raw]
331
+
332
+ # Validate that all requested types are supported
333
+ supported = {"ROWCOUNT", "ROW_COL_VALIDATION"}
334
+ for vt in validation_types:
335
+ if vt not in supported:
336
+ print("FAILED")
337
+ print(f"Reason: Unsupported validation type '{vt}'. Supported: ROWCOUNT, ROW_COL_VALIDATION")
338
+ sys.exit(1)
339
+
340
+ # Run validations across configured sheet pair(s)
341
+ all_mismatches = []
342
+ all_passed = True
343
+ reasons = []
344
+ total_src_rows = 0
345
+ total_tgt_rows = 0
346
+
347
+ for src_sheet_name, tgt_sheet_name in sheet_pairs:
348
+ src_header, src_rows = load_table(args.src_path, sheet_name=src_sheet_name)
349
+ tgt_header, tgt_rows = load_table(args.tgt_path, sheet_name=tgt_sheet_name)
350
+
351
+ total_src_rows += len(src_rows)
352
+ total_tgt_rows += len(tgt_rows)
353
+
354
+ if src_sheet_name or tgt_sheet_name:
355
+ sheet_label = f"{src_sheet_name or '<active>'}->{tgt_sheet_name or '<active>'}"
356
+ else:
357
+ sheet_label = "<active_sheet>"
358
+
359
+ if "ROWCOUNT" in validation_types:
360
+ rc_passed, rc_reason, rc_mismatches = compare_rowcount(src_rows, tgt_rows)
361
+ all_passed = all_passed and rc_passed
362
+ reasons.append(f"[{sheet_label}][ROWCOUNT] {rc_reason}")
363
+ for item in rc_mismatches:
364
+ tagged = dict(item)
365
+ tagged["reason"] = f"[{sheet_label}] {item.get('reason', '')}"
366
+ all_mismatches.append(tagged)
367
+
368
+ if "ROW_COL_VALIDATION" in validation_types:
369
+ row_col_passed, row_col_reason, row_col_mismatches = compare_row_col(
370
+ src_header, src_rows, tgt_header, tgt_rows
371
+ )
372
+ all_passed = all_passed and row_col_passed
373
+ reasons.append(f"[{sheet_label}][ROW_COL_VALIDATION] {row_col_reason}")
374
+ for item in row_col_mismatches:
375
+ tagged = dict(item)
376
+ tagged["reason"] = f"[{sheet_label}] {item.get('reason', '')}"
377
+ all_mismatches.append(tagged)
378
+
379
+ passed = all_passed
380
+ reason = " | ".join(reasons)
381
+
382
+ print("PASSED" if passed else "FAILED")
383
+ print(f"Reason: {reason}")
384
+
385
+ if not passed and all_mismatches:
386
+ preview = all_mismatches[:10]
387
+ print("Mismatch preview (first 10):")
388
+ for item in preview:
389
+ print(
390
+ f" - type={item['type']} row={item['row']} column={item['column']} "
391
+ f"src='{item['src']}' tgt='{item['tgt']}' reason='{item['reason']}'"
392
+ )
393
+
394
+ if args.html_output:
395
+ out_path = resolve_output_path(args.html_output)
396
+ write_html_report(
397
+ path=out_path,
398
+ passed=passed,
399
+ summary=reason,
400
+ src_path=args.src_path,
401
+ tgt_path=args.tgt_path,
402
+ validation_type=args.validation_type,
403
+ mismatches=all_mismatches,
404
+ src_rows_count=total_src_rows,
405
+ tgt_rows_count=total_tgt_rows,
406
+ )
407
+ print(f"HTML report: {out_path}")
408
+
409
+ sys.exit(0 if passed else 1)
410
+
411
+
412
+ if __name__ == "__main__":
413
+ main()
@@ -0,0 +1,459 @@
1
+ import html
2
+ from datetime import datetime
3
+
4
+
5
+ DEFAULT_LINKEDIN_URL = "https://www.linkedin.com/in/shankonduru"
6
+ DEFAULT_REPO_URL = "https://github.com/ShanKonduru/data-validation-gini"
7
+
8
+
9
+ def _safe_int(value):
10
+ try:
11
+ return int(value)
12
+ except (TypeError, ValueError):
13
+ return 0
14
+
15
+
16
+ def _normalize_column_name(value, fallback="unknown"):
17
+ text = str(value or "").strip()
18
+ if not text:
19
+ text = fallback
20
+ return text.replace(" ", "_")
21
+
22
+
23
+ def _column_pair_for_mismatch(item):
24
+ mismatch_type = str(item.get("type", ""))
25
+ column_value = _normalize_column_name(item.get("column", ""))
26
+
27
+ if mismatch_type == "HEADER_NAME":
28
+ src_name = _normalize_column_name(item.get("src", ""), fallback=column_value)
29
+ tgt_name = _normalize_column_name(item.get("tgt", ""), fallback=column_value)
30
+ else:
31
+ src_name = column_value
32
+ tgt_name = column_value
33
+
34
+ return f"src_{src_name}", f"tgt_{tgt_name}"
35
+
36
+
37
+ def _group_mismatches(mismatches):
38
+ grouped = {}
39
+ for item in mismatches:
40
+ row_type_raw = str(item.get("type", ""))
41
+ row_num_raw = str(item.get("row", ""))
42
+ reason_raw = str(item.get("reason", ""))
43
+ group_key = (row_type_raw, row_num_raw, reason_raw)
44
+
45
+ if group_key not in grouped:
46
+ grouped[group_key] = {
47
+ "columns": [],
48
+ "src_values": [],
49
+ "tgt_values": [],
50
+ }
51
+
52
+ src_col, tgt_col = _column_pair_for_mismatch(item)
53
+ column_pair = f"{src_col},{tgt_col}"
54
+ if column_pair not in grouped[group_key]["columns"]:
55
+ grouped[group_key]["columns"].append(column_pair)
56
+
57
+ src_val_raw = str(item.get("src", ""))
58
+ tgt_val_raw = str(item.get("tgt", ""))
59
+ src_pair = f"{src_col}={src_val_raw}"
60
+ tgt_pair = f"{tgt_col}={tgt_val_raw}"
61
+ if src_pair not in grouped[group_key]["src_values"]:
62
+ grouped[group_key]["src_values"].append(src_pair)
63
+ if tgt_pair not in grouped[group_key]["tgt_values"]:
64
+ grouped[group_key]["tgt_values"].append(tgt_pair)
65
+
66
+ return grouped
67
+
68
+
69
+ def build_report_metrics(src_rows_count, tgt_rows_count, mismatches):
70
+ grouped = _group_mismatches(mismatches)
71
+ failed_rows = set()
72
+ src_only_rows = set()
73
+ tgt_only_rows = set()
74
+
75
+ for row_type_raw, row_num_raw, _reason_raw in grouped.keys():
76
+ row_num = _safe_int(row_num_raw)
77
+ if row_num <= 1:
78
+ continue
79
+
80
+ if row_type_raw in {"CELL", "SRC_ONLY", "TGT_ONLY"}:
81
+ failed_rows.add(row_num)
82
+ if row_type_raw == "SRC_ONLY":
83
+ src_only_rows.add(row_num)
84
+ if row_type_raw == "TGT_ONLY":
85
+ tgt_only_rows.add(row_num)
86
+
87
+ comparison_rows = max(src_rows_count, tgt_rows_count)
88
+ failed_count = len(failed_rows)
89
+ passed_count = max(comparison_rows - failed_count, 0)
90
+
91
+ if comparison_rows > 0:
92
+ pass_rate = (passed_count / comparison_rows) * 100.0
93
+ failed_rate = (failed_count / comparison_rows) * 100.0
94
+ else:
95
+ pass_rate = 100.0
96
+ failed_rate = 0.0
97
+
98
+ return {
99
+ "src_count": src_rows_count,
100
+ "tgt_count": tgt_rows_count,
101
+ "passed": passed_count,
102
+ "failed": failed_count,
103
+ "pass_rate": pass_rate,
104
+ "failed_rate": failed_rate,
105
+ "src_only": len(src_only_rows),
106
+ "tgt_only": len(tgt_only_rows),
107
+ }
108
+
109
+
110
+ def _render_mismatch_rows(mismatches):
111
+ if not mismatches:
112
+ return "<tr><td colspan='6'>No mismatches</td></tr>"
113
+
114
+ grouped = _group_mismatches(mismatches)
115
+ rows = []
116
+ ordered_keys = sorted(grouped.keys(), key=lambda x: (_safe_int(x[1]), x[0], x[2]))
117
+
118
+ for row_type_raw, row_num_raw, reason_raw in ordered_keys:
119
+ grouped_item = grouped[(row_type_raw, row_num_raw, reason_raw)]
120
+ row_type = html.escape(row_type_raw)
121
+ row_num = html.escape(row_num_raw)
122
+
123
+ # For SRC_ONLY and TGT_ONLY, don't show column names (no row to compare)
124
+ if row_type_raw in ("SRC_ONLY", "TGT_ONLY"):
125
+ column_pair = "—"
126
+ else:
127
+ unique_cols = set()
128
+ for pair in grouped_item["columns"]:
129
+ for col in pair.split(","):
130
+ col_clean = col.strip()
131
+ if col_clean.startswith("src_"):
132
+ col_clean = col_clean[4:]
133
+ elif col_clean.startswith("tgt_"):
134
+ col_clean = col_clean[4:]
135
+ if col_clean:
136
+ unique_cols.add(col_clean)
137
+ column_pair = html.escape(",".join(sorted(unique_cols)))
138
+
139
+ src_val = html.escape(" | ".join(grouped_item["src_values"]))
140
+ tgt_val = html.escape(" | ".join(grouped_item["tgt_values"]))
141
+ reason = html.escape(reason_raw)
142
+ rows.append(
143
+ "<tr>"
144
+ f"<td>{row_type}</td>"
145
+ f"<td>{row_num}</td>"
146
+ f"<td>{column_pair}</td>"
147
+ f"<td>{src_val}</td>"
148
+ f"<td>{tgt_val}</td>"
149
+ f"<td>{reason}</td>"
150
+ "</tr>"
151
+ )
152
+
153
+ return "".join(rows)
154
+
155
+
156
+ def generate_html_report(
157
+ passed,
158
+ summary,
159
+ src_path,
160
+ tgt_path,
161
+ validation_type,
162
+ mismatches,
163
+ metrics,
164
+ linkedin_url=DEFAULT_LINKEDIN_URL,
165
+ repo_url=DEFAULT_REPO_URL,
166
+ ):
167
+ status_text = "PASSED" if passed else "FAILED"
168
+ status_color = "#0f5132" if passed else "#7f1d1d"
169
+ status_bg = "#dcfce7" if passed else "#fee2e2"
170
+ status_border = "#14532d" if passed else "#991b1b"
171
+
172
+ mismatch_rows = _render_mismatch_rows(mismatches)
173
+
174
+ return f"""<!doctype html>
175
+ <html lang=\"en\">
176
+ <head>
177
+ <meta charset=\"utf-8\" />
178
+ <meta name=\"viewport\" content=\"width=device-width,initial-scale=1\" />
179
+ <title>DVG Validation Report</title>
180
+ <style>
181
+ :root {{
182
+ --bg: #f4f7fb;
183
+ --surface: #ffffff;
184
+ --line: #d9e2ec;
185
+ --text: #0f172a;
186
+ --muted: #475569;
187
+ --brand: #0b3b66;
188
+ }}
189
+
190
+ * {{ box-sizing: border-box; }}
191
+
192
+ body {{
193
+ margin: 0;
194
+ font-family: "Segoe UI", Tahoma, sans-serif;
195
+ color: var(--text);
196
+ background: radial-gradient(circle at top right, #d9ebff, var(--bg) 35%);
197
+ min-height: 100vh;
198
+ display: flex;
199
+ flex-direction: column;
200
+ }}
201
+
202
+ .page {{
203
+ max-width: 1180px;
204
+ width: 100%;
205
+ margin: 0 auto;
206
+ padding: 24px;
207
+ flex: 1;
208
+ }}
209
+
210
+ .brand-header {{
211
+ background: linear-gradient(130deg, var(--brand), #155a96);
212
+ color: #fff;
213
+ border-radius: 14px;
214
+ padding: 20px;
215
+ box-shadow: 0 8px 24px rgba(17, 24, 39, 0.12);
216
+ margin-bottom: 18px;
217
+ }}
218
+
219
+ .brand-title {{ margin: 0; font-size: 30px; letter-spacing: 0.4px; }}
220
+ .brand-sub {{ margin: 6px 0 0 0; opacity: 0.92; }}
221
+
222
+ .status-pill {{
223
+ display: inline-block;
224
+ margin-top: 14px;
225
+ padding: 8px 14px;
226
+ border-radius: 999px;
227
+ font-weight: 700;
228
+ color: {status_color};
229
+ background: {status_bg};
230
+ border: 1px solid {status_border};
231
+ }}
232
+
233
+ .kpi-grid {{
234
+ display: grid;
235
+ grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
236
+ gap: 12px;
237
+ margin-bottom: 16px;
238
+ }}
239
+
240
+ .kpi-card {{
241
+ background: var(--surface);
242
+ border: 1px solid var(--line);
243
+ border-radius: 12px;
244
+ padding: 12px 14px;
245
+ box-shadow: 0 4px 14px rgba(15, 23, 42, 0.05);
246
+ }}
247
+
248
+ .kpi-card.success {{
249
+ background: #f0fdf4;
250
+ border-color: #86efac;
251
+ box-shadow: 0 4px 14px rgba(34, 197, 94, 0.15);
252
+ }}
253
+
254
+ .kpi-card.danger {{
255
+ background: #fef2f2;
256
+ border-color: #fca5a5;
257
+ box-shadow: 0 4px 14px rgba(239, 68, 68, 0.15);
258
+ }}
259
+
260
+ .kpi-label {{
261
+ color: var(--muted);
262
+ font-size: 12px;
263
+ text-transform: uppercase;
264
+ letter-spacing: 0.4px;
265
+ margin-bottom: 6px;
266
+ }}
267
+
268
+ .kpi-card.success .kpi-label {{
269
+ color: #15803d;
270
+ }}
271
+
272
+ .kpi-card.danger .kpi-label {{
273
+ color: #991b1b;
274
+ }}
275
+
276
+ .kpi-value {{ font-size: 24px; font-weight: 700; line-height: 1; }}
277
+
278
+ .kpi-card.success .kpi-value {{
279
+ color: #15803d;
280
+ }}
281
+
282
+ .kpi-card.danger .kpi-value {{
283
+ color: #dc2626;
284
+ }}
285
+
286
+ .card {{
287
+ background: var(--surface);
288
+ border: 1px solid var(--line);
289
+ border-radius: 12px;
290
+ padding: 16px;
291
+ margin-bottom: 16px;
292
+ }}
293
+
294
+ .meta-grid {{
295
+ display: grid;
296
+ grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
297
+ gap: 8px 16px;
298
+ color: var(--muted);
299
+ }}
300
+
301
+ .table-wrap {{
302
+ width: 100%;
303
+ overflow-x: auto;
304
+ overflow-y: auto;
305
+ max-height: 60vh;
306
+ border: 1px solid var(--line);
307
+ border-radius: 10px;
308
+ }}
309
+
310
+ table {{ width: max-content; min-width: 1400px; border-collapse: collapse; font-size: 13px; }}
311
+ th, td {{ border: 1px solid #cfd8e3; padding: 8px; text-align: left; vertical-align: top; }}
312
+ th {{ background: #eef4fb; position: sticky; top: 0; z-index: 1; }}
313
+ .filters th {{ background: #f8fafc; padding: 6px; position: sticky; top: 38px; z-index: 1; }}
314
+ .filters input {{ width: 100%; border: 1px solid #cbd5e1; border-radius: 6px; padding: 6px 8px; font-size: 12px; }}
315
+
316
+ .footer {{
317
+ background: #0b3b66;
318
+ color: #e2ecf7;
319
+ padding: 16px 24px;
320
+ text-align: center;
321
+ font-size: 14px;
322
+ }}
323
+
324
+ .footer a {{ color: #f8bf3b; text-decoration: none; font-weight: 600; }}
325
+ .footer a:hover {{ text-decoration: underline; }}
326
+
327
+ @media (max-width: 720px) {{
328
+ .brand-title {{ font-size: 24px; }}
329
+ .filters th {{ top: 74px; }}
330
+ }}
331
+ </style>
332
+ </head>
333
+ <body>
334
+ <main class=\"page\">
335
+ <header class=\"brand-header\">
336
+ <h1 class=\"brand-title\">DVG Data Validation Report</h1>
337
+ <p class=\"brand-sub\">Data Validation Gini | Reconciliation and mismatch diagnostics</p>
338
+ <span class=\"status-pill\">{status_text}</span>
339
+ </header>
340
+
341
+ <section class=\"kpi-grid\">
342
+ <article class=\"kpi-card\"><div class=\"kpi-label\">SRC Count</div><div class=\"kpi-value\">{metrics['src_count']}</div></article>
343
+ <article class=\"kpi-card\"><div class=\"kpi-label\">TGT Count</div><div class=\"kpi-value\">{metrics['tgt_count']}</div></article>
344
+ <article class=\"kpi-card success\"><div class=\"kpi-label\">PASSED</div><div class=\"kpi-value\">{metrics['passed']}</div></article>
345
+ <article class=\"kpi-card danger\"><div class=\"kpi-label\">FAILED</div><div class=\"kpi-value\">{metrics['failed']}</div></article>
346
+ <article class=\"kpi-card success\"><div class=\"kpi-label\">Pass Rate</div><div class=\"kpi-value\">{metrics['pass_rate']:.2f}%</div></article>
347
+ <article class=\"kpi-card danger\"><div class=\"kpi-label\">Failed Rate</div><div class=\"kpi-value\">{metrics['failed_rate']:.2f}%</div></article>
348
+ <article class=\"kpi-card\"><div class=\"kpi-label\">SRC Only</div><div class=\"kpi-value\">{metrics['src_only']}</div></article>
349
+ <article class=\"kpi-card\"><div class=\"kpi-label\">TGT Only</div><div class=\"kpi-value\">{metrics['tgt_only']}</div></article>
350
+ </section>
351
+
352
+ <section class=\"card\">
353
+ <div class=\"meta-grid\">
354
+ <div><strong>Validation Type:</strong> {html.escape(validation_type)}</div>
355
+ <div><strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
356
+ <div><strong>Source:</strong> {html.escape(src_path)}</div>
357
+ <div><strong>Target:</strong> {html.escape(tgt_path)}</div>
358
+ </div>
359
+ <p><strong>Summary:</strong> {html.escape(summary)}</p>
360
+ </section>
361
+
362
+ <section class=\"card\">
363
+ <h2 style=\"margin-top:0;\">Mismatch Details</h2>
364
+ <p style=\"color: var(--muted); margin-top: -4px;\">Use the filter boxes in each column to narrow results.</p>
365
+ <div class=\"table-wrap\">
366
+ <table id=\"mismatchTable\">
367
+ <thead>
368
+ <tr>
369
+ <th>Type</th>
370
+ <th>Row</th>
371
+ <th>Mismatching Columns</th>
372
+ <th>Source</th>
373
+ <th>Target</th>
374
+ <th>Reason</th>
375
+ </tr>
376
+ <tr class=\"filters\">
377
+ <th><input type=\"text\" oninput=\"filterTable(0, this.value)\" placeholder=\"Filter type\" /></th>
378
+ <th><input type=\"text\" oninput=\"filterTable(1, this.value)\" placeholder=\"Filter row\" /></th>
379
+ <th><input type=\"text\" oninput=\"filterTable(2, this.value)\" placeholder=\"Filter mismatching columns\" /></th>
380
+ <th><input type=\"text\" oninput=\"filterTable(3, this.value)\" placeholder=\"Filter source\" /></th>
381
+ <th><input type=\"text\" oninput=\"filterTable(4, this.value)\" placeholder=\"Filter target\" /></th>
382
+ <th><input type=\"text\" oninput=\"filterTable(5, this.value)\" placeholder=\"Filter reason\" /></th>
383
+ </tr>
384
+ </thead>
385
+ <tbody>
386
+ {mismatch_rows}
387
+ </tbody>
388
+ </table>
389
+ </div>
390
+ </section>
391
+ </main>
392
+
393
+ <footer class=\"footer\">
394
+ <div>Designed and developed by <a href=\"{html.escape(linkedin_url)}\" target=\"_blank\" rel=\"noopener noreferrer\">ShanKonduru</a></div>
395
+ <div>This project is available at <a href=\"{html.escape(repo_url)}\" target=\"_blank\" rel=\"noopener noreferrer\">GitHub</a></div>
396
+ </footer>
397
+
398
+ <script>
399
+ const activeFilters = ["", "", "", "", "", ""];
400
+
401
+ function filterTable(columnIndex, searchValue) {{
402
+ activeFilters[columnIndex] = (searchValue || "").toLowerCase();
403
+ const table = document.getElementById("mismatchTable");
404
+ const rows = table.tBodies[0].rows;
405
+
406
+ for (let i = 0; i < rows.length; i += 1) {{
407
+ let showRow = true;
408
+
409
+ for (let col = 0; col < activeFilters.length; col += 1) {{
410
+ const term = activeFilters[col];
411
+ if (!term) {{
412
+ continue;
413
+ }}
414
+
415
+ const cell = rows[i].cells[col];
416
+ const text = (cell ? cell.textContent : "").toLowerCase();
417
+ if (text.indexOf(term) == -1) {{
418
+ showRow = false;
419
+ break;
420
+ }}
421
+ }}
422
+
423
+ rows[i].style.display = showRow ? "" : "none";
424
+ }}
425
+ }}
426
+ </script>
427
+ </body>
428
+ </html>
429
+ """
430
+
431
+
432
+ def write_html_report(
433
+ path,
434
+ passed,
435
+ summary,
436
+ src_path,
437
+ tgt_path,
438
+ validation_type,
439
+ mismatches,
440
+ src_rows_count,
441
+ tgt_rows_count,
442
+ linkedin_url=DEFAULT_LINKEDIN_URL,
443
+ repo_url=DEFAULT_REPO_URL,
444
+ ):
445
+ metrics = build_report_metrics(src_rows_count, tgt_rows_count, mismatches)
446
+ report_html = generate_html_report(
447
+ passed=passed,
448
+ summary=summary,
449
+ src_path=src_path,
450
+ tgt_path=tgt_path,
451
+ validation_type=validation_type,
452
+ mismatches=mismatches,
453
+ metrics=metrics,
454
+ linkedin_url=linkedin_url,
455
+ repo_url=repo_url,
456
+ )
457
+
458
+ with open(path, mode="w", encoding="utf-8") as report_file:
459
+ report_file.write(report_html)
@@ -0,0 +1,23 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "data-validation-gini"
7
+ version = "0.1.1"
8
+ description = "Data Validation Gini (DVG) CLI for row count and row/column comparison with HTML reports"
9
+ readme = "data_corruptor_readme.txt"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "ShanKonduru" }
14
+ ]
15
+ dependencies = [
16
+ "openpyxl"
17
+ ]
18
+
19
+ [project.scripts]
20
+ dvg = "dvg:main"
21
+
22
+ [tool.setuptools]
23
+ py-modules = ["dvg", "dvg_report", "data_corruptor"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+