csv-stream-diff 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
config.example.yaml ADDED
@@ -0,0 +1,59 @@
1
+ files:
2
+ left: ./data/left.csv
3
+ right: ./data/right.csv
4
+
5
+ csv:
6
+ left:
7
+ encoding: utf-8-sig
8
+ delimiter: ","
9
+ quotechar: '"'
10
+ escapechar:
11
+ newline: ""
12
+ right:
13
+ encoding: utf-8-sig
14
+ delimiter: ","
15
+ quotechar: '"'
16
+ escapechar:
17
+ newline: ""
18
+
19
+ keys:
20
+ left:
21
+ - customer_id
22
+ - transaction_date
23
+ right:
24
+ - cust_id
25
+ - txn_dt
26
+
27
+ compare:
28
+ left:
29
+ - amount
30
+ - status
31
+ - description
32
+ right:
33
+ - transaction_amount
34
+ - txn_status
35
+ - desc
36
+
37
+ comparison:
38
+ case_insensitive: true
39
+ trim_whitespace: true
40
+ treat_null_as_equal: false
41
+
42
+ sampling:
43
+ size: 0
44
+ seed: 12345
45
+
46
+ performance:
47
+ chunk_size: 100000
48
+ workers:
49
+ bucket_count:
50
+ report_every_rows: 50000
51
+ temp_directory:
52
+ keep_temp_files: false
53
+ show_progress: true
54
+
55
+ output:
56
+ directory: ./output
57
+ prefix: comparison_
58
+ include_full_rows: true
59
+ summary_format: both
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jordi Corbilla
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,161 @@
1
+ Metadata-Version: 2.1
2
+ Name: csv-stream-diff
3
+ Version: 0.1.0
4
+ Summary: Stream and compare very large CSV files with multiprocessing.
5
+ License: MIT
6
+ Keywords: csv,diff,streaming,multiprocessing,comparison
7
+ Author: Jordi
8
+ Requires-Python: >=3.10,<4.0
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: File Formats
18
+ Classifier: Topic :: Software Development :: Testing
19
+ Classifier: Topic :: Utilities
20
+ Requires-Dist: PyYAML (>=6.0)
21
+ Requires-Dist: rich (>=13.7)
22
+ Description-Content-Type: text/markdown
23
+
24
+ # csv-stream-diff
25
+
26
+ `csv-stream-diff` compares very large CSV files with streaming I/O, hashed bucket partitioning, and multiprocessing. It is designed for datasets that are too large to load fully into memory.
27
+
28
+ ## Features
29
+
30
+ - Compare CSVs by configurable key columns, even when left and right headers differ
31
+ - Stream files in chunks with configurable `chunk_size`
32
+ - Partition by stable hashed key to keep worker memory bounded
33
+ - Use all CPUs by default, or set a worker count explicitly
34
+ - Write machine-usable output artifacts for left-only, right-only, cell differences, duplicate keys, and run summary
35
+ - Support exact random sampling for validation runs with `sampling.size > 0`
36
+ - Warn on duplicate keys and continue using the first occurrence per key
37
+ - Include a fixture generator and both `pytest` and `behave` tests
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ pip install csv-stream-diff
43
+ ```
44
+
45
+ For local development:
46
+
47
+ ```bash
48
+ poetry install
49
+ ```
50
+
51
+ ## CLI
52
+
53
+ ```bash
54
+ csv-stream-diff --config config.yaml
55
+ ```
56
+
57
+ Optional overrides:
58
+
59
+ ```bash
60
+ csv-stream-diff \
61
+ --config config.yaml \
62
+ --left-file ./left.csv \
63
+ --right-file ./right.csv \
64
+ --chunk-size 100000 \
65
+ --sample-size 100000 \
66
+ --sample-seed 20260321 \
67
+ --workers 8 \
68
+ --output-dir ./output \
69
+ --output-prefix run_
70
+ ```
71
+
72
+ The YAML config is the default source of truth. CLI flags override it for a single run.
73
+
74
+ ## Configuration
75
+
76
+ See [config.example.yaml](/c:/repo/csv-stream-diff/config.example.yaml) for a full example.
77
+
78
+ Main sections:
79
+
80
+ - `files.left`, `files.right`: input CSV paths
81
+ - `csv.left`, `csv.right`: dialect and encoding settings
82
+ - `keys.left`, `keys.right`: key columns used to match rows
83
+ - `compare.left`, `compare.right`: value columns to compare
84
+ - `comparison`: normalization options
85
+ - `sampling`: `size: 0` means full comparison; any positive value means exact random sample by left-side unique key with a fixed seed
86
+ - `performance`: chunking, worker count, bucket count, temp directory, progress reporting
87
+ - `output`: output directory, filename prefix, whether to include serialized full rows, and whether to write a text summary
88
+
89
+ ## Output Files
90
+
91
+ The tool writes these artifacts to `output.directory`:
92
+
93
+ - `<prefix>only_in_left.csv`
94
+ - `<prefix>only_in_right.csv`
95
+ - `<prefix>differences.csv`
96
+ - `<prefix>duplicate_keys.csv`
97
+ - `<prefix>summary.json`
98
+ - `<prefix>summary.txt` when `output.summary_format` is `text` or `both`
99
+
100
+ `differences.csv` contains one row per differing cell with both the left and right column names and values.
101
+
102
+ ## Sampling
103
+
104
+ - `sampling.size: 0` runs the full comparison.
105
+ - `sampling.size > 0` selects an exact random sample of left-side unique keys using reservoir sampling.
106
+ - Sampling is reproducible when `sampling.seed` stays the same.
107
+ - Duplicate keys do not expand the sampling population because only the first occurrence per key is considered.
108
+
109
+ ## Duplicate Keys
110
+
111
+ Duplicate keys do not stop the run. They are written to `duplicate_keys.csv`, counted in the summary, and the main comparison uses the first occurrence of each key on each side.
112
+
113
+ ## Generator
114
+
115
+ The generator creates two baseline-identical CSVs, applies controlled mutations, writes a matching config, and saves an expected manifest:
116
+
117
+ ```bash
118
+ python generator/generate_fixtures.py --output-dir ./generated --rows 10000 --seed 42
119
+ ```
120
+
121
+ Generated artifacts:
122
+
123
+ - `left.csv`
124
+ - `right.csv`
125
+ - `config.generated.yaml`
126
+ - `expected.json`
127
+
128
+ ## Tests
129
+
130
+ Run unit tests:
131
+
132
+ ```bash
133
+ poetry run pytest
134
+ ```
135
+
136
+ Run BDD acceptance tests:
137
+
138
+ ```bash
139
+ poetry run behave tests/features
140
+ ```
141
+
142
+ Run a package build:
143
+
144
+ ```bash
145
+ poetry build
146
+ ```
147
+
148
+ ## PyPI Packaging
149
+
150
+ Build source and wheel distributions:
151
+
152
+ ```bash
153
+ poetry build
154
+ ```
155
+
156
+ Upload after verifying artifacts:
157
+
158
+ ```bash
159
+ poetry publish
160
+ ```
161
+
@@ -0,0 +1,12 @@
1
+ config.example.yaml,sha256=8ENWrYFHA25qT0TdGNSiSgvzoQjMgV9Q-RRPcVaBRm0,866
2
+ csvstreamdiff/__init__.py,sha256=Umar2SMg-gpnxi4lw26YR_RjNSp9-1ciWrh-TcoEQlo,81
3
+ csvstreamdiff/cli.py,sha256=0gvfltCfZnHYLLRyaHgd1sRe77kuWO5syheZvdpGsCc,1914
4
+ csvstreamdiff/comparer.py,sha256=LO3sEAWlhrpLhxBE2h2_bLOOuzcIQ0rcSxAsfz0KV8c,20122
5
+ csvstreamdiff/hashing.py,sha256=rt4nfmtDxBjmiVNz9rOgXdH007-qLhEY00p6sqr1mY8,1858
6
+ csvstreamdiff/multiprocessing.py,sha256=aHwYpVu8DJrpVnUh0YtD-XZsB9q8GApE2LvZvxXhra4,24671
7
+ csvstreamdiff/streaming.py,sha256=HZVSEZW5Pa8Ye0SS77IGuq19e_aZ7tcQcccNWlTIAy0,3698
8
+ csv_stream_diff-0.1.0.dist-info/entry_points.txt,sha256=31ZuS-GCnz2sFvaAq2y7_uUfjl-0nNIPgmwFtsd2TSc,58
9
+ csv_stream_diff-0.1.0.dist-info/LICENSE,sha256=ywpYokjmlAtrA7JZDmTrosLLGHOkUsPAiC5XTK0VGEU,1092
10
+ csv_stream_diff-0.1.0.dist-info/METADATA,sha256=oUe4aGT2alpmp13NplztNwtrvfU1bTfNW6lit2Q9CDA,4553
11
+ csv_stream_diff-0.1.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
12
+ csv_stream_diff-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 1.9.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ csv-stream-diff=csvstreamdiff.cli:main
3
+
@@ -0,0 +1,5 @@
1
+ """csv-stream-diff package."""
2
+
3
+ __all__ = ["__version__"]
4
+
5
+ __version__ = "0.1.0"
csvstreamdiff/cli.py ADDED
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import multiprocessing as mp
6
+ import sys
7
+
8
+ from .comparer import compare_from_path
9
+
10
+
11
+ def build_parser() -> argparse.ArgumentParser:
12
+ parser = argparse.ArgumentParser(description="Compare very large CSV files using streaming and multiprocessing.")
13
+ parser.add_argument("--config", "-c", required=True, help="Path to the YAML configuration file.")
14
+ parser.add_argument("--left-file", help="Override files.left from the config.")
15
+ parser.add_argument("--right-file", help="Override files.right from the config.")
16
+ parser.add_argument("--chunk-size", type=int, help="Override performance.chunk_size.")
17
+ parser.add_argument("--size", "--sample-size", dest="sample_size", type=int, help="Override sampling.size.")
18
+ parser.add_argument("--sample-seed", type=int, help="Override sampling.seed.")
19
+ parser.add_argument("--workers", type=int, help="Override performance.workers.")
20
+ parser.add_argument("--output-dir", help="Override output.directory.")
21
+ parser.add_argument("--output-prefix", help="Override output.prefix.")
22
+ return parser
23
+
24
+
25
+ def main(argv: list[str] | None = None) -> int:
26
+ parser = build_parser()
27
+ args = parser.parse_args(argv)
28
+
29
+ overrides = {
30
+ "left_file": args.left_file,
31
+ "right_file": args.right_file,
32
+ "chunk_size": args.chunk_size,
33
+ "sample_size": args.sample_size,
34
+ "sample_seed": args.sample_seed,
35
+ "workers": args.workers,
36
+ "output_dir": args.output_dir,
37
+ "output_prefix": args.output_prefix,
38
+ }
39
+
40
+ try:
41
+ summary = compare_from_path(args.config, overrides)
42
+ except Exception as exc:
43
+ print(f"csv-stream-diff failed: {exc}", file=sys.stderr)
44
+ return 1
45
+
46
+ print(json.dumps(summary, indent=2, ensure_ascii=False))
47
+ return 0
48
+
49
+
50
+ if __name__ == "__main__":
51
+ mp.freeze_support()
52
+ raise SystemExit(main())