csv-stream-diff 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- config.example.yaml +59 -0
- csv_stream_diff-0.1.0.dist-info/LICENSE +21 -0
- csv_stream_diff-0.1.0.dist-info/METADATA +161 -0
- csv_stream_diff-0.1.0.dist-info/RECORD +12 -0
- csv_stream_diff-0.1.0.dist-info/WHEEL +4 -0
- csv_stream_diff-0.1.0.dist-info/entry_points.txt +3 -0
- csvstreamdiff/__init__.py +5 -0
- csvstreamdiff/cli.py +52 -0
- csvstreamdiff/comparer.py +478 -0
- csvstreamdiff/hashing.py +62 -0
- csvstreamdiff/multiprocessing.py +639 -0
- csvstreamdiff/streaming.py +114 -0
config.example.yaml
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
files:
|
|
2
|
+
left: ./data/left.csv
|
|
3
|
+
right: ./data/right.csv
|
|
4
|
+
|
|
5
|
+
csv:
|
|
6
|
+
left:
|
|
7
|
+
encoding: utf-8-sig
|
|
8
|
+
delimiter: ","
|
|
9
|
+
quotechar: '"'
|
|
10
|
+
escapechar:
|
|
11
|
+
newline: ""
|
|
12
|
+
right:
|
|
13
|
+
encoding: utf-8-sig
|
|
14
|
+
delimiter: ","
|
|
15
|
+
quotechar: '"'
|
|
16
|
+
escapechar:
|
|
17
|
+
newline: ""
|
|
18
|
+
|
|
19
|
+
keys:
|
|
20
|
+
left:
|
|
21
|
+
- customer_id
|
|
22
|
+
- transaction_date
|
|
23
|
+
right:
|
|
24
|
+
- cust_id
|
|
25
|
+
- txn_dt
|
|
26
|
+
|
|
27
|
+
compare:
|
|
28
|
+
left:
|
|
29
|
+
- amount
|
|
30
|
+
- status
|
|
31
|
+
- description
|
|
32
|
+
right:
|
|
33
|
+
- transaction_amount
|
|
34
|
+
- txn_status
|
|
35
|
+
- desc
|
|
36
|
+
|
|
37
|
+
comparison:
|
|
38
|
+
case_insensitive: true
|
|
39
|
+
trim_whitespace: true
|
|
40
|
+
treat_null_as_equal: false
|
|
41
|
+
|
|
42
|
+
sampling:
|
|
43
|
+
size: 0
|
|
44
|
+
seed: 12345
|
|
45
|
+
|
|
46
|
+
performance:
|
|
47
|
+
chunk_size: 100000
|
|
48
|
+
workers:
|
|
49
|
+
bucket_count:
|
|
50
|
+
report_every_rows: 50000
|
|
51
|
+
temp_directory:
|
|
52
|
+
keep_temp_files: false
|
|
53
|
+
show_progress: true
|
|
54
|
+
|
|
55
|
+
output:
|
|
56
|
+
directory: ./output
|
|
57
|
+
prefix: comparison_
|
|
58
|
+
include_full_rows: true
|
|
59
|
+
summary_format: both
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jordi Corbilla
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: csv-stream-diff
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Stream and compare very large CSV files with multiprocessing.
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: csv,diff,streaming,multiprocessing,comparison
|
|
7
|
+
Author: Jordi
|
|
8
|
+
Requires-Python: >=3.10,<4.0
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: File Formats
|
|
18
|
+
Classifier: Topic :: Software Development :: Testing
|
|
19
|
+
Classifier: Topic :: Utilities
|
|
20
|
+
Requires-Dist: PyYAML (>=6.0)
|
|
21
|
+
Requires-Dist: rich (>=13.7)
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# csv-stream-diff
|
|
25
|
+
|
|
26
|
+
`csv-stream-diff` compares very large CSV files with streaming I/O, hashed bucket partitioning, and multiprocessing. It is designed for datasets that are too large to load fully into memory.
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
|
|
30
|
+
- Compare CSVs by configurable key columns, even when left and right headers differ
|
|
31
|
+
- Stream files in chunks with configurable `chunk_size`
|
|
32
|
+
- Partition by stable hashed key to keep worker memory bounded
|
|
33
|
+
- Use all CPUs by default, or set a worker count explicitly
|
|
34
|
+
- Write machine-usable output artifacts for left-only, right-only, cell differences, duplicate keys, and run summary
|
|
35
|
+
- Support exact random sampling for validation runs with `sampling.size > 0`
|
|
36
|
+
- Warn on duplicate keys and continue using the first occurrence per key
|
|
37
|
+
- Include a fixture generator and both `pytest` and `behave` tests
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install csv-stream-diff
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
For local development:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
poetry install
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## CLI
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
csv-stream-diff --config config.yaml
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Optional overrides:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
csv-stream-diff \
|
|
61
|
+
--config config.yaml \
|
|
62
|
+
--left-file ./left.csv \
|
|
63
|
+
--right-file ./right.csv \
|
|
64
|
+
--chunk-size 100000 \
|
|
65
|
+
--sample-size 100000 \
|
|
66
|
+
--sample-seed 20260321 \
|
|
67
|
+
--workers 8 \
|
|
68
|
+
--output-dir ./output \
|
|
69
|
+
--output-prefix run_
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
The YAML config is the default source of truth. CLI flags override it for a single run.
|
|
73
|
+
|
|
74
|
+
## Configuration
|
|
75
|
+
|
|
76
|
+
See [config.example.yaml](/c:/repo/csv-stream-diff/config.example.yaml) for a full example.
|
|
77
|
+
|
|
78
|
+
Main sections:
|
|
79
|
+
|
|
80
|
+
- `files.left`, `files.right`: input CSV paths
|
|
81
|
+
- `csv.left`, `csv.right`: dialect and encoding settings
|
|
82
|
+
- `keys.left`, `keys.right`: key columns used to match rows
|
|
83
|
+
- `compare.left`, `compare.right`: value columns to compare
|
|
84
|
+
- `comparison`: normalization options
|
|
85
|
+
- `sampling`: `size: 0` means full comparison; any positive value means exact random sample by left-side unique key with a fixed seed
|
|
86
|
+
- `performance`: chunking, worker count, bucket count, temp directory, progress reporting
|
|
87
|
+
- `output`: output directory, filename prefix, whether to include serialized full rows, and whether to write a text summary
|
|
88
|
+
|
|
89
|
+
## Output Files
|
|
90
|
+
|
|
91
|
+
The tool writes these artifacts to `output.directory`:
|
|
92
|
+
|
|
93
|
+
- `<prefix>only_in_left.csv`
|
|
94
|
+
- `<prefix>only_in_right.csv`
|
|
95
|
+
- `<prefix>differences.csv`
|
|
96
|
+
- `<prefix>duplicate_keys.csv`
|
|
97
|
+
- `<prefix>summary.json`
|
|
98
|
+
- `<prefix>summary.txt` when `output.summary_format` is `text` or `both`
|
|
99
|
+
|
|
100
|
+
`differences.csv` contains one row per differing cell with both the left and right column names and values.
|
|
101
|
+
|
|
102
|
+
## Sampling
|
|
103
|
+
|
|
104
|
+
- `sampling.size: 0` runs the full comparison.
|
|
105
|
+
- `sampling.size > 0` selects an exact random sample of left-side unique keys using reservoir sampling.
|
|
106
|
+
- Sampling is reproducible when `sampling.seed` stays the same.
|
|
107
|
+
- Duplicate keys do not expand the sampling population because only the first occurrence per key is considered.
|
|
108
|
+
|
|
109
|
+
## Duplicate Keys
|
|
110
|
+
|
|
111
|
+
Duplicate keys do not stop the run. They are written to `duplicate_keys.csv`, counted in the summary, and the main comparison uses the first occurrence of each key on each side.
|
|
112
|
+
|
|
113
|
+
## Generator
|
|
114
|
+
|
|
115
|
+
The generator creates two baseline-identical CSVs, applies controlled mutations, writes a matching config, and saves an expected manifest:
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
python generator/generate_fixtures.py --output-dir ./generated --rows 10000 --seed 42
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Generated artifacts:
|
|
122
|
+
|
|
123
|
+
- `left.csv`
|
|
124
|
+
- `right.csv`
|
|
125
|
+
- `config.generated.yaml`
|
|
126
|
+
- `expected.json`
|
|
127
|
+
|
|
128
|
+
## Tests
|
|
129
|
+
|
|
130
|
+
Run unit tests:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
poetry run pytest
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Run BDD acceptance tests:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
poetry run behave tests/features
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Run a package build:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
poetry build
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## PyPI Packaging
|
|
149
|
+
|
|
150
|
+
Build source and wheel distributions:
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
poetry build
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Upload after verifying artifacts:
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
poetry publish
|
|
160
|
+
```
|
|
161
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
config.example.yaml,sha256=8ENWrYFHA25qT0TdGNSiSgvzoQjMgV9Q-RRPcVaBRm0,866
|
|
2
|
+
csvstreamdiff/__init__.py,sha256=Umar2SMg-gpnxi4lw26YR_RjNSp9-1ciWrh-TcoEQlo,81
|
|
3
|
+
csvstreamdiff/cli.py,sha256=0gvfltCfZnHYLLRyaHgd1sRe77kuWO5syheZvdpGsCc,1914
|
|
4
|
+
csvstreamdiff/comparer.py,sha256=LO3sEAWlhrpLhxBE2h2_bLOOuzcIQ0rcSxAsfz0KV8c,20122
|
|
5
|
+
csvstreamdiff/hashing.py,sha256=rt4nfmtDxBjmiVNz9rOgXdH007-qLhEY00p6sqr1mY8,1858
|
|
6
|
+
csvstreamdiff/multiprocessing.py,sha256=aHwYpVu8DJrpVnUh0YtD-XZsB9q8GApE2LvZvxXhra4,24671
|
|
7
|
+
csvstreamdiff/streaming.py,sha256=HZVSEZW5Pa8Ye0SS77IGuq19e_aZ7tcQcccNWlTIAy0,3698
|
|
8
|
+
csv_stream_diff-0.1.0.dist-info/entry_points.txt,sha256=31ZuS-GCnz2sFvaAq2y7_uUfjl-0nNIPgmwFtsd2TSc,58
|
|
9
|
+
csv_stream_diff-0.1.0.dist-info/LICENSE,sha256=ywpYokjmlAtrA7JZDmTrosLLGHOkUsPAiC5XTK0VGEU,1092
|
|
10
|
+
csv_stream_diff-0.1.0.dist-info/METADATA,sha256=oUe4aGT2alpmp13NplztNwtrvfU1bTfNW6lit2Q9CDA,4553
|
|
11
|
+
csv_stream_diff-0.1.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
12
|
+
csv_stream_diff-0.1.0.dist-info/RECORD,,
|
csvstreamdiff/cli.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from .comparer import compare_from_path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
12
|
+
parser = argparse.ArgumentParser(description="Compare very large CSV files using streaming and multiprocessing.")
|
|
13
|
+
parser.add_argument("--config", "-c", required=True, help="Path to the YAML configuration file.")
|
|
14
|
+
parser.add_argument("--left-file", help="Override files.left from the config.")
|
|
15
|
+
parser.add_argument("--right-file", help="Override files.right from the config.")
|
|
16
|
+
parser.add_argument("--chunk-size", type=int, help="Override performance.chunk_size.")
|
|
17
|
+
parser.add_argument("--size", "--sample-size", dest="sample_size", type=int, help="Override sampling.size.")
|
|
18
|
+
parser.add_argument("--sample-seed", type=int, help="Override sampling.seed.")
|
|
19
|
+
parser.add_argument("--workers", type=int, help="Override performance.workers.")
|
|
20
|
+
parser.add_argument("--output-dir", help="Override output.directory.")
|
|
21
|
+
parser.add_argument("--output-prefix", help="Override output.prefix.")
|
|
22
|
+
return parser
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def main(argv: list[str] | None = None) -> int:
|
|
26
|
+
parser = build_parser()
|
|
27
|
+
args = parser.parse_args(argv)
|
|
28
|
+
|
|
29
|
+
overrides = {
|
|
30
|
+
"left_file": args.left_file,
|
|
31
|
+
"right_file": args.right_file,
|
|
32
|
+
"chunk_size": args.chunk_size,
|
|
33
|
+
"sample_size": args.sample_size,
|
|
34
|
+
"sample_seed": args.sample_seed,
|
|
35
|
+
"workers": args.workers,
|
|
36
|
+
"output_dir": args.output_dir,
|
|
37
|
+
"output_prefix": args.output_prefix,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
summary = compare_from_path(args.config, overrides)
|
|
42
|
+
except Exception as exc:
|
|
43
|
+
print(f"csv-stream-diff failed: {exc}", file=sys.stderr)
|
|
44
|
+
return 1
|
|
45
|
+
|
|
46
|
+
print(json.dumps(summary, indent=2, ensure_ascii=False))
|
|
47
|
+
return 0
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
if __name__ == "__main__":
|
|
51
|
+
mp.freeze_support()
|
|
52
|
+
raise SystemExit(main())
|