csv-diff-tool 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_diff_tool-0.1.0/LICENSE +21 -0
- csv_diff_tool-0.1.0/PKG-INFO +173 -0
- csv_diff_tool-0.1.0/README.md +153 -0
- csv_diff_tool-0.1.0/pyproject.toml +34 -0
- csv_diff_tool-0.1.0/setup.cfg +4 -0
- csv_diff_tool-0.1.0/src/csv_diff_tool/__init__.py +11 -0
- csv_diff_tool-0.1.0/src/csv_diff_tool/comparer.py +213 -0
- csv_diff_tool-0.1.0/src/csv_diff_tool/csv_compare_output.py +80 -0
- csv_diff_tool-0.1.0/src/csv_diff_tool/csv_parser.py +430 -0
- csv_diff_tool-0.1.0/src/csv_diff_tool.egg-info/PKG-INFO +173 -0
- csv_diff_tool-0.1.0/src/csv_diff_tool.egg-info/SOURCES.txt +12 -0
- csv_diff_tool-0.1.0/src/csv_diff_tool.egg-info/dependency_links.txt +1 -0
- csv_diff_tool-0.1.0/src/csv_diff_tool.egg-info/requires.txt +4 -0
- csv_diff_tool-0.1.0/src/csv_diff_tool.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Ashish Narmen
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: csv-diff-tool
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A package for comparing CSV files
|
|
5
|
+
Author-email: Ashish N <ashish012@e.ntu.edu.sg>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ashishnarmen/csv-diff-tool
|
|
8
|
+
Project-URL: Issues, https://github.com/ashishnarmen/csv-diff-tool/issues
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Topic :: Utilities
|
|
13
|
+
Requires-Python: >=3.8
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: chardet
|
|
17
|
+
Provides-Extra: test
|
|
18
|
+
Requires-Dist: coverage; extra == "test"
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
|
|
21
|
+
# csv-diff-tool
|
|
22
|
+
|
|
23
|
+
[](https://opensource.org/licenses/MIT)
|
|
24
|
+
[](https://www.python.org/downloads/)
|
|
25
|
+
|
|
26
|
+
A Python library for comparing CSV files with detailed output including extra columns, extra rows, and mismatched cell values. Supports automatic encoding detection, data transformations, and flexible input methods.
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install csv-diff-tool
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Quick Start
|
|
35
|
+
|
|
36
|
+
### Compare two CSV files
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from csv_diff_tool import CSVComparer
|
|
40
|
+
|
|
41
|
+
result = CSVComparer.from_files("report_v1.csv", "report_v2.csv").compare("id")
|
|
42
|
+
|
|
43
|
+
print(result.match_result) # True if files are identical
|
|
44
|
+
print(result.extra_cols_in_first_file) # Columns only in first file
|
|
45
|
+
print(result.extra_rows_in_second_file) # Rows only in second file
|
|
46
|
+
print(result.mismatched_rows) # Cell-level differences
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Compare from in-memory data
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from csv_diff_tool import CSVComparer
|
|
53
|
+
|
|
54
|
+
file_1 = ["id,name,score", "1,Alice,95", "2,Bob,87"]
|
|
55
|
+
file_2 = ["id,name,score", "1,Alice,95", "2,Bob,90", "3,Carol,88"]
|
|
56
|
+
|
|
57
|
+
result = CSVComparer.from_lines(file_1, file_2).compare("id")
|
|
58
|
+
|
|
59
|
+
print(result)
|
|
60
|
+
# First file: init from lines
|
|
61
|
+
# Second file: init from lines
|
|
62
|
+
# Match result: False
|
|
63
|
+
# Extra rows in second file
|
|
64
|
+
# 3
|
|
65
|
+
# Mismatched rows
|
|
66
|
+
# row: 2, column: score, first: 87, second: 90
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Transform data before comparing
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from csv_diff_tool import CSVComparer
|
|
73
|
+
|
|
74
|
+
comparer = CSVComparer.from_files("old.csv", "new.csv")
|
|
75
|
+
|
|
76
|
+
# Strip whitespace from all cells
|
|
77
|
+
comparer.strip_whitespace()
|
|
78
|
+
|
|
79
|
+
# Drop columns you don't care about
|
|
80
|
+
comparer.drop_columns(["timestamp", "metadata"])
|
|
81
|
+
|
|
82
|
+
# Drop specific rows
|
|
83
|
+
comparer.drop_rows("status", ["deleted", "archived"])
|
|
84
|
+
|
|
85
|
+
# Apply custom transforms
|
|
86
|
+
comparer.apply_transform("price", lambda row: str(round(float(row["price"]), 2)))
|
|
87
|
+
|
|
88
|
+
result = comparer.compare("id")
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Work with CSVParser directly
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from csv_diff_tool import CSVParser
|
|
95
|
+
|
|
96
|
+
parser = CSVParser.from_file("data.csv")
|
|
97
|
+
|
|
98
|
+
# Inspect the data
|
|
99
|
+
print(parser.column_names)
|
|
100
|
+
print(parser.get_row("id", "42"))
|
|
101
|
+
print(parser.row_values_in_column("name"))
|
|
102
|
+
|
|
103
|
+
# Modify and write back
|
|
104
|
+
parser.set_value("42", "status", "updated")
|
|
105
|
+
parser.write_to_file()
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## API Overview
|
|
109
|
+
|
|
110
|
+
### CSVComparer
|
|
111
|
+
|
|
112
|
+
The main comparison class. Create instances using factory methods:
|
|
113
|
+
|
|
114
|
+
- `CSVComparer.from_files(first_file, second_file)` - Load from file paths
|
|
115
|
+
- `CSVComparer.from_lines(first_lines, second_lines)` - Load from lists of strings
|
|
116
|
+
- `CSVComparer.from_csv_parsers(parser1, parser2)` - Load from CSVParser objects
|
|
117
|
+
|
|
118
|
+
**Transform methods** (applied to both files):
|
|
119
|
+
- `strip_whitespace()` - Remove leading/trailing whitespace
|
|
120
|
+
- `drop_columns(column_names)` - Remove specified columns
|
|
121
|
+
- `drop_rows(column, values)` - Remove rows matching values
|
|
122
|
+
- `drop_rows_by(predicate)` - Remove rows matching a function
|
|
123
|
+
- `apply_transform(column, func)` - Apply a function to a column
|
|
124
|
+
|
|
125
|
+
**Compare:**
|
|
126
|
+
- `compare(index_column)` - Returns a `CSVCompareOutput` with the results
|
|
127
|
+
|
|
128
|
+
### CSVParser
|
|
129
|
+
|
|
130
|
+
Parses CSV data into a list of dictionaries with column-aware operations:
|
|
131
|
+
|
|
132
|
+
- `CSVParser.from_file(path)` - Load from file (auto-detects encoding)
|
|
133
|
+
- `CSVParser.from_lines(lines)` - Load from list of strings
|
|
134
|
+
- `CSVParser.from_csv_text(text)` - Load from a single string
|
|
135
|
+
- `get_row(column, value)` / `get_rows(column, value)` - Query rows
|
|
136
|
+
- `get_value(row_id, column)` / `set_value(row_id, column, value)` - Cell access
|
|
137
|
+
- `write_to_file()` - Write modifications back to the source file
|
|
138
|
+
|
|
139
|
+
### CSVCompareOutput
|
|
140
|
+
|
|
141
|
+
A dataclass containing comparison results:
|
|
142
|
+
|
|
143
|
+
| Field | Type | Description |
|
|
144
|
+
|-------|------|-------------|
|
|
145
|
+
| `match_result` | `bool` | `True` if files are identical |
|
|
146
|
+
| `extra_cols_in_first_file` | `List[str]` | Columns only in the first file |
|
|
147
|
+
| `extra_cols_in_second_file` | `List[str]` | Columns only in the second file |
|
|
148
|
+
| `extra_rows_in_first_file` | `List[str]` | Rows only in the first file |
|
|
149
|
+
| `extra_rows_in_second_file` | `List[str]` | Rows only in the second file |
|
|
150
|
+
| `mismatched_rows` | `List[Dict]` | Cell-level mismatches with row, column, first, second |
|
|
151
|
+
|
|
152
|
+
## Contributing
|
|
153
|
+
|
|
154
|
+
Contributions are welcome! Please open an issue or submit a pull request on [GitHub](https://github.com/ashishnarmen/csv-diff-tool).
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
# Clone and install for development
|
|
158
|
+
git clone https://github.com/ashishnarmen/csv-diff-tool.git
|
|
159
|
+
cd csv-diff-tool
|
|
160
|
+
pip install -e ".[test]"
|
|
161
|
+
|
|
162
|
+
# Run tests with unittest and coverage
|
|
163
|
+
coverage run -m unittest discover tests/
|
|
164
|
+
coverage report -m
|
|
165
|
+
|
|
166
|
+
# Or, if you prefer pytest (install separately)
|
|
167
|
+
pip install pytest pytest-cov
|
|
168
|
+
pytest tests/ -v --cov=csv_diff_tool
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## License
|
|
172
|
+
|
|
173
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# csv-diff-tool
|
|
2
|
+
|
|
3
|
+
[](https://opensource.org/licenses/MIT)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
|
|
6
|
+
A Python library for comparing CSV files with detailed output including extra columns, extra rows, and mismatched cell values. Supports automatic encoding detection, data transformations, and flexible input methods.
|
|
7
|
+
|
|
8
|
+
## Installation
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
pip install csv-diff-tool
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Quick Start
|
|
15
|
+
|
|
16
|
+
### Compare two CSV files
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
from csv_diff_tool import CSVComparer
|
|
20
|
+
|
|
21
|
+
result = CSVComparer.from_files("report_v1.csv", "report_v2.csv").compare("id")
|
|
22
|
+
|
|
23
|
+
print(result.match_result) # True if files are identical
|
|
24
|
+
print(result.extra_cols_in_first_file) # Columns only in first file
|
|
25
|
+
print(result.extra_rows_in_second_file) # Rows only in second file
|
|
26
|
+
print(result.mismatched_rows) # Cell-level differences
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Compare from in-memory data
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from csv_diff_tool import CSVComparer
|
|
33
|
+
|
|
34
|
+
file_1 = ["id,name,score", "1,Alice,95", "2,Bob,87"]
|
|
35
|
+
file_2 = ["id,name,score", "1,Alice,95", "2,Bob,90", "3,Carol,88"]
|
|
36
|
+
|
|
37
|
+
result = CSVComparer.from_lines(file_1, file_2).compare("id")
|
|
38
|
+
|
|
39
|
+
print(result)
|
|
40
|
+
# First file: init from lines
|
|
41
|
+
# Second file: init from lines
|
|
42
|
+
# Match result: False
|
|
43
|
+
# Extra rows in second file
|
|
44
|
+
# 3
|
|
45
|
+
# Mismatched rows
|
|
46
|
+
# row: 2, column: score, first: 87, second: 90
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Transform data before comparing
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from csv_diff_tool import CSVComparer
|
|
53
|
+
|
|
54
|
+
comparer = CSVComparer.from_files("old.csv", "new.csv")
|
|
55
|
+
|
|
56
|
+
# Strip whitespace from all cells
|
|
57
|
+
comparer.strip_whitespace()
|
|
58
|
+
|
|
59
|
+
# Drop columns you don't care about
|
|
60
|
+
comparer.drop_columns(["timestamp", "metadata"])
|
|
61
|
+
|
|
62
|
+
# Drop specific rows
|
|
63
|
+
comparer.drop_rows("status", ["deleted", "archived"])
|
|
64
|
+
|
|
65
|
+
# Apply custom transforms
|
|
66
|
+
comparer.apply_transform("price", lambda row: str(round(float(row["price"]), 2)))
|
|
67
|
+
|
|
68
|
+
result = comparer.compare("id")
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Work with CSVParser directly
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from csv_diff_tool import CSVParser
|
|
75
|
+
|
|
76
|
+
parser = CSVParser.from_file("data.csv")
|
|
77
|
+
|
|
78
|
+
# Inspect the data
|
|
79
|
+
print(parser.column_names)
|
|
80
|
+
print(parser.get_row("id", "42"))
|
|
81
|
+
print(parser.row_values_in_column("name"))
|
|
82
|
+
|
|
83
|
+
# Modify and write back
|
|
84
|
+
parser.set_value("42", "status", "updated")
|
|
85
|
+
parser.write_to_file()
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## API Overview
|
|
89
|
+
|
|
90
|
+
### CSVComparer
|
|
91
|
+
|
|
92
|
+
The main comparison class. Create instances using factory methods:
|
|
93
|
+
|
|
94
|
+
- `CSVComparer.from_files(first_file, second_file)` - Load from file paths
|
|
95
|
+
- `CSVComparer.from_lines(first_lines, second_lines)` - Load from lists of strings
|
|
96
|
+
- `CSVComparer.from_csv_parsers(parser1, parser2)` - Load from CSVParser objects
|
|
97
|
+
|
|
98
|
+
**Transform methods** (applied to both files):
|
|
99
|
+
- `strip_whitespace()` - Remove leading/trailing whitespace
|
|
100
|
+
- `drop_columns(column_names)` - Remove specified columns
|
|
101
|
+
- `drop_rows(column, values)` - Remove rows matching values
|
|
102
|
+
- `drop_rows_by(predicate)` - Remove rows matching a function
|
|
103
|
+
- `apply_transform(column, func)` - Apply a function to a column
|
|
104
|
+
|
|
105
|
+
**Compare:**
|
|
106
|
+
- `compare(index_column)` - Returns a `CSVCompareOutput` with the results
|
|
107
|
+
|
|
108
|
+
### CSVParser
|
|
109
|
+
|
|
110
|
+
Parses CSV data into a list of dictionaries with column-aware operations:
|
|
111
|
+
|
|
112
|
+
- `CSVParser.from_file(path)` - Load from file (auto-detects encoding)
|
|
113
|
+
- `CSVParser.from_lines(lines)` - Load from list of strings
|
|
114
|
+
- `CSVParser.from_csv_text(text)` - Load from a single string
|
|
115
|
+
- `get_row(column, value)` / `get_rows(column, value)` - Query rows
|
|
116
|
+
- `get_value(row_id, column)` / `set_value(row_id, column, value)` - Cell access
|
|
117
|
+
- `write_to_file()` - Write modifications back to the source file
|
|
118
|
+
|
|
119
|
+
### CSVCompareOutput
|
|
120
|
+
|
|
121
|
+
A dataclass containing comparison results:
|
|
122
|
+
|
|
123
|
+
| Field | Type | Description |
|
|
124
|
+
|-------|------|-------------|
|
|
125
|
+
| `match_result` | `bool` | `True` if files are identical |
|
|
126
|
+
| `extra_cols_in_first_file` | `List[str]` | Columns only in the first file |
|
|
127
|
+
| `extra_cols_in_second_file` | `List[str]` | Columns only in the second file |
|
|
128
|
+
| `extra_rows_in_first_file` | `List[str]` | Rows only in the first file |
|
|
129
|
+
| `extra_rows_in_second_file` | `List[str]` | Rows only in the second file |
|
|
130
|
+
| `mismatched_rows` | `List[Dict]` | Cell-level mismatches with row, column, first, second |
|
|
131
|
+
|
|
132
|
+
## Contributing
|
|
133
|
+
|
|
134
|
+
Contributions are welcome! Please open an issue or submit a pull request on [GitHub](https://github.com/ashishnarmen/csv-diff-tool).
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
# Clone and install for development
|
|
138
|
+
git clone https://github.com/ashishnarmen/csv-diff-tool.git
|
|
139
|
+
cd csv-diff-tool
|
|
140
|
+
pip install -e ".[test]"
|
|
141
|
+
|
|
142
|
+
# Run tests with unittest and coverage
|
|
143
|
+
coverage run -m unittest discover tests/
|
|
144
|
+
coverage report -m
|
|
145
|
+
|
|
146
|
+
# Or, if you prefer pytest (install separately)
|
|
147
|
+
pip install pytest pytest-cov
|
|
148
|
+
pytest tests/ -v --cov=csv_diff_tool
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## License
|
|
152
|
+
|
|
153
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools >= 77.0.3"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "csv-diff-tool"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Ashish N", email="ashish012@e.ntu.edu.sg" },
|
|
10
|
+
]
|
|
11
|
+
description = "A package for comparing CSV files"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"chardet",
|
|
16
|
+
]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 3 - Alpha",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
"Topic :: Utilities",
|
|
22
|
+
]
|
|
23
|
+
license = "MIT"
|
|
24
|
+
license-files = ["LICEN[CS]E*"]
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
test = ["coverage"]
|
|
28
|
+
|
|
29
|
+
[project.urls]
|
|
30
|
+
Homepage = "https://github.com/ashishnarmen/csv-diff-tool"
|
|
31
|
+
Issues = "https://github.com/ashishnarmen/csv-diff-tool/issues"
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.packages.find]
|
|
34
|
+
where = ["src"]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""csv_diff_tool - A library for comparing CSV files.
|
|
2
|
+
|
|
3
|
+
Provides tools for parsing, transforming, and comparing CSV files with
|
|
4
|
+
detailed output including extra columns, extra rows, and mismatched values.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .comparer import CSVComparer
|
|
8
|
+
from .csv_compare_output import CSVCompareOutput
|
|
9
|
+
from .csv_parser import CSVParser
|
|
10
|
+
|
|
11
|
+
__all__ = ["CSVComparer", "CSVCompareOutput", "CSVParser"]
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""CSV comparison module for comparing two parsed CSV files."""
|
|
2
|
+
|
|
3
|
+
from typing import Callable, Dict, List
|
|
4
|
+
|
|
5
|
+
from .csv_compare_output import CSVCompareOutput
|
|
6
|
+
from .csv_parser import CSVParser, NullCSVParser
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CSVComparer:
|
|
10
|
+
"""Compares two CSV files and reports differences.
|
|
11
|
+
|
|
12
|
+
Supports loading from files, text lines, or pre-built CSVParser objects.
|
|
13
|
+
Provides methods to transform data before comparison.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
first_file: CSVParser
|
|
17
|
+
second_file: CSVParser
|
|
18
|
+
|
|
19
|
+
def __init__(self) -> None:
|
|
20
|
+
self.first_file: CSVParser = NullCSVParser()
|
|
21
|
+
self.second_file: CSVParser = NullCSVParser()
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def from_files(cls, first_file: str, second_file: str) -> "CSVComparer":
|
|
25
|
+
"""Create an instance of CSVComparer from two CSV files.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
first_file (str): File path of the first CSV file.
|
|
29
|
+
second_file (str): File path of the second CSV file.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
CSVComparer: A CSVComparer object with the parsed file contents.
|
|
33
|
+
|
|
34
|
+
Raises:
|
|
35
|
+
FileNotFoundError: If either file_path does not exist.
|
|
36
|
+
UnicodeDecodeError: If either file cannot be decoded with the detected encoding.
|
|
37
|
+
"""
|
|
38
|
+
csv_comparer = cls()
|
|
39
|
+
csv_comparer.first_file = CSVParser.from_file(first_file)
|
|
40
|
+
csv_comparer.second_file = CSVParser.from_file(second_file)
|
|
41
|
+
return csv_comparer
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def from_lines(
|
|
45
|
+
cls, first_file_lines: List[str], second_file_lines: List[str]
|
|
46
|
+
) -> "CSVComparer":
|
|
47
|
+
"""Create an instance of CSVComparer from two lists of CSV text lines.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
first_file_lines (List[str]): List of text lines for the first CSV file.
|
|
51
|
+
second_file_lines (List[str]): List of text lines for the second CSV file.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
CSVComparer: A CSVComparer object with the parsed file contents.
|
|
55
|
+
"""
|
|
56
|
+
csv_comparer = cls()
|
|
57
|
+
csv_comparer.first_file = CSVParser.from_lines(first_file_lines)
|
|
58
|
+
csv_comparer.second_file = CSVParser.from_lines(second_file_lines)
|
|
59
|
+
return csv_comparer
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def from_csv_parsers(
|
|
63
|
+
cls, first_file_parser: CSVParser, second_file_parser: CSVParser
|
|
64
|
+
) -> "CSVComparer":
|
|
65
|
+
"""Create an instance of CSVComparer from two CSVParser objects.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
first_file_parser (CSVParser): CSVParser object for the first CSV file.
|
|
69
|
+
second_file_parser (CSVParser): CSVParser object for the second CSV file.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
CSVComparer: A CSVComparer object with the parsed file contents.
|
|
73
|
+
"""
|
|
74
|
+
csv_comparer = cls()
|
|
75
|
+
csv_comparer.first_file = first_file_parser
|
|
76
|
+
csv_comparer.second_file = second_file_parser
|
|
77
|
+
return csv_comparer
|
|
78
|
+
|
|
79
|
+
def strip_whitespace(self) -> None:
|
|
80
|
+
"""Strip whitespace from all keys and values in both CSV files.
|
|
81
|
+
|
|
82
|
+
This method removes leading and trailing whitespace from column names
|
|
83
|
+
and all cell values in both the first and second CSV files.
|
|
84
|
+
"""
|
|
85
|
+
self.first_file.strip_whitespace()
|
|
86
|
+
self.second_file.strip_whitespace()
|
|
87
|
+
|
|
88
|
+
def drop_columns(self, column_names: List[str]) -> None:
|
|
89
|
+
"""Drop (remove) the specified columns from both CSV files.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
column_names (List[str]): List of column names to drop (remove).
|
|
93
|
+
"""
|
|
94
|
+
self.first_file.drop_columns(column_names)
|
|
95
|
+
self.second_file.drop_columns(column_names)
|
|
96
|
+
|
|
97
|
+
def drop_rows(self, index_column: str, row_values: List[str]) -> None:
|
|
98
|
+
"""Drop (remove) the rows with matching values in both CSV files.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
index_column (str): Column name to search.
|
|
102
|
+
row_values (List[str]): Values to match in the specified column.
|
|
103
|
+
"""
|
|
104
|
+
self.first_file.drop_rows(index_column, row_values)
|
|
105
|
+
self.second_file.drop_rows(index_column, row_values)
|
|
106
|
+
|
|
107
|
+
def drop_rows_by(self, predicate: Callable) -> None:
|
|
108
|
+
"""Drop (remove) rows based on a predicate function in both CSV files.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
predicate (Callable): Predicate function applied on each row to determine
|
|
112
|
+
if it will be dropped.
|
|
113
|
+
"""
|
|
114
|
+
self.first_file.drop_rows_by(predicate)
|
|
115
|
+
self.second_file.drop_rows_by(predicate)
|
|
116
|
+
|
|
117
|
+
def apply_transform(self, column_name: str, func: Callable) -> None:
|
|
118
|
+
"""Apply a transform on a column on each row of the CSV files.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
column_name (str): Column name on which the transform is to be applied.
|
|
122
|
+
func (Callable): Transformer function to execute on each row.
|
|
123
|
+
"""
|
|
124
|
+
self.first_file.apply_transform(column_name, func)
|
|
125
|
+
self.second_file.apply_transform(column_name, func)
|
|
126
|
+
|
|
127
|
+
def compare(self, index_column: str) -> CSVCompareOutput:
|
|
128
|
+
"""Compare two CSV files and return detailed comparison results.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
index_column (str): Column name to use as the unique row identifier.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
CSVCompareOutput: An object containing match result and detailed comparison
|
|
135
|
+
information including extra/missing columns, extra/missing rows, and
|
|
136
|
+
mismatched cell values.
|
|
137
|
+
|
|
138
|
+
Raises:
|
|
139
|
+
ValueError: If index_column does not exist in either CSV file.
|
|
140
|
+
"""
|
|
141
|
+
# Validate that index_column exists in both files
|
|
142
|
+
if index_column not in self.first_file.column_names:
|
|
143
|
+
raise ValueError(
|
|
144
|
+
f"Index column '{index_column}' not found in first file. Available columns: {self.first_file.column_names}"
|
|
145
|
+
)
|
|
146
|
+
if index_column not in self.second_file.column_names:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
f"Index column '{index_column}' not found in second file. Available columns: {self.second_file.column_names}"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Set the index column for both files
|
|
152
|
+
self.first_file.index_column = index_column
|
|
153
|
+
self.second_file.index_column = index_column
|
|
154
|
+
|
|
155
|
+
# Convert to sets for efficient lookups (O(1) instead of O(n))
|
|
156
|
+
first_cols = set(self.first_file.column_names)
|
|
157
|
+
second_cols = set(self.second_file.column_names)
|
|
158
|
+
first_rows = set(self.first_file.row_values_in_column(index_column))
|
|
159
|
+
second_rows = set(self.second_file.row_values_in_column(index_column))
|
|
160
|
+
|
|
161
|
+
# Find columns that exist in first file but not in second
|
|
162
|
+
extra_cols_in_first_file = list(first_cols - second_cols)
|
|
163
|
+
|
|
164
|
+
# Find columns that exist in second file but not in first
|
|
165
|
+
extra_cols_in_second_file = list(second_cols - first_cols)
|
|
166
|
+
|
|
167
|
+
# Find rows that exist in first file but not in second
|
|
168
|
+
extra_rows_in_first_file = list(first_rows - second_rows)
|
|
169
|
+
|
|
170
|
+
# Find rows that exist in second file but not in first
|
|
171
|
+
extra_rows_in_second_file = list(second_rows - first_rows)
|
|
172
|
+
|
|
173
|
+
# Find columns and rows that exist in both files
|
|
174
|
+
common_columns = list(first_cols & second_cols)
|
|
175
|
+
common_rows = list(first_rows & second_rows)
|
|
176
|
+
|
|
177
|
+
# Compare values in common rows and columns to find mismatches
|
|
178
|
+
mismatched_rows = []
|
|
179
|
+
for row in common_rows:
|
|
180
|
+
for column in common_columns:
|
|
181
|
+
first_val = self.first_file.get_value(row, column)
|
|
182
|
+
second_val = self.second_file.get_value(row, column)
|
|
183
|
+
if first_val != second_val:
|
|
184
|
+
mismatched_rows.append(
|
|
185
|
+
{
|
|
186
|
+
"row": row,
|
|
187
|
+
"column": column,
|
|
188
|
+
"first": first_val,
|
|
189
|
+
"second": second_val,
|
|
190
|
+
}
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Determine if files match: true only if there are no differences
|
|
194
|
+
match_result = not any(
|
|
195
|
+
[
|
|
196
|
+
extra_cols_in_first_file,
|
|
197
|
+
extra_cols_in_second_file,
|
|
198
|
+
extra_rows_in_first_file,
|
|
199
|
+
extra_rows_in_second_file,
|
|
200
|
+
mismatched_rows,
|
|
201
|
+
]
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
return CSVCompareOutput(
|
|
205
|
+
match_result=match_result,
|
|
206
|
+
first_file=self.first_file.file_path,
|
|
207
|
+
second_file=self.second_file.file_path,
|
|
208
|
+
extra_cols_in_first_file=extra_cols_in_first_file,
|
|
209
|
+
extra_cols_in_second_file=extra_cols_in_second_file,
|
|
210
|
+
extra_rows_in_first_file=extra_rows_in_first_file,
|
|
211
|
+
extra_rows_in_second_file=extra_rows_in_second_file,
|
|
212
|
+
mismatched_rows=mismatched_rows,
|
|
213
|
+
)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Output model for CSV comparison results."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict, List
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class CSVCompareOutput:
|
|
9
|
+
"""Contains the results of comparing two CSV files.
|
|
10
|
+
|
|
11
|
+
Attributes:
|
|
12
|
+
match_result: True if the files are identical, False otherwise.
|
|
13
|
+
first_file: Path or identifier of the first file.
|
|
14
|
+
second_file: Path or identifier of the second file.
|
|
15
|
+
extra_cols_in_first_file: Columns present in the first file but not the second.
|
|
16
|
+
extra_cols_in_second_file: Columns present in the second file but not the first.
|
|
17
|
+
extra_rows_in_first_file: Rows present in the first file but not the second.
|
|
18
|
+
extra_rows_in_second_file: Rows present in the second file but not the first.
|
|
19
|
+
mismatched_rows: List of dicts describing cell-level mismatches.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
match_result: bool
|
|
23
|
+
first_file: str
|
|
24
|
+
second_file: str
|
|
25
|
+
extra_cols_in_first_file: List[str]
|
|
26
|
+
extra_cols_in_second_file: List[str]
|
|
27
|
+
extra_rows_in_first_file: List[str]
|
|
28
|
+
extra_rows_in_second_file: List[str]
|
|
29
|
+
mismatched_rows: List[Dict[str, str]]
|
|
30
|
+
|
|
31
|
+
def to_dict(self) -> Dict:
|
|
32
|
+
"""Convert the comparison result to a dictionary.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Dict: Dictionary representation of the comparison result.
|
|
36
|
+
"""
|
|
37
|
+
return {
|
|
38
|
+
"first_file": self.first_file,
|
|
39
|
+
"second_file": self.second_file,
|
|
40
|
+
"match_result": self.match_result,
|
|
41
|
+
"extra_cols_in_first_file": self.extra_cols_in_first_file,
|
|
42
|
+
"extra_cols_in_second_file": self.extra_cols_in_second_file,
|
|
43
|
+
"extra_rows_in_first_file": self.extra_rows_in_first_file,
|
|
44
|
+
"extra_rows_in_second_file": self.extra_rows_in_second_file,
|
|
45
|
+
"mismatched_rows": self.mismatched_rows,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def __str__(self) -> str:
|
|
49
|
+
"""Format the comparison result as a human-readable string.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
str: Multi-line summary of the comparison.
|
|
53
|
+
"""
|
|
54
|
+
lines = []
|
|
55
|
+
lines.append(f"First file: {self.first_file}")
|
|
56
|
+
lines.append(f"Second file: {self.second_file}")
|
|
57
|
+
lines.append(f"Match result: {self.match_result}")
|
|
58
|
+
|
|
59
|
+
if self.extra_cols_in_first_file:
|
|
60
|
+
lines.append("Extra columns in first file")
|
|
61
|
+
lines.extend([f"\t{x}" for x in self.extra_cols_in_first_file])
|
|
62
|
+
|
|
63
|
+
if self.extra_cols_in_second_file:
|
|
64
|
+
lines.append("Extra columns in second file")
|
|
65
|
+
lines.extend([f"\t{x}" for x in self.extra_cols_in_second_file])
|
|
66
|
+
if self.extra_rows_in_first_file:
|
|
67
|
+
lines.append("Extra rows in first file")
|
|
68
|
+
lines.extend([f"\t{x}" for x in self.extra_rows_in_first_file])
|
|
69
|
+
if self.extra_rows_in_second_file:
|
|
70
|
+
lines.append("Extra rows in second file")
|
|
71
|
+
lines.extend([f"\t{x}" for x in self.extra_rows_in_second_file])
|
|
72
|
+
if self.mismatched_rows:
|
|
73
|
+
lines.append("Mismatched rows")
|
|
74
|
+
lines.extend(
|
|
75
|
+
[
|
|
76
|
+
f"\trow: {x.get('row', '')}, column: {x.get('column', '')}, first: {x.get('first', '')}, second: {x.get('second', '')}"
|
|
77
|
+
for x in self.mismatched_rows
|
|
78
|
+
]
|
|
79
|
+
)
|
|
80
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
"""CSV parsing module for reading, transforming, and writing CSV data."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import io
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from typing import Callable, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
import chardet
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CSVParser:
|
|
14
|
+
"""Parses CSV data from files, text, or lines into a list of dictionaries.
|
|
15
|
+
|
|
16
|
+
Supports encoding detection, column/row manipulation, transforms,
|
|
17
|
+
and writing modified data back to files.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self) -> None:
|
|
21
|
+
self.list_of_dicts: List[Dict[str, str]] = []
|
|
22
|
+
self.column_names: List[str] = []
|
|
23
|
+
self.file_text: str = ""
|
|
24
|
+
self._index_column: str = ""
|
|
25
|
+
self.file_path: str = ""
|
|
26
|
+
self._encoding: str = "utf-8"
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def get_encoding(file_path: str) -> Optional[str]:
|
|
30
|
+
"""Detect the encoding of the file.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
file_path (str): File path of the text file.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Optional[str]: Detected encoding, or None if detection fails.
|
|
37
|
+
"""
|
|
38
|
+
with open(file_path, "rb") as f:
|
|
39
|
+
rawdata = b"".join(f.readlines())
|
|
40
|
+
return chardet.detect(rawdata)["encoding"]
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def from_file(cls, file_path: str, column_names: Optional[List[str]] = None) -> "CSVParser":
|
|
44
|
+
"""Create an instance of CSVParser from a file.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
file_path (str): File Path (CSV file).
|
|
48
|
+
column_names: Optional list of column names. If None or empty, uses the CSV header row.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
CSVParser: A CSVParser object with the parsed file content.
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
FileNotFoundError: If the file_path does not exist.
|
|
55
|
+
TypeError: If column_names contains non-string elements.
|
|
56
|
+
UnicodeDecodeError: If the file cannot be decoded with the detected encoding.
|
|
57
|
+
"""
|
|
58
|
+
if not os.path.isfile(file_path):
|
|
59
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
encoding = cls.get_encoding(file_path) or "utf-8"
|
|
63
|
+
except (OSError, LookupError) as e:
|
|
64
|
+
encoding = "utf-8"
|
|
65
|
+
logging.warning(f"Warning: Encoding detection failed, using utf-8: {e}")
|
|
66
|
+
|
|
67
|
+
file_text = ""
|
|
68
|
+
try:
|
|
69
|
+
with open(file_path, encoding=encoding) as file:
|
|
70
|
+
file_text = file.read()
|
|
71
|
+
except UnicodeDecodeError as e:
|
|
72
|
+
raise UnicodeDecodeError(
|
|
73
|
+
e.encoding,
|
|
74
|
+
e.object,
|
|
75
|
+
e.start,
|
|
76
|
+
e.end,
|
|
77
|
+
f"Failed to decode {file_path} with encoding {encoding}: {e.reason}",
|
|
78
|
+
) from e
|
|
79
|
+
|
|
80
|
+
csv_parser = CSVParser.from_csv_text(file_text, column_names)
|
|
81
|
+
csv_parser.file_path = file_path
|
|
82
|
+
csv_parser._encoding = encoding
|
|
83
|
+
return csv_parser
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def from_lines(cls, csv_lines: List[str], column_names: Optional[List[str]] = None) -> "CSVParser":
|
|
87
|
+
"""Create an instance of CSVParser from a list of text lines.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
csv_lines (List[str]): List of text lines.
|
|
91
|
+
column_names: Optional list of column names. If None or empty, uses the CSV header row.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
CSVParser: A CSVParser object with the parsed file content.
|
|
95
|
+
"""
|
|
96
|
+
file_text = "\n".join(csv_lines)
|
|
97
|
+
csv_parser = CSVParser.from_csv_text(file_text, column_names)
|
|
98
|
+
csv_parser.file_path = "init from lines"
|
|
99
|
+
return csv_parser
|
|
100
|
+
|
|
101
|
+
@classmethod
|
|
102
|
+
def from_csv_text(cls, file_text: str, column_names: Optional[List[str]] = None) -> "CSVParser":
|
|
103
|
+
"""Create an instance of CSVParser from CSV text as a string.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
file_text (str): CSV content as text.
|
|
107
|
+
column_names: Optional list of column names. If None or empty, uses the CSV header row.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
CSVParser: A CSVParser object with the parsed file content.
|
|
111
|
+
"""
|
|
112
|
+
if column_names is not None and column_names:
|
|
113
|
+
if not all(isinstance(col, str) for col in column_names):
|
|
114
|
+
raise TypeError("All column names must be strings")
|
|
115
|
+
csv_parser = cls()
|
|
116
|
+
csv_parser.file_path = "init from text"
|
|
117
|
+
csv_parser.file_text = file_text
|
|
118
|
+
csv_parser.column_names = (
|
|
119
|
+
csv_parser._get_column_names_from_header_row(file_text)
|
|
120
|
+
if not column_names
|
|
121
|
+
else column_names
|
|
122
|
+
)
|
|
123
|
+
with io.StringIO(file_text) as file:
|
|
124
|
+
dict_reader = csv.DictReader(file, column_names)
|
|
125
|
+
csv_parser.list_of_dicts = list(dict_reader)
|
|
126
|
+
return csv_parser
|
|
127
|
+
|
|
128
|
+
def _get_column_names_from_header_row(self, file_text: str) -> List[str]:
|
|
129
|
+
"""Extract column names from the first row of CSV text.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
file_text (str): CSV content as text.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
List[str]: List of column names, with duplicates made unique.
|
|
136
|
+
"""
|
|
137
|
+
with io.StringIO(file_text) as file:
|
|
138
|
+
reader = csv.reader(file)
|
|
139
|
+
try:
|
|
140
|
+
header_row = next(reader)
|
|
141
|
+
except StopIteration:
|
|
142
|
+
return []
|
|
143
|
+
return self._unique_vals(header_row)
|
|
144
|
+
|
|
145
|
+
def strip_whitespace(self) -> None:
|
|
146
|
+
"""Strip leading and trailing whitespace from all keys and values."""
|
|
147
|
+
self.list_of_dicts = [
|
|
148
|
+
{
|
|
149
|
+
(key.strip() if isinstance(key, str) else key): (
|
|
150
|
+
value.strip() if isinstance(value, str) else value
|
|
151
|
+
)
|
|
152
|
+
for key, value in item.items()
|
|
153
|
+
}
|
|
154
|
+
for item in self.list_of_dicts
|
|
155
|
+
]
|
|
156
|
+
self.column_names = [x.strip() for x in self.column_names if isinstance(x, str)]
|
|
157
|
+
if isinstance(self._index_column, str):
|
|
158
|
+
self._index_column = self._index_column.strip()
|
|
159
|
+
|
|
160
|
+
def apply_transform(self, column_name: str, func: Callable) -> None:
|
|
161
|
+
"""Apply a transform on a column on each row of the CSV file.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
column_name (str): Column name on which the transform is to be applied.
|
|
165
|
+
func (Callable): Transformer function to execute on each row.
|
|
166
|
+
"""
|
|
167
|
+
self.list_of_dicts = [
|
|
168
|
+
{**item, column_name: func(item)} for item in self.list_of_dicts
|
|
169
|
+
]
|
|
170
|
+
if column_name not in self.column_names:
|
|
171
|
+
self.column_names.append(column_name)
|
|
172
|
+
|
|
173
|
+
def get_row(self, column_name: str, row_value: str) -> Dict[str, str]:
|
|
174
|
+
"""Get the contents of the first matching row as a dictionary.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
column_name (str): Column name to search.
|
|
178
|
+
row_value (str): Value to match in the specified column.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Dict[str, str]: Dictionary representation of the first matching row
|
|
182
|
+
or an empty dictionary if a match is not found.
|
|
183
|
+
"""
|
|
184
|
+
row: Dict[str, str] = {}
|
|
185
|
+
rows = self.get_rows(column_name, row_value)
|
|
186
|
+
if rows:
|
|
187
|
+
row = rows[0]
|
|
188
|
+
return row
|
|
189
|
+
|
|
190
|
+
def get_rows(self, column_name: str, row_value: str) -> List[Dict[str, str]]:
|
|
191
|
+
"""Get the contents of matching rows as a list of dictionaries.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
column_name (str): Column name to search.
|
|
195
|
+
row_value (str): Value to match in the specified column.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
List[Dict[str, str]]: List of dictionaries of the matching rows
|
|
199
|
+
or an empty list if a match is not found.
|
|
200
|
+
"""
|
|
201
|
+
rows: List[Dict[str, str]] = []
|
|
202
|
+
if column_name in self.column_names:
|
|
203
|
+
matching_row = lambda row: row.get(column_name) == row_value
|
|
204
|
+
rows = list(filter(matching_row, self.list_of_dicts))
|
|
205
|
+
return rows
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def has_error(self) -> bool:
|
|
209
|
+
"""Check if there are any errors in the CSV file.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
bool: True if there are any errors (e.g., inconsistent column counts),
|
|
213
|
+
otherwise False.
|
|
214
|
+
"""
|
|
215
|
+
reader = csv.reader(io.StringIO(self.file_text))
|
|
216
|
+
try:
|
|
217
|
+
header = next(reader)
|
|
218
|
+
except StopIteration:
|
|
219
|
+
return False
|
|
220
|
+
expected_cols = len(header)
|
|
221
|
+
for _, row in enumerate(reader, start=2):
|
|
222
|
+
if len(row) != expected_cols:
|
|
223
|
+
return True
|
|
224
|
+
return False
|
|
225
|
+
|
|
226
|
+
def drop_columns(self, column_names: List[str]) -> None:
|
|
227
|
+
"""Drop (remove) the columns.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
column_names (List[str]): List of column names to drop (remove).
|
|
231
|
+
"""
|
|
232
|
+
list_of_dicts = []
|
|
233
|
+
for dict_row in self.list_of_dicts:
|
|
234
|
+
list_of_dicts.append(
|
|
235
|
+
{
|
|
236
|
+
key: value
|
|
237
|
+
for key, value in dict_row.items()
|
|
238
|
+
if key not in column_names
|
|
239
|
+
}
|
|
240
|
+
)
|
|
241
|
+
self.list_of_dicts = list_of_dicts
|
|
242
|
+
self.column_names = [
|
|
243
|
+
col_name for col_name in self.column_names if col_name not in column_names
|
|
244
|
+
]
|
|
245
|
+
|
|
246
|
+
def drop_rows_by(self, predicate: Callable) -> None:
|
|
247
|
+
"""Drop (remove) rows based on a predicate function that is applied on the row.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
predicate (Callable): Predicate function applied on each row to determine
|
|
251
|
+
if it will be dropped.
|
|
252
|
+
"""
|
|
253
|
+
self.list_of_dicts = [
|
|
254
|
+
dict_row for dict_row in self.list_of_dicts if not predicate(dict_row)
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
def drop_rows(self, column_name: str, row_values: List[str]) -> None:
|
|
258
|
+
"""Drop (remove) the rows with matching values in a specified column.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
column_name (str): Column name to search.
|
|
262
|
+
row_values (List[str]): Values to match in the specified column.
|
|
263
|
+
"""
|
|
264
|
+
if column_name not in self.column_names:
|
|
265
|
+
return
|
|
266
|
+
row_val_check = lambda x: x[column_name] in row_values
|
|
267
|
+
self.drop_rows_by(row_val_check)
|
|
268
|
+
|
|
269
|
+
def row_values_in_column(self, column_name: str) -> List[str]:
|
|
270
|
+
"""Get a list of row values in the specified column.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
column_name (str): Column name.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
List[str]: List of all the row values in that column.
|
|
277
|
+
"""
|
|
278
|
+
row_values: List[str] = []
|
|
279
|
+
if column_name in self.column_names:
|
|
280
|
+
row_values = [x.get(column_name, "") for x in self.list_of_dicts]
|
|
281
|
+
return row_values
|
|
282
|
+
|
|
283
|
+
def _unique_vals(self, values: List[str]) -> List[str]:
|
|
284
|
+
"""Make duplicate values unique by appending numeric suffixes.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
values (List[str]): List of values that may contain duplicates.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
List[str]: List with duplicates renamed (e.g., col, col.1, col.2).
|
|
291
|
+
"""
|
|
292
|
+
result = list(values)
|
|
293
|
+
unique_values = set()
|
|
294
|
+
suffix_counts = defaultdict(int)
|
|
295
|
+
for idx, val in enumerate(result):
|
|
296
|
+
if val not in unique_values:
|
|
297
|
+
unique_values.add(val)
|
|
298
|
+
else:
|
|
299
|
+
while True:
|
|
300
|
+
suffix_counts[val] += 1
|
|
301
|
+
new_val = f"{val}.{suffix_counts[val]}"
|
|
302
|
+
if new_val not in unique_values:
|
|
303
|
+
unique_values.add(new_val)
|
|
304
|
+
result[idx] = new_val
|
|
305
|
+
break
|
|
306
|
+
return result
|
|
307
|
+
|
|
308
|
+
@property
|
|
309
|
+
def index_column(self) -> str:
|
|
310
|
+
"""Get the index column.
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
str: Column name.
|
|
314
|
+
"""
|
|
315
|
+
if not self.column_names:
|
|
316
|
+
raise ValueError("No columns available to set or get the index column")
|
|
317
|
+
return self._index_column or self.column_names[0]
|
|
318
|
+
|
|
319
|
+
@index_column.setter
|
|
320
|
+
def index_column(self, value: str) -> None:
|
|
321
|
+
"""Set the index column. If there are duplicate column names, append a numeric
|
|
322
|
+
index to the column name (col, col.1, col.2).
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
value (str): Column name.
|
|
326
|
+
"""
|
|
327
|
+
if not self.column_names:
|
|
328
|
+
raise ValueError("No columns available to set the index column")
|
|
329
|
+
if value not in self.column_names:
|
|
330
|
+
raise ValueError(f"Column '{value}' not found in column names")
|
|
331
|
+
self._index_column = value
|
|
332
|
+
row_vals = self._unique_vals(self.row_values_in_column(self._index_column))
|
|
333
|
+
for idx, row in enumerate(self.list_of_dicts):
|
|
334
|
+
row[self.index_column] = row_vals[idx]
|
|
335
|
+
|
|
336
|
+
def get_value(self, row_value_in_index_column: str, column_name: str) -> str:
|
|
337
|
+
"""Get value of the cell in the specified row and column.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
row_value_in_index_column (str): Row identifier.
|
|
341
|
+
column_name (str): Column name.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
str: Value contained in the cell.
|
|
345
|
+
"""
|
|
346
|
+
row = self.get_row(self.index_column, row_value_in_index_column)
|
|
347
|
+
return row.get(column_name, "")
|
|
348
|
+
|
|
349
|
+
def set_value(self, row_value_in_index_column: str, column_name: str, new_value: str) -> None:
|
|
350
|
+
"""Set value of cell in the specified row and column.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
row_value_in_index_column (str): Row identifier.
|
|
354
|
+
column_name (str): Column name.
|
|
355
|
+
new_value (str): New value to set for the cell.
|
|
356
|
+
"""
|
|
357
|
+
row = self.get_row(self.index_column, row_value_in_index_column)
|
|
358
|
+
if row and column_name in row:
|
|
359
|
+
row[column_name] = new_value
|
|
360
|
+
|
|
361
|
+
def write_to_file(self) -> None:
|
|
362
|
+
"""Write the CSV data back to the source file.
|
|
363
|
+
|
|
364
|
+
Raises:
|
|
365
|
+
ValueError: If the CSVParser was not initialized from a file.
|
|
366
|
+
"""
|
|
367
|
+
if not os.path.isfile(self.file_path):
|
|
368
|
+
raise ValueError(
|
|
369
|
+
f"Cannot write: '{self.file_path}' is not a valid file path. "
|
|
370
|
+
"write_to_file is only supported when initialized via from_file."
|
|
371
|
+
)
|
|
372
|
+
with open(self.file_path, "w", newline="", encoding=self._encoding) as f:
|
|
373
|
+
writer = csv.DictWriter(f, fieldnames=self.column_names)
|
|
374
|
+
writer.writeheader()
|
|
375
|
+
normalized_rows = [
|
|
376
|
+
{key: value for key, value in row.items() if key in self.column_names}
|
|
377
|
+
for row in self.list_of_dicts
|
|
378
|
+
]
|
|
379
|
+
writer.writerows(normalized_rows)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
class NullCSVParser(CSVParser):
|
|
383
|
+
"""Null Object implementation of CSVParser.
|
|
384
|
+
|
|
385
|
+
All mutation methods are no-ops. Used as a default placeholder
|
|
386
|
+
when no CSV data has been loaded yet.
|
|
387
|
+
"""
|
|
388
|
+
|
|
389
|
+
def __init__(self) -> None:
|
|
390
|
+
self.list_of_dicts: List[Dict[str, str]] = []
|
|
391
|
+
self.column_names: List[str] = []
|
|
392
|
+
self.file_text: str = ""
|
|
393
|
+
self._index_column: str = ""
|
|
394
|
+
self.file_path: str = "null"
|
|
395
|
+
self._encoding: str = "utf-8"
|
|
396
|
+
|
|
397
|
+
def strip_whitespace(self) -> None:
|
|
398
|
+
pass
|
|
399
|
+
|
|
400
|
+
def apply_transform(self, column_name: str, func: Callable) -> None:
|
|
401
|
+
pass
|
|
402
|
+
|
|
403
|
+
def drop_columns(self, column_names: List[str]) -> None:
|
|
404
|
+
pass
|
|
405
|
+
|
|
406
|
+
def drop_rows_by(self, predicate: Callable) -> None:
|
|
407
|
+
pass
|
|
408
|
+
|
|
409
|
+
def drop_rows(self, column_name: str, row_values: List[str]) -> None:
|
|
410
|
+
pass
|
|
411
|
+
|
|
412
|
+
def row_values_in_column(self, column_name: str) -> List[str]:
|
|
413
|
+
return []
|
|
414
|
+
|
|
415
|
+
def get_value(self, row_value_in_index_column: str, column_name: str) -> str:
|
|
416
|
+
return ""
|
|
417
|
+
|
|
418
|
+
def set_value(self, row_value_in_index_column: str, column_name: str, new_value: str) -> None:
|
|
419
|
+
pass
|
|
420
|
+
|
|
421
|
+
@property
|
|
422
|
+
def index_column(self) -> str:
|
|
423
|
+
return self._index_column
|
|
424
|
+
|
|
425
|
+
@index_column.setter
|
|
426
|
+
def index_column(self, value: str) -> None:
|
|
427
|
+
pass
|
|
428
|
+
|
|
429
|
+
def write_to_file(self) -> None:
|
|
430
|
+
pass
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: csv-diff-tool
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A package for comparing CSV files
|
|
5
|
+
Author-email: Ashish N <ashish012@e.ntu.edu.sg>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ashishnarmen/csv-diff-tool
|
|
8
|
+
Project-URL: Issues, https://github.com/ashishnarmen/csv-diff-tool/issues
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Topic :: Utilities
|
|
13
|
+
Requires-Python: >=3.8
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: chardet
|
|
17
|
+
Provides-Extra: test
|
|
18
|
+
Requires-Dist: coverage; extra == "test"
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
|
|
21
|
+
# csv-diff-tool
|
|
22
|
+
|
|
23
|
+
[](https://opensource.org/licenses/MIT)
|
|
24
|
+
[](https://www.python.org/downloads/)
|
|
25
|
+
|
|
26
|
+
A Python library for comparing CSV files with detailed output including extra columns, extra rows, and mismatched cell values. Supports automatic encoding detection, data transformations, and flexible input methods.
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install csv-diff-tool
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Quick Start
|
|
35
|
+
|
|
36
|
+
### Compare two CSV files
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from csv_diff_tool import CSVComparer
|
|
40
|
+
|
|
41
|
+
result = CSVComparer.from_files("report_v1.csv", "report_v2.csv").compare("id")
|
|
42
|
+
|
|
43
|
+
print(result.match_result) # True if files are identical
|
|
44
|
+
print(result.extra_cols_in_first_file) # Columns only in first file
|
|
45
|
+
print(result.extra_rows_in_second_file) # Rows only in second file
|
|
46
|
+
print(result.mismatched_rows) # Cell-level differences
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Compare from in-memory data
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from csv_diff_tool import CSVComparer
|
|
53
|
+
|
|
54
|
+
file_1 = ["id,name,score", "1,Alice,95", "2,Bob,87"]
|
|
55
|
+
file_2 = ["id,name,score", "1,Alice,95", "2,Bob,90", "3,Carol,88"]
|
|
56
|
+
|
|
57
|
+
result = CSVComparer.from_lines(file_1, file_2).compare("id")
|
|
58
|
+
|
|
59
|
+
print(result)
|
|
60
|
+
# First file: init from lines
|
|
61
|
+
# Second file: init from lines
|
|
62
|
+
# Match result: False
|
|
63
|
+
# Extra rows in second file
|
|
64
|
+
# 3
|
|
65
|
+
# Mismatched rows
|
|
66
|
+
# row: 2, column: score, first: 87, second: 90
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Transform data before comparing
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from csv_diff_tool import CSVComparer
|
|
73
|
+
|
|
74
|
+
comparer = CSVComparer.from_files("old.csv", "new.csv")
|
|
75
|
+
|
|
76
|
+
# Strip whitespace from all cells
|
|
77
|
+
comparer.strip_whitespace()
|
|
78
|
+
|
|
79
|
+
# Drop columns you don't care about
|
|
80
|
+
comparer.drop_columns(["timestamp", "metadata"])
|
|
81
|
+
|
|
82
|
+
# Drop specific rows
|
|
83
|
+
comparer.drop_rows("status", ["deleted", "archived"])
|
|
84
|
+
|
|
85
|
+
# Apply custom transforms
|
|
86
|
+
comparer.apply_transform("price", lambda row: str(round(float(row["price"]), 2)))
|
|
87
|
+
|
|
88
|
+
result = comparer.compare("id")
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Work with CSVParser directly
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from csv_diff_tool import CSVParser
|
|
95
|
+
|
|
96
|
+
parser = CSVParser.from_file("data.csv")
|
|
97
|
+
|
|
98
|
+
# Inspect the data
|
|
99
|
+
print(parser.column_names)
|
|
100
|
+
print(parser.get_row("id", "42"))
|
|
101
|
+
print(parser.row_values_in_column("name"))
|
|
102
|
+
|
|
103
|
+
# Modify and write back
|
|
104
|
+
parser.set_value("42", "status", "updated")
|
|
105
|
+
parser.write_to_file()
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## API Overview
|
|
109
|
+
|
|
110
|
+
### CSVComparer
|
|
111
|
+
|
|
112
|
+
The main comparison class. Create instances using factory methods:
|
|
113
|
+
|
|
114
|
+
- `CSVComparer.from_files(first_file, second_file)` - Load from file paths
|
|
115
|
+
- `CSVComparer.from_lines(first_lines, second_lines)` - Load from lists of strings
|
|
116
|
+
- `CSVComparer.from_csv_parsers(parser1, parser2)` - Load from CSVParser objects
|
|
117
|
+
|
|
118
|
+
**Transform methods** (applied to both files):
|
|
119
|
+
- `strip_whitespace()` - Remove leading/trailing whitespace
|
|
120
|
+
- `drop_columns(column_names)` - Remove specified columns
|
|
121
|
+
- `drop_rows(column, values)` - Remove rows matching values
|
|
122
|
+
- `drop_rows_by(predicate)` - Remove rows matching a function
|
|
123
|
+
- `apply_transform(column, func)` - Apply a function to a column
|
|
124
|
+
|
|
125
|
+
**Compare:**
|
|
126
|
+
- `compare(index_column)` - Returns a `CSVCompareOutput` with the results
|
|
127
|
+
|
|
128
|
+
### CSVParser
|
|
129
|
+
|
|
130
|
+
Parses CSV data into a list of dictionaries with column-aware operations:
|
|
131
|
+
|
|
132
|
+
- `CSVParser.from_file(path)` - Load from file (auto-detects encoding)
|
|
133
|
+
- `CSVParser.from_lines(lines)` - Load from list of strings
|
|
134
|
+
- `CSVParser.from_csv_text(text)` - Load from a single string
|
|
135
|
+
- `get_row(column, value)` / `get_rows(column, value)` - Query rows
|
|
136
|
+
- `get_value(row_id, column)` / `set_value(row_id, column, value)` - Cell access
|
|
137
|
+
- `write_to_file()` - Write modifications back to the source file
|
|
138
|
+
|
|
139
|
+
### CSVCompareOutput
|
|
140
|
+
|
|
141
|
+
A dataclass containing comparison results:
|
|
142
|
+
|
|
143
|
+
| Field | Type | Description |
|
|
144
|
+
|-------|------|-------------|
|
|
145
|
+
| `match_result` | `bool` | `True` if files are identical |
|
|
146
|
+
| `extra_cols_in_first_file` | `List[str]` | Columns only in the first file |
|
|
147
|
+
| `extra_cols_in_second_file` | `List[str]` | Columns only in the second file |
|
|
148
|
+
| `extra_rows_in_first_file` | `List[str]` | Rows only in the first file |
|
|
149
|
+
| `extra_rows_in_second_file` | `List[str]` | Rows only in the second file |
|
|
150
|
+
| `mismatched_rows` | `List[Dict]` | Cell-level mismatches with row, column, first, second |
|
|
151
|
+
|
|
152
|
+
## Contributing
|
|
153
|
+
|
|
154
|
+
Contributions are welcome! Please open an issue or submit a pull request on [GitHub](https://github.com/ashishnarmen/csv-diff-tool).
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
# Clone and install for development
|
|
158
|
+
git clone https://github.com/ashishnarmen/csv-diff-tool.git
|
|
159
|
+
cd csv-diff-tool
|
|
160
|
+
pip install -e ".[test]"
|
|
161
|
+
|
|
162
|
+
# Run tests with unittest and coverage
|
|
163
|
+
coverage run -m unittest discover tests/
|
|
164
|
+
coverage report -m
|
|
165
|
+
|
|
166
|
+
# Or, if you prefer pytest (install separately)
|
|
167
|
+
pip install pytest pytest-cov
|
|
168
|
+
pytest tests/ -v --cov=csv_diff_tool
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## License
|
|
172
|
+
|
|
173
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/csv_diff_tool/__init__.py
|
|
5
|
+
src/csv_diff_tool/comparer.py
|
|
6
|
+
src/csv_diff_tool/csv_compare_output.py
|
|
7
|
+
src/csv_diff_tool/csv_parser.py
|
|
8
|
+
src/csv_diff_tool.egg-info/PKG-INFO
|
|
9
|
+
src/csv_diff_tool.egg-info/SOURCES.txt
|
|
10
|
+
src/csv_diff_tool.egg-info/dependency_links.txt
|
|
11
|
+
src/csv_diff_tool.egg-info/requires.txt
|
|
12
|
+
src/csv_diff_tool.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
csv_diff_tool
|