csvsmith 0.2.3__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {csvsmith-0.2.3/src/csvsmith.egg-info → csvsmith-0.4.0}/PKG-INFO +50 -22
- {csvsmith-0.2.3 → csvsmith-0.4.0}/README.rst +48 -19
- {csvsmith-0.2.3 → csvsmith-0.4.0}/pyproject.toml +1 -3
- {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith/__init__.py +18 -1
- {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith/cli.py +44 -10
- csvsmith-0.4.0/src/csvsmith/row_dedup.py +192 -0
- csvsmith-0.4.0/src/csvsmith/string_distance.py +190 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0/src/csvsmith.egg-info}/PKG-INFO +50 -22
- {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith.egg-info/SOURCES.txt +3 -1
- {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith.egg-info/requires.txt +0 -1
- csvsmith-0.4.0/tests/test_row_dedup.py +228 -0
- csvsmith-0.4.0/tests/test_string_distance.py +42 -0
- csvsmith-0.2.3/src/csvsmith/row_dedup.py +0 -128
- csvsmith-0.2.3/tests/test_row_dedup.py +0 -186
- {csvsmith-0.2.3 → csvsmith-0.4.0}/LICENSE +0 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0}/setup.cfg +0 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith/classify.py +0 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith/excel2csv.py +0 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith/filter_rows.py +0 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith/move_files.py +0 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith.egg-info/dependency_links.txt +0 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith.egg-info/entry_points.txt +0 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith.egg-info/top_level.txt +0 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0}/tests/test_classify.py +0 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0}/tests/test_cli.py +0 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0}/tests/test_excel2csv.py +0 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0}/tests/test_filter_rows.py +0 -0
- {csvsmith-0.2.3 → csvsmith-0.4.0}/tests/test_move_files.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: csvsmith
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Small CSV utilities: row deduplication, classification, row filtering, and CLI helpers.
|
|
5
5
|
Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
|
|
6
6
|
License: MIT License
|
|
@@ -27,7 +27,7 @@ License: MIT License
|
|
|
27
27
|
|
|
28
28
|
Project-URL: Homepage, https://github.com/yeiichi/csvsmith
|
|
29
29
|
Project-URL: Repository, https://github.com/yeiichi/csvsmith
|
|
30
|
-
Keywords: csv,
|
|
30
|
+
Keywords: csv,deduplication,data-filtering,file-organization,filtering
|
|
31
31
|
Classifier: Programming Language :: Python :: 3
|
|
32
32
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
33
33
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -37,7 +37,6 @@ Classifier: Topic :: Utilities
|
|
|
37
37
|
Requires-Python: >=3.10
|
|
38
38
|
Description-Content-Type: text/x-rst
|
|
39
39
|
License-File: LICENSE
|
|
40
|
-
Requires-Dist: pandas>=2.0
|
|
41
40
|
Requires-Dist: openpyxl>=3.1
|
|
42
41
|
Dynamic: license-file
|
|
43
42
|
|
|
@@ -51,32 +50,35 @@ csvsmith
|
|
|
51
50
|
:target: https://pypi.org/project/csvsmith/
|
|
52
51
|
|
|
53
52
|
.. image:: https://img.shields.io/pypi/l/csvsmith.svg
|
|
54
|
-
:target: https://pypi.org/project/
|
|
53
|
+
:target: https://pypi.org/project/ccsvsmith/
|
|
55
54
|
|
|
56
55
|
Introduction
|
|
57
56
|
------------
|
|
58
57
|
|
|
59
58
|
csvsmith is a lightweight collection of CSV utilities designed for data
|
|
60
|
-
integrity, deduplication, organization,
|
|
59
|
+
integrity, deduplication, organization, Excel-to-CSV conversion, and
|
|
60
|
+
string-similarity analysis.
|
|
61
61
|
|
|
62
62
|
It provides a small Python API for programmatic data filtering and a single
|
|
63
63
|
CLI entrypoint for quick operations.
|
|
64
64
|
|
|
65
65
|
Whether you need to organize CSV files by header signatures, find duplicate
|
|
66
|
-
rows in a dataset, convert an Excel worksheet into CSV,
|
|
67
|
-
substring rule,
|
|
66
|
+
rows in a dataset, convert an Excel worksheet into CSV, drop rows by a
|
|
67
|
+
substring rule, or compare two strings for similarity, csvsmith aims to keep
|
|
68
|
+
the process predictable and reversible.
|
|
68
69
|
|
|
69
70
|
Features
|
|
70
71
|
--------
|
|
71
72
|
|
|
72
73
|
- row duplicate counting and reporting
|
|
73
|
-
-
|
|
74
|
+
- CSV deduplication with reports
|
|
74
75
|
- CSV classification by header signature
|
|
75
76
|
- dry-run and report-only classification modes
|
|
76
77
|
- rollback support via manifest
|
|
77
78
|
- row filtering by substring
|
|
78
79
|
- Excel worksheet to CSV conversion
|
|
79
80
|
- file moving by suffix
|
|
81
|
+
- string distance and similarity analysis
|
|
80
82
|
- a single command-line entrypoint with subcommands
|
|
81
83
|
|
|
82
84
|
Installation
|
|
@@ -112,34 +114,46 @@ Count duplicate values
|
|
|
112
114
|
print(count_duplicates_sorted(items))
|
|
113
115
|
# [('a', 3), ('b', 2)]
|
|
114
116
|
|
|
115
|
-
Find duplicate rows in a
|
|
116
|
-
|
|
117
|
+
Find duplicate rows in a CSV
|
|
118
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
117
119
|
|
|
118
120
|
.. code-block:: python
|
|
119
121
|
|
|
120
|
-
import
|
|
121
|
-
from csvsmith import find_duplicate_rows
|
|
122
|
+
from csvsmith import find_duplicate_rows, read_csv_rows
|
|
122
123
|
|
|
123
|
-
|
|
124
|
-
dup_rows = find_duplicate_rows(
|
|
124
|
+
rows = read_csv_rows("input.csv")
|
|
125
|
+
dup_rows = find_duplicate_rows(rows)
|
|
125
126
|
|
|
126
127
|
Deduplicate with report
|
|
127
128
|
~~~~~~~~~~~~~~~~~~~~~~~
|
|
128
129
|
|
|
129
130
|
.. code-block:: python
|
|
130
131
|
|
|
131
|
-
import
|
|
132
|
-
from csvsmith import dedupe_with_report
|
|
132
|
+
from csvsmith import dedupe_with_report, read_csv_rows, write_csv_rows
|
|
133
133
|
|
|
134
|
-
|
|
134
|
+
rows = read_csv_rows("input.csv")
|
|
135
135
|
|
|
136
|
-
deduped, report = dedupe_with_report(
|
|
137
|
-
|
|
138
|
-
report.to_csv("duplicate_report.csv", index=False)
|
|
136
|
+
deduped, report = dedupe_with_report(rows)
|
|
137
|
+
write_csv_rows("deduped.csv", deduped, fieldnames=list(rows[0].keys()))
|
|
139
138
|
|
|
140
139
|
# Exclude columns (e.g. IDs or timestamps)
|
|
141
|
-
deduped2, report2 = dedupe_with_report(
|
|
140
|
+
deduped2, report2 = dedupe_with_report(rows, exclude=["id"])
|
|
141
|
+
|
|
142
|
+
Analyze string distance
|
|
143
|
+
~~~~~~~~~~~~~~~~~~~~~~~
|
|
144
|
+
|
|
145
|
+
.. code-block:: python
|
|
142
146
|
|
|
147
|
+
from csvsmith import analyze_pair
|
|
148
|
+
|
|
149
|
+
result = analyze_pair("kitten", "sitting")
|
|
150
|
+
|
|
151
|
+
print(result.get_relation_string())
|
|
152
|
+
print(result.damerau_levenshtein_distance)
|
|
153
|
+
print(result.jaro_winkler_score)
|
|
154
|
+
print(result.similarity_percentage)
|
|
155
|
+
|
|
156
|
+
CLI Usage
|
|
143
157
|
Drop rows in a CSV by column name
|
|
144
158
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
145
159
|
|
|
@@ -212,7 +226,8 @@ CLI Usage
|
|
|
212
226
|
---------
|
|
213
227
|
|
|
214
228
|
csvsmith provides a single CLI entrypoint with subcommands for duplicate
|
|
215
|
-
detection, CSV organization, Excel conversion, file moving,
|
|
229
|
+
detection, CSV organization, Excel conversion, file moving, row filtering,
|
|
230
|
+
and string comparison.
|
|
216
231
|
|
|
217
232
|
Show duplicate rows
|
|
218
233
|
~~~~~~~~~~~~~~~~~~~
|
|
@@ -227,6 +242,19 @@ Save duplicate rows only:
|
|
|
227
242
|
|
|
228
243
|
csvsmith row-duplicates input.csv -o duplicates_only.csv
|
|
229
244
|
|
|
245
|
+
Analyze string distance
|
|
246
|
+
~~~~~~~~~~~~~~~~~~~~~~~
|
|
247
|
+
|
|
248
|
+
.. code-block:: bash
|
|
249
|
+
|
|
250
|
+
csvsmith string-distance "kitten" "sitting"
|
|
251
|
+
|
|
252
|
+
Ignore case:
|
|
253
|
+
|
|
254
|
+
.. code-block:: bash
|
|
255
|
+
|
|
256
|
+
csvsmith string-distance "Hello" "hello" --ignore-case
|
|
257
|
+
|
|
230
258
|
Deduplicate and generate a report
|
|
231
259
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
232
260
|
|
|
@@ -8,32 +8,35 @@ csvsmith
|
|
|
8
8
|
:target: https://pypi.org/project/csvsmith/
|
|
9
9
|
|
|
10
10
|
.. image:: https://img.shields.io/pypi/l/csvsmith.svg
|
|
11
|
-
:target: https://pypi.org/project/
|
|
11
|
+
:target: https://pypi.org/project/ccsvsmith/
|
|
12
12
|
|
|
13
13
|
Introduction
|
|
14
14
|
------------
|
|
15
15
|
|
|
16
16
|
csvsmith is a lightweight collection of CSV utilities designed for data
|
|
17
|
-
integrity, deduplication, organization,
|
|
17
|
+
integrity, deduplication, organization, Excel-to-CSV conversion, and
|
|
18
|
+
string-similarity analysis.
|
|
18
19
|
|
|
19
20
|
It provides a small Python API for programmatic data filtering and a single
|
|
20
21
|
CLI entrypoint for quick operations.
|
|
21
22
|
|
|
22
23
|
Whether you need to organize CSV files by header signatures, find duplicate
|
|
23
|
-
rows in a dataset, convert an Excel worksheet into CSV,
|
|
24
|
-
substring rule,
|
|
24
|
+
rows in a dataset, convert an Excel worksheet into CSV, drop rows by a
|
|
25
|
+
substring rule, or compare two strings for similarity, csvsmith aims to keep
|
|
26
|
+
the process predictable and reversible.
|
|
25
27
|
|
|
26
28
|
Features
|
|
27
29
|
--------
|
|
28
30
|
|
|
29
31
|
- row duplicate counting and reporting
|
|
30
|
-
-
|
|
32
|
+
- CSV deduplication with reports
|
|
31
33
|
- CSV classification by header signature
|
|
32
34
|
- dry-run and report-only classification modes
|
|
33
35
|
- rollback support via manifest
|
|
34
36
|
- row filtering by substring
|
|
35
37
|
- Excel worksheet to CSV conversion
|
|
36
38
|
- file moving by suffix
|
|
39
|
+
- string distance and similarity analysis
|
|
37
40
|
- a single command-line entrypoint with subcommands
|
|
38
41
|
|
|
39
42
|
Installation
|
|
@@ -69,34 +72,46 @@ Count duplicate values
|
|
|
69
72
|
print(count_duplicates_sorted(items))
|
|
70
73
|
# [('a', 3), ('b', 2)]
|
|
71
74
|
|
|
72
|
-
Find duplicate rows in a
|
|
73
|
-
|
|
75
|
+
Find duplicate rows in a CSV
|
|
76
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
74
77
|
|
|
75
78
|
.. code-block:: python
|
|
76
79
|
|
|
77
|
-
import
|
|
78
|
-
from csvsmith import find_duplicate_rows
|
|
80
|
+
from csvsmith import find_duplicate_rows, read_csv_rows
|
|
79
81
|
|
|
80
|
-
|
|
81
|
-
dup_rows = find_duplicate_rows(
|
|
82
|
+
rows = read_csv_rows("input.csv")
|
|
83
|
+
dup_rows = find_duplicate_rows(rows)
|
|
82
84
|
|
|
83
85
|
Deduplicate with report
|
|
84
86
|
~~~~~~~~~~~~~~~~~~~~~~~
|
|
85
87
|
|
|
86
88
|
.. code-block:: python
|
|
87
89
|
|
|
88
|
-
import
|
|
89
|
-
from csvsmith import dedupe_with_report
|
|
90
|
+
from csvsmith import dedupe_with_report, read_csv_rows, write_csv_rows
|
|
90
91
|
|
|
91
|
-
|
|
92
|
+
rows = read_csv_rows("input.csv")
|
|
92
93
|
|
|
93
|
-
deduped, report = dedupe_with_report(
|
|
94
|
-
|
|
95
|
-
report.to_csv("duplicate_report.csv", index=False)
|
|
94
|
+
deduped, report = dedupe_with_report(rows)
|
|
95
|
+
write_csv_rows("deduped.csv", deduped, fieldnames=list(rows[0].keys()))
|
|
96
96
|
|
|
97
97
|
# Exclude columns (e.g. IDs or timestamps)
|
|
98
|
-
deduped2, report2 = dedupe_with_report(
|
|
98
|
+
deduped2, report2 = dedupe_with_report(rows, exclude=["id"])
|
|
99
|
+
|
|
100
|
+
Analyze string distance
|
|
101
|
+
~~~~~~~~~~~~~~~~~~~~~~~
|
|
102
|
+
|
|
103
|
+
.. code-block:: python
|
|
99
104
|
|
|
105
|
+
from csvsmith import analyze_pair
|
|
106
|
+
|
|
107
|
+
result = analyze_pair("kitten", "sitting")
|
|
108
|
+
|
|
109
|
+
print(result.get_relation_string())
|
|
110
|
+
print(result.damerau_levenshtein_distance)
|
|
111
|
+
print(result.jaro_winkler_score)
|
|
112
|
+
print(result.similarity_percentage)
|
|
113
|
+
|
|
114
|
+
CLI Usage
|
|
100
115
|
Drop rows in a CSV by column name
|
|
101
116
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
102
117
|
|
|
@@ -169,7 +184,8 @@ CLI Usage
|
|
|
169
184
|
---------
|
|
170
185
|
|
|
171
186
|
csvsmith provides a single CLI entrypoint with subcommands for duplicate
|
|
172
|
-
detection, CSV organization, Excel conversion, file moving,
|
|
187
|
+
detection, CSV organization, Excel conversion, file moving, row filtering,
|
|
188
|
+
and string comparison.
|
|
173
189
|
|
|
174
190
|
Show duplicate rows
|
|
175
191
|
~~~~~~~~~~~~~~~~~~~
|
|
@@ -184,6 +200,19 @@ Save duplicate rows only:
|
|
|
184
200
|
|
|
185
201
|
csvsmith row-duplicates input.csv -o duplicates_only.csv
|
|
186
202
|
|
|
203
|
+
Analyze string distance
|
|
204
|
+
~~~~~~~~~~~~~~~~~~~~~~~
|
|
205
|
+
|
|
206
|
+
.. code-block:: bash
|
|
207
|
+
|
|
208
|
+
csvsmith string-distance "kitten" "sitting"
|
|
209
|
+
|
|
210
|
+
Ignore case:
|
|
211
|
+
|
|
212
|
+
.. code-block:: bash
|
|
213
|
+
|
|
214
|
+
csvsmith string-distance "Hello" "hello" --ignore-case
|
|
215
|
+
|
|
187
216
|
Deduplicate and generate a report
|
|
188
217
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
189
218
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "csvsmith"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.0"
|
|
8
8
|
description = "Small CSV utilities: row deduplication, classification, row filtering, and CLI helpers."
|
|
9
9
|
readme = "README.rst"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -16,7 +16,6 @@ authors = [
|
|
|
16
16
|
|
|
17
17
|
keywords = [
|
|
18
18
|
"csv",
|
|
19
|
-
"pandas",
|
|
20
19
|
"deduplication",
|
|
21
20
|
"data-filtering",
|
|
22
21
|
"file-organization",
|
|
@@ -33,7 +32,6 @@ classifiers = [
|
|
|
33
32
|
]
|
|
34
33
|
|
|
35
34
|
dependencies = [
|
|
36
|
-
"pandas>=2.0",
|
|
37
35
|
"openpyxl>=3.1",
|
|
38
36
|
]
|
|
39
37
|
|
|
@@ -6,14 +6,22 @@ Public API:
|
|
|
6
6
|
- add_row_digest
|
|
7
7
|
- find_duplicate_rows
|
|
8
8
|
- dedupe_with_report
|
|
9
|
+
- read_csv_rows
|
|
10
|
+
- write_csv_rows
|
|
9
11
|
- CSVClassifier
|
|
10
12
|
- DropRowsBySubstring
|
|
11
13
|
- excel_to_csv
|
|
14
|
+
- move_by_suffix
|
|
15
|
+
- StringDistance
|
|
16
|
+
- Relation
|
|
17
|
+
- Result
|
|
18
|
+
- analyze_pair
|
|
12
19
|
|
|
13
20
|
Compatibility aliases:
|
|
14
21
|
- CSVCleaner
|
|
15
22
|
|
|
16
23
|
Submodules:
|
|
24
|
+
- csvsmith.string_distance
|
|
17
25
|
- csvsmith.row_dedup
|
|
18
26
|
- csvsmith.classify
|
|
19
27
|
- csvsmith.filter_rows
|
|
@@ -22,26 +30,35 @@ Submodules:
|
|
|
22
30
|
- csvsmith.cli (CLI entrypoint)
|
|
23
31
|
"""
|
|
24
32
|
|
|
25
|
-
__version__ = "0.
|
|
33
|
+
__version__ = "0.4.0"
|
|
26
34
|
|
|
27
35
|
from .row_dedup import (
|
|
28
36
|
count_duplicates_sorted,
|
|
29
37
|
add_row_digest,
|
|
30
38
|
find_duplicate_rows,
|
|
31
39
|
dedupe_with_report,
|
|
40
|
+
read_csv_rows,
|
|
41
|
+
write_csv_rows,
|
|
32
42
|
)
|
|
33
43
|
from .classify import CSVClassifier
|
|
34
44
|
from .filter_rows import DropRowsBySubstring, CSVCleaner
|
|
35
45
|
from .excel2csv import excel_to_csv
|
|
36
46
|
from .move_files import move_by_suffix
|
|
47
|
+
from .string_distance import StringDistance, Relation, Result, analyze_pair
|
|
37
48
|
|
|
38
49
|
__all__ = [
|
|
39
50
|
"count_duplicates_sorted",
|
|
40
51
|
"add_row_digest",
|
|
41
52
|
"find_duplicate_rows",
|
|
42
53
|
"dedupe_with_report",
|
|
54
|
+
"read_csv_rows",
|
|
55
|
+
"write_csv_rows",
|
|
43
56
|
"CSVClassifier",
|
|
44
57
|
"DropRowsBySubstring",
|
|
45
58
|
"excel_to_csv",
|
|
46
59
|
"move_by_suffix",
|
|
60
|
+
"StringDistance",
|
|
61
|
+
"Relation",
|
|
62
|
+
"Result",
|
|
63
|
+
"analyze_pair",
|
|
47
64
|
]
|
|
@@ -1,17 +1,23 @@
|
|
|
1
1
|
import argparse
|
|
2
|
+
import csv
|
|
2
3
|
import json
|
|
3
4
|
import sys
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Optional, Sequence
|
|
6
7
|
|
|
7
|
-
import pandas as pd
|
|
8
8
|
|
|
9
9
|
from . import __version__
|
|
10
10
|
from .classify import CSVClassifier
|
|
11
11
|
from .excel2csv import excel_to_csv
|
|
12
12
|
from .filter_rows import DropRowsBySubstring
|
|
13
13
|
from .move_files import move_by_suffix
|
|
14
|
-
from .row_dedup import
|
|
14
|
+
from .row_dedup import (
|
|
15
|
+
dedupe_with_report,
|
|
16
|
+
find_duplicate_rows,
|
|
17
|
+
read_csv_rows,
|
|
18
|
+
write_csv_rows,
|
|
19
|
+
)
|
|
20
|
+
from .string_distance import analyze_pair
|
|
15
21
|
|
|
16
22
|
|
|
17
23
|
def _parse_suffixes(value: str | None) -> set[str]:
|
|
@@ -30,28 +36,33 @@ def _parse_suffixes(value: str | None) -> set[str]:
|
|
|
30
36
|
|
|
31
37
|
|
|
32
38
|
def cmd_row_duplicates(args: argparse.Namespace) -> int:
|
|
33
|
-
|
|
39
|
+
rows = read_csv_rows(args.input)
|
|
34
40
|
subset = args.subset.split(",") if args.subset else None
|
|
35
|
-
dupes = find_duplicate_rows(
|
|
36
|
-
|
|
41
|
+
dupes = find_duplicate_rows(rows, subset=subset)
|
|
42
|
+
|
|
43
|
+
if not dupes:
|
|
37
44
|
print("No duplicate rows found.")
|
|
38
45
|
else:
|
|
39
46
|
print(f"Found {len(dupes)} duplicate rows:")
|
|
40
|
-
|
|
47
|
+
fieldnames = list(dupes[0].keys())
|
|
48
|
+
writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
|
|
49
|
+
writer.writeheader()
|
|
50
|
+
writer.writerows(dupes)
|
|
41
51
|
return 0
|
|
42
52
|
|
|
43
53
|
|
|
44
54
|
def cmd_dedupe(args: argparse.Namespace) -> int:
|
|
45
|
-
|
|
55
|
+
rows = read_csv_rows(args.input)
|
|
46
56
|
subset = args.subset.split(",") if args.subset else None
|
|
47
57
|
exclude = args.exclude.split(",") if args.exclude else None
|
|
48
58
|
|
|
49
|
-
|
|
50
|
-
|
|
59
|
+
deduped_rows, report = dedupe_with_report(
|
|
60
|
+
rows, subset=subset, exclude=exclude, keep=args.keep
|
|
51
61
|
)
|
|
52
62
|
|
|
53
63
|
output_path = Path(args.output) if args.output else Path(args.input).with_suffix(".deduped.csv")
|
|
54
|
-
|
|
64
|
+
fieldnames = list(rows[0].keys()) if rows else []
|
|
65
|
+
write_csv_rows(output_path, deduped_rows, fieldnames=fieldnames)
|
|
55
66
|
print(f"Wrote deduped CSV to: {output_path}")
|
|
56
67
|
|
|
57
68
|
if args.report:
|
|
@@ -127,6 +138,16 @@ def cmd_drop_rows(args: argparse.Namespace) -> int:
|
|
|
127
138
|
return 0
|
|
128
139
|
|
|
129
140
|
|
|
141
|
+
def cmd_string_distance(args: argparse.Namespace) -> int:
|
|
142
|
+
res = analyze_pair(args.string_a, args.string_b, args.ignore_case)
|
|
143
|
+
|
|
144
|
+
print(f"{'Classification':<18}: {res.get_relation_string()}")
|
|
145
|
+
print(f"{'D-Levenshtein Dist':<18}: {res.damerau_levenshtein_distance} changes")
|
|
146
|
+
print(f"{'Jaro-Winkler':<18}: {res.jaro_winkler_score:.4f}")
|
|
147
|
+
print(f"{'Similarity':<18}: {res.similarity_percentage:.2f}%")
|
|
148
|
+
return 0
|
|
149
|
+
|
|
150
|
+
|
|
130
151
|
def _add_row_duplicates_parser(subparsers) -> None:
|
|
131
152
|
parser = subparsers.add_parser("row-duplicates", help="Find duplicate rows in a CSV.")
|
|
132
153
|
parser.add_argument("input", help="Input CSV file.")
|
|
@@ -197,6 +218,18 @@ def _add_drop_rows_parser(subparsers) -> None:
|
|
|
197
218
|
parser.set_defaults(func=cmd_drop_rows)
|
|
198
219
|
|
|
199
220
|
|
|
221
|
+
def _add_string_distance_parser(subparsers) -> None:
|
|
222
|
+
parser = subparsers.add_parser("string-distance", help="Analyze distance between two strings.")
|
|
223
|
+
parser.add_argument("string_a", help="First string.")
|
|
224
|
+
parser.add_argument("string_b", help="Second string.")
|
|
225
|
+
parser.add_argument(
|
|
226
|
+
"--ignore-case",
|
|
227
|
+
action="store_true",
|
|
228
|
+
help="Ignore case for distance calculation.",
|
|
229
|
+
)
|
|
230
|
+
parser.set_defaults(func=cmd_string_distance)
|
|
231
|
+
|
|
232
|
+
|
|
200
233
|
def build_parser() -> argparse.ArgumentParser:
|
|
201
234
|
parser = argparse.ArgumentParser(
|
|
202
235
|
prog="csvsmith",
|
|
@@ -215,6 +248,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
215
248
|
_add_move_files_parser(subparsers)
|
|
216
249
|
_add_excel_to_csv_parser(subparsers)
|
|
217
250
|
_add_drop_rows_parser(subparsers)
|
|
251
|
+
_add_string_distance_parser(subparsers)
|
|
218
252
|
|
|
219
253
|
return parser
|
|
220
254
|
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
from collections import Counter, defaultdict
|
|
5
|
+
from hashlib import sha256
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Hashable, Iterable, Mapping, Optional, Sequence
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
ROW_SEP = "\x1f"
|
|
11
|
+
KEEP_OPTIONS = {"first", "last"}
|
|
12
|
+
|
|
13
|
+
Row = dict[str, str]
|
|
14
|
+
RowLike = Mapping[str, object]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def count_duplicates_sorted(
|
|
18
|
+
items: Iterable[Hashable],
|
|
19
|
+
threshold: int = 2,
|
|
20
|
+
reverse: bool = True,
|
|
21
|
+
) -> list[tuple[Hashable, int]]:
|
|
22
|
+
"""Count items and return those occurring at least `threshold` times."""
|
|
23
|
+
counter = Counter(items)
|
|
24
|
+
duplicates = [(key, count) for key, count in counter.items() if count >= threshold]
|
|
25
|
+
duplicates.sort(key=lambda x: x[1], reverse=reverse)
|
|
26
|
+
return duplicates
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def read_csv_rows(csv_path: Path | str, encoding: str = "utf-8") -> list[Row]:
|
|
30
|
+
"""Read a CSV file into a list of row dictionaries."""
|
|
31
|
+
path = Path(csv_path)
|
|
32
|
+
with path.open("r", encoding=encoding, newline="") as fp:
|
|
33
|
+
reader = csv.DictReader(fp)
|
|
34
|
+
return list(reader)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def write_csv_rows(
|
|
38
|
+
csv_path: Path | str,
|
|
39
|
+
rows: Sequence[Mapping[str, object]],
|
|
40
|
+
*,
|
|
41
|
+
fieldnames: Sequence[str],
|
|
42
|
+
encoding: str = "utf-8",
|
|
43
|
+
) -> None:
|
|
44
|
+
"""Write row dictionaries to a CSV file."""
|
|
45
|
+
path = Path(csv_path)
|
|
46
|
+
with path.open("w", encoding=encoding, newline="") as fp:
|
|
47
|
+
writer = csv.DictWriter(fp, fieldnames=fieldnames)
|
|
48
|
+
writer.writeheader()
|
|
49
|
+
for row in rows:
|
|
50
|
+
writer.writerow(row)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _normalize_cell(value: object) -> str:
|
|
54
|
+
"""Convert a cell value to a stable string for hashing."""
|
|
55
|
+
if value is None:
|
|
56
|
+
return ""
|
|
57
|
+
return str(value)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _resolve_columns(
|
|
61
|
+
rows: Sequence[RowLike],
|
|
62
|
+
*,
|
|
63
|
+
subset: Optional[Sequence[Hashable]] = None,
|
|
64
|
+
exclude: Optional[Sequence[Hashable]] = None,
|
|
65
|
+
) -> list[str]:
|
|
66
|
+
"""Resolve the effective column list used for comparison."""
|
|
67
|
+
if subset is None:
|
|
68
|
+
if not rows:
|
|
69
|
+
return []
|
|
70
|
+
cols = list(rows[0].keys())
|
|
71
|
+
else:
|
|
72
|
+
cols = [str(col) for col in subset]
|
|
73
|
+
|
|
74
|
+
if exclude:
|
|
75
|
+
exclude_set = {str(col) for col in exclude}
|
|
76
|
+
cols = [col for col in cols if col not in exclude_set]
|
|
77
|
+
|
|
78
|
+
return cols
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def make_row_digest(row: RowLike, *, columns: Sequence[str]) -> str:
|
|
82
|
+
"""Build a SHA-256 digest for a row using selected columns."""
|
|
83
|
+
joined = ROW_SEP.join(_normalize_cell(row.get(col, "")) for col in columns)
|
|
84
|
+
return sha256(joined.encode("utf-8")).hexdigest()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def add_row_digest(
|
|
88
|
+
rows: Sequence[RowLike],
|
|
89
|
+
*,
|
|
90
|
+
subset: Optional[Sequence[Hashable]] = None,
|
|
91
|
+
exclude: Optional[Sequence[Hashable]] = None,
|
|
92
|
+
colname: str = "row_digest",
|
|
93
|
+
inplace: bool = False,
|
|
94
|
+
) -> list[dict[str, object]]:
|
|
95
|
+
"""Add a row digest column and return the resulting rows."""
|
|
96
|
+
columns = _resolve_columns(rows, subset=subset, exclude=exclude)
|
|
97
|
+
|
|
98
|
+
out = rows if inplace else [dict(row) for row in rows]
|
|
99
|
+
|
|
100
|
+
for row in out:
|
|
101
|
+
row[colname] = make_row_digest(row, columns=columns)
|
|
102
|
+
|
|
103
|
+
return [dict(row) for row in out]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def find_duplicate_rows(
|
|
107
|
+
rows: Sequence[RowLike],
|
|
108
|
+
*,
|
|
109
|
+
subset: Optional[Sequence[Hashable]] = None,
|
|
110
|
+
) -> list[dict[str, object]]:
|
|
111
|
+
"""Return only rows that participate in duplicate groups."""
|
|
112
|
+
columns = _resolve_columns(rows, subset=subset)
|
|
113
|
+
|
|
114
|
+
grouped: dict[str, list[int]] = defaultdict(list)
|
|
115
|
+
for idx, row in enumerate(rows):
|
|
116
|
+
digest = make_row_digest(row, columns=columns)
|
|
117
|
+
grouped[digest].append(idx)
|
|
118
|
+
|
|
119
|
+
dup_indices = {
|
|
120
|
+
idx
|
|
121
|
+
for indices in grouped.values()
|
|
122
|
+
if len(indices) > 1
|
|
123
|
+
for idx in indices
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return [dict(rows[idx]) for idx in sorted(dup_indices)]
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def dedupe_with_report(
|
|
130
|
+
rows: Sequence[RowLike],
|
|
131
|
+
*,
|
|
132
|
+
subset: Optional[Sequence[Hashable]] = None,
|
|
133
|
+
exclude: Optional[Sequence[Hashable]] = None,
|
|
134
|
+
keep: str = "first",
|
|
135
|
+
digest_col: str = "row_digest",
|
|
136
|
+
) -> tuple[list[dict[str, object]], list[dict[str, object]]]:
|
|
137
|
+
"""Drop duplicates and return `(deduped_rows, report)`."""
|
|
138
|
+
if keep not in KEEP_OPTIONS:
|
|
139
|
+
raise ValueError(f"keep must be one of {sorted(KEEP_OPTIONS)}")
|
|
140
|
+
|
|
141
|
+
columns = _resolve_columns(rows, subset=subset, exclude=exclude)
|
|
142
|
+
|
|
143
|
+
grouped: dict[str, list[int]] = defaultdict(list)
|
|
144
|
+
for idx, row in enumerate(rows):
|
|
145
|
+
digest = make_row_digest(row, columns=columns)
|
|
146
|
+
grouped[digest].append(idx)
|
|
147
|
+
|
|
148
|
+
report = [
|
|
149
|
+
{
|
|
150
|
+
digest_col: digest,
|
|
151
|
+
"count": len(indices),
|
|
152
|
+
"indices": indices,
|
|
153
|
+
}
|
|
154
|
+
for digest, indices in grouped.items()
|
|
155
|
+
if len(indices) > 1
|
|
156
|
+
]
|
|
157
|
+
report.sort(key=lambda x: x["count"], reverse=True)
|
|
158
|
+
|
|
159
|
+
kept_indices: set[int] = set()
|
|
160
|
+
for indices in grouped.values():
|
|
161
|
+
kept_indices.add(indices[0] if keep == "first" else indices[-1])
|
|
162
|
+
|
|
163
|
+
deduped_rows = [
|
|
164
|
+
dict(row)
|
|
165
|
+
for idx, row in enumerate(rows)
|
|
166
|
+
if idx in kept_indices
|
|
167
|
+
]
|
|
168
|
+
|
|
169
|
+
return deduped_rows, report
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def dedupe_csv_file(
|
|
173
|
+
src: Path | str,
|
|
174
|
+
dst: Path | str,
|
|
175
|
+
*,
|
|
176
|
+
subset: Optional[Sequence[Hashable]] = None,
|
|
177
|
+
exclude: Optional[Sequence[Hashable]] = None,
|
|
178
|
+
keep: str = "first",
|
|
179
|
+
encoding: str = "utf-8",
|
|
180
|
+
) -> list[dict[str, object]]:
|
|
181
|
+
"""Deduplicate a CSV file, write the result, and return the report."""
|
|
182
|
+
rows = read_csv_rows(src, encoding=encoding)
|
|
183
|
+
deduped_rows, report = dedupe_with_report(
|
|
184
|
+
rows,
|
|
185
|
+
subset=subset,
|
|
186
|
+
exclude=exclude,
|
|
187
|
+
keep=keep,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
fieldnames = list(rows[0].keys()) if rows else []
|
|
191
|
+
write_csv_rows(dst, deduped_rows, fieldnames=fieldnames, encoding=encoding)
|
|
192
|
+
return report
|