csvsmith 0.2.3__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {csvsmith-0.2.3/src/csvsmith.egg-info → csvsmith-0.4.0}/PKG-INFO +50 -22
  2. {csvsmith-0.2.3 → csvsmith-0.4.0}/README.rst +48 -19
  3. {csvsmith-0.2.3 → csvsmith-0.4.0}/pyproject.toml +1 -3
  4. {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith/__init__.py +18 -1
  5. {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith/cli.py +44 -10
  6. csvsmith-0.4.0/src/csvsmith/row_dedup.py +192 -0
  7. csvsmith-0.4.0/src/csvsmith/string_distance.py +190 -0
  8. {csvsmith-0.2.3 → csvsmith-0.4.0/src/csvsmith.egg-info}/PKG-INFO +50 -22
  9. {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith.egg-info/SOURCES.txt +3 -1
  10. {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith.egg-info/requires.txt +0 -1
  11. csvsmith-0.4.0/tests/test_row_dedup.py +228 -0
  12. csvsmith-0.4.0/tests/test_string_distance.py +42 -0
  13. csvsmith-0.2.3/src/csvsmith/row_dedup.py +0 -128
  14. csvsmith-0.2.3/tests/test_row_dedup.py +0 -186
  15. {csvsmith-0.2.3 → csvsmith-0.4.0}/LICENSE +0 -0
  16. {csvsmith-0.2.3 → csvsmith-0.4.0}/setup.cfg +0 -0
  17. {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith/classify.py +0 -0
  18. {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith/excel2csv.py +0 -0
  19. {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith/filter_rows.py +0 -0
  20. {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith/move_files.py +0 -0
  21. {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith.egg-info/dependency_links.txt +0 -0
  22. {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith.egg-info/entry_points.txt +0 -0
  23. {csvsmith-0.2.3 → csvsmith-0.4.0}/src/csvsmith.egg-info/top_level.txt +0 -0
  24. {csvsmith-0.2.3 → csvsmith-0.4.0}/tests/test_classify.py +0 -0
  25. {csvsmith-0.2.3 → csvsmith-0.4.0}/tests/test_cli.py +0 -0
  26. {csvsmith-0.2.3 → csvsmith-0.4.0}/tests/test_excel2csv.py +0 -0
  27. {csvsmith-0.2.3 → csvsmith-0.4.0}/tests/test_filter_rows.py +0 -0
  28. {csvsmith-0.2.3 → csvsmith-0.4.0}/tests/test_move_files.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csvsmith
3
- Version: 0.2.3
3
+ Version: 0.4.0
4
4
  Summary: Small CSV utilities: row deduplication, classification, row filtering, and CLI helpers.
5
5
  Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
6
6
  License: MIT License
@@ -27,7 +27,7 @@ License: MIT License
27
27
 
28
28
  Project-URL: Homepage, https://github.com/yeiichi/csvsmith
29
29
  Project-URL: Repository, https://github.com/yeiichi/csvsmith
30
- Keywords: csv,pandas,deduplication,data-filtering,file-organization,filtering
30
+ Keywords: csv,deduplication,data-filtering,file-organization,filtering
31
31
  Classifier: Programming Language :: Python :: 3
32
32
  Classifier: Programming Language :: Python :: 3 :: Only
33
33
  Classifier: License :: OSI Approved :: MIT License
@@ -37,7 +37,6 @@ Classifier: Topic :: Utilities
37
37
  Requires-Python: >=3.10
38
38
  Description-Content-Type: text/x-rst
39
39
  License-File: LICENSE
40
- Requires-Dist: pandas>=2.0
41
40
  Requires-Dist: openpyxl>=3.1
42
41
  Dynamic: license-file
43
42
 
@@ -51,32 +50,35 @@ csvsmith
51
50
  :target: https://pypi.org/project/csvsmith/
52
51
 
53
52
  .. image:: https://img.shields.io/pypi/l/csvsmith.svg
54
- :target: https://pypi.org/project/csvsmith/
53
+ :target: https://pypi.org/project/ccsvsmith/
55
54
 
56
55
  Introduction
57
56
  ------------
58
57
 
59
58
  csvsmith is a lightweight collection of CSV utilities designed for data
60
- integrity, deduplication, organization, and Excel-to-CSV conversion.
59
+ integrity, deduplication, organization, Excel-to-CSV conversion, and
60
+ string-similarity analysis.
61
61
 
62
62
  It provides a small Python API for programmatic data filtering and a single
63
63
  CLI entrypoint for quick operations.
64
64
 
65
65
  Whether you need to organize CSV files by header signatures, find duplicate
66
- rows in a dataset, convert an Excel worksheet into CSV, or drop rows by a
67
- substring rule, csvsmith aims to keep the process predictable and reversible.
66
+ rows in a dataset, convert an Excel worksheet into CSV, drop rows by a
67
+ substring rule, or compare two strings for similarity, csvsmith aims to keep
68
+ the process predictable and reversible.
68
69
 
69
70
  Features
70
71
  --------
71
72
 
72
73
  - row duplicate counting and reporting
73
- - DataFrame deduplication with reports
74
+ - CSV deduplication with reports
74
75
  - CSV classification by header signature
75
76
  - dry-run and report-only classification modes
76
77
  - rollback support via manifest
77
78
  - row filtering by substring
78
79
  - Excel worksheet to CSV conversion
79
80
  - file moving by suffix
81
+ - string distance and similarity analysis
80
82
  - a single command-line entrypoint with subcommands
81
83
 
82
84
  Installation
@@ -112,34 +114,46 @@ Count duplicate values
112
114
  print(count_duplicates_sorted(items))
113
115
  # [('a', 3), ('b', 2)]
114
116
 
115
- Find duplicate rows in a DataFrame
116
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
117
+ Find duplicate rows in a CSV
118
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
117
119
 
118
120
  .. code-block:: python
119
121
 
120
- import pandas as pd
121
- from csvsmith import find_duplicate_rows
122
+ from csvsmith import find_duplicate_rows, read_csv_rows
122
123
 
123
- df = pd.read_csv("input.csv")
124
- dup_rows = find_duplicate_rows(df)
124
+ rows = read_csv_rows("input.csv")
125
+ dup_rows = find_duplicate_rows(rows)
125
126
 
126
127
  Deduplicate with report
127
128
  ~~~~~~~~~~~~~~~~~~~~~~~
128
129
 
129
130
  .. code-block:: python
130
131
 
131
- import pandas as pd
132
- from csvsmith import dedupe_with_report
132
+ from csvsmith import dedupe_with_report, read_csv_rows, write_csv_rows
133
133
 
134
- df = pd.read_csv("input.csv")
134
+ rows = read_csv_rows("input.csv")
135
135
 
136
- deduped, report = dedupe_with_report(df)
137
- deduped.to_csv("deduped.csv", index=False)
138
- report.to_csv("duplicate_report.csv", index=False)
136
+ deduped, report = dedupe_with_report(rows)
137
+ write_csv_rows("deduped.csv", deduped, fieldnames=list(rows[0].keys()))
139
138
 
140
139
  # Exclude columns (e.g. IDs or timestamps)
141
- deduped2, report2 = dedupe_with_report(df, exclude=["id"])
140
+ deduped2, report2 = dedupe_with_report(rows, exclude=["id"])
141
+
142
+ Analyze string distance
143
+ ~~~~~~~~~~~~~~~~~~~~~~~
144
+
145
+ .. code-block:: python
142
146
 
147
+ from csvsmith import analyze_pair
148
+
149
+ result = analyze_pair("kitten", "sitting")
150
+
151
+ print(result.get_relation_string())
152
+ print(result.damerau_levenshtein_distance)
153
+ print(result.jaro_winkler_score)
154
+ print(result.similarity_percentage)
155
+
156
+ CLI Usage
143
157
  Drop rows in a CSV by column name
144
158
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
145
159
 
@@ -212,7 +226,8 @@ CLI Usage
212
226
  ---------
213
227
 
214
228
  csvsmith provides a single CLI entrypoint with subcommands for duplicate
215
- detection, CSV organization, Excel conversion, file moving, and row filtering.
229
+ detection, CSV organization, Excel conversion, file moving, row filtering,
230
+ and string comparison.
216
231
 
217
232
  Show duplicate rows
218
233
  ~~~~~~~~~~~~~~~~~~~
@@ -227,6 +242,19 @@ Save duplicate rows only:
227
242
 
228
243
  csvsmith row-duplicates input.csv -o duplicates_only.csv
229
244
 
245
+ Analyze string distance
246
+ ~~~~~~~~~~~~~~~~~~~~~~~
247
+
248
+ .. code-block:: bash
249
+
250
+ csvsmith string-distance "kitten" "sitting"
251
+
252
+ Ignore case:
253
+
254
+ .. code-block:: bash
255
+
256
+ csvsmith string-distance "Hello" "hello" --ignore-case
257
+
230
258
  Deduplicate and generate a report
231
259
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
232
260
 
@@ -8,32 +8,35 @@ csvsmith
8
8
  :target: https://pypi.org/project/csvsmith/
9
9
 
10
10
  .. image:: https://img.shields.io/pypi/l/csvsmith.svg
11
- :target: https://pypi.org/project/csvsmith/
11
+ :target: https://pypi.org/project/ccsvsmith/
12
12
 
13
13
  Introduction
14
14
  ------------
15
15
 
16
16
  csvsmith is a lightweight collection of CSV utilities designed for data
17
- integrity, deduplication, organization, and Excel-to-CSV conversion.
17
+ integrity, deduplication, organization, Excel-to-CSV conversion, and
18
+ string-similarity analysis.
18
19
 
19
20
  It provides a small Python API for programmatic data filtering and a single
20
21
  CLI entrypoint for quick operations.
21
22
 
22
23
  Whether you need to organize CSV files by header signatures, find duplicate
23
- rows in a dataset, convert an Excel worksheet into CSV, or drop rows by a
24
- substring rule, csvsmith aims to keep the process predictable and reversible.
24
+ rows in a dataset, convert an Excel worksheet into CSV, drop rows by a
25
+ substring rule, or compare two strings for similarity, csvsmith aims to keep
26
+ the process predictable and reversible.
25
27
 
26
28
  Features
27
29
  --------
28
30
 
29
31
  - row duplicate counting and reporting
30
- - DataFrame deduplication with reports
32
+ - CSV deduplication with reports
31
33
  - CSV classification by header signature
32
34
  - dry-run and report-only classification modes
33
35
  - rollback support via manifest
34
36
  - row filtering by substring
35
37
  - Excel worksheet to CSV conversion
36
38
  - file moving by suffix
39
+ - string distance and similarity analysis
37
40
  - a single command-line entrypoint with subcommands
38
41
 
39
42
  Installation
@@ -69,34 +72,46 @@ Count duplicate values
69
72
  print(count_duplicates_sorted(items))
70
73
  # [('a', 3), ('b', 2)]
71
74
 
72
- Find duplicate rows in a DataFrame
73
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
75
+ Find duplicate rows in a CSV
76
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
74
77
 
75
78
  .. code-block:: python
76
79
 
77
- import pandas as pd
78
- from csvsmith import find_duplicate_rows
80
+ from csvsmith import find_duplicate_rows, read_csv_rows
79
81
 
80
- df = pd.read_csv("input.csv")
81
- dup_rows = find_duplicate_rows(df)
82
+ rows = read_csv_rows("input.csv")
83
+ dup_rows = find_duplicate_rows(rows)
82
84
 
83
85
  Deduplicate with report
84
86
  ~~~~~~~~~~~~~~~~~~~~~~~
85
87
 
86
88
  .. code-block:: python
87
89
 
88
- import pandas as pd
89
- from csvsmith import dedupe_with_report
90
+ from csvsmith import dedupe_with_report, read_csv_rows, write_csv_rows
90
91
 
91
- df = pd.read_csv("input.csv")
92
+ rows = read_csv_rows("input.csv")
92
93
 
93
- deduped, report = dedupe_with_report(df)
94
- deduped.to_csv("deduped.csv", index=False)
95
- report.to_csv("duplicate_report.csv", index=False)
94
+ deduped, report = dedupe_with_report(rows)
95
+ write_csv_rows("deduped.csv", deduped, fieldnames=list(rows[0].keys()))
96
96
 
97
97
  # Exclude columns (e.g. IDs or timestamps)
98
- deduped2, report2 = dedupe_with_report(df, exclude=["id"])
98
+ deduped2, report2 = dedupe_with_report(rows, exclude=["id"])
99
+
100
+ Analyze string distance
101
+ ~~~~~~~~~~~~~~~~~~~~~~~
102
+
103
+ .. code-block:: python
99
104
 
105
+ from csvsmith import analyze_pair
106
+
107
+ result = analyze_pair("kitten", "sitting")
108
+
109
+ print(result.get_relation_string())
110
+ print(result.damerau_levenshtein_distance)
111
+ print(result.jaro_winkler_score)
112
+ print(result.similarity_percentage)
113
+
114
+ CLI Usage
100
115
  Drop rows in a CSV by column name
101
116
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
102
117
 
@@ -169,7 +184,8 @@ CLI Usage
169
184
  ---------
170
185
 
171
186
  csvsmith provides a single CLI entrypoint with subcommands for duplicate
172
- detection, CSV organization, Excel conversion, file moving, and row filtering.
187
+ detection, CSV organization, Excel conversion, file moving, row filtering,
188
+ and string comparison.
173
189
 
174
190
  Show duplicate rows
175
191
  ~~~~~~~~~~~~~~~~~~~
@@ -184,6 +200,19 @@ Save duplicate rows only:
184
200
 
185
201
  csvsmith row-duplicates input.csv -o duplicates_only.csv
186
202
 
203
+ Analyze string distance
204
+ ~~~~~~~~~~~~~~~~~~~~~~~
205
+
206
+ .. code-block:: bash
207
+
208
+ csvsmith string-distance "kitten" "sitting"
209
+
210
+ Ignore case:
211
+
212
+ .. code-block:: bash
213
+
214
+ csvsmith string-distance "Hello" "hello" --ignore-case
215
+
187
216
  Deduplicate and generate a report
188
217
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
189
218
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "csvsmith"
7
- version = "0.2.3"
7
+ version = "0.4.0"
8
8
  description = "Small CSV utilities: row deduplication, classification, row filtering, and CLI helpers."
9
9
  readme = "README.rst"
10
10
  requires-python = ">=3.10"
@@ -16,7 +16,6 @@ authors = [
16
16
 
17
17
  keywords = [
18
18
  "csv",
19
- "pandas",
20
19
  "deduplication",
21
20
  "data-filtering",
22
21
  "file-organization",
@@ -33,7 +32,6 @@ classifiers = [
33
32
  ]
34
33
 
35
34
  dependencies = [
36
- "pandas>=2.0",
37
35
  "openpyxl>=3.1",
38
36
  ]
39
37
 
@@ -6,14 +6,22 @@ Public API:
6
6
  - add_row_digest
7
7
  - find_duplicate_rows
8
8
  - dedupe_with_report
9
+ - read_csv_rows
10
+ - write_csv_rows
9
11
  - CSVClassifier
10
12
  - DropRowsBySubstring
11
13
  - excel_to_csv
14
+ - move_by_suffix
15
+ - StringDistance
16
+ - Relation
17
+ - Result
18
+ - analyze_pair
12
19
 
13
20
  Compatibility aliases:
14
21
  - CSVCleaner
15
22
 
16
23
  Submodules:
24
+ - csvsmith.string_distance
17
25
  - csvsmith.row_dedup
18
26
  - csvsmith.classify
19
27
  - csvsmith.filter_rows
@@ -22,26 +30,35 @@ Submodules:
22
30
  - csvsmith.cli (CLI entrypoint)
23
31
  """
24
32
 
25
- __version__ = "0.2.3"
33
+ __version__ = "0.4.0"
26
34
 
27
35
  from .row_dedup import (
28
36
  count_duplicates_sorted,
29
37
  add_row_digest,
30
38
  find_duplicate_rows,
31
39
  dedupe_with_report,
40
+ read_csv_rows,
41
+ write_csv_rows,
32
42
  )
33
43
  from .classify import CSVClassifier
34
44
  from .filter_rows import DropRowsBySubstring, CSVCleaner
35
45
  from .excel2csv import excel_to_csv
36
46
  from .move_files import move_by_suffix
47
+ from .string_distance import StringDistance, Relation, Result, analyze_pair
37
48
 
38
49
  __all__ = [
39
50
  "count_duplicates_sorted",
40
51
  "add_row_digest",
41
52
  "find_duplicate_rows",
42
53
  "dedupe_with_report",
54
+ "read_csv_rows",
55
+ "write_csv_rows",
43
56
  "CSVClassifier",
44
57
  "DropRowsBySubstring",
45
58
  "excel_to_csv",
46
59
  "move_by_suffix",
60
+ "StringDistance",
61
+ "Relation",
62
+ "Result",
63
+ "analyze_pair",
47
64
  ]
@@ -1,17 +1,23 @@
1
1
  import argparse
2
+ import csv
2
3
  import json
3
4
  import sys
4
5
  from pathlib import Path
5
6
  from typing import Optional, Sequence
6
7
 
7
- import pandas as pd
8
8
 
9
9
  from . import __version__
10
10
  from .classify import CSVClassifier
11
11
  from .excel2csv import excel_to_csv
12
12
  from .filter_rows import DropRowsBySubstring
13
13
  from .move_files import move_by_suffix
14
- from .row_dedup import dedupe_with_report, find_duplicate_rows
14
+ from .row_dedup import (
15
+ dedupe_with_report,
16
+ find_duplicate_rows,
17
+ read_csv_rows,
18
+ write_csv_rows,
19
+ )
20
+ from .string_distance import analyze_pair
15
21
 
16
22
 
17
23
  def _parse_suffixes(value: str | None) -> set[str]:
@@ -30,28 +36,33 @@ def _parse_suffixes(value: str | None) -> set[str]:
30
36
 
31
37
 
32
38
  def cmd_row_duplicates(args: argparse.Namespace) -> int:
33
- df = pd.read_csv(args.input)
39
+ rows = read_csv_rows(args.input)
34
40
  subset = args.subset.split(",") if args.subset else None
35
- dupes = find_duplicate_rows(df, subset=subset)
36
- if dupes.empty:
41
+ dupes = find_duplicate_rows(rows, subset=subset)
42
+
43
+ if not dupes:
37
44
  print("No duplicate rows found.")
38
45
  else:
39
46
  print(f"Found {len(dupes)} duplicate rows:")
40
- print(dupes.to_csv(index=False))
47
+ fieldnames = list(dupes[0].keys())
48
+ writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
49
+ writer.writeheader()
50
+ writer.writerows(dupes)
41
51
  return 0
42
52
 
43
53
 
44
54
  def cmd_dedupe(args: argparse.Namespace) -> int:
45
- df = pd.read_csv(args.input)
55
+ rows = read_csv_rows(args.input)
46
56
  subset = args.subset.split(",") if args.subset else None
47
57
  exclude = args.exclude.split(",") if args.exclude else None
48
58
 
49
- deduped_df, report = dedupe_with_report(
50
- df, subset=subset, exclude=exclude, keep=args.keep
59
+ deduped_rows, report = dedupe_with_report(
60
+ rows, subset=subset, exclude=exclude, keep=args.keep
51
61
  )
52
62
 
53
63
  output_path = Path(args.output) if args.output else Path(args.input).with_suffix(".deduped.csv")
54
- deduped_df.to_csv(output_path, index=False)
64
+ fieldnames = list(rows[0].keys()) if rows else []
65
+ write_csv_rows(output_path, deduped_rows, fieldnames=fieldnames)
55
66
  print(f"Wrote deduped CSV to: {output_path}")
56
67
 
57
68
  if args.report:
@@ -127,6 +138,16 @@ def cmd_drop_rows(args: argparse.Namespace) -> int:
127
138
  return 0
128
139
 
129
140
 
141
+ def cmd_string_distance(args: argparse.Namespace) -> int:
142
+ res = analyze_pair(args.string_a, args.string_b, args.ignore_case)
143
+
144
+ print(f"{'Classification':<18}: {res.get_relation_string()}")
145
+ print(f"{'D-Levenshtein Dist':<18}: {res.damerau_levenshtein_distance} changes")
146
+ print(f"{'Jaro-Winkler':<18}: {res.jaro_winkler_score:.4f}")
147
+ print(f"{'Similarity':<18}: {res.similarity_percentage:.2f}%")
148
+ return 0
149
+
150
+
130
151
  def _add_row_duplicates_parser(subparsers) -> None:
131
152
  parser = subparsers.add_parser("row-duplicates", help="Find duplicate rows in a CSV.")
132
153
  parser.add_argument("input", help="Input CSV file.")
@@ -197,6 +218,18 @@ def _add_drop_rows_parser(subparsers) -> None:
197
218
  parser.set_defaults(func=cmd_drop_rows)
198
219
 
199
220
 
221
+ def _add_string_distance_parser(subparsers) -> None:
222
+ parser = subparsers.add_parser("string-distance", help="Analyze distance between two strings.")
223
+ parser.add_argument("string_a", help="First string.")
224
+ parser.add_argument("string_b", help="Second string.")
225
+ parser.add_argument(
226
+ "--ignore-case",
227
+ action="store_true",
228
+ help="Ignore case for distance calculation.",
229
+ )
230
+ parser.set_defaults(func=cmd_string_distance)
231
+
232
+
200
233
  def build_parser() -> argparse.ArgumentParser:
201
234
  parser = argparse.ArgumentParser(
202
235
  prog="csvsmith",
@@ -215,6 +248,7 @@ def build_parser() -> argparse.ArgumentParser:
215
248
  _add_move_files_parser(subparsers)
216
249
  _add_excel_to_csv_parser(subparsers)
217
250
  _add_drop_rows_parser(subparsers)
251
+ _add_string_distance_parser(subparsers)
218
252
 
219
253
  return parser
220
254
 
@@ -0,0 +1,192 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ from collections import Counter, defaultdict
5
+ from hashlib import sha256
6
+ from pathlib import Path
7
+ from typing import Hashable, Iterable, Mapping, Optional, Sequence
8
+
9
+
10
+ ROW_SEP = "\x1f"
11
+ KEEP_OPTIONS = {"first", "last"}
12
+
13
+ Row = dict[str, str]
14
+ RowLike = Mapping[str, object]
15
+
16
+
17
+ def count_duplicates_sorted(
18
+ items: Iterable[Hashable],
19
+ threshold: int = 2,
20
+ reverse: bool = True,
21
+ ) -> list[tuple[Hashable, int]]:
22
+ """Count items and return those occurring at least `threshold` times."""
23
+ counter = Counter(items)
24
+ duplicates = [(key, count) for key, count in counter.items() if count >= threshold]
25
+ duplicates.sort(key=lambda x: x[1], reverse=reverse)
26
+ return duplicates
27
+
28
+
29
+ def read_csv_rows(csv_path: Path | str, encoding: str = "utf-8") -> list[Row]:
30
+ """Read a CSV file into a list of row dictionaries."""
31
+ path = Path(csv_path)
32
+ with path.open("r", encoding=encoding, newline="") as fp:
33
+ reader = csv.DictReader(fp)
34
+ return list(reader)
35
+
36
+
37
+ def write_csv_rows(
38
+ csv_path: Path | str,
39
+ rows: Sequence[Mapping[str, object]],
40
+ *,
41
+ fieldnames: Sequence[str],
42
+ encoding: str = "utf-8",
43
+ ) -> None:
44
+ """Write row dictionaries to a CSV file."""
45
+ path = Path(csv_path)
46
+ with path.open("w", encoding=encoding, newline="") as fp:
47
+ writer = csv.DictWriter(fp, fieldnames=fieldnames)
48
+ writer.writeheader()
49
+ for row in rows:
50
+ writer.writerow(row)
51
+
52
+
53
+ def _normalize_cell(value: object) -> str:
54
+ """Convert a cell value to a stable string for hashing."""
55
+ if value is None:
56
+ return ""
57
+ return str(value)
58
+
59
+
60
+ def _resolve_columns(
61
+ rows: Sequence[RowLike],
62
+ *,
63
+ subset: Optional[Sequence[Hashable]] = None,
64
+ exclude: Optional[Sequence[Hashable]] = None,
65
+ ) -> list[str]:
66
+ """Resolve the effective column list used for comparison."""
67
+ if subset is None:
68
+ if not rows:
69
+ return []
70
+ cols = list(rows[0].keys())
71
+ else:
72
+ cols = [str(col) for col in subset]
73
+
74
+ if exclude:
75
+ exclude_set = {str(col) for col in exclude}
76
+ cols = [col for col in cols if col not in exclude_set]
77
+
78
+ return cols
79
+
80
+
81
+ def make_row_digest(row: RowLike, *, columns: Sequence[str]) -> str:
82
+ """Build a SHA-256 digest for a row using selected columns."""
83
+ joined = ROW_SEP.join(_normalize_cell(row.get(col, "")) for col in columns)
84
+ return sha256(joined.encode("utf-8")).hexdigest()
85
+
86
+
87
+ def add_row_digest(
88
+ rows: Sequence[RowLike],
89
+ *,
90
+ subset: Optional[Sequence[Hashable]] = None,
91
+ exclude: Optional[Sequence[Hashable]] = None,
92
+ colname: str = "row_digest",
93
+ inplace: bool = False,
94
+ ) -> list[dict[str, object]]:
95
+ """Add a row digest column and return the resulting rows."""
96
+ columns = _resolve_columns(rows, subset=subset, exclude=exclude)
97
+
98
+ out = rows if inplace else [dict(row) for row in rows]
99
+
100
+ for row in out:
101
+ row[colname] = make_row_digest(row, columns=columns)
102
+
103
+ return [dict(row) for row in out]
104
+
105
+
106
+ def find_duplicate_rows(
107
+ rows: Sequence[RowLike],
108
+ *,
109
+ subset: Optional[Sequence[Hashable]] = None,
110
+ ) -> list[dict[str, object]]:
111
+ """Return only rows that participate in duplicate groups."""
112
+ columns = _resolve_columns(rows, subset=subset)
113
+
114
+ grouped: dict[str, list[int]] = defaultdict(list)
115
+ for idx, row in enumerate(rows):
116
+ digest = make_row_digest(row, columns=columns)
117
+ grouped[digest].append(idx)
118
+
119
+ dup_indices = {
120
+ idx
121
+ for indices in grouped.values()
122
+ if len(indices) > 1
123
+ for idx in indices
124
+ }
125
+
126
+ return [dict(rows[idx]) for idx in sorted(dup_indices)]
127
+
128
+
129
+ def dedupe_with_report(
130
+ rows: Sequence[RowLike],
131
+ *,
132
+ subset: Optional[Sequence[Hashable]] = None,
133
+ exclude: Optional[Sequence[Hashable]] = None,
134
+ keep: str = "first",
135
+ digest_col: str = "row_digest",
136
+ ) -> tuple[list[dict[str, object]], list[dict[str, object]]]:
137
+ """Drop duplicates and return `(deduped_rows, report)`."""
138
+ if keep not in KEEP_OPTIONS:
139
+ raise ValueError(f"keep must be one of {sorted(KEEP_OPTIONS)}")
140
+
141
+ columns = _resolve_columns(rows, subset=subset, exclude=exclude)
142
+
143
+ grouped: dict[str, list[int]] = defaultdict(list)
144
+ for idx, row in enumerate(rows):
145
+ digest = make_row_digest(row, columns=columns)
146
+ grouped[digest].append(idx)
147
+
148
+ report = [
149
+ {
150
+ digest_col: digest,
151
+ "count": len(indices),
152
+ "indices": indices,
153
+ }
154
+ for digest, indices in grouped.items()
155
+ if len(indices) > 1
156
+ ]
157
+ report.sort(key=lambda x: x["count"], reverse=True)
158
+
159
+ kept_indices: set[int] = set()
160
+ for indices in grouped.values():
161
+ kept_indices.add(indices[0] if keep == "first" else indices[-1])
162
+
163
+ deduped_rows = [
164
+ dict(row)
165
+ for idx, row in enumerate(rows)
166
+ if idx in kept_indices
167
+ ]
168
+
169
+ return deduped_rows, report
170
+
171
+
172
+ def dedupe_csv_file(
173
+ src: Path | str,
174
+ dst: Path | str,
175
+ *,
176
+ subset: Optional[Sequence[Hashable]] = None,
177
+ exclude: Optional[Sequence[Hashable]] = None,
178
+ keep: str = "first",
179
+ encoding: str = "utf-8",
180
+ ) -> list[dict[str, object]]:
181
+ """Deduplicate a CSV file, write the result, and return the report."""
182
+ rows = read_csv_rows(src, encoding=encoding)
183
+ deduped_rows, report = dedupe_with_report(
184
+ rows,
185
+ subset=subset,
186
+ exclude=exclude,
187
+ keep=keep,
188
+ )
189
+
190
+ fieldnames = list(rows[0].keys()) if rows else []
191
+ write_csv_rows(dst, deduped_rows, fieldnames=fieldnames, encoding=encoding)
192
+ return report