csvsmith 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csvsmith-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 YOUR NAME
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the “Software”), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,198 @@
1
+ Metadata-Version: 2.4
2
+ Name: csvsmith
3
+ Version: 0.1.0
4
+ Summary: Small CSV utilities: duplicates, row digests, and CLI helpers.
5
+ Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 YOUR NAME
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the “Software”), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/yeiichi/csvsmith
29
+ Project-URL: Repository, https://github.com/yeiichi/csvsmith
30
+ Keywords: csv,pandas,duplicates,data-cleaning
31
+ Classifier: Programming Language :: Python :: 3
32
+ Classifier: Programming Language :: Python :: 3 :: Only
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: Topic :: Software Development :: Libraries
36
+ Classifier: Topic :: Utilities
37
+ Requires-Python: >=3.10
38
+ Description-Content-Type: text/markdown
39
+ License-File: LICENSE
40
+ Requires-Dist: pandas>=2.0
41
+ Dynamic: license-file
42
+
43
+ # csvsmith
44
+
45
+ [![PyPI version](https://img.shields.io/pypi/v/csvsmith.svg)](https://pypi.org/project/csvsmith/)
46
+ ![Python versions](https://img.shields.io/pypi/pyversions/csvsmith.svg)
47
+ [![License](https://img.shields.io/pypi/l/csvsmith.svg)](https://pypi.org/project/csvsmith/)
48
+
49
+ `csvsmith` is a small collection of CSV utilities.
50
+
51
+ ---
52
+
53
+ Current focus:
54
+
55
+ - Duplicate value counting (`count_duplicates_sorted`)
56
+ - Row-level digest creation (`add_row_digest`)
57
+ - Duplicate-row detection (`find_duplicate_rows`)
58
+ - Deduplication with full duplicate report (`dedupe_with_report`)
59
+ - Command-line interface (CLI) for quick operations
60
+
61
+ ---
62
+
63
+ ## Installation
64
+
65
+ From PyPI (future):
66
+
67
+ ```bash
68
+ pip install csvsmith
69
+ ```
70
+
71
+ For local development:
72
+
73
+ ```bash
74
+ git clone https://github.com/YOUR_GITHUB_USERNAME/csvsmith.git
75
+ cd csvsmith
76
+ python -m venv .venv
77
+ source .venv/bin/activate
78
+ pip install -e .[dev]
79
+ ```
80
+
81
+ ---
82
+
83
+ ## Python API Usage
84
+
85
+ ### Count duplicate values
86
+
87
+ ```python
88
+ from csvsmith import count_duplicates_sorted
89
+
90
+ items = ["a", "b", "a", "c", "a", "b"]
91
+ print(count_duplicates_sorted(items))
92
+ # [('a', 3), ('b', 2)]
93
+ ```
94
+
95
+ ### Find duplicate rows in a DataFrame
96
+
97
+ ```python
98
+ import pandas as pd
99
+ from csvsmith import find_duplicate_rows
100
+
101
+ df = pd.read_csv("input.csv")
102
+ dup_rows = find_duplicate_rows(df)
103
+ print(dup_rows)
104
+ ```
105
+
106
+ ### Deduplicate with report
107
+
108
+ ```python
109
+ import pandas as pd
110
+ from csvsmith import dedupe_with_report
111
+
112
+ df = pd.read_csv("input.csv")
113
+
114
+ # Use all columns
115
+ deduped, report = dedupe_with_report(df)
116
+ deduped.to_csv("deduped.csv", index=False)
117
+ report.to_csv("duplicate_report.csv", index=False)
118
+
119
+ # Use all columns except an ID column
120
+ deduped_no_id, report_no_id = dedupe_with_report(df, exclude=["id"])
121
+ ```
122
+
123
+ ---
124
+
125
+ ## CLI Usage
126
+
127
+ `csvsmith` includes a small command-line interface for duplicate detection
128
+ and CSV deduplication.
129
+
130
+ ### Show duplicate rows
131
+
132
+ ```bash
133
+ csvsmith row-duplicates input.csv
134
+ ```
135
+
136
+ Save only duplicate rows to a file:
137
+
138
+ ```bash
139
+ csvsmith row-duplicates input.csv -o duplicates_only.csv
140
+ ```
141
+
142
+ Use only a subset of columns to determine duplicates:
143
+
144
+ ```bash
145
+ csvsmith row-duplicates input.csv --subset col1 col2 -o dup_rows_subset.csv
146
+ ```
147
+
148
+ Exclude ID column(s) when looking for duplicates:
149
+
150
+ ```bash
151
+ csvsmith row-duplicates input.csv --exclude id -o dup_rows_no_id.csv
152
+ ```
153
+
154
+ ### Deduplicate and generate a duplicate report
155
+
156
+ ```bash
157
+ csvsmith dedupe input.csv --deduped deduped.csv --report duplicate_report.csv
158
+ ```
159
+
160
+ ### Deduplicate using selected columns
161
+
162
+ ```bash
163
+ csvsmith dedupe input.csv --subset col1 col2 --deduped deduped_subset.csv --report duplicate_report_subset.csv
164
+ ```
165
+
166
+ ### Remove *all* occurrences of duplicated rows
167
+
168
+ ```bash
169
+ csvsmith dedupe input.csv --subset col1 --keep False --deduped deduped_no_dups.csv --report duplicate_report_col1.csv
170
+ ```
171
+
172
+ Exclude “id” from duplicate logic:
173
+
174
+ ```bash
175
+ csvsmith dedupe input.csv --exclude id --deduped deduped_no_id.csv --report duplicate_report_no_id.csv
176
+ ```
177
+
178
+ ---
179
+
180
+ ## Philosophy (“csvsmith Manifesto”)
181
+
182
+ 1. CSVs deserve tools that are simple, predictable, and transparent.
183
+ 2. A row has meaning only when its identity is stable and hashable.
184
+ 3. Collisions are sin; determinism is virtue.
185
+ 4. Let no delimiter sow ambiguity among fields.
186
+ 5. **Love thy `\x1f`.**
187
+ The unseen separator, the quiet guardian of clean hashes.
188
+ Chosen not for aesthetics, but for truth.
189
+ 6. The pipeline should be silent unless something is wrong.
190
+ 7. Your data deserves respect — and your tools should help you give it.
191
+
192
+ For more, see `MANIFESTO.md`.
193
+
194
+ ---
195
+
196
+ ## License
197
+
198
+ MIT License.
@@ -0,0 +1,156 @@
1
+ # csvsmith
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/csvsmith.svg)](https://pypi.org/project/csvsmith/)
4
+ ![Python versions](https://img.shields.io/pypi/pyversions/csvsmith.svg)
5
+ [![License](https://img.shields.io/pypi/l/csvsmith.svg)](https://pypi.org/project/csvsmith/)
6
+
7
+ `csvsmith` is a small collection of CSV utilities.
8
+
9
+ ---
10
+
11
+ Current focus:
12
+
13
+ - Duplicate value counting (`count_duplicates_sorted`)
14
+ - Row-level digest creation (`add_row_digest`)
15
+ - Duplicate-row detection (`find_duplicate_rows`)
16
+ - Deduplication with full duplicate report (`dedupe_with_report`)
17
+ - Command-line interface (CLI) for quick operations
18
+
19
+ ---
20
+
21
+ ## Installation
22
+
23
+ From PyPI (future):
24
+
25
+ ```bash
26
+ pip install csvsmith
27
+ ```
28
+
29
+ For local development:
30
+
31
+ ```bash
32
+ git clone https://github.com/YOUR_GITHUB_USERNAME/csvsmith.git
33
+ cd csvsmith
34
+ python -m venv .venv
35
+ source .venv/bin/activate
36
+ pip install -e .[dev]
37
+ ```
38
+
39
+ ---
40
+
41
+ ## Python API Usage
42
+
43
+ ### Count duplicate values
44
+
45
+ ```python
46
+ from csvsmith import count_duplicates_sorted
47
+
48
+ items = ["a", "b", "a", "c", "a", "b"]
49
+ print(count_duplicates_sorted(items))
50
+ # [('a', 3), ('b', 2)]
51
+ ```
52
+
53
+ ### Find duplicate rows in a DataFrame
54
+
55
+ ```python
56
+ import pandas as pd
57
+ from csvsmith import find_duplicate_rows
58
+
59
+ df = pd.read_csv("input.csv")
60
+ dup_rows = find_duplicate_rows(df)
61
+ print(dup_rows)
62
+ ```
63
+
64
+ ### Deduplicate with report
65
+
66
+ ```python
67
+ import pandas as pd
68
+ from csvsmith import dedupe_with_report
69
+
70
+ df = pd.read_csv("input.csv")
71
+
72
+ # Use all columns
73
+ deduped, report = dedupe_with_report(df)
74
+ deduped.to_csv("deduped.csv", index=False)
75
+ report.to_csv("duplicate_report.csv", index=False)
76
+
77
+ # Use all columns except an ID column
78
+ deduped_no_id, report_no_id = dedupe_with_report(df, exclude=["id"])
79
+ ```
80
+
81
+ ---
82
+
83
+ ## CLI Usage
84
+
85
+ `csvsmith` includes a small command-line interface for duplicate detection
86
+ and CSV deduplication.
87
+
88
+ ### Show duplicate rows
89
+
90
+ ```bash
91
+ csvsmith row-duplicates input.csv
92
+ ```
93
+
94
+ Save only duplicate rows to a file:
95
+
96
+ ```bash
97
+ csvsmith row-duplicates input.csv -o duplicates_only.csv
98
+ ```
99
+
100
+ Use only a subset of columns to determine duplicates:
101
+
102
+ ```bash
103
+ csvsmith row-duplicates input.csv --subset col1 col2 -o dup_rows_subset.csv
104
+ ```
105
+
106
+ Exclude ID column(s) when looking for duplicates:
107
+
108
+ ```bash
109
+ csvsmith row-duplicates input.csv --exclude id -o dup_rows_no_id.csv
110
+ ```
111
+
112
+ ### Deduplicate and generate a duplicate report
113
+
114
+ ```bash
115
+ csvsmith dedupe input.csv --deduped deduped.csv --report duplicate_report.csv
116
+ ```
117
+
118
+ ### Deduplicate using selected columns
119
+
120
+ ```bash
121
+ csvsmith dedupe input.csv --subset col1 col2 --deduped deduped_subset.csv --report duplicate_report_subset.csv
122
+ ```
123
+
124
+ ### Remove *all* occurrences of duplicated rows
125
+
126
+ ```bash
127
+ csvsmith dedupe input.csv --subset col1 --keep False --deduped deduped_no_dups.csv --report duplicate_report_col1.csv
128
+ ```
129
+
130
+ Exclude “id” from duplicate logic:
131
+
132
+ ```bash
133
+ csvsmith dedupe input.csv --exclude id --deduped deduped_no_id.csv --report duplicate_report_no_id.csv
134
+ ```
135
+
136
+ ---
137
+
138
+ ## Philosophy (“csvsmith Manifesto”)
139
+
140
+ 1. CSVs deserve tools that are simple, predictable, and transparent.
141
+ 2. A row has meaning only when its identity is stable and hashable.
142
+ 3. Collisions are sin; determinism is virtue.
143
+ 4. Let no delimiter sow ambiguity among fields.
144
+ 5. **Love thy `\x1f`.**
145
+ The unseen separator, the quiet guardian of clean hashes.
146
+ Chosen not for aesthetics, but for truth.
147
+ 6. The pipeline should be silent unless something is wrong.
148
+ 7. Your data deserves respect — and your tools should help you give it.
149
+
150
+ For more, see `MANIFESTO.md`.
151
+
152
+ ---
153
+
154
+ ## License
155
+
156
+ MIT License.
@@ -0,0 +1,38 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "csvsmith"
7
+ version = "0.1.0"
8
+ description = "Small CSV utilities: duplicates, row digests, and CLI helpers."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { file = "LICENSE" }
12
+
13
+ authors = [
14
+ { name = "Eiichi YAMAMOTO", email = "info@yeiichi.com" }
15
+ ]
16
+ keywords = ["csv", "pandas", "duplicates", "data-cleaning"]
17
+ classifiers = [
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3 :: Only",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Intended Audience :: Developers",
22
+ "Topic :: Software Development :: Libraries",
23
+ "Topic :: Utilities",
24
+ ]
25
+
26
+ dependencies = [
27
+ "pandas>=2.0",
28
+ ]
29
+
30
+ [project.urls]
31
+ Homepage = "https://github.com/yeiichi/csvsmith"
32
+ Repository = "https://github.com/yeiichi/csvsmith"
33
+
34
+ [project.scripts]
35
+ csvsmith = "csvsmith.cli:main"
36
+
37
+ [tool.setuptools.packages.find]
38
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,21 @@
1
+ """
2
+ csvsmith: small, focused CSV utilities.
3
+
4
+ Current submodules:
5
+ - csvsmith.duplicates
6
+ - csvsmith.cli (CLI entrypoint)
7
+ """
8
+
9
+ from .duplicates import (
10
+ count_duplicates_sorted,
11
+ add_row_digest,
12
+ find_duplicate_rows,
13
+ dedupe_with_report,
14
+ )
15
+
16
+ __all__ = [
17
+ "count_duplicates_sorted",
18
+ "add_row_digest",
19
+ "find_duplicate_rows",
20
+ "dedupe_with_report",
21
+ ]
@@ -0,0 +1,215 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ csvsmith CLI
4
+
5
+ Duplicate-related helpers on CSV files.
6
+
7
+ Subcommands:
8
+ - row-duplicates: show only rows that are duplicated
9
+ - dedupe: drop duplicates and write both deduped CSV and a report CSV
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import sys
16
+ from pathlib import Path
17
+ from typing import Sequence, Optional, List
18
+
19
+ import pandas as pd
20
+
21
+ from .duplicates import find_duplicate_rows, dedupe_with_report
22
+
23
+
24
+ def _parse_cols(cols: Optional[Sequence[str]]) -> Optional[List[str]]:
25
+ """
26
+ Normalize column list arguments from CLI.
27
+
28
+ We accept:
29
+ --subset col1 col2 col3
30
+ --exclude colA colB
31
+ or omit entirely.
32
+ """
33
+ if cols is None:
34
+ return None
35
+ if len(cols) == 0:
36
+ return None
37
+ return list(cols)
38
+
39
+
40
+ def _effective_subset(
41
+ df: pd.DataFrame,
42
+ subset: Optional[Sequence[str]],
43
+ exclude: Optional[Sequence[str]],
44
+ ) -> Optional[List[str]]:
45
+ """
46
+ Compute the effective subset of columns to use for duplicate detection,
47
+ given a requested subset and/or exclude list.
48
+
49
+ Logic:
50
+ - if subset is None: start from all columns
51
+ - else: start from subset
52
+ - then remove any columns in exclude
53
+ """
54
+ if subset is None:
55
+ cols = list(df.columns)
56
+ else:
57
+ cols = list(subset)
58
+
59
+ if exclude:
60
+ exclude_set = set(exclude)
61
+ cols = [c for c in cols if c not in exclude_set]
62
+
63
+ if not cols:
64
+ return None
65
+
66
+ return cols
67
+
68
+
69
+ def cmd_row_duplicates(args: argparse.Namespace) -> int:
70
+ input_path = Path(args.input)
71
+ if not input_path.is_file():
72
+ print(f"Error: input file not found: {input_path}", file=sys.stderr)
73
+ return 1
74
+
75
+ df = pd.read_csv(input_path)
76
+ subset = _parse_cols(args.subset)
77
+ exclude = _parse_cols(args.exclude)
78
+
79
+ eff_subset = _effective_subset(df, subset=subset, exclude=exclude)
80
+
81
+ dup_df = find_duplicate_rows(df, subset=eff_subset)
82
+
83
+ if args.output:
84
+ output_path = Path(args.output)
85
+ dup_df.to_csv(output_path, index=False)
86
+ else:
87
+ dup_df.to_csv(sys.stdout, index=False)
88
+
89
+ return 0
90
+
91
+
92
+ def cmd_dedupe(args: argparse.Namespace) -> int:
93
+ input_path = Path(args.input)
94
+ if not input_path.is_file():
95
+ print(f"Error: input file not found: {input_path}", file=sys.stderr)
96
+ return 1
97
+
98
+ df = pd.read_csv(input_path)
99
+ subset = _parse_cols(args.subset)
100
+ exclude = _parse_cols(args.exclude)
101
+
102
+ deduped, report = dedupe_with_report(
103
+ df,
104
+ subset=subset,
105
+ exclude=exclude,
106
+ keep=args.keep,
107
+ digest_col=args.digest_col,
108
+ )
109
+
110
+ deduped_path = Path(args.deduped)
111
+ report_path = Path(args.report)
112
+
113
+ deduped_path.parent.mkdir(parents=True, exist_ok=True)
114
+ report_path.parent.mkdir(parents=True, exist_ok=True)
115
+
116
+ deduped.to_csv(deduped_path, index=False)
117
+ report.to_csv(report_path, index=False)
118
+
119
+ print(f"Wrote deduped CSV to: {deduped_path}")
120
+ print(f"Wrote duplicate report to: {report_path}")
121
+ return 0
122
+
123
+
124
+ def build_parser() -> argparse.ArgumentParser:
125
+ parser = argparse.ArgumentParser(
126
+ prog="csvsmith",
127
+ description="Small CSV utilities (duplicates-focused, first iteration).",
128
+ )
129
+ subparsers = parser.add_subparsers(dest="command", required=True)
130
+
131
+ # row-duplicates
132
+ p_row = subparsers.add_parser(
133
+ "row-duplicates",
134
+ help="Print only rows that have duplicates.",
135
+ )
136
+ p_row.add_argument("input", help="Input CSV file.")
137
+ p_row.add_argument(
138
+ "--subset",
139
+ nargs="*",
140
+ help="Column names to consider when detecting duplicates. "
141
+ "If omitted, all columns are used.",
142
+ )
143
+ p_row.add_argument(
144
+ "--exclude",
145
+ nargs="*",
146
+ help="Column names to exclude from duplicate detection. "
147
+ "Useful for ID columns, timestamps, etc.",
148
+ )
149
+ p_row.add_argument(
150
+ "-o",
151
+ "--output",
152
+ help="Output CSV file for duplicate rows. If omitted, writes to stdout.",
153
+ )
154
+ p_row.set_defaults(func=cmd_row_duplicates)
155
+
156
+ # dedupe
157
+ p_dedupe = subparsers.add_parser(
158
+ "dedupe",
159
+ help="Drop duplicates and generate a duplicate-report CSV.",
160
+ )
161
+ p_dedupe.add_argument("input", help="Input CSV file.")
162
+ p_dedupe.add_argument(
163
+ "--subset",
164
+ nargs="*",
165
+ help="Column names to consider when detecting duplicates. "
166
+ "If omitted, all columns are used.",
167
+ )
168
+ p_dedupe.add_argument(
169
+ "--exclude",
170
+ nargs="*",
171
+ help="Column names to exclude from duplicate detection. "
172
+ "Useful for ID columns, timestamps, etc.",
173
+ )
174
+ p_dedupe.add_argument(
175
+ "--keep",
176
+ choices=["first", "last", "False"],
177
+ default="first",
178
+ help='Which duplicate to keep (same as pandas.drop_duplicates). '
179
+ '"False" = drop all occurrences. Default: "first".',
180
+ )
181
+ p_dedupe.add_argument(
182
+ "--digest-col",
183
+ default="row_digest",
184
+ help='Name of digest column used in the report. Default: "row_digest".',
185
+ )
186
+ p_dedupe.add_argument(
187
+ "--deduped",
188
+ required=True,
189
+ help="Path to write the deduplicated CSV.",
190
+ )
191
+ p_dedupe.add_argument(
192
+ "--report",
193
+ required=True,
194
+ help="Path to write the duplicate-report CSV.",
195
+ )
196
+ p_dedupe.set_defaults(func=cmd_dedupe)
197
+
198
+ return parser
199
+
200
+
201
+ def main(argv: Optional[Sequence[str]] = None) -> int:
202
+ parser = build_parser()
203
+ args = parser.parse_args(argv)
204
+
205
+ try:
206
+ func = args.func
207
+ except AttributeError:
208
+ parser.print_help()
209
+ return 1
210
+
211
+ return func(args)
212
+
213
+
214
+ if __name__ == "__main__":
215
+ raise SystemExit(main())
@@ -0,0 +1,221 @@
1
+ """
2
+ Duplicate-related helpers for csvsmith.
3
+
4
+ Includes:
5
+ - count_duplicates_sorted: generic iterable duplicate counter
6
+ - add_row_digest: add a SHA-256 digest per row to a DataFrame
7
+ - find_duplicate_rows: return only rows that have duplicates
8
+ - dedupe_with_report: drop duplicates and report duplicate groups
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from collections import Counter
14
+ from hashlib import sha256
15
+ from typing import Iterable, List, Tuple, Hashable, Sequence, Optional
16
+
17
+ import pandas as pd
18
+
19
+
20
+ def count_duplicates_sorted(
21
+ items: Iterable[Hashable],
22
+ threshold: int = 2,
23
+ reverse: bool = True,
24
+ ) -> List[Tuple[Hashable, int]]:
25
+ """
26
+ Count occurrences in an iterable and return items whose frequency
27
+ is at or above `threshold`, sorted by count.
28
+
29
+ Args:
30
+ items:
31
+ Any iterable of hashable items (str, int, tuple, etc.)
32
+ threshold:
33
+ Minimum count to include in output (default: 2).
34
+ reverse:
35
+ Whether to sort in descending order (default: True).
36
+
37
+ Returns:
38
+ A list of (item, count) tuples sorted by frequency.
39
+ """
40
+ counter = Counter(items)
41
+ duplicates = [(k, v) for k, v in counter.items() if v >= threshold]
42
+ duplicates.sort(key=lambda x: x[1], reverse=reverse)
43
+ return duplicates
44
+
45
+
46
+ def add_row_digest(
47
+ df: pd.DataFrame,
48
+ *,
49
+ subset: Optional[Sequence[Hashable]] = None,
50
+ exclude: Optional[Sequence[Hashable]] = None,
51
+ colname: str = "row_digest",
52
+ inplace: bool = False,
53
+ ) -> pd.DataFrame:
54
+ """
55
+ Add a SHA-256 digest for each row of a DataFrame.
56
+
57
+ Args:
58
+ df:
59
+ Input DataFrame.
60
+ subset:
61
+ Optional list/sequence of column labels to use for the digest.
62
+ If None, all columns are used.
63
+ exclude:
64
+ Optional list/sequence of column labels to exclude from the digest,
65
+ after `subset` is applied. This is useful for excluding ID columns,
66
+ timestamps, etc.
67
+ colname:
68
+ Name of the digest column to add (default: "row_digest").
69
+ inplace:
70
+ If True, modify the original DataFrame and return it.
71
+ If False (default), return a copy.
72
+
73
+ Returns:
74
+ DataFrame with an extra column containing hex digests.
75
+
76
+ Notes:
77
+ We use the ASCII "Unit Separator" (0x1F, "\\x1f") as the internal
78
+ delimiter when concatenating row values before hashing. It is a
79
+ non-printable control character that almost never appears in normal
80
+ CSV data, which helps avoid accidental collisions like:
81
+
82
+ ["ab", "c"] vs ["a", "bc"]
83
+
84
+ Credo #5 of csvsmith: "Love thy \\x1f."
85
+ """
86
+ # Determine columns to include
87
+ if subset is None:
88
+ cols = list(df.columns)
89
+ else:
90
+ cols = list(subset)
91
+
92
+ if exclude:
93
+ exclude_set = set(exclude)
94
+ cols = [c for c in cols if c not in exclude_set]
95
+
96
+ # Convert to string, fill NaNs, and join with a non-printable separator
97
+ concatted = df[cols].astype("string").fillna("").agg("\x1f".join, axis=1)
98
+ digests = concatted.map(lambda s: sha256(s.encode("utf-8")).hexdigest())
99
+
100
+ if inplace:
101
+ df[colname] = digests
102
+ return df
103
+ else:
104
+ df2 = df.copy()
105
+ df2[colname] = digests
106
+ return df2
107
+
108
+
109
+ def find_duplicate_rows(
110
+ df: pd.DataFrame,
111
+ *,
112
+ subset: Optional[Sequence[Hashable]] = None,
113
+ ) -> pd.DataFrame:
114
+ """
115
+ Return only rows that participate in duplicates.
116
+
117
+ This is a convenience wrapper around `df.duplicated(keep=False)`.
118
+
119
+ Args:
120
+ df:
121
+ Input DataFrame.
122
+ subset:
123
+ Columns to consider when identifying duplicates. If None,
124
+ all columns are used.
125
+
126
+ Returns:
127
+ A DataFrame containing only rows that have at least one duplicate,
128
+ preserving the original order and index.
129
+ """
130
+ mask = df.duplicated(subset=subset, keep=False)
131
+ return df[mask]
132
+
133
+
134
+ def dedupe_with_report(
135
+ df: pd.DataFrame,
136
+ *,
137
+ subset: Optional[Sequence[Hashable]] = None,
138
+ exclude: Optional[Sequence[Hashable]] = None,
139
+ keep: str = "first",
140
+ digest_col: str = "row_digest",
141
+ ) -> Tuple[pd.DataFrame, pd.DataFrame]:
142
+ """
143
+ Drop duplicate rows *and* return a report of what was duplicated.
144
+
145
+ Args:
146
+ df:
147
+ Input DataFrame.
148
+ subset:
149
+ Columns to consider when identifying duplicates. If None,
150
+ all columns are used.
151
+ exclude:
152
+ Columns to exclude from the duplicate check and digest,
153
+ after `subset` is applied. Useful for ID columns, timestamps, etc.
154
+ keep:
155
+ Which duplicate to keep. Same semantics as pandas:
156
+ "first", "last", or "False" (string). Default: "first".
157
+ digest_col:
158
+ Name of the temporary digest column used for grouping in the
159
+ report (default: "row_digest").
160
+
161
+ Returns:
162
+ (df_deduped, report)
163
+
164
+ df_deduped:
165
+ DataFrame with duplicates dropped according to the effective
166
+ subset (subset minus exclude) and `keep`.
167
+
168
+ report:
169
+ DataFrame with one row per duplicate group, columns:
170
+ - digest_col: the SHA-256 row digest
171
+ - count: number of rows in this group
172
+ - indices: list of original DataFrame indices in this group
173
+
174
+ Only groups with count > 1 are included, sorted by `count`
175
+ descending.
176
+ """
177
+ # Determine effective subset for both hashing and drop_duplicates
178
+ if subset is None:
179
+ cols = list(df.columns)
180
+ else:
181
+ cols = list(subset)
182
+
183
+ if exclude:
184
+ exclude_set = set(exclude)
185
+ cols = [c for c in cols if c not in exclude_set]
186
+
187
+ subset_for_dupes: Optional[Sequence[Hashable]]
188
+ if cols:
189
+ subset_for_dupes = cols
190
+ else:
191
+ subset_for_dupes = None
192
+
193
+ # Work on a copy with a digest column, using the effective subset
194
+ work = add_row_digest(
195
+ df,
196
+ subset=subset_for_dupes,
197
+ exclude=None,
198
+ colname=digest_col,
199
+ inplace=False,
200
+ )
201
+
202
+ grouped = work.groupby(digest_col, dropna=False)
203
+
204
+ sizes = grouped.size().rename("count")
205
+ indices_map = {k: list(v) for k, v in grouped.indices.items()}
206
+ indices = pd.Series(indices_map, name="indices")
207
+
208
+ report = (
209
+ pd.concat([sizes, indices], axis=1)
210
+ .reset_index()
211
+ .rename(columns={"index": digest_col})
212
+ )
213
+
214
+ report = (
215
+ report[report["count"] > 1]
216
+ .sort_values("count", ascending=False)
217
+ .reset_index(drop=True)
218
+ )
219
+
220
+ df_deduped = df.drop_duplicates(subset=subset_for_dupes, keep=keep)
221
+ return df_deduped, report
@@ -0,0 +1,198 @@
1
+ Metadata-Version: 2.4
2
+ Name: csvsmith
3
+ Version: 0.1.0
4
+ Summary: Small CSV utilities: duplicates, row digests, and CLI helpers.
5
+ Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 YOUR NAME
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the “Software”), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/yeiichi/csvsmith
29
+ Project-URL: Repository, https://github.com/yeiichi/csvsmith
30
+ Keywords: csv,pandas,duplicates,data-cleaning
31
+ Classifier: Programming Language :: Python :: 3
32
+ Classifier: Programming Language :: Python :: 3 :: Only
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: Topic :: Software Development :: Libraries
36
+ Classifier: Topic :: Utilities
37
+ Requires-Python: >=3.10
38
+ Description-Content-Type: text/markdown
39
+ License-File: LICENSE
40
+ Requires-Dist: pandas>=2.0
41
+ Dynamic: license-file
42
+
43
+ # csvsmith
44
+
45
+ [![PyPI version](https://img.shields.io/pypi/v/csvsmith.svg)](https://pypi.org/project/csvsmith/)
46
+ ![Python versions](https://img.shields.io/pypi/pyversions/csvsmith.svg)
47
+ [![License](https://img.shields.io/pypi/l/csvsmith.svg)](https://pypi.org/project/csvsmith/)
48
+
49
+ `csvsmith` is a small collection of CSV utilities.
50
+
51
+ ---
52
+
53
+ Current focus:
54
+
55
+ - Duplicate value counting (`count_duplicates_sorted`)
56
+ - Row-level digest creation (`add_row_digest`)
57
+ - Duplicate-row detection (`find_duplicate_rows`)
58
+ - Deduplication with full duplicate report (`dedupe_with_report`)
59
+ - Command-line interface (CLI) for quick operations
60
+
61
+ ---
62
+
63
+ ## Installation
64
+
65
+ From PyPI (future):
66
+
67
+ ```bash
68
+ pip install csvsmith
69
+ ```
70
+
71
+ For local development:
72
+
73
+ ```bash
74
+ git clone https://github.com/YOUR_GITHUB_USERNAME/csvsmith.git
75
+ cd csvsmith
76
+ python -m venv .venv
77
+ source .venv/bin/activate
78
+ pip install -e .[dev]
79
+ ```
80
+
81
+ ---
82
+
83
+ ## Python API Usage
84
+
85
+ ### Count duplicate values
86
+
87
+ ```python
88
+ from csvsmith import count_duplicates_sorted
89
+
90
+ items = ["a", "b", "a", "c", "a", "b"]
91
+ print(count_duplicates_sorted(items))
92
+ # [('a', 3), ('b', 2)]
93
+ ```
94
+
95
+ ### Find duplicate rows in a DataFrame
96
+
97
+ ```python
98
+ import pandas as pd
99
+ from csvsmith import find_duplicate_rows
100
+
101
+ df = pd.read_csv("input.csv")
102
+ dup_rows = find_duplicate_rows(df)
103
+ print(dup_rows)
104
+ ```
105
+
106
+ ### Deduplicate with report
107
+
108
+ ```python
109
+ import pandas as pd
110
+ from csvsmith import dedupe_with_report
111
+
112
+ df = pd.read_csv("input.csv")
113
+
114
+ # Use all columns
115
+ deduped, report = dedupe_with_report(df)
116
+ deduped.to_csv("deduped.csv", index=False)
117
+ report.to_csv("duplicate_report.csv", index=False)
118
+
119
+ # Use all columns except an ID column
120
+ deduped_no_id, report_no_id = dedupe_with_report(df, exclude=["id"])
121
+ ```
122
+
123
+ ---
124
+
125
+ ## CLI Usage
126
+
127
+ `csvsmith` includes a small command-line interface for duplicate detection
128
+ and CSV deduplication.
129
+
130
+ ### Show duplicate rows
131
+
132
+ ```bash
133
+ csvsmith row-duplicates input.csv
134
+ ```
135
+
136
+ Save only duplicate rows to a file:
137
+
138
+ ```bash
139
+ csvsmith row-duplicates input.csv -o duplicates_only.csv
140
+ ```
141
+
142
+ Use only a subset of columns to determine duplicates:
143
+
144
+ ```bash
145
+ csvsmith row-duplicates input.csv --subset col1 col2 -o dup_rows_subset.csv
146
+ ```
147
+
148
+ Exclude ID column(s) when looking for duplicates:
149
+
150
+ ```bash
151
+ csvsmith row-duplicates input.csv --exclude id -o dup_rows_no_id.csv
152
+ ```
153
+
154
+ ### Deduplicate and generate a duplicate report
155
+
156
+ ```bash
157
+ csvsmith dedupe input.csv --deduped deduped.csv --report duplicate_report.csv
158
+ ```
159
+
160
+ ### Deduplicate using selected columns
161
+
162
+ ```bash
163
+ csvsmith dedupe input.csv --subset col1 col2 --deduped deduped_subset.csv --report duplicate_report_subset.csv
164
+ ```
165
+
166
+ ### Remove *all* occurrences of duplicated rows
167
+
168
+ ```bash
169
+ csvsmith dedupe input.csv --subset col1 --keep False --deduped deduped_no_dups.csv --report duplicate_report_col1.csv
170
+ ```
171
+
172
+ Exclude “id” from duplicate logic:
173
+
174
+ ```bash
175
+ csvsmith dedupe input.csv --exclude id --deduped deduped_no_id.csv --report duplicate_report_no_id.csv
176
+ ```
177
+
178
+ ---
179
+
180
+ ## Philosophy (“csvsmith Manifesto”)
181
+
182
+ 1. CSVs deserve tools that are simple, predictable, and transparent.
183
+ 2. A row has meaning only when its identity is stable and hashable.
184
+ 3. Collisions are sin; determinism is virtue.
185
+ 4. Let no delimiter sow ambiguity among fields.
186
+ 5. **Love thy `\x1f`.**
187
+ The unseen separator, the quiet guardian of clean hashes.
188
+ Chosen not for aesthetics, but for truth.
189
+ 6. The pipeline should be silent unless something is wrong.
190
+ 7. Your data deserves respect — and your tools should help you give it.
191
+
192
+ For more, see `MANIFESTO.md`.
193
+
194
+ ---
195
+
196
+ ## License
197
+
198
+ MIT License.
@@ -0,0 +1,13 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/csvsmith/__init__.py
5
+ src/csvsmith/cli.py
6
+ src/csvsmith/duplicates.py
7
+ src/csvsmith.egg-info/PKG-INFO
8
+ src/csvsmith.egg-info/SOURCES.txt
9
+ src/csvsmith.egg-info/dependency_links.txt
10
+ src/csvsmith.egg-info/entry_points.txt
11
+ src/csvsmith.egg-info/requires.txt
12
+ src/csvsmith.egg-info/top_level.txt
13
+ tests/test_duplicates.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ csvsmith = csvsmith.cli:main
@@ -0,0 +1 @@
1
+ pandas>=2.0
@@ -0,0 +1 @@
1
+ csvsmith
@@ -0,0 +1,252 @@
1
+ import pandas as pd
2
+ import pandas.testing as pdt
3
+
4
+ from csvsmith.duplicates import (
5
+ count_duplicates_sorted,
6
+ add_row_digest,
7
+ find_duplicate_rows,
8
+ dedupe_with_report,
9
+ )
10
+
11
+
12
+ # -------------------------------------------------------------------
13
+ # count_duplicates_sorted
14
+ # -------------------------------------------------------------------
15
+
16
+
17
+ def test_count_duplicates_sorted_basic():
18
+ items = ["a", "b", "a", "c", "a", "b"]
19
+ result = count_duplicates_sorted(items)
20
+ assert result == [("a", 3), ("b", 2)]
21
+
22
+
23
+ def test_count_duplicates_sorted_threshold():
24
+ items = ["x", "x", "y", "y", "y"]
25
+ result = count_duplicates_sorted(items, threshold=3)
26
+ assert result == [("y", 3)]
27
+
28
+
29
+ def test_count_duplicates_sorted_reverse_false():
30
+ items = ["a", "b", "a", "b", "b"]
31
+ result = count_duplicates_sorted(items, reverse=False)
32
+ assert result == [("a", 2), ("b", 3)]
33
+
34
+
35
+ def test_count_duplicates_sorted_empty_input():
36
+ items: list[str] = []
37
+ result = count_duplicates_sorted(items)
38
+ assert result == []
39
+
40
+
41
+ def test_count_duplicates_sorted_numeric_items():
42
+ items = [1, 2, 2, 3, 3, 3]
43
+ result = count_duplicates_sorted(items)
44
+ assert result == [(3, 3), (2, 2)]
45
+
46
+
47
+ def test_count_duplicates_sorted_threshold_above_all():
48
+ items = ["a", "a", "b"]
49
+ result = count_duplicates_sorted(items, threshold=5)
50
+ assert result == []
51
+
52
+
53
+ # -------------------------------------------------------------------
54
+ # add_row_digest
55
+ # -------------------------------------------------------------------
56
+
57
+
58
+ def test_add_row_digest_basic():
59
+ df = pd.DataFrame({"A": [1, 1, 2], "B": ["x", "x", "y"]})
60
+ out = add_row_digest(df)
61
+ assert "row_digest" in out.columns
62
+
63
+ # identical rows → identical digests
64
+ assert out["row_digest"].iloc[0] == out["row_digest"].iloc[1]
65
+ # different row → different digest (very high probability)
66
+ assert out["row_digest"].iloc[2] != out["row_digest"].iloc[0]
67
+
68
+
69
+ def test_add_row_digest_subset_columns():
70
+ df = pd.DataFrame(
71
+ {
72
+ "A": [1, 1, 1],
73
+ "B": ["x", "y", "z"],
74
+ }
75
+ )
76
+ out = add_row_digest(df, subset=["A"], colname="digest_a")
77
+ assert "digest_a" in out.columns
78
+ assert len(out["digest_a"].unique()) == 1
79
+
80
+
81
+ def test_add_row_digest_inplace_true_returns_same_object():
82
+ df = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]})
83
+ df_id_before = id(df)
84
+ out = add_row_digest(df, inplace=True)
85
+ df_id_after = id(df)
86
+ assert df_id_before == df_id_after
87
+ assert id(out) == id(df)
88
+ assert "row_digest" in df.columns
89
+
90
+
91
+ def test_add_row_digest_handles_nans():
92
+ df = pd.DataFrame({"A": [1, None, 1], "B": ["x", "x", None]})
93
+ out = add_row_digest(df)
94
+ assert "row_digest" in out.columns
95
+ digest_series = out["row_digest"]
96
+ assert digest_series.notna().all()
97
+ assert digest_series.map(len).eq(64).all()
98
+
99
+
100
+ def test_add_row_digest_exclude_id_column():
101
+ df = pd.DataFrame(
102
+ {
103
+ "id": [1, 2, 3],
104
+ "value": [10, 10, 20],
105
+ }
106
+ )
107
+
108
+ # Without exclude: all rows differ (id is included)
109
+ out_all = add_row_digest(df)
110
+ assert len(out_all["row_digest"].unique()) == 3
111
+
112
+ # With exclude: only "value" is used -> rows 0 and 1 should match
113
+ out_no_id = add_row_digest(df, exclude=["id"])
114
+ digests = out_no_id["row_digest"]
115
+ assert digests.iloc[0] == digests.iloc[1]
116
+ assert digests.iloc[2] != digests.iloc[0]
117
+
118
+
119
+ # -------------------------------------------------------------------
120
+ # find_duplicate_rows
121
+ # -------------------------------------------------------------------
122
+
123
+
124
+ def test_find_duplicate_rows_all_columns():
125
+ df = pd.DataFrame(
126
+ {
127
+ "A": [1, 1, 2, 2, 2, 3],
128
+ "B": ["x", "x", "y", "y", "z", "z"],
129
+ }
130
+ )
131
+ dup_df = find_duplicate_rows(df)
132
+ assert list(dup_df.index) == [0, 1, 2, 3]
133
+
134
+
135
+ def test_find_duplicate_rows_subset():
136
+ df = pd.DataFrame(
137
+ {
138
+ "A": [1, 1, 2, 2, 2, 3],
139
+ "B": [10, 11, 20, 21, 22, 30],
140
+ }
141
+ )
142
+ dup_df = find_duplicate_rows(df, subset=["A"])
143
+ assert list(dup_df.index) == [0, 1, 2, 3, 4]
144
+
145
+
146
+ def test_find_duplicate_rows_no_duplicates():
147
+ df = pd.DataFrame({"A": [1, 2, 3], "B": ["x", "y", "z"]})
148
+ dup_df = find_duplicate_rows(df)
149
+ assert dup_df.empty
150
+
151
+
152
+ def test_find_duplicate_rows_empty_df():
153
+ df = pd.DataFrame(columns=["A", "B"])
154
+ dup_df = find_duplicate_rows(df)
155
+ assert dup_df.empty
156
+ assert list(dup_df.columns) == ["A", "B"]
157
+
158
+
159
+ # -------------------------------------------------------------------
160
+ # dedupe_with_report
161
+ # -------------------------------------------------------------------
162
+
163
+
164
+ def test_dedupe_with_report_all_columns():
165
+ df = pd.DataFrame(
166
+ {
167
+ "A": [1, 1, 2, 2, 2, 3],
168
+ "B": ["x", "x", "y", "y", "z", "z"],
169
+ }
170
+ )
171
+
172
+ deduped, report = dedupe_with_report(df)
173
+
174
+ expected = df.drop_duplicates()
175
+ pdt.assert_frame_equal(
176
+ deduped.reset_index(drop=True),
177
+ expected.reset_index(drop=True),
178
+ )
179
+
180
+ assert set(report.columns) == {"row_digest", "count", "indices"}
181
+ assert sorted(report["count"].tolist(), reverse=True) == [2, 2]
182
+
183
+
184
+ def test_dedupe_with_report_subset_column():
185
+ df = pd.DataFrame(
186
+ {
187
+ "A": [1, 1, 2, 2, 2, 3],
188
+ "B": [10, 11, 20, 21, 22, 30],
189
+ }
190
+ )
191
+
192
+ deduped, report = dedupe_with_report(df, subset=["A"])
193
+ expected = df.drop_duplicates(subset=["A"])
194
+
195
+ pdt.assert_frame_equal(
196
+ deduped.reset_index(drop=True),
197
+ expected.reset_index(drop=True),
198
+ )
199
+
200
+ assert sorted(report["count"].tolist(), reverse=True) == [3, 2]
201
+
202
+
203
+ def test_dedupe_with_report_no_duplicates_gives_empty_report():
204
+ df = pd.DataFrame(
205
+ {
206
+ "A": [1, 2, 3],
207
+ "B": ["x", "y", "z"],
208
+ }
209
+ )
210
+ deduped, report = dedupe_with_report(df)
211
+ expected = df.drop_duplicates()
212
+
213
+ pdt.assert_frame_equal(
214
+ deduped.reset_index(drop=True),
215
+ expected.reset_index(drop=True),
216
+ )
217
+ assert report.empty
218
+
219
+
220
+ def test_dedupe_with_report_keep_last():
221
+ df = pd.DataFrame(
222
+ {
223
+ "A": [1, 1, 1],
224
+ "B": ["x", "y", "z"],
225
+ }
226
+ )
227
+
228
+ deduped_first, _ = dedupe_with_report(df, subset=["A"], keep="first")
229
+ deduped_last, _ = dedupe_with_report(df, subset=["A"], keep="last")
230
+
231
+ assert deduped_first.iloc[0]["B"] == "x"
232
+ assert deduped_last.iloc[0]["B"] == "z"
233
+
234
+
235
+ def test_dedupe_with_report_custom_digest_col_name():
236
+ df = pd.DataFrame(
237
+ {
238
+ "A": [1, 1, 2, 2],
239
+ "B": ["x", "x", "y", "y"],
240
+ }
241
+ )
242
+
243
+ deduped, report = dedupe_with_report(df, digest_col="my_digest")
244
+
245
+ expected = df.drop_duplicates()
246
+ pdt.assert_frame_equal(
247
+ deduped.reset_index(drop=True),
248
+ expected.reset_index(drop=True),
249
+ )
250
+
251
+ assert "my_digest" in report.columns
252
+ assert set(report.columns) == {"my_digest", "count", "indices"}