csvsmith 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csvsmith/__init__.py +21 -0
- csvsmith/cli.py +215 -0
- csvsmith/duplicates.py +221 -0
- csvsmith-0.1.0.dist-info/METADATA +198 -0
- csvsmith-0.1.0.dist-info/RECORD +9 -0
- csvsmith-0.1.0.dist-info/WHEEL +5 -0
- csvsmith-0.1.0.dist-info/entry_points.txt +2 -0
- csvsmith-0.1.0.dist-info/licenses/LICENSE +21 -0
- csvsmith-0.1.0.dist-info/top_level.txt +1 -0
csvsmith/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
csvsmith: small, focused CSV utilities.
|
|
3
|
+
|
|
4
|
+
Current submodules:
|
|
5
|
+
- csvsmith.duplicates
|
|
6
|
+
- csvsmith.cli (CLI entrypoint)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .duplicates import (
|
|
10
|
+
count_duplicates_sorted,
|
|
11
|
+
add_row_digest,
|
|
12
|
+
find_duplicate_rows,
|
|
13
|
+
dedupe_with_report,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"count_duplicates_sorted",
|
|
18
|
+
"add_row_digest",
|
|
19
|
+
"find_duplicate_rows",
|
|
20
|
+
"dedupe_with_report",
|
|
21
|
+
]
|
csvsmith/cli.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
csvsmith CLI
|
|
4
|
+
|
|
5
|
+
Duplicate-related helpers on CSV files.
|
|
6
|
+
|
|
7
|
+
Subcommands:
|
|
8
|
+
- row-duplicates: show only rows that are duplicated
|
|
9
|
+
- dedupe: drop duplicates and write both deduped CSV and a report CSV
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import sys
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Sequence, Optional, List
|
|
18
|
+
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
from .duplicates import find_duplicate_rows, dedupe_with_report
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _parse_cols(cols: Optional[Sequence[str]]) -> Optional[List[str]]:
|
|
25
|
+
"""
|
|
26
|
+
Normalize column list arguments from CLI.
|
|
27
|
+
|
|
28
|
+
We accept:
|
|
29
|
+
--subset col1 col2 col3
|
|
30
|
+
--exclude colA colB
|
|
31
|
+
or omit entirely.
|
|
32
|
+
"""
|
|
33
|
+
if cols is None:
|
|
34
|
+
return None
|
|
35
|
+
if len(cols) == 0:
|
|
36
|
+
return None
|
|
37
|
+
return list(cols)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _effective_subset(
|
|
41
|
+
df: pd.DataFrame,
|
|
42
|
+
subset: Optional[Sequence[str]],
|
|
43
|
+
exclude: Optional[Sequence[str]],
|
|
44
|
+
) -> Optional[List[str]]:
|
|
45
|
+
"""
|
|
46
|
+
Compute the effective subset of columns to use for duplicate detection,
|
|
47
|
+
given a requested subset and/or exclude list.
|
|
48
|
+
|
|
49
|
+
Logic:
|
|
50
|
+
- if subset is None: start from all columns
|
|
51
|
+
- else: start from subset
|
|
52
|
+
- then remove any columns in exclude
|
|
53
|
+
"""
|
|
54
|
+
if subset is None:
|
|
55
|
+
cols = list(df.columns)
|
|
56
|
+
else:
|
|
57
|
+
cols = list(subset)
|
|
58
|
+
|
|
59
|
+
if exclude:
|
|
60
|
+
exclude_set = set(exclude)
|
|
61
|
+
cols = [c for c in cols if c not in exclude_set]
|
|
62
|
+
|
|
63
|
+
if not cols:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
return cols
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def cmd_row_duplicates(args: argparse.Namespace) -> int:
|
|
70
|
+
input_path = Path(args.input)
|
|
71
|
+
if not input_path.is_file():
|
|
72
|
+
print(f"Error: input file not found: {input_path}", file=sys.stderr)
|
|
73
|
+
return 1
|
|
74
|
+
|
|
75
|
+
df = pd.read_csv(input_path)
|
|
76
|
+
subset = _parse_cols(args.subset)
|
|
77
|
+
exclude = _parse_cols(args.exclude)
|
|
78
|
+
|
|
79
|
+
eff_subset = _effective_subset(df, subset=subset, exclude=exclude)
|
|
80
|
+
|
|
81
|
+
dup_df = find_duplicate_rows(df, subset=eff_subset)
|
|
82
|
+
|
|
83
|
+
if args.output:
|
|
84
|
+
output_path = Path(args.output)
|
|
85
|
+
dup_df.to_csv(output_path, index=False)
|
|
86
|
+
else:
|
|
87
|
+
dup_df.to_csv(sys.stdout, index=False)
|
|
88
|
+
|
|
89
|
+
return 0
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def cmd_dedupe(args: argparse.Namespace) -> int:
|
|
93
|
+
input_path = Path(args.input)
|
|
94
|
+
if not input_path.is_file():
|
|
95
|
+
print(f"Error: input file not found: {input_path}", file=sys.stderr)
|
|
96
|
+
return 1
|
|
97
|
+
|
|
98
|
+
df = pd.read_csv(input_path)
|
|
99
|
+
subset = _parse_cols(args.subset)
|
|
100
|
+
exclude = _parse_cols(args.exclude)
|
|
101
|
+
|
|
102
|
+
deduped, report = dedupe_with_report(
|
|
103
|
+
df,
|
|
104
|
+
subset=subset,
|
|
105
|
+
exclude=exclude,
|
|
106
|
+
keep=args.keep,
|
|
107
|
+
digest_col=args.digest_col,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
deduped_path = Path(args.deduped)
|
|
111
|
+
report_path = Path(args.report)
|
|
112
|
+
|
|
113
|
+
deduped_path.parent.mkdir(parents=True, exist_ok=True)
|
|
114
|
+
report_path.parent.mkdir(parents=True, exist_ok=True)
|
|
115
|
+
|
|
116
|
+
deduped.to_csv(deduped_path, index=False)
|
|
117
|
+
report.to_csv(report_path, index=False)
|
|
118
|
+
|
|
119
|
+
print(f"Wrote deduped CSV to: {deduped_path}")
|
|
120
|
+
print(f"Wrote duplicate report to: {report_path}")
|
|
121
|
+
return 0
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
125
|
+
parser = argparse.ArgumentParser(
|
|
126
|
+
prog="csvsmith",
|
|
127
|
+
description="Small CSV utilities (duplicates-focused, first iteration).",
|
|
128
|
+
)
|
|
129
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
130
|
+
|
|
131
|
+
# row-duplicates
|
|
132
|
+
p_row = subparsers.add_parser(
|
|
133
|
+
"row-duplicates",
|
|
134
|
+
help="Print only rows that have duplicates.",
|
|
135
|
+
)
|
|
136
|
+
p_row.add_argument("input", help="Input CSV file.")
|
|
137
|
+
p_row.add_argument(
|
|
138
|
+
"--subset",
|
|
139
|
+
nargs="*",
|
|
140
|
+
help="Column names to consider when detecting duplicates. "
|
|
141
|
+
"If omitted, all columns are used.",
|
|
142
|
+
)
|
|
143
|
+
p_row.add_argument(
|
|
144
|
+
"--exclude",
|
|
145
|
+
nargs="*",
|
|
146
|
+
help="Column names to exclude from duplicate detection. "
|
|
147
|
+
"Useful for ID columns, timestamps, etc.",
|
|
148
|
+
)
|
|
149
|
+
p_row.add_argument(
|
|
150
|
+
"-o",
|
|
151
|
+
"--output",
|
|
152
|
+
help="Output CSV file for duplicate rows. If omitted, writes to stdout.",
|
|
153
|
+
)
|
|
154
|
+
p_row.set_defaults(func=cmd_row_duplicates)
|
|
155
|
+
|
|
156
|
+
# dedupe
|
|
157
|
+
p_dedupe = subparsers.add_parser(
|
|
158
|
+
"dedupe",
|
|
159
|
+
help="Drop duplicates and generate a duplicate-report CSV.",
|
|
160
|
+
)
|
|
161
|
+
p_dedupe.add_argument("input", help="Input CSV file.")
|
|
162
|
+
p_dedupe.add_argument(
|
|
163
|
+
"--subset",
|
|
164
|
+
nargs="*",
|
|
165
|
+
help="Column names to consider when detecting duplicates. "
|
|
166
|
+
"If omitted, all columns are used.",
|
|
167
|
+
)
|
|
168
|
+
p_dedupe.add_argument(
|
|
169
|
+
"--exclude",
|
|
170
|
+
nargs="*",
|
|
171
|
+
help="Column names to exclude from duplicate detection. "
|
|
172
|
+
"Useful for ID columns, timestamps, etc.",
|
|
173
|
+
)
|
|
174
|
+
p_dedupe.add_argument(
|
|
175
|
+
"--keep",
|
|
176
|
+
choices=["first", "last", "False"],
|
|
177
|
+
default="first",
|
|
178
|
+
help='Which duplicate to keep (same as pandas.drop_duplicates). '
|
|
179
|
+
'"False" = drop all occurrences. Default: "first".',
|
|
180
|
+
)
|
|
181
|
+
p_dedupe.add_argument(
|
|
182
|
+
"--digest-col",
|
|
183
|
+
default="row_digest",
|
|
184
|
+
help='Name of digest column used in the report. Default: "row_digest".',
|
|
185
|
+
)
|
|
186
|
+
p_dedupe.add_argument(
|
|
187
|
+
"--deduped",
|
|
188
|
+
required=True,
|
|
189
|
+
help="Path to write the deduplicated CSV.",
|
|
190
|
+
)
|
|
191
|
+
p_dedupe.add_argument(
|
|
192
|
+
"--report",
|
|
193
|
+
required=True,
|
|
194
|
+
help="Path to write the duplicate-report CSV.",
|
|
195
|
+
)
|
|
196
|
+
p_dedupe.set_defaults(func=cmd_dedupe)
|
|
197
|
+
|
|
198
|
+
return parser
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def main(argv: Optional[Sequence[str]] = None) -> int:
|
|
202
|
+
parser = build_parser()
|
|
203
|
+
args = parser.parse_args(argv)
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
func = args.func
|
|
207
|
+
except AttributeError:
|
|
208
|
+
parser.print_help()
|
|
209
|
+
return 1
|
|
210
|
+
|
|
211
|
+
return func(args)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
if __name__ == "__main__":
|
|
215
|
+
raise SystemExit(main())
|
csvsmith/duplicates.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Duplicate-related helpers for csvsmith.
|
|
3
|
+
|
|
4
|
+
Includes:
|
|
5
|
+
- count_duplicates_sorted: generic iterable duplicate counter
|
|
6
|
+
- add_row_digest: add a SHA-256 digest per row to a DataFrame
|
|
7
|
+
- find_duplicate_rows: return only rows that have duplicates
|
|
8
|
+
- dedupe_with_report: drop duplicates and report duplicate groups
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from collections import Counter
|
|
14
|
+
from hashlib import sha256
|
|
15
|
+
from typing import Iterable, List, Tuple, Hashable, Sequence, Optional
|
|
16
|
+
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def count_duplicates_sorted(
|
|
21
|
+
items: Iterable[Hashable],
|
|
22
|
+
threshold: int = 2,
|
|
23
|
+
reverse: bool = True,
|
|
24
|
+
) -> List[Tuple[Hashable, int]]:
|
|
25
|
+
"""
|
|
26
|
+
Count occurrences in an iterable and return items whose frequency
|
|
27
|
+
is at or above `threshold`, sorted by count.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
items:
|
|
31
|
+
Any iterable of hashable items (str, int, tuple, etc.)
|
|
32
|
+
threshold:
|
|
33
|
+
Minimum count to include in output (default: 2).
|
|
34
|
+
reverse:
|
|
35
|
+
Whether to sort in descending order (default: True).
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
A list of (item, count) tuples sorted by frequency.
|
|
39
|
+
"""
|
|
40
|
+
counter = Counter(items)
|
|
41
|
+
duplicates = [(k, v) for k, v in counter.items() if v >= threshold]
|
|
42
|
+
duplicates.sort(key=lambda x: x[1], reverse=reverse)
|
|
43
|
+
return duplicates
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def add_row_digest(
|
|
47
|
+
df: pd.DataFrame,
|
|
48
|
+
*,
|
|
49
|
+
subset: Optional[Sequence[Hashable]] = None,
|
|
50
|
+
exclude: Optional[Sequence[Hashable]] = None,
|
|
51
|
+
colname: str = "row_digest",
|
|
52
|
+
inplace: bool = False,
|
|
53
|
+
) -> pd.DataFrame:
|
|
54
|
+
"""
|
|
55
|
+
Add a SHA-256 digest for each row of a DataFrame.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
df:
|
|
59
|
+
Input DataFrame.
|
|
60
|
+
subset:
|
|
61
|
+
Optional list/sequence of column labels to use for the digest.
|
|
62
|
+
If None, all columns are used.
|
|
63
|
+
exclude:
|
|
64
|
+
Optional list/sequence of column labels to exclude from the digest,
|
|
65
|
+
after `subset` is applied. This is useful for excluding ID columns,
|
|
66
|
+
timestamps, etc.
|
|
67
|
+
colname:
|
|
68
|
+
Name of the digest column to add (default: "row_digest").
|
|
69
|
+
inplace:
|
|
70
|
+
If True, modify the original DataFrame and return it.
|
|
71
|
+
If False (default), return a copy.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
DataFrame with an extra column containing hex digests.
|
|
75
|
+
|
|
76
|
+
Notes:
|
|
77
|
+
We use the ASCII "Unit Separator" (0x1F, "\\x1f") as the internal
|
|
78
|
+
delimiter when concatenating row values before hashing. It is a
|
|
79
|
+
non-printable control character that almost never appears in normal
|
|
80
|
+
CSV data, which helps avoid accidental collisions like:
|
|
81
|
+
|
|
82
|
+
["ab", "c"] vs ["a", "bc"]
|
|
83
|
+
|
|
84
|
+
Credo #5 of csvsmith: "Love thy \\x1f."
|
|
85
|
+
"""
|
|
86
|
+
# Determine columns to include
|
|
87
|
+
if subset is None:
|
|
88
|
+
cols = list(df.columns)
|
|
89
|
+
else:
|
|
90
|
+
cols = list(subset)
|
|
91
|
+
|
|
92
|
+
if exclude:
|
|
93
|
+
exclude_set = set(exclude)
|
|
94
|
+
cols = [c for c in cols if c not in exclude_set]
|
|
95
|
+
|
|
96
|
+
# Convert to string, fill NaNs, and join with a non-printable separator
|
|
97
|
+
concatted = df[cols].astype("string").fillna("").agg("\x1f".join, axis=1)
|
|
98
|
+
digests = concatted.map(lambda s: sha256(s.encode("utf-8")).hexdigest())
|
|
99
|
+
|
|
100
|
+
if inplace:
|
|
101
|
+
df[colname] = digests
|
|
102
|
+
return df
|
|
103
|
+
else:
|
|
104
|
+
df2 = df.copy()
|
|
105
|
+
df2[colname] = digests
|
|
106
|
+
return df2
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def find_duplicate_rows(
|
|
110
|
+
df: pd.DataFrame,
|
|
111
|
+
*,
|
|
112
|
+
subset: Optional[Sequence[Hashable]] = None,
|
|
113
|
+
) -> pd.DataFrame:
|
|
114
|
+
"""
|
|
115
|
+
Return only rows that participate in duplicates.
|
|
116
|
+
|
|
117
|
+
This is a convenience wrapper around `df.duplicated(keep=False)`.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
df:
|
|
121
|
+
Input DataFrame.
|
|
122
|
+
subset:
|
|
123
|
+
Columns to consider when identifying duplicates. If None,
|
|
124
|
+
all columns are used.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
A DataFrame containing only rows that have at least one duplicate,
|
|
128
|
+
preserving the original order and index.
|
|
129
|
+
"""
|
|
130
|
+
mask = df.duplicated(subset=subset, keep=False)
|
|
131
|
+
return df[mask]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def dedupe_with_report(
|
|
135
|
+
df: pd.DataFrame,
|
|
136
|
+
*,
|
|
137
|
+
subset: Optional[Sequence[Hashable]] = None,
|
|
138
|
+
exclude: Optional[Sequence[Hashable]] = None,
|
|
139
|
+
keep: str = "first",
|
|
140
|
+
digest_col: str = "row_digest",
|
|
141
|
+
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
142
|
+
"""
|
|
143
|
+
Drop duplicate rows *and* return a report of what was duplicated.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
df:
|
|
147
|
+
Input DataFrame.
|
|
148
|
+
subset:
|
|
149
|
+
Columns to consider when identifying duplicates. If None,
|
|
150
|
+
all columns are used.
|
|
151
|
+
exclude:
|
|
152
|
+
Columns to exclude from the duplicate check and digest,
|
|
153
|
+
after `subset` is applied. Useful for ID columns, timestamps, etc.
|
|
154
|
+
keep:
|
|
155
|
+
Which duplicate to keep. Same semantics as pandas:
|
|
156
|
+
"first", "last", or "False" (string). Default: "first".
|
|
157
|
+
digest_col:
|
|
158
|
+
Name of the temporary digest column used for grouping in the
|
|
159
|
+
report (default: "row_digest").
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
(df_deduped, report)
|
|
163
|
+
|
|
164
|
+
df_deduped:
|
|
165
|
+
DataFrame with duplicates dropped according to the effective
|
|
166
|
+
subset (subset minus exclude) and `keep`.
|
|
167
|
+
|
|
168
|
+
report:
|
|
169
|
+
DataFrame with one row per duplicate group, columns:
|
|
170
|
+
- digest_col: the SHA-256 row digest
|
|
171
|
+
- count: number of rows in this group
|
|
172
|
+
- indices: list of original DataFrame indices in this group
|
|
173
|
+
|
|
174
|
+
Only groups with count > 1 are included, sorted by `count`
|
|
175
|
+
descending.
|
|
176
|
+
"""
|
|
177
|
+
# Determine effective subset for both hashing and drop_duplicates
|
|
178
|
+
if subset is None:
|
|
179
|
+
cols = list(df.columns)
|
|
180
|
+
else:
|
|
181
|
+
cols = list(subset)
|
|
182
|
+
|
|
183
|
+
if exclude:
|
|
184
|
+
exclude_set = set(exclude)
|
|
185
|
+
cols = [c for c in cols if c not in exclude_set]
|
|
186
|
+
|
|
187
|
+
subset_for_dupes: Optional[Sequence[Hashable]]
|
|
188
|
+
if cols:
|
|
189
|
+
subset_for_dupes = cols
|
|
190
|
+
else:
|
|
191
|
+
subset_for_dupes = None
|
|
192
|
+
|
|
193
|
+
# Work on a copy with a digest column, using the effective subset
|
|
194
|
+
work = add_row_digest(
|
|
195
|
+
df,
|
|
196
|
+
subset=subset_for_dupes,
|
|
197
|
+
exclude=None,
|
|
198
|
+
colname=digest_col,
|
|
199
|
+
inplace=False,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
grouped = work.groupby(digest_col, dropna=False)
|
|
203
|
+
|
|
204
|
+
sizes = grouped.size().rename("count")
|
|
205
|
+
indices_map = {k: list(v) for k, v in grouped.indices.items()}
|
|
206
|
+
indices = pd.Series(indices_map, name="indices")
|
|
207
|
+
|
|
208
|
+
report = (
|
|
209
|
+
pd.concat([sizes, indices], axis=1)
|
|
210
|
+
.reset_index()
|
|
211
|
+
.rename(columns={"index": digest_col})
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
report = (
|
|
215
|
+
report[report["count"] > 1]
|
|
216
|
+
.sort_values("count", ascending=False)
|
|
217
|
+
.reset_index(drop=True)
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
df_deduped = df.drop_duplicates(subset=subset_for_dupes, keep=keep)
|
|
221
|
+
return df_deduped, report
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: csvsmith
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Small CSV utilities: duplicates, row digests, and CLI helpers.
|
|
5
|
+
Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 YOUR NAME
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the “Software”), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/yeiichi/csvsmith
|
|
29
|
+
Project-URL: Repository, https://github.com/yeiichi/csvsmith
|
|
30
|
+
Keywords: csv,pandas,duplicates,data-cleaning
|
|
31
|
+
Classifier: Programming Language :: Python :: 3
|
|
32
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
36
|
+
Classifier: Topic :: Utilities
|
|
37
|
+
Requires-Python: >=3.10
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
License-File: LICENSE
|
|
40
|
+
Requires-Dist: pandas>=2.0
|
|
41
|
+
Dynamic: license-file
|
|
42
|
+
|
|
43
|
+
# csvsmith
|
|
44
|
+
|
|
45
|
+
[](https://pypi.org/project/csvsmith/)
|
|
46
|
+

|
|
47
|
+
[](https://pypi.org/project/csvsmith/)
|
|
48
|
+
|
|
49
|
+
`csvsmith` is a small collection of CSV utilities.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
Current focus:
|
|
54
|
+
|
|
55
|
+
- Duplicate value counting (`count_duplicates_sorted`)
|
|
56
|
+
- Row-level digest creation (`add_row_digest`)
|
|
57
|
+
- Duplicate-row detection (`find_duplicate_rows`)
|
|
58
|
+
- Deduplication with full duplicate report (`dedupe_with_report`)
|
|
59
|
+
- Command-line interface (CLI) for quick operations
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
From PyPI (future):
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install csvsmith
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
For local development:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
git clone https://github.com/YOUR_GITHUB_USERNAME/csvsmith.git
|
|
75
|
+
cd csvsmith
|
|
76
|
+
python -m venv .venv
|
|
77
|
+
source .venv/bin/activate
|
|
78
|
+
pip install -e .[dev]
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Python API Usage
|
|
84
|
+
|
|
85
|
+
### Count duplicate values
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from csvsmith import count_duplicates_sorted
|
|
89
|
+
|
|
90
|
+
items = ["a", "b", "a", "c", "a", "b"]
|
|
91
|
+
print(count_duplicates_sorted(items))
|
|
92
|
+
# [('a', 3), ('b', 2)]
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Find duplicate rows in a DataFrame
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
import pandas as pd
|
|
99
|
+
from csvsmith import find_duplicate_rows
|
|
100
|
+
|
|
101
|
+
df = pd.read_csv("input.csv")
|
|
102
|
+
dup_rows = find_duplicate_rows(df)
|
|
103
|
+
print(dup_rows)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Deduplicate with report
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
import pandas as pd
|
|
110
|
+
from csvsmith import dedupe_with_report
|
|
111
|
+
|
|
112
|
+
df = pd.read_csv("input.csv")
|
|
113
|
+
|
|
114
|
+
# Use all columns
|
|
115
|
+
deduped, report = dedupe_with_report(df)
|
|
116
|
+
deduped.to_csv("deduped.csv", index=False)
|
|
117
|
+
report.to_csv("duplicate_report.csv", index=False)
|
|
118
|
+
|
|
119
|
+
# Use all columns except an ID column
|
|
120
|
+
deduped_no_id, report_no_id = dedupe_with_report(df, exclude=["id"])
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## CLI Usage
|
|
126
|
+
|
|
127
|
+
`csvsmith` includes a small command-line interface for duplicate detection
|
|
128
|
+
and CSV deduplication.
|
|
129
|
+
|
|
130
|
+
### Show duplicate rows
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
csvsmith row-duplicates input.csv
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Save only duplicate rows to a file:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
csvsmith row-duplicates input.csv -o duplicates_only.csv
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Use only a subset of columns to determine duplicates:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
csvsmith row-duplicates input.csv --subset col1 col2 -o dup_rows_subset.csv
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
Exclude ID column(s) when looking for duplicates:
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
csvsmith row-duplicates input.csv --exclude id -o dup_rows_no_id.csv
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Deduplicate and generate a duplicate report
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
csvsmith dedupe input.csv --deduped deduped.csv --report duplicate_report.csv
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Deduplicate using selected columns
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
csvsmith dedupe input.csv --subset col1 col2 --deduped deduped_subset.csv --report duplicate_report_subset.csv
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### Remove *all* occurrences of duplicated rows
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
csvsmith dedupe input.csv --subset col1 --keep False --deduped deduped_no_dups.csv --report duplicate_report_col1.csv
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Exclude “id” from duplicate logic:
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
csvsmith dedupe input.csv --exclude id --deduped deduped_no_id.csv --report duplicate_report_no_id.csv
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Philosophy (“csvsmith Manifesto”)
|
|
181
|
+
|
|
182
|
+
1. CSVs deserve tools that are simple, predictable, and transparent.
|
|
183
|
+
2. A row has meaning only when its identity is stable and hashable.
|
|
184
|
+
3. Collisions are sin; determinism is virtue.
|
|
185
|
+
4. Let no delimiter sow ambiguity among fields.
|
|
186
|
+
5. **Love thy `\x1f`.**
|
|
187
|
+
The unseen separator, the quiet guardian of clean hashes.
|
|
188
|
+
Chosen not for aesthetics, but for truth.
|
|
189
|
+
6. The pipeline should be silent unless something is wrong.
|
|
190
|
+
7. Your data deserves respect — and your tools should help you give it.
|
|
191
|
+
|
|
192
|
+
For more, see `MANIFESTO.md`.
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## License
|
|
197
|
+
|
|
198
|
+
MIT License.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
csvsmith/__init__.py,sha256=t0vhxiJFalD-oLnNt4ud06Qcv4lBtubM8-kNKwhQT3c,371
|
|
2
|
+
csvsmith/cli.py,sha256=k46U-cGjydclQ4vZ1_tsfV6xRrVgWvh_KHPmNQCDi5Y,5692
|
|
3
|
+
csvsmith/duplicates.py,sha256=PzYzb6KnEf2E6HchXJO4b4PT8URV1FUPIhI0GbupeQ4,6655
|
|
4
|
+
csvsmith-0.1.0.dist-info/licenses/LICENSE,sha256=A8UVDOPo1679c7fl5k7o7t5m7g-5TqpHo8RPzDbAUHs,1074
|
|
5
|
+
csvsmith-0.1.0.dist-info/METADATA,sha256=pMNyw-NuYq-f18Beb_rYw7UV8_Ee-pih3clrWGSu44k,5546
|
|
6
|
+
csvsmith-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
+
csvsmith-0.1.0.dist-info/entry_points.txt,sha256=9Vmom48a5jBhWflrspYBPHD_HCYP5nzzxcHCPz7tagI,47
|
|
8
|
+
csvsmith-0.1.0.dist-info/top_level.txt,sha256=BF99nykjiyBPqYqvbTuS8WS-zaVvQW-uD7uVHNTw_3Y,9
|
|
9
|
+
csvsmith-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 YOUR NAME
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the “Software”), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
csvsmith
|