csvsmith 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csvsmith-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Eiichi YAMAMOTO
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
+ IN THE SOFTWARE.
@@ -0,0 +1,221 @@
1
+ Metadata-Version: 2.4
2
+ Name: csvsmith
3
+ Version: 0.2.0
4
+ Summary: Small CSV utilities: classification, duplicates, row digests, and CLI helpers.
5
+ Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Eiichi YAMAMOTO
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in
18
+ all copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26
+ IN THE SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/yeiichi/csvsmith
29
+ Project-URL: Repository, https://github.com/yeiichi/csvsmith
30
+ Keywords: csv,pandas,duplicates,data-cleaning,file-organization
31
+ Classifier: Programming Language :: Python :: 3
32
+ Classifier: Programming Language :: Python :: 3 :: Only
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: Topic :: Software Development :: Libraries
36
+ Classifier: Topic :: Utilities
37
+ Requires-Python: >=3.10
38
+ Description-Content-Type: text/markdown
39
+ License-File: LICENSE
40
+ Requires-Dist: pandas>=2.0
41
+ Dynamic: license-file
42
+
43
+ # csvsmith
44
+
45
+ [![PyPI version](https://img.shields.io/pypi/v/csvsmith.svg)](https://pypi.org/project/csvsmith/)
46
+ ![Python versions](https://img.shields.io/pypi/pyversions/csvsmith.svg)
47
+ [![License](https://img.shields.io/pypi/l/csvsmith.svg)](https://pypi.org/project/csvsmith/)
48
+
49
+ ## Introduction
50
+
51
+ `csvsmith` is a lightweight collection of CSV utilities designed for
52
+ data integrity, deduplication, and organization. It provides a robust
53
+ Python API for programmatic data cleaning and a convenient CLI for quick
54
+ operations. Whether you need to organize thousands of files based on
55
+ their structural signatures or pinpoint duplicate rows in a complex
56
+ dataset, `csvsmith` ensures the process is predictable, transparent, and
57
+ reversible.
58
+
59
+ ## Table of Contents
60
+
61
+ - [Installation](#installation)
62
+
63
+ -
64
+
65
+ [Python API Usage](#python-api-usage)
66
+
67
+ : - [Count duplicate values](#count-duplicate-values)
68
+ - [Find duplicate rows in a
69
+ DataFrame](#find-duplicate-rows-in-a-dataframe)
70
+ - [Deduplicate with report](#deduplicate-with-report)
71
+ - [CSV File Classification](#csv-file-classification)
72
+
73
+ -
74
+
75
+ [CLI Usage](#cli-usage)
76
+
77
+ : - [Show duplicate rows](#show-duplicate-rows)
78
+ - [Deduplicate and generate a duplicate
79
+ report](#deduplicate-and-generate-a-duplicate-report)
80
+ - [Classify CSVs](#classify-csvs)
81
+
82
+ - [Philosophy](#philosophy)
83
+
84
+ - [License](#license)
85
+
86
+ ## Installation
87
+
88
+ From PyPI:
89
+
90
+ ``` bash
91
+ pip install csvsmith
92
+ ```
93
+
94
+ For local development:
95
+
96
+ ``` bash
97
+ git clone https://github.com/yeiichi/csvsmith.git
98
+ cd csvsmith
99
+ python -m venv .venv
100
+ source .venv/bin/activate
101
+ pip install -e .[dev]
102
+ ```
103
+
104
+ ## Python API Usage
105
+
106
+ ### Count duplicate values
107
+
108
+ Works on any iterable of hashable items.
109
+
110
+ ``` python
111
+ from csvsmith import count_duplicates_sorted
112
+
113
+ items = ["a", "b", "a", "c", "a", "b"]
114
+ print(count_duplicates_sorted(items))
115
+ # [('a', 3), ('b', 2)]
116
+ ```
117
+
118
+ ### Find duplicate rows in a DataFrame
119
+
120
+ ``` python
121
+ import pandas as pd
122
+ from csvsmith import find_duplicate_rows
123
+
124
+ df = pd.read_csv("input.csv")
125
+ dup_rows = find_duplicate_rows(df)
126
+ print(dup_rows)
127
+ ```
128
+
129
+ ### Deduplicate with report
130
+
131
+ ``` python
132
+ import pandas as pd
133
+ from csvsmith import dedupe_with_report
134
+
135
+ df = pd.read_csv("input.csv")
136
+
137
+ # Use all columns
138
+ deduped, report = dedupe_with_report(df)
139
+ deduped.to_csv("deduped.csv", index=False)
140
+ report.to_csv("duplicate_report.csv", index=False)
141
+
142
+ # Use all columns except an ID column
143
+ deduped_no_id, report_no_id = dedupe_with_report(df, exclude=["id"])
144
+ ```
145
+
146
+ ### CSV File Classification
147
+
148
+ Organize files into directories based on their headers.
149
+
150
+ ``` python
151
+ from csvsmith.classify import CSVClassifier
152
+
153
+ classifier = CSVClassifier(
154
+ source_dir="./raw_data",
155
+ dest_dir="./organized",
156
+ auto=True # Automatically group files with identical headers
157
+ )
158
+
159
+ # Execute the classification
160
+ classifier.run()
161
+
162
+ # Or rollback a previous run using its manifest
163
+ classifier.rollback("./organized/manifest_20260121_120000.json")
164
+ ```
165
+
166
+ ## CLI Usage
167
+
168
+ `csvsmith` includes a command-line interface for duplicate detection and
169
+ file organization.
170
+
171
+ ### Show duplicate rows
172
+
173
+ ``` bash
174
+ csvsmith row-duplicates input.csv
175
+ ```
176
+
177
+ Save only duplicate rows to a file:
178
+
179
+ ``` bash
180
+ csvsmith row-duplicates input.csv -o duplicates_only.csv
181
+ ```
182
+
183
+ ### Deduplicate and generate a duplicate report
184
+
185
+ ``` bash
186
+ csvsmith dedupe input.csv --deduped deduped.csv --report duplicate_report.csv
187
+ ```
188
+
189
+ ### Classify CSVs
190
+
191
+ Organize a mess of CSV files into structured folders based on their
192
+ column headers.
193
+
194
+ ``` bash
195
+ # Preview what would happen (Dry Run)
196
+ csvsmith classify --src ./raw_data --dest ./organized --auto --dry-run
197
+
198
+ # Run classification with a signature config
199
+ csvsmith classify --src ./raw_data --dest ./organized --config signatures.json
200
+
201
+ # Undo a classification run
202
+ csvsmith classify --rollback ./organized/manifest_20260121_120000.json
203
+ ```
204
+
205
+ ## Philosophy
206
+
207
+ 1. CSVs deserve tools that are simple, predictable, and transparent.
208
+ 2. A row has meaning only when its identity is stable and hashable.
209
+ 3. Collisions are sin; determinism is virtue.
210
+ 4. Let no delimiter sow ambiguity among fields.
211
+ 5. **Love thy \\x1f.** The unseen separator, the quiet guardian of
212
+ clean hashes.
213
+ 6. The pipeline should be silent unless something is wrong.
214
+ 7. Your data deserves respect --- and your tools should help you give
215
+ it.
216
+
217
+ For more, see `MANIFESTO.md`.
218
+
219
+ ## License
220
+
221
+ MIT License.
@@ -0,0 +1,179 @@
1
+ # csvsmith
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/csvsmith.svg)](https://pypi.org/project/csvsmith/)
4
+ ![Python versions](https://img.shields.io/pypi/pyversions/csvsmith.svg)
5
+ [![License](https://img.shields.io/pypi/l/csvsmith.svg)](https://pypi.org/project/csvsmith/)
6
+
7
+ ## Introduction
8
+
9
+ `csvsmith` is a lightweight collection of CSV utilities designed for
10
+ data integrity, deduplication, and organization. It provides a robust
11
+ Python API for programmatic data cleaning and a convenient CLI for quick
12
+ operations. Whether you need to organize thousands of files based on
13
+ their structural signatures or pinpoint duplicate rows in a complex
14
+ dataset, `csvsmith` ensures the process is predictable, transparent, and
15
+ reversible.
16
+
17
+ ## Table of Contents
18
+
19
+ - [Installation](#installation)
20
+
21
+ -
22
+
23
+ [Python API Usage](#python-api-usage)
24
+
25
+ : - [Count duplicate values](#count-duplicate-values)
26
+ - [Find duplicate rows in a
27
+ DataFrame](#find-duplicate-rows-in-a-dataframe)
28
+ - [Deduplicate with report](#deduplicate-with-report)
29
+ - [CSV File Classification](#csv-file-classification)
30
+
31
+ -
32
+
33
+ [CLI Usage](#cli-usage)
34
+
35
+ : - [Show duplicate rows](#show-duplicate-rows)
36
+ - [Deduplicate and generate a duplicate
37
+ report](#deduplicate-and-generate-a-duplicate-report)
38
+ - [Classify CSVs](#classify-csvs)
39
+
40
+ - [Philosophy](#philosophy)
41
+
42
+ - [License](#license)
43
+
44
+ ## Installation
45
+
46
+ From PyPI:
47
+
48
+ ``` bash
49
+ pip install csvsmith
50
+ ```
51
+
52
+ For local development:
53
+
54
+ ``` bash
55
+ git clone https://github.com/yeiichi/csvsmith.git
56
+ cd csvsmith
57
+ python -m venv .venv
58
+ source .venv/bin/activate
59
+ pip install -e .[dev]
60
+ ```
61
+
62
+ ## Python API Usage
63
+
64
+ ### Count duplicate values
65
+
66
+ Works on any iterable of hashable items.
67
+
68
+ ``` python
69
+ from csvsmith import count_duplicates_sorted
70
+
71
+ items = ["a", "b", "a", "c", "a", "b"]
72
+ print(count_duplicates_sorted(items))
73
+ # [('a', 3), ('b', 2)]
74
+ ```
75
+
76
+ ### Find duplicate rows in a DataFrame
77
+
78
+ ``` python
79
+ import pandas as pd
80
+ from csvsmith import find_duplicate_rows
81
+
82
+ df = pd.read_csv("input.csv")
83
+ dup_rows = find_duplicate_rows(df)
84
+ print(dup_rows)
85
+ ```
86
+
87
+ ### Deduplicate with report
88
+
89
+ ``` python
90
+ import pandas as pd
91
+ from csvsmith import dedupe_with_report
92
+
93
+ df = pd.read_csv("input.csv")
94
+
95
+ # Use all columns
96
+ deduped, report = dedupe_with_report(df)
97
+ deduped.to_csv("deduped.csv", index=False)
98
+ report.to_csv("duplicate_report.csv", index=False)
99
+
100
+ # Use all columns except an ID column
101
+ deduped_no_id, report_no_id = dedupe_with_report(df, exclude=["id"])
102
+ ```
103
+
104
+ ### CSV File Classification
105
+
106
+ Organize files into directories based on their headers.
107
+
108
+ ``` python
109
+ from csvsmith.classify import CSVClassifier
110
+
111
+ classifier = CSVClassifier(
112
+ source_dir="./raw_data",
113
+ dest_dir="./organized",
114
+ auto=True # Automatically group files with identical headers
115
+ )
116
+
117
+ # Execute the classification
118
+ classifier.run()
119
+
120
+ # Or rollback a previous run using its manifest
121
+ classifier.rollback("./organized/manifest_20260121_120000.json")
122
+ ```
123
+
124
+ ## CLI Usage
125
+
126
+ `csvsmith` includes a command-line interface for duplicate detection and
127
+ file organization.
128
+
129
+ ### Show duplicate rows
130
+
131
+ ``` bash
132
+ csvsmith row-duplicates input.csv
133
+ ```
134
+
135
+ Save only duplicate rows to a file:
136
+
137
+ ``` bash
138
+ csvsmith row-duplicates input.csv -o duplicates_only.csv
139
+ ```
140
+
141
+ ### Deduplicate and generate a duplicate report
142
+
143
+ ``` bash
144
+ csvsmith dedupe input.csv --deduped deduped.csv --report duplicate_report.csv
145
+ ```
146
+
147
+ ### Classify CSVs
148
+
149
+ Organize a mess of CSV files into structured folders based on their
150
+ column headers.
151
+
152
+ ``` bash
153
+ # Preview what would happen (Dry Run)
154
+ csvsmith classify --src ./raw_data --dest ./organized --auto --dry-run
155
+
156
+ # Run classification with a signature config
157
+ csvsmith classify --src ./raw_data --dest ./organized --config signatures.json
158
+
159
+ # Undo a classification run
160
+ csvsmith classify --rollback ./organized/manifest_20260121_120000.json
161
+ ```
162
+
163
+ ## Philosophy
164
+
165
+ 1. CSVs deserve tools that are simple, predictable, and transparent.
166
+ 2. A row has meaning only when its identity is stable and hashable.
167
+ 3. Collisions are sin; determinism is virtue.
168
+ 4. Let no delimiter sow ambiguity among fields.
169
+ 5. **Love thy \\x1f.** The unseen separator, the quiet guardian of
170
+ clean hashes.
171
+ 6. The pipeline should be silent unless something is wrong.
172
+ 7. Your data deserves respect --- and your tools should help you give
173
+ it.
174
+
175
+ For more, see `MANIFESTO.md`.
176
+
177
+ ## License
178
+
179
+ MIT License.
@@ -0,0 +1,38 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "csvsmith"
7
+ version = "0.2.0"
8
+ description = "Small CSV utilities: classification, duplicates, row digests, and CLI helpers."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { file = "LICENSE" }
12
+
13
+ authors = [
14
+ { name = "Eiichi YAMAMOTO", email = "info@yeiichi.com" }
15
+ ]
16
+ keywords = ["csv", "pandas", "duplicates", "data-cleaning", "file-organization"]
17
+ classifiers = [
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3 :: Only",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Intended Audience :: Developers",
22
+ "Topic :: Software Development :: Libraries",
23
+ "Topic :: Utilities",
24
+ ]
25
+
26
+ dependencies = [
27
+ "pandas>=2.0",
28
+ ]
29
+
30
+ [project.urls]
31
+ Homepage = "https://github.com/yeiichi/csvsmith"
32
+ Repository = "https://github.com/yeiichi/csvsmith"
33
+
34
+ [project.scripts]
35
+ csvsmith = "csvsmith.cli:main"
36
+
37
+ [tool.setuptools.packages.find]
38
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,26 @@
1
+ """
2
+ csvsmith: small, focused CSV utilities.
3
+
4
+ Current submodules:
5
+ - csvsmith.duplicates
6
+ - csvsmith.classify
7
+ - csvsmith.cli (CLI entrypoint)
8
+ """
9
+
10
+ __version__ = "0.2.0"
11
+
12
+ from .duplicates import (
13
+ count_duplicates_sorted,
14
+ add_row_digest,
15
+ find_duplicate_rows,
16
+ dedupe_with_report,
17
+ )
18
+ from .classify import CSVClassifier
19
+
20
+ __all__ = [
21
+ "count_duplicates_sorted",
22
+ "add_row_digest",
23
+ "find_duplicate_rows",
24
+ "dedupe_with_report",
25
+ "CSVClassifier",
26
+ ]
@@ -0,0 +1,143 @@
1
+ import csv
2
+ import json
3
+ import shutil
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+
7
+
8
+ class CSVClassifier:
9
+ """
10
+ Classifies CSV files into folders based on header signatures.
11
+ Supports predefined mapping, auto-discovery, dry-runs, and rollbacks.
12
+ """
13
+
14
+ def __init__(self, source_dir, dest_dir, signatures=None, auto=False, dry_run=False):
15
+ self.source = Path(source_dir)
16
+ self.dest = Path(dest_dir)
17
+ self.signatures = signatures or {}
18
+ self.auto = auto
19
+ self.dry_run = dry_run
20
+ self.manifest = {
21
+ "source_path": str(self.source.absolute()),
22
+ "timestamp": datetime.now().isoformat(),
23
+ "operations": []
24
+ }
25
+
26
+ def _get_headers(self, file_path):
27
+ """Validates if file is a CSV and extracts the first row as a header."""
28
+ if not file_path.suffix.lower() == '.csv':
29
+ return None
30
+ try:
31
+ with open(file_path, 'r', encoding='utf-8-sig', newline='') as f:
32
+ reader = csv.reader(f)
33
+ header = next(reader, None)
34
+
35
+ if not header:
36
+ return None
37
+
38
+ # Rule: If the first row is purely numeric, it is data, not a header.
39
+ if all(str(c).strip().replace('.', '', 1).isdigit() for c in header if c.strip()):
40
+ return None
41
+
42
+ return [h.strip() for h in header if h.strip()]
43
+ except (UnicodeDecodeError, csv.Error):
44
+ return None
45
+
46
+ def _move_file(self, file_path, category, headers):
47
+ """Executes move with duplicate protection and records in manifest."""
48
+ target_dir = self.dest / category
49
+ dest_file = target_dir / file_path.name
50
+
51
+ # Handle duplicate filenames in destination
52
+ if dest_file.exists() and not self.dry_run:
53
+ timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
54
+ dest_file = target_dir / f"{file_path.stem}_{timestamp}{file_path.suffix}"
55
+
56
+ operation_log = {
57
+ "original_path": str(file_path.absolute()),
58
+ "moved_to": str(dest_file.absolute()) if not self.dry_run else "simulated",
59
+ "category": category,
60
+ "headers": headers,
61
+ "status": "pending"
62
+ }
63
+
64
+ if self.dry_run:
65
+ print(f"[DRY RUN] Would move: {file_path.name} -> {category}/")
66
+ operation_log["status"] = "simulated"
67
+ else:
68
+ try:
69
+ target_dir.mkdir(parents=True, exist_ok=True)
70
+ shutil.move(str(file_path), str(dest_file))
71
+ print(f"Moved: {file_path.name} -> {category}/")
72
+ operation_log["status"] = "success"
73
+ except Exception as e:
74
+ print(f"Failed to move {file_path.name}: {e}")
75
+ operation_log["status"] = "failed"
76
+
77
+ self.manifest["operations"].append(operation_log)
78
+
79
+ def _save_manifest(self):
80
+ """Saves the session manifest to the destination directory."""
81
+ if not self.manifest["operations"] or self.dry_run:
82
+ return
83
+
84
+ ts = datetime.now().strftime('%Y%m%d_%H%M%S')
85
+ manifest_path = self.dest / f"manifest_{ts}.json"
86
+
87
+ with open(manifest_path, 'w', encoding='utf-8') as f:
88
+ json.dump(self.manifest, f, indent=4)
89
+ print(f"\nManifest saved: {manifest_path}")
90
+
91
+ def rollback(self, manifest_path):
92
+ """Reverses operations defined in a manifest file."""
93
+ m_path = Path(manifest_path)
94
+ if not m_path.exists():
95
+ print(f"Error: Manifest {manifest_path} not found.")
96
+ return
97
+
98
+ with open(m_path, 'r') as f:
99
+ data = json.load(f)
100
+
101
+ print(f"Starting rollback for session: {data.get('timestamp')}")
102
+ for op in data.get("operations", []):
103
+ if op["status"] != "success":
104
+ continue
105
+
106
+ current_loc = Path(op["moved_to"])
107
+ original_loc = Path(op["original_path"])
108
+
109
+ if current_loc.exists():
110
+ if self.dry_run:
111
+ print(f"[DRY RUN] Would restore: {current_loc.name} -> {original_loc}")
112
+ else:
113
+ original_loc.parent.mkdir(parents=True, exist_ok=True)
114
+ shutil.move(str(current_loc), str(original_loc))
115
+ print(f"Restored: {current_loc.name}")
116
+ else:
117
+ print(f"Warning: Could not find file to restore: {current_loc}")
118
+
119
+ def run(self):
120
+ """Standard classification run."""
121
+ if not self.source.is_dir():
122
+ print(f"Error: Source directory {self.source} does not exist.")
123
+ return
124
+
125
+ for file in self.source.glob("*.csv"):
126
+ headers = self._get_headers(file)
127
+ target_sub = "unclassified"
128
+
129
+ if headers:
130
+ match_found = False
131
+ for cat, reqs in self.signatures.items():
132
+ if all(r in headers for r in reqs):
133
+ target_sub = cat
134
+ match_found = True
135
+ break
136
+
137
+ if not match_found and self.auto:
138
+ slug = "_".join(sorted(headers))[:50]
139
+ target_sub = f"cluster_{slug}"
140
+
141
+ self._move_file(file, target_sub, headers)
142
+
143
+ self._save_manifest()