csvsmith 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csvsmith-0.2.0/LICENSE +21 -0
- csvsmith-0.2.0/PKG-INFO +221 -0
- csvsmith-0.2.0/README.md +179 -0
- csvsmith-0.2.0/pyproject.toml +38 -0
- csvsmith-0.2.0/setup.cfg +4 -0
- csvsmith-0.2.0/src/csvsmith/__init__.py +26 -0
- csvsmith-0.2.0/src/csvsmith/classify.py +143 -0
- csvsmith-0.2.0/src/csvsmith/cli.py +261 -0
- csvsmith-0.2.0/src/csvsmith/duplicates.py +221 -0
- csvsmith-0.2.0/src/csvsmith.egg-info/PKG-INFO +221 -0
- csvsmith-0.2.0/src/csvsmith.egg-info/SOURCES.txt +15 -0
- csvsmith-0.2.0/src/csvsmith.egg-info/dependency_links.txt +1 -0
- csvsmith-0.2.0/src/csvsmith.egg-info/entry_points.txt +2 -0
- csvsmith-0.2.0/src/csvsmith.egg-info/requires.txt +1 -0
- csvsmith-0.2.0/src/csvsmith.egg-info/top_level.txt +1 -0
- csvsmith-0.2.0/tests/test_classify.py +60 -0
- csvsmith-0.2.0/tests/test_duplicates.py +252 -0
csvsmith-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Eiichi YAMAMOTO
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
20
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
21
|
+
IN THE SOFTWARE.
|
csvsmith-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: csvsmith
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Small CSV utilities: classification, duplicates, row digests, and CLI helpers.
|
|
5
|
+
Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 Eiichi YAMAMOTO
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in
|
|
18
|
+
all copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
25
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
26
|
+
IN THE SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/yeiichi/csvsmith
|
|
29
|
+
Project-URL: Repository, https://github.com/yeiichi/csvsmith
|
|
30
|
+
Keywords: csv,pandas,duplicates,data-cleaning,file-organization
|
|
31
|
+
Classifier: Programming Language :: Python :: 3
|
|
32
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
36
|
+
Classifier: Topic :: Utilities
|
|
37
|
+
Requires-Python: >=3.10
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
License-File: LICENSE
|
|
40
|
+
Requires-Dist: pandas>=2.0
|
|
41
|
+
Dynamic: license-file
|
|
42
|
+
|
|
43
|
+
# csvsmith
|
|
44
|
+
|
|
45
|
+
[](https://pypi.org/project/csvsmith/)
|
|
46
|
+

|
|
47
|
+
[](https://pypi.org/project/csvsmith/)
|
|
48
|
+
|
|
49
|
+
## Introduction
|
|
50
|
+
|
|
51
|
+
`csvsmith` is a lightweight collection of CSV utilities designed for
|
|
52
|
+
data integrity, deduplication, and organization. It provides a robust
|
|
53
|
+
Python API for programmatic data cleaning and a convenient CLI for quick
|
|
54
|
+
operations. Whether you need to organize thousands of files based on
|
|
55
|
+
their structural signatures or pinpoint duplicate rows in a complex
|
|
56
|
+
dataset, `csvsmith` ensures the process is predictable, transparent, and
|
|
57
|
+
reversible.
|
|
58
|
+
|
|
59
|
+
## Table of Contents
|
|
60
|
+
|
|
61
|
+
- [Installation](#installation)
|
|
62
|
+
|
|
63
|
+
-
|
|
64
|
+
|
|
65
|
+
[Python API Usage](#python-api-usage)
|
|
66
|
+
|
|
67
|
+
: - [Count duplicate values](#count-duplicate-values)
|
|
68
|
+
- [Find duplicate rows in a
|
|
69
|
+
DataFrame](#find-duplicate-rows-in-a-dataframe)
|
|
70
|
+
- [Deduplicate with report](#deduplicate-with-report)
|
|
71
|
+
- [CSV File Classification](#csv-file-classification)
|
|
72
|
+
|
|
73
|
+
-
|
|
74
|
+
|
|
75
|
+
[CLI Usage](#cli-usage)
|
|
76
|
+
|
|
77
|
+
: - [Show duplicate rows](#show-duplicate-rows)
|
|
78
|
+
- [Deduplicate and generate a duplicate
|
|
79
|
+
report](#deduplicate-and-generate-a-duplicate-report)
|
|
80
|
+
- [Classify CSVs](#classify-csvs)
|
|
81
|
+
|
|
82
|
+
- [Philosophy](#philosophy)
|
|
83
|
+
|
|
84
|
+
- [License](#license)
|
|
85
|
+
|
|
86
|
+
## Installation
|
|
87
|
+
|
|
88
|
+
From PyPI:
|
|
89
|
+
|
|
90
|
+
``` bash
|
|
91
|
+
pip install csvsmith
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
For local development:
|
|
95
|
+
|
|
96
|
+
``` bash
|
|
97
|
+
git clone https://github.com/yeiichi/csvsmith.git
|
|
98
|
+
cd csvsmith
|
|
99
|
+
python -m venv .venv
|
|
100
|
+
source .venv/bin/activate
|
|
101
|
+
pip install -e .[dev]
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Python API Usage
|
|
105
|
+
|
|
106
|
+
### Count duplicate values
|
|
107
|
+
|
|
108
|
+
Works on any iterable of hashable items.
|
|
109
|
+
|
|
110
|
+
``` python
|
|
111
|
+
from csvsmith import count_duplicates_sorted
|
|
112
|
+
|
|
113
|
+
items = ["a", "b", "a", "c", "a", "b"]
|
|
114
|
+
print(count_duplicates_sorted(items))
|
|
115
|
+
# [('a', 3), ('b', 2)]
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Find duplicate rows in a DataFrame
|
|
119
|
+
|
|
120
|
+
``` python
|
|
121
|
+
import pandas as pd
|
|
122
|
+
from csvsmith import find_duplicate_rows
|
|
123
|
+
|
|
124
|
+
df = pd.read_csv("input.csv")
|
|
125
|
+
dup_rows = find_duplicate_rows(df)
|
|
126
|
+
print(dup_rows)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Deduplicate with report
|
|
130
|
+
|
|
131
|
+
``` python
|
|
132
|
+
import pandas as pd
|
|
133
|
+
from csvsmith import dedupe_with_report
|
|
134
|
+
|
|
135
|
+
df = pd.read_csv("input.csv")
|
|
136
|
+
|
|
137
|
+
# Use all columns
|
|
138
|
+
deduped, report = dedupe_with_report(df)
|
|
139
|
+
deduped.to_csv("deduped.csv", index=False)
|
|
140
|
+
report.to_csv("duplicate_report.csv", index=False)
|
|
141
|
+
|
|
142
|
+
# Use all columns except an ID column
|
|
143
|
+
deduped_no_id, report_no_id = dedupe_with_report(df, exclude=["id"])
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### CSV File Classification
|
|
147
|
+
|
|
148
|
+
Organize files into directories based on their headers.
|
|
149
|
+
|
|
150
|
+
``` python
|
|
151
|
+
from csvsmith.classify import CSVClassifier
|
|
152
|
+
|
|
153
|
+
classifier = CSVClassifier(
|
|
154
|
+
source_dir="./raw_data",
|
|
155
|
+
dest_dir="./organized",
|
|
156
|
+
auto=True # Automatically group files with identical headers
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Execute the classification
|
|
160
|
+
classifier.run()
|
|
161
|
+
|
|
162
|
+
# Or rollback a previous run using its manifest
|
|
163
|
+
classifier.rollback("./organized/manifest_20260121_120000.json")
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## CLI Usage
|
|
167
|
+
|
|
168
|
+
`csvsmith` includes a command-line interface for duplicate detection and
|
|
169
|
+
file organization.
|
|
170
|
+
|
|
171
|
+
### Show duplicate rows
|
|
172
|
+
|
|
173
|
+
``` bash
|
|
174
|
+
csvsmith row-duplicates input.csv
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Save only duplicate rows to a file:
|
|
178
|
+
|
|
179
|
+
``` bash
|
|
180
|
+
csvsmith row-duplicates input.csv -o duplicates_only.csv
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Deduplicate and generate a duplicate report
|
|
184
|
+
|
|
185
|
+
``` bash
|
|
186
|
+
csvsmith dedupe input.csv --deduped deduped.csv --report duplicate_report.csv
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### Classify CSVs
|
|
190
|
+
|
|
191
|
+
Organize a mess of CSV files into structured folders based on their
|
|
192
|
+
column headers.
|
|
193
|
+
|
|
194
|
+
``` bash
|
|
195
|
+
# Preview what would happen (Dry Run)
|
|
196
|
+
csvsmith classify --src ./raw_data --dest ./organized --auto --dry-run
|
|
197
|
+
|
|
198
|
+
# Run classification with a signature config
|
|
199
|
+
csvsmith classify --src ./raw_data --dest ./organized --config signatures.json
|
|
200
|
+
|
|
201
|
+
# Undo a classification run
|
|
202
|
+
csvsmith classify --rollback ./organized/manifest_20260121_120000.json
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Philosophy
|
|
206
|
+
|
|
207
|
+
1. CSVs deserve tools that are simple, predictable, and transparent.
|
|
208
|
+
2. A row has meaning only when its identity is stable and hashable.
|
|
209
|
+
3. Collisions are sin; determinism is virtue.
|
|
210
|
+
4. Let no delimiter sow ambiguity among fields.
|
|
211
|
+
5. **Love thy \\x1f.** The unseen separator, the quiet guardian of
|
|
212
|
+
clean hashes.
|
|
213
|
+
6. The pipeline should be silent unless something is wrong.
|
|
214
|
+
7. Your data deserves respect --- and your tools should help you give
|
|
215
|
+
it.
|
|
216
|
+
|
|
217
|
+
For more, see `MANIFESTO.md`.
|
|
218
|
+
|
|
219
|
+
## License
|
|
220
|
+
|
|
221
|
+
MIT License.
|
csvsmith-0.2.0/README.md
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# csvsmith
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/csvsmith/)
|
|
4
|
+

|
|
5
|
+
[](https://pypi.org/project/csvsmith/)
|
|
6
|
+
|
|
7
|
+
## Introduction
|
|
8
|
+
|
|
9
|
+
`csvsmith` is a lightweight collection of CSV utilities designed for
|
|
10
|
+
data integrity, deduplication, and organization. It provides a robust
|
|
11
|
+
Python API for programmatic data cleaning and a convenient CLI for quick
|
|
12
|
+
operations. Whether you need to organize thousands of files based on
|
|
13
|
+
their structural signatures or pinpoint duplicate rows in a complex
|
|
14
|
+
dataset, `csvsmith` ensures the process is predictable, transparent, and
|
|
15
|
+
reversible.
|
|
16
|
+
|
|
17
|
+
## Table of Contents
|
|
18
|
+
|
|
19
|
+
- [Installation](#installation)
|
|
20
|
+
|
|
21
|
+
-
|
|
22
|
+
|
|
23
|
+
[Python API Usage](#python-api-usage)
|
|
24
|
+
|
|
25
|
+
: - [Count duplicate values](#count-duplicate-values)
|
|
26
|
+
- [Find duplicate rows in a
|
|
27
|
+
DataFrame](#find-duplicate-rows-in-a-dataframe)
|
|
28
|
+
- [Deduplicate with report](#deduplicate-with-report)
|
|
29
|
+
- [CSV File Classification](#csv-file-classification)
|
|
30
|
+
|
|
31
|
+
-
|
|
32
|
+
|
|
33
|
+
[CLI Usage](#cli-usage)
|
|
34
|
+
|
|
35
|
+
: - [Show duplicate rows](#show-duplicate-rows)
|
|
36
|
+
- [Deduplicate and generate a duplicate
|
|
37
|
+
report](#deduplicate-and-generate-a-duplicate-report)
|
|
38
|
+
- [Classify CSVs](#classify-csvs)
|
|
39
|
+
|
|
40
|
+
- [Philosophy](#philosophy)
|
|
41
|
+
|
|
42
|
+
- [License](#license)
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
From PyPI:
|
|
47
|
+
|
|
48
|
+
``` bash
|
|
49
|
+
pip install csvsmith
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
For local development:
|
|
53
|
+
|
|
54
|
+
``` bash
|
|
55
|
+
git clone https://github.com/yeiichi/csvsmith.git
|
|
56
|
+
cd csvsmith
|
|
57
|
+
python -m venv .venv
|
|
58
|
+
source .venv/bin/activate
|
|
59
|
+
pip install -e .[dev]
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Python API Usage
|
|
63
|
+
|
|
64
|
+
### Count duplicate values
|
|
65
|
+
|
|
66
|
+
Works on any iterable of hashable items.
|
|
67
|
+
|
|
68
|
+
``` python
|
|
69
|
+
from csvsmith import count_duplicates_sorted
|
|
70
|
+
|
|
71
|
+
items = ["a", "b", "a", "c", "a", "b"]
|
|
72
|
+
print(count_duplicates_sorted(items))
|
|
73
|
+
# [('a', 3), ('b', 2)]
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Find duplicate rows in a DataFrame
|
|
77
|
+
|
|
78
|
+
``` python
|
|
79
|
+
import pandas as pd
|
|
80
|
+
from csvsmith import find_duplicate_rows
|
|
81
|
+
|
|
82
|
+
df = pd.read_csv("input.csv")
|
|
83
|
+
dup_rows = find_duplicate_rows(df)
|
|
84
|
+
print(dup_rows)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Deduplicate with report
|
|
88
|
+
|
|
89
|
+
``` python
|
|
90
|
+
import pandas as pd
|
|
91
|
+
from csvsmith import dedupe_with_report
|
|
92
|
+
|
|
93
|
+
df = pd.read_csv("input.csv")
|
|
94
|
+
|
|
95
|
+
# Use all columns
|
|
96
|
+
deduped, report = dedupe_with_report(df)
|
|
97
|
+
deduped.to_csv("deduped.csv", index=False)
|
|
98
|
+
report.to_csv("duplicate_report.csv", index=False)
|
|
99
|
+
|
|
100
|
+
# Use all columns except an ID column
|
|
101
|
+
deduped_no_id, report_no_id = dedupe_with_report(df, exclude=["id"])
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### CSV File Classification
|
|
105
|
+
|
|
106
|
+
Organize files into directories based on their headers.
|
|
107
|
+
|
|
108
|
+
``` python
|
|
109
|
+
from csvsmith.classify import CSVClassifier
|
|
110
|
+
|
|
111
|
+
classifier = CSVClassifier(
|
|
112
|
+
source_dir="./raw_data",
|
|
113
|
+
dest_dir="./organized",
|
|
114
|
+
auto=True # Automatically group files with identical headers
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Execute the classification
|
|
118
|
+
classifier.run()
|
|
119
|
+
|
|
120
|
+
# Or rollback a previous run using its manifest
|
|
121
|
+
classifier.rollback("./organized/manifest_20260121_120000.json")
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## CLI Usage
|
|
125
|
+
|
|
126
|
+
`csvsmith` includes a command-line interface for duplicate detection and
|
|
127
|
+
file organization.
|
|
128
|
+
|
|
129
|
+
### Show duplicate rows
|
|
130
|
+
|
|
131
|
+
``` bash
|
|
132
|
+
csvsmith row-duplicates input.csv
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Save only duplicate rows to a file:
|
|
136
|
+
|
|
137
|
+
``` bash
|
|
138
|
+
csvsmith row-duplicates input.csv -o duplicates_only.csv
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Deduplicate and generate a duplicate report
|
|
142
|
+
|
|
143
|
+
``` bash
|
|
144
|
+
csvsmith dedupe input.csv --deduped deduped.csv --report duplicate_report.csv
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Classify CSVs
|
|
148
|
+
|
|
149
|
+
Organize a mess of CSV files into structured folders based on their
|
|
150
|
+
column headers.
|
|
151
|
+
|
|
152
|
+
``` bash
|
|
153
|
+
# Preview what would happen (Dry Run)
|
|
154
|
+
csvsmith classify --src ./raw_data --dest ./organized --auto --dry-run
|
|
155
|
+
|
|
156
|
+
# Run classification with a signature config
|
|
157
|
+
csvsmith classify --src ./raw_data --dest ./organized --config signatures.json
|
|
158
|
+
|
|
159
|
+
# Undo a classification run
|
|
160
|
+
csvsmith classify --rollback ./organized/manifest_20260121_120000.json
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Philosophy
|
|
164
|
+
|
|
165
|
+
1. CSVs deserve tools that are simple, predictable, and transparent.
|
|
166
|
+
2. A row has meaning only when its identity is stable and hashable.
|
|
167
|
+
3. Collisions are sin; determinism is virtue.
|
|
168
|
+
4. Let no delimiter sow ambiguity among fields.
|
|
169
|
+
5. **Love thy \\x1f.** The unseen separator, the quiet guardian of
|
|
170
|
+
clean hashes.
|
|
171
|
+
6. The pipeline should be silent unless something is wrong.
|
|
172
|
+
7. Your data deserves respect --- and your tools should help you give
|
|
173
|
+
it.
|
|
174
|
+
|
|
175
|
+
For more, see `MANIFESTO.md`.
|
|
176
|
+
|
|
177
|
+
## License
|
|
178
|
+
|
|
179
|
+
MIT License.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "csvsmith"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Small CSV utilities: classification, duplicates, row digests, and CLI helpers."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
|
+
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Eiichi YAMAMOTO", email = "info@yeiichi.com" }
|
|
15
|
+
]
|
|
16
|
+
keywords = ["csv", "pandas", "duplicates", "data-cleaning", "file-organization"]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"Topic :: Software Development :: Libraries",
|
|
23
|
+
"Topic :: Utilities",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
dependencies = [
|
|
27
|
+
"pandas>=2.0",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Homepage = "https://github.com/yeiichi/csvsmith"
|
|
32
|
+
Repository = "https://github.com/yeiichi/csvsmith"
|
|
33
|
+
|
|
34
|
+
[project.scripts]
|
|
35
|
+
csvsmith = "csvsmith.cli:main"
|
|
36
|
+
|
|
37
|
+
[tool.setuptools.packages.find]
|
|
38
|
+
where = ["src"]
|
csvsmith-0.2.0/setup.cfg
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
csvsmith: small, focused CSV utilities.
|
|
3
|
+
|
|
4
|
+
Current submodules:
|
|
5
|
+
- csvsmith.duplicates
|
|
6
|
+
- csvsmith.classify
|
|
7
|
+
- csvsmith.cli (CLI entrypoint)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
__version__ = "0.2.0"
|
|
11
|
+
|
|
12
|
+
from .duplicates import (
|
|
13
|
+
count_duplicates_sorted,
|
|
14
|
+
add_row_digest,
|
|
15
|
+
find_duplicate_rows,
|
|
16
|
+
dedupe_with_report,
|
|
17
|
+
)
|
|
18
|
+
from .classify import CSVClassifier
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"count_duplicates_sorted",
|
|
22
|
+
"add_row_digest",
|
|
23
|
+
"find_duplicate_rows",
|
|
24
|
+
"dedupe_with_report",
|
|
25
|
+
"CSVClassifier",
|
|
26
|
+
]
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import json
|
|
3
|
+
import shutil
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CSVClassifier:
|
|
9
|
+
"""
|
|
10
|
+
Classifies CSV files into folders based on header signatures.
|
|
11
|
+
Supports predefined mapping, auto-discovery, dry-runs, and rollbacks.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, source_dir, dest_dir, signatures=None, auto=False, dry_run=False):
|
|
15
|
+
self.source = Path(source_dir)
|
|
16
|
+
self.dest = Path(dest_dir)
|
|
17
|
+
self.signatures = signatures or {}
|
|
18
|
+
self.auto = auto
|
|
19
|
+
self.dry_run = dry_run
|
|
20
|
+
self.manifest = {
|
|
21
|
+
"source_path": str(self.source.absolute()),
|
|
22
|
+
"timestamp": datetime.now().isoformat(),
|
|
23
|
+
"operations": []
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
def _get_headers(self, file_path):
|
|
27
|
+
"""Validates if file is a CSV and extracts the first row as a header."""
|
|
28
|
+
if not file_path.suffix.lower() == '.csv':
|
|
29
|
+
return None
|
|
30
|
+
try:
|
|
31
|
+
with open(file_path, 'r', encoding='utf-8-sig', newline='') as f:
|
|
32
|
+
reader = csv.reader(f)
|
|
33
|
+
header = next(reader, None)
|
|
34
|
+
|
|
35
|
+
if not header:
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
# Rule: If the first row is purely numeric, it is data, not a header.
|
|
39
|
+
if all(str(c).strip().replace('.', '', 1).isdigit() for c in header if c.strip()):
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
return [h.strip() for h in header if h.strip()]
|
|
43
|
+
except (UnicodeDecodeError, csv.Error):
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
def _move_file(self, file_path, category, headers):
|
|
47
|
+
"""Executes move with duplicate protection and records in manifest."""
|
|
48
|
+
target_dir = self.dest / category
|
|
49
|
+
dest_file = target_dir / file_path.name
|
|
50
|
+
|
|
51
|
+
# Handle duplicate filenames in destination
|
|
52
|
+
if dest_file.exists() and not self.dry_run:
|
|
53
|
+
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
|
54
|
+
dest_file = target_dir / f"{file_path.stem}_{timestamp}{file_path.suffix}"
|
|
55
|
+
|
|
56
|
+
operation_log = {
|
|
57
|
+
"original_path": str(file_path.absolute()),
|
|
58
|
+
"moved_to": str(dest_file.absolute()) if not self.dry_run else "simulated",
|
|
59
|
+
"category": category,
|
|
60
|
+
"headers": headers,
|
|
61
|
+
"status": "pending"
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if self.dry_run:
|
|
65
|
+
print(f"[DRY RUN] Would move: {file_path.name} -> {category}/")
|
|
66
|
+
operation_log["status"] = "simulated"
|
|
67
|
+
else:
|
|
68
|
+
try:
|
|
69
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
shutil.move(str(file_path), str(dest_file))
|
|
71
|
+
print(f"Moved: {file_path.name} -> {category}/")
|
|
72
|
+
operation_log["status"] = "success"
|
|
73
|
+
except Exception as e:
|
|
74
|
+
print(f"Failed to move {file_path.name}: {e}")
|
|
75
|
+
operation_log["status"] = "failed"
|
|
76
|
+
|
|
77
|
+
self.manifest["operations"].append(operation_log)
|
|
78
|
+
|
|
79
|
+
def _save_manifest(self):
|
|
80
|
+
"""Saves the session manifest to the destination directory."""
|
|
81
|
+
if not self.manifest["operations"] or self.dry_run:
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
85
|
+
manifest_path = self.dest / f"manifest_{ts}.json"
|
|
86
|
+
|
|
87
|
+
with open(manifest_path, 'w', encoding='utf-8') as f:
|
|
88
|
+
json.dump(self.manifest, f, indent=4)
|
|
89
|
+
print(f"\nManifest saved: {manifest_path}")
|
|
90
|
+
|
|
91
|
+
def rollback(self, manifest_path):
|
|
92
|
+
"""Reverses operations defined in a manifest file."""
|
|
93
|
+
m_path = Path(manifest_path)
|
|
94
|
+
if not m_path.exists():
|
|
95
|
+
print(f"Error: Manifest {manifest_path} not found.")
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
with open(m_path, 'r') as f:
|
|
99
|
+
data = json.load(f)
|
|
100
|
+
|
|
101
|
+
print(f"Starting rollback for session: {data.get('timestamp')}")
|
|
102
|
+
for op in data.get("operations", []):
|
|
103
|
+
if op["status"] != "success":
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
current_loc = Path(op["moved_to"])
|
|
107
|
+
original_loc = Path(op["original_path"])
|
|
108
|
+
|
|
109
|
+
if current_loc.exists():
|
|
110
|
+
if self.dry_run:
|
|
111
|
+
print(f"[DRY RUN] Would restore: {current_loc.name} -> {original_loc}")
|
|
112
|
+
else:
|
|
113
|
+
original_loc.parent.mkdir(parents=True, exist_ok=True)
|
|
114
|
+
shutil.move(str(current_loc), str(original_loc))
|
|
115
|
+
print(f"Restored: {current_loc.name}")
|
|
116
|
+
else:
|
|
117
|
+
print(f"Warning: Could not find file to restore: {current_loc}")
|
|
118
|
+
|
|
119
|
+
def run(self):
|
|
120
|
+
"""Standard classification run."""
|
|
121
|
+
if not self.source.is_dir():
|
|
122
|
+
print(f"Error: Source directory {self.source} does not exist.")
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
for file in self.source.glob("*.csv"):
|
|
126
|
+
headers = self._get_headers(file)
|
|
127
|
+
target_sub = "unclassified"
|
|
128
|
+
|
|
129
|
+
if headers:
|
|
130
|
+
match_found = False
|
|
131
|
+
for cat, reqs in self.signatures.items():
|
|
132
|
+
if all(r in headers for r in reqs):
|
|
133
|
+
target_sub = cat
|
|
134
|
+
match_found = True
|
|
135
|
+
break
|
|
136
|
+
|
|
137
|
+
if not match_found and self.auto:
|
|
138
|
+
slug = "_".join(sorted(headers))[:50]
|
|
139
|
+
target_sub = f"cluster_{slug}"
|
|
140
|
+
|
|
141
|
+
self._move_file(file, target_sub, headers)
|
|
142
|
+
|
|
143
|
+
self._save_manifest()
|