csvsmith 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {csvsmith-0.2.0/src/csvsmith.egg-info → csvsmith-0.2.1}/PKG-INFO +70 -73
- csvsmith-0.2.1/README.md +176 -0
- {csvsmith-0.2.0 → csvsmith-0.2.1}/pyproject.toml +1 -1
- {csvsmith-0.2.0 → csvsmith-0.2.1}/src/csvsmith/__init__.py +10 -3
- csvsmith-0.2.1/src/csvsmith/classify.py +326 -0
- {csvsmith-0.2.0 → csvsmith-0.2.1}/src/csvsmith/cli.py +19 -3
- {csvsmith-0.2.0 → csvsmith-0.2.1/src/csvsmith.egg-info}/PKG-INFO +70 -73
- csvsmith-0.2.1/tests/test_classify.py +152 -0
- csvsmith-0.2.0/README.md +0 -179
- csvsmith-0.2.0/src/csvsmith/classify.py +0 -143
- csvsmith-0.2.0/tests/test_classify.py +0 -60
- {csvsmith-0.2.0 → csvsmith-0.2.1}/LICENSE +0 -0
- {csvsmith-0.2.0 → csvsmith-0.2.1}/setup.cfg +0 -0
- {csvsmith-0.2.0 → csvsmith-0.2.1}/src/csvsmith/duplicates.py +0 -0
- {csvsmith-0.2.0 → csvsmith-0.2.1}/src/csvsmith.egg-info/SOURCES.txt +0 -0
- {csvsmith-0.2.0 → csvsmith-0.2.1}/src/csvsmith.egg-info/dependency_links.txt +0 -0
- {csvsmith-0.2.0 → csvsmith-0.2.1}/src/csvsmith.egg-info/entry_points.txt +0 -0
- {csvsmith-0.2.0 → csvsmith-0.2.1}/src/csvsmith.egg-info/requires.txt +0 -0
- {csvsmith-0.2.0 → csvsmith-0.2.1}/src/csvsmith.egg-info/top_level.txt +0 -0
- {csvsmith-0.2.0 → csvsmith-0.2.1}/tests/test_duplicates.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: csvsmith
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Small CSV utilities: classification, duplicates, row digests, and CLI helpers.
|
|
5
5
|
Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
|
|
6
6
|
License: MIT License
|
|
@@ -51,49 +51,33 @@ Dynamic: license-file
|
|
|
51
51
|
`csvsmith` is a lightweight collection of CSV utilities designed for
|
|
52
52
|
data integrity, deduplication, and organization. It provides a robust
|
|
53
53
|
Python API for programmatic data cleaning and a convenient CLI for quick
|
|
54
|
-
operations.
|
|
55
|
-
their structural signatures or pinpoint duplicate rows in a complex
|
|
56
|
-
dataset, `csvsmith` ensures the process is predictable, transparent, and
|
|
57
|
-
reversible.
|
|
54
|
+
operations.
|
|
58
55
|
|
|
59
|
-
|
|
56
|
+
Whether you need to organize thousands of files based on their structural
|
|
57
|
+
signatures or pinpoint duplicate rows in a complex dataset, `csvsmith`
|
|
58
|
+
ensures the process is predictable, transparent, and reversible.
|
|
60
59
|
|
|
61
|
-
|
|
60
|
+
As of recent versions, CSV classification supports:
|
|
62
61
|
|
|
63
|
-
-
|
|
62
|
+
- strict vs relaxed header matching
|
|
63
|
+
- exact vs subset (“contains”) matching
|
|
64
|
+
- auto clustering with collision‑resistant hashes
|
|
65
|
+
- dry‑run preview
|
|
66
|
+
- report‑only planning mode (scan without moving)
|
|
67
|
+
- full rollback via manifest
|
|
64
68
|
|
|
65
|
-
[Python API Usage](#python-api-usage)
|
|
66
|
-
|
|
67
|
-
: - [Count duplicate values](#count-duplicate-values)
|
|
68
|
-
- [Find duplicate rows in a
|
|
69
|
-
DataFrame](#find-duplicate-rows-in-a-dataframe)
|
|
70
|
-
- [Deduplicate with report](#deduplicate-with-report)
|
|
71
|
-
- [CSV File Classification](#csv-file-classification)
|
|
72
|
-
|
|
73
|
-
-
|
|
74
|
-
|
|
75
|
-
[CLI Usage](#cli-usage)
|
|
76
|
-
|
|
77
|
-
: - [Show duplicate rows](#show-duplicate-rows)
|
|
78
|
-
- [Deduplicate and generate a duplicate
|
|
79
|
-
report](#deduplicate-and-generate-a-duplicate-report)
|
|
80
|
-
- [Classify CSVs](#classify-csvs)
|
|
81
|
-
|
|
82
|
-
- [Philosophy](#philosophy)
|
|
83
|
-
|
|
84
|
-
- [License](#license)
|
|
85
69
|
|
|
86
70
|
## Installation
|
|
87
71
|
|
|
88
72
|
From PyPI:
|
|
89
73
|
|
|
90
|
-
```
|
|
74
|
+
```bash
|
|
91
75
|
pip install csvsmith
|
|
92
76
|
```
|
|
93
77
|
|
|
94
78
|
For local development:
|
|
95
79
|
|
|
96
|
-
```
|
|
80
|
+
```bash
|
|
97
81
|
git clone https://github.com/yeiichi/csvsmith.git
|
|
98
82
|
cd csvsmith
|
|
99
83
|
python -m venv .venv
|
|
@@ -101,13 +85,12 @@ source .venv/bin/activate
|
|
|
101
85
|
pip install -e .[dev]
|
|
102
86
|
```
|
|
103
87
|
|
|
88
|
+
|
|
104
89
|
## Python API Usage
|
|
105
90
|
|
|
106
91
|
### Count duplicate values
|
|
107
92
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
``` python
|
|
93
|
+
```python
|
|
111
94
|
from csvsmith import count_duplicates_sorted
|
|
112
95
|
|
|
113
96
|
items = ["a", "b", "a", "c", "a", "b"]
|
|
@@ -115,106 +98,120 @@ print(count_duplicates_sorted(items))
|
|
|
115
98
|
# [('a', 3), ('b', 2)]
|
|
116
99
|
```
|
|
117
100
|
|
|
101
|
+
|
|
118
102
|
### Find duplicate rows in a DataFrame
|
|
119
103
|
|
|
120
|
-
```
|
|
104
|
+
```python
|
|
121
105
|
import pandas as pd
|
|
122
106
|
from csvsmith import find_duplicate_rows
|
|
123
107
|
|
|
124
108
|
df = pd.read_csv("input.csv")
|
|
125
109
|
dup_rows = find_duplicate_rows(df)
|
|
126
|
-
print(dup_rows)
|
|
127
110
|
```
|
|
128
111
|
|
|
112
|
+
|
|
129
113
|
### Deduplicate with report
|
|
130
114
|
|
|
131
|
-
```
|
|
115
|
+
```python
|
|
132
116
|
import pandas as pd
|
|
133
117
|
from csvsmith import dedupe_with_report
|
|
134
118
|
|
|
135
119
|
df = pd.read_csv("input.csv")
|
|
136
120
|
|
|
137
|
-
# Use all columns
|
|
138
121
|
deduped, report = dedupe_with_report(df)
|
|
139
122
|
deduped.to_csv("deduped.csv", index=False)
|
|
140
123
|
report.to_csv("duplicate_report.csv", index=False)
|
|
141
124
|
|
|
142
|
-
#
|
|
143
|
-
|
|
125
|
+
# Exclude columns (e.g. IDs or timestamps)
|
|
126
|
+
deduped2, report2 = dedupe_with_report(df, exclude=["id"])
|
|
144
127
|
```
|
|
145
128
|
|
|
146
|
-
### CSV File Classification
|
|
147
129
|
|
|
148
|
-
|
|
130
|
+
### CSV File Classification (Python)
|
|
149
131
|
|
|
150
|
-
```
|
|
132
|
+
```python
|
|
151
133
|
from csvsmith.classify import CSVClassifier
|
|
152
134
|
|
|
153
135
|
classifier = CSVClassifier(
|
|
154
136
|
source_dir="./raw_data",
|
|
155
137
|
dest_dir="./organized",
|
|
156
|
-
auto=True
|
|
138
|
+
auto=True,
|
|
139
|
+
mode="relaxed", # or "strict"
|
|
140
|
+
match="exact", # or "contains"
|
|
157
141
|
)
|
|
158
142
|
|
|
159
|
-
# Execute the classification
|
|
160
143
|
classifier.run()
|
|
161
144
|
|
|
162
|
-
#
|
|
163
|
-
classifier.rollback("./organized/
|
|
145
|
+
# Roll back using the generated manifest
|
|
146
|
+
classifier.rollback("./organized/manifest_YYYYMMDD_HHMMSS.json")
|
|
164
147
|
```
|
|
165
148
|
|
|
149
|
+
|
|
166
150
|
## CLI Usage
|
|
167
151
|
|
|
168
|
-
|
|
169
|
-
|
|
152
|
+
csvsmith provides a CLI for duplicate detection and CSV organization.
|
|
153
|
+
|
|
170
154
|
|
|
171
155
|
### Show duplicate rows
|
|
172
156
|
|
|
173
|
-
```
|
|
157
|
+
```bash
|
|
174
158
|
csvsmith row-duplicates input.csv
|
|
175
159
|
```
|
|
176
160
|
|
|
177
|
-
Save
|
|
161
|
+
Save duplicate rows only:
|
|
178
162
|
|
|
179
|
-
```
|
|
163
|
+
```bash
|
|
180
164
|
csvsmith row-duplicates input.csv -o duplicates_only.csv
|
|
181
165
|
```
|
|
182
166
|
|
|
183
|
-
### Deduplicate and generate a duplicate report
|
|
184
167
|
|
|
185
|
-
|
|
168
|
+
### Deduplicate and generate a report
|
|
169
|
+
|
|
170
|
+
```bash
|
|
186
171
|
csvsmith dedupe input.csv --deduped deduped.csv --report duplicate_report.csv
|
|
187
172
|
```
|
|
188
173
|
|
|
174
|
+
|
|
189
175
|
### Classify CSVs
|
|
190
176
|
|
|
191
|
-
|
|
192
|
-
|
|
177
|
+
```bash
|
|
178
|
+
# Dry-run (preview only)
|
|
179
|
+
csvsmith classify --src ./raw --dest ./out --auto --dry-run
|
|
180
|
+
|
|
181
|
+
# Exact matching (default)
|
|
182
|
+
csvsmith classify --src ./raw --dest ./out --config signatures.json
|
|
183
|
+
|
|
184
|
+
# Relaxed matching (ignore column order)
|
|
185
|
+
csvsmith classify --src ./raw --dest ./out --config signatures.json --mode relaxed
|
|
193
186
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
csvsmith classify --src ./raw_data --dest ./organized --auto --dry-run
|
|
187
|
+
# Subset matching (signature columns must be present)
|
|
188
|
+
csvsmith classify --src ./raw --dest ./out --config signatures.json --match contains
|
|
197
189
|
|
|
198
|
-
#
|
|
199
|
-
csvsmith classify --src ./
|
|
190
|
+
# Report-only (plan without moving files)
|
|
191
|
+
csvsmith classify --src ./raw --dest ./out --auto --report-only
|
|
200
192
|
|
|
201
|
-
#
|
|
202
|
-
csvsmith classify --rollback ./
|
|
193
|
+
# Roll back using manifest
|
|
194
|
+
csvsmith classify --rollback ./out/manifest_YYYYMMDD_HHMMSS.json
|
|
203
195
|
```
|
|
204
196
|
|
|
197
|
+
|
|
198
|
+
### Report-only mode
|
|
199
|
+
|
|
200
|
+
`--report-only` scans all CSVs and writes a manifest describing what *would*
|
|
201
|
+
happen, without touching the filesystem. This enables downstream pipelines
|
|
202
|
+
to consume the classification plan for custom processing.
|
|
203
|
+
|
|
204
|
+
|
|
205
205
|
## Philosophy
|
|
206
206
|
|
|
207
|
-
1.
|
|
208
|
-
2.
|
|
209
|
-
3.
|
|
210
|
-
4.
|
|
211
|
-
5.
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
it.
|
|
216
|
-
|
|
217
|
-
For more, see `MANIFESTO.md`.
|
|
207
|
+
1. CSVs deserve tools that are simple, predictable, and transparent.
|
|
208
|
+
2. A row has meaning only when its identity is stable and hashable.
|
|
209
|
+
3. Collisions are sin; determinism is virtue.
|
|
210
|
+
4. Let no delimiter sow ambiguity among fields.
|
|
211
|
+
5. Love thy \x1f — the unseen separator, guardian of clean hashes.
|
|
212
|
+
6. The pipeline should be silent unless something is wrong.
|
|
213
|
+
7. Your data deserves respect — and your tools should help you give it.
|
|
214
|
+
|
|
218
215
|
|
|
219
216
|
## License
|
|
220
217
|
|
csvsmith-0.2.1/README.md
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# csvsmith
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/csvsmith/)
|
|
4
|
+

|
|
5
|
+
[](https://pypi.org/project/csvsmith/)
|
|
6
|
+
|
|
7
|
+
## Introduction
|
|
8
|
+
|
|
9
|
+
`csvsmith` is a lightweight collection of CSV utilities designed for
|
|
10
|
+
data integrity, deduplication, and organization. It provides a robust
|
|
11
|
+
Python API for programmatic data cleaning and a convenient CLI for quick
|
|
12
|
+
operations.
|
|
13
|
+
|
|
14
|
+
Whether you need to organize thousands of files based on their structural
|
|
15
|
+
signatures or pinpoint duplicate rows in a complex dataset, `csvsmith`
|
|
16
|
+
ensures the process is predictable, transparent, and reversible.
|
|
17
|
+
|
|
18
|
+
As of recent versions, CSV classification supports:
|
|
19
|
+
|
|
20
|
+
- strict vs relaxed header matching
|
|
21
|
+
- exact vs subset (“contains”) matching
|
|
22
|
+
- auto clustering with collision‑resistant hashes
|
|
23
|
+
- dry‑run preview
|
|
24
|
+
- report‑only planning mode (scan without moving)
|
|
25
|
+
- full rollback via manifest
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
From PyPI:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install csvsmith
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
For local development:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
git clone https://github.com/yeiichi/csvsmith.git
|
|
40
|
+
cd csvsmith
|
|
41
|
+
python -m venv .venv
|
|
42
|
+
source .venv/bin/activate
|
|
43
|
+
pip install -e .[dev]
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
## Python API Usage
|
|
48
|
+
|
|
49
|
+
### Count duplicate values
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from csvsmith import count_duplicates_sorted
|
|
53
|
+
|
|
54
|
+
items = ["a", "b", "a", "c", "a", "b"]
|
|
55
|
+
print(count_duplicates_sorted(items))
|
|
56
|
+
# [('a', 3), ('b', 2)]
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
### Find duplicate rows in a DataFrame
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
import pandas as pd
|
|
64
|
+
from csvsmith import find_duplicate_rows
|
|
65
|
+
|
|
66
|
+
df = pd.read_csv("input.csv")
|
|
67
|
+
dup_rows = find_duplicate_rows(df)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
### Deduplicate with report
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
import pandas as pd
|
|
75
|
+
from csvsmith import dedupe_with_report
|
|
76
|
+
|
|
77
|
+
df = pd.read_csv("input.csv")
|
|
78
|
+
|
|
79
|
+
deduped, report = dedupe_with_report(df)
|
|
80
|
+
deduped.to_csv("deduped.csv", index=False)
|
|
81
|
+
report.to_csv("duplicate_report.csv", index=False)
|
|
82
|
+
|
|
83
|
+
# Exclude columns (e.g. IDs or timestamps)
|
|
84
|
+
deduped2, report2 = dedupe_with_report(df, exclude=["id"])
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
### CSV File Classification (Python)
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from csvsmith.classify import CSVClassifier
|
|
92
|
+
|
|
93
|
+
classifier = CSVClassifier(
|
|
94
|
+
source_dir="./raw_data",
|
|
95
|
+
dest_dir="./organized",
|
|
96
|
+
auto=True,
|
|
97
|
+
mode="relaxed", # or "strict"
|
|
98
|
+
match="exact", # or "contains"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
classifier.run()
|
|
102
|
+
|
|
103
|
+
# Roll back using the generated manifest
|
|
104
|
+
classifier.rollback("./organized/manifest_YYYYMMDD_HHMMSS.json")
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
## CLI Usage
|
|
109
|
+
|
|
110
|
+
csvsmith provides a CLI for duplicate detection and CSV organization.
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
### Show duplicate rows
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
csvsmith row-duplicates input.csv
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Save duplicate rows only:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
csvsmith row-duplicates input.csv -o duplicates_only.csv
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
### Deduplicate and generate a report
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
csvsmith dedupe input.csv --deduped deduped.csv --report duplicate_report.csv
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
### Classify CSVs
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
# Dry-run (preview only)
|
|
137
|
+
csvsmith classify --src ./raw --dest ./out --auto --dry-run
|
|
138
|
+
|
|
139
|
+
# Exact matching (default)
|
|
140
|
+
csvsmith classify --src ./raw --dest ./out --config signatures.json
|
|
141
|
+
|
|
142
|
+
# Relaxed matching (ignore column order)
|
|
143
|
+
csvsmith classify --src ./raw --dest ./out --config signatures.json --mode relaxed
|
|
144
|
+
|
|
145
|
+
# Subset matching (signature columns must be present)
|
|
146
|
+
csvsmith classify --src ./raw --dest ./out --config signatures.json --match contains
|
|
147
|
+
|
|
148
|
+
# Report-only (plan without moving files)
|
|
149
|
+
csvsmith classify --src ./raw --dest ./out --auto --report-only
|
|
150
|
+
|
|
151
|
+
# Roll back using manifest
|
|
152
|
+
csvsmith classify --rollback ./out/manifest_YYYYMMDD_HHMMSS.json
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
### Report-only mode
|
|
157
|
+
|
|
158
|
+
`--report-only` scans all CSVs and writes a manifest describing what *would*
|
|
159
|
+
happen, without touching the filesystem. This enables downstream pipelines
|
|
160
|
+
to consume the classification plan for custom processing.
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
## Philosophy
|
|
164
|
+
|
|
165
|
+
1. CSVs deserve tools that are simple, predictable, and transparent.
|
|
166
|
+
2. A row has meaning only when its identity is stable and hashable.
|
|
167
|
+
3. Collisions are sin; determinism is virtue.
|
|
168
|
+
4. Let no delimiter sow ambiguity among fields.
|
|
169
|
+
5. Love thy \x1f — the unseen separator, guardian of clean hashes.
|
|
170
|
+
6. The pipeline should be silent unless something is wrong.
|
|
171
|
+
7. Your data deserves respect — and your tools should help you give it.
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
## License
|
|
175
|
+
|
|
176
|
+
MIT License.
|
|
@@ -1,13 +1,20 @@
|
|
|
1
1
|
"""
|
|
2
2
|
csvsmith: small, focused CSV utilities.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Public API:
|
|
5
|
+
- count_duplicates_sorted
|
|
6
|
+
- add_row_digest
|
|
7
|
+
- find_duplicate_rows
|
|
8
|
+
- dedupe_with_report
|
|
9
|
+
- CSVClassifier
|
|
10
|
+
|
|
11
|
+
Submodules:
|
|
5
12
|
- csvsmith.duplicates
|
|
6
13
|
- csvsmith.classify
|
|
7
14
|
- csvsmith.cli (CLI entrypoint)
|
|
8
15
|
"""
|
|
9
16
|
|
|
10
|
-
__version__ = "0.2.
|
|
17
|
+
__version__ = "0.2.1"
|
|
11
18
|
|
|
12
19
|
from .duplicates import (
|
|
13
20
|
count_duplicates_sorted,
|
|
@@ -23,4 +30,4 @@ __all__ = [
|
|
|
23
30
|
"find_duplicate_rows",
|
|
24
31
|
"dedupe_with_report",
|
|
25
32
|
"CSVClassifier",
|
|
26
|
-
]
|
|
33
|
+
]
|