csvmedic 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csvmedic-0.1.0/.gitignore +52 -0
- csvmedic-0.1.0/LICENSE +21 -0
- csvmedic-0.1.0/PKG-INFO +146 -0
- csvmedic-0.1.0/README.md +107 -0
- csvmedic-0.1.0/pyproject.toml +74 -0
- csvmedic-0.1.0/src/csvmedic/__init__.py +27 -0
- csvmedic-0.1.0/src/csvmedic/_version.py +3 -0
- csvmedic-0.1.0/src/csvmedic/accessor.py +27 -0
- csvmedic-0.1.0/src/csvmedic/batch.py +102 -0
- csvmedic-0.1.0/src/csvmedic/confidence.py +8 -0
- csvmedic-0.1.0/src/csvmedic/detectors/__init__.py +1 -0
- csvmedic-0.1.0/src/csvmedic/detectors/booleans.py +61 -0
- csvmedic-0.1.0/src/csvmedic/detectors/dates.py +479 -0
- csvmedic-0.1.0/src/csvmedic/detectors/dialect.py +165 -0
- csvmedic-0.1.0/src/csvmedic/detectors/encoding.py +121 -0
- csvmedic-0.1.0/src/csvmedic/detectors/numbers.py +193 -0
- csvmedic-0.1.0/src/csvmedic/detectors/strings.py +30 -0
- csvmedic-0.1.0/src/csvmedic/diagnosis.py +67 -0
- csvmedic-0.1.0/src/csvmedic/diff.py +146 -0
- csvmedic-0.1.0/src/csvmedic/exceptions.py +19 -0
- csvmedic-0.1.0/src/csvmedic/models.py +126 -0
- csvmedic-0.1.0/src/csvmedic/reader.py +376 -0
- csvmedic-0.1.0/src/csvmedic/schema.py +35 -0
- csvmedic-0.1.0/src/csvmedic/transformers/__init__.py +1 -0
- csvmedic-0.1.0/src/csvmedic/transformers/boolean_transformer.py +57 -0
- csvmedic-0.1.0/src/csvmedic/transformers/date_transformer.py +71 -0
- csvmedic-0.1.0/src/csvmedic/transformers/number_transformer.py +61 -0
- csvmedic-0.1.0/src/csvmedic/transformers/string_transformer.py +32 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Byte-compiled
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
|
|
23
|
+
# Virtual environments
|
|
24
|
+
.venv/
|
|
25
|
+
venv/
|
|
26
|
+
ENV/
|
|
27
|
+
env/
|
|
28
|
+
|
|
29
|
+
# IDE
|
|
30
|
+
.idea/
|
|
31
|
+
.vscode/
|
|
32
|
+
*.swp
|
|
33
|
+
*.swo
|
|
34
|
+
|
|
35
|
+
# Testing
|
|
36
|
+
.coverage
|
|
37
|
+
.pytest_cache/
|
|
38
|
+
htmlcov/
|
|
39
|
+
.tox/
|
|
40
|
+
nox/
|
|
41
|
+
|
|
42
|
+
# mypy
|
|
43
|
+
.mypy_cache/
|
|
44
|
+
.dmypy.json
|
|
45
|
+
dmypy.json
|
|
46
|
+
|
|
47
|
+
# uv
|
|
48
|
+
.uv/
|
|
49
|
+
|
|
50
|
+
# OS
|
|
51
|
+
.DS_Store
|
|
52
|
+
Thumbs.db
|
csvmedic-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 csvmedic contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
csvmedic-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: csvmedic
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automatic locale-aware CSV and Excel reader with encoding, delimiter, date format, and number locale detection.
|
|
5
|
+
Project-URL: Homepage, https://github.com/csvmedic/csvmedic
|
|
6
|
+
Project-URL: Documentation, https://csvmedic.readthedocs.io
|
|
7
|
+
Project-URL: Repository, https://github.com/csvmedic/csvmedic
|
|
8
|
+
Author: csvmedic contributors
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Requires-Dist: charset-normalizer>=3.0.0
|
|
22
|
+
Requires-Dist: pandas>=1.5.0
|
|
23
|
+
Provides-Extra: all
|
|
24
|
+
Requires-Dist: clevercsv>=0.8.0; extra == 'all'
|
|
25
|
+
Requires-Dist: openpyxl>=3.1.0; extra == 'all'
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: mkdocs-material>=9.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == 'dev'
|
|
29
|
+
Requires-Dist: mypy>=1.8; extra == 'dev'
|
|
30
|
+
Requires-Dist: pandas-stubs>=2.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: ruff>=0.4.0; extra == 'dev'
|
|
34
|
+
Provides-Extra: excel
|
|
35
|
+
Requires-Dist: openpyxl>=3.1.0; extra == 'excel'
|
|
36
|
+
Provides-Extra: fast
|
|
37
|
+
Requires-Dist: clevercsv>=0.8.0; extra == 'fast'
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# csvmedic
|
|
41
|
+
|
|
42
|
+
Automatic locale-aware CSV and Excel reader. One line to clean messy data:
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
import csvmedic
|
|
46
|
+
|
|
47
|
+
df = csvmedic.read("messy_file.csv")
|
|
48
|
+
print(df.diagnosis) # See what was detected and converted
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## What it does
|
|
52
|
+
|
|
53
|
+
| Detects | Examples |
|
|
54
|
+
|--------|----------|
|
|
55
|
+
| **Encoding** | UTF-8, Windows-1252, ISO-8859-1, Shift-JIS, BOM |
|
|
56
|
+
| **Delimiter** | Comma, semicolon, tab, pipe |
|
|
57
|
+
| **Dates** | DD-MM vs MM-DD resolved statistically; ISO, European, US formats |
|
|
58
|
+
| **Numbers** | European (1.234,56) vs US (1,234.56); locale hint |
|
|
59
|
+
| **Booleans** | Yes/No, Ja/Nein, Oui/Non, Sí/No, and more |
|
|
60
|
+
| **Strings** | Preserves leading zeros (IDs like 00742) |
|
|
61
|
+
|
|
62
|
+
Every transformation is recorded in the `.diagnosis` attribute so you can audit what was changed.
|
|
63
|
+
|
|
64
|
+
## Installation
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install csvmedic
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Optional extras:
|
|
71
|
+
|
|
72
|
+
- `pip install csvmedic[fast]` — better dialect detection (clevercsv)
|
|
73
|
+
- `pip install csvmedic[excel]` — .xlsx support (openpyxl)
|
|
74
|
+
- `pip install csvmedic[all]` — both
|
|
75
|
+
|
|
76
|
+
## Configuration
|
|
77
|
+
|
|
78
|
+
Override auto-detection when you know better:
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
df = csvmedic.read(
|
|
82
|
+
"file.csv",
|
|
83
|
+
encoding="utf-8",
|
|
84
|
+
delimiter=";",
|
|
85
|
+
dayfirst=True, # Force DD-MM dates
|
|
86
|
+
preserve_strings=["ID"], # Never convert these columns
|
|
87
|
+
sample_rows=2000, # Rows to use for detection
|
|
88
|
+
confidence_threshold=0.75, # Min confidence to convert (0–1)
|
|
89
|
+
)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Analyze without converting
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
profile = csvmedic.read_raw("file.csv")
|
|
96
|
+
print(profile.summary())
|
|
97
|
+
print(profile.columns["Date"].details)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Schema pinning (recurring files)
|
|
101
|
+
|
|
102
|
+
Save the detected schema after the first read and reuse it so the next read skips detection:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
df = csvmedic.read("monthly_export.csv")
|
|
106
|
+
csvmedic.save_schema(df.attrs["diagnosis"].file_profile, "monthly_export.csvmedic.json")
|
|
107
|
+
|
|
108
|
+
# Next time: same encoding, delimiter, and column types, no re-detection
|
|
109
|
+
df2 = csvmedic.read("monthly_export.csv", schema="monthly_export.csvmedic.json")
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Batch read with consensus
|
|
113
|
+
|
|
114
|
+
When reading many similar CSVs (e.g. one per month), use consensus so every file gets the same encoding and delimiter:
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
dfs = csvmedic.read_batch(["jan.csv", "feb.csv", "mar.csv"], use_consensus=True)
|
|
118
|
+
# Encoding and delimiter are chosen by majority across the three files.
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Diff: pandas vs csvmedic
|
|
122
|
+
|
|
123
|
+
See exactly what pandas would have changed or corrupted vs what csvmedic preserves:
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
result = csvmedic.diff("leading_zeros.csv")
|
|
127
|
+
print(result.summary()) # Columns/rows that differ
|
|
128
|
+
print(result.pandas_df) # Default pandas read
|
|
129
|
+
print(result.csvmedic_df) # csvmedic read (e.g. keeps "00742" as string)
|
|
130
|
+
print(result.sample_differences) # Example (row, column, pandas_val, csvmedic_val)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## How disambiguation works
|
|
134
|
+
|
|
135
|
+
For ambiguous dates like `03/04/2025` (March 4 or April 3?), csvmedic uses the data itself: if any value has a day > 12 (e.g. `25/03/2025`), the column is treated as day-first. It also uses cross-column inference, separator hints (e.g. period = European), and sequential order. If it still can’t decide, the column stays as string and is marked ambiguous in the diagnosis.
|
|
136
|
+
|
|
137
|
+
## Documentation
|
|
138
|
+
|
|
139
|
+
- [Quickstart](docs/quickstart.md)
|
|
140
|
+
- [How it works](docs/how-it-works.md)
|
|
141
|
+
- [API reference](docs/api-reference.md)
|
|
142
|
+
- [FAQ](docs/faq.md)
|
|
143
|
+
|
|
144
|
+
## License
|
|
145
|
+
|
|
146
|
+
MIT
|
csvmedic-0.1.0/README.md
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# csvmedic
|
|
2
|
+
|
|
3
|
+
Automatic locale-aware CSV and Excel reader. One line to clean messy data:
|
|
4
|
+
|
|
5
|
+
```python
|
|
6
|
+
import csvmedic
|
|
7
|
+
|
|
8
|
+
df = csvmedic.read("messy_file.csv")
|
|
9
|
+
print(df.diagnosis) # See what was detected and converted
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## What it does
|
|
13
|
+
|
|
14
|
+
| Detects | Examples |
|
|
15
|
+
|--------|----------|
|
|
16
|
+
| **Encoding** | UTF-8, Windows-1252, ISO-8859-1, Shift-JIS, BOM |
|
|
17
|
+
| **Delimiter** | Comma, semicolon, tab, pipe |
|
|
18
|
+
| **Dates** | DD-MM vs MM-DD resolved statistically; ISO, European, US formats |
|
|
19
|
+
| **Numbers** | European (1.234,56) vs US (1,234.56); locale hint |
|
|
20
|
+
| **Booleans** | Yes/No, Ja/Nein, Oui/Non, Sí/No, and more |
|
|
21
|
+
| **Strings** | Preserves leading zeros (IDs like 00742) |
|
|
22
|
+
|
|
23
|
+
Every transformation is recorded in the `.diagnosis` attribute so you can audit what was changed.
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install csvmedic
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Optional extras:
|
|
32
|
+
|
|
33
|
+
- `pip install csvmedic[fast]` — better dialect detection (clevercsv)
|
|
34
|
+
- `pip install csvmedic[excel]` — .xlsx support (openpyxl)
|
|
35
|
+
- `pip install csvmedic[all]` — both
|
|
36
|
+
|
|
37
|
+
## Configuration
|
|
38
|
+
|
|
39
|
+
Override auto-detection when you know better:
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
df = csvmedic.read(
|
|
43
|
+
"file.csv",
|
|
44
|
+
encoding="utf-8",
|
|
45
|
+
delimiter=";",
|
|
46
|
+
dayfirst=True, # Force DD-MM dates
|
|
47
|
+
preserve_strings=["ID"], # Never convert these columns
|
|
48
|
+
sample_rows=2000, # Rows to use for detection
|
|
49
|
+
confidence_threshold=0.75, # Min confidence to convert (0–1)
|
|
50
|
+
)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Analyze without converting
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
profile = csvmedic.read_raw("file.csv")
|
|
57
|
+
print(profile.summary())
|
|
58
|
+
print(profile.columns["Date"].details)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Schema pinning (recurring files)
|
|
62
|
+
|
|
63
|
+
Save the detected schema after the first read and reuse it so the next read skips detection:
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
df = csvmedic.read("monthly_export.csv")
|
|
67
|
+
csvmedic.save_schema(df.attrs["diagnosis"].file_profile, "monthly_export.csvmedic.json")
|
|
68
|
+
|
|
69
|
+
# Next time: same encoding, delimiter, and column types, no re-detection
|
|
70
|
+
df2 = csvmedic.read("monthly_export.csv", schema="monthly_export.csvmedic.json")
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Batch read with consensus
|
|
74
|
+
|
|
75
|
+
When reading many similar CSVs (e.g. one per month), use consensus so every file gets the same encoding and delimiter:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
dfs = csvmedic.read_batch(["jan.csv", "feb.csv", "mar.csv"], use_consensus=True)
|
|
79
|
+
# Encoding and delimiter are chosen by majority across the three files.
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Diff: pandas vs csvmedic
|
|
83
|
+
|
|
84
|
+
See exactly what pandas would have changed or corrupted vs what csvmedic preserves:
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
result = csvmedic.diff("leading_zeros.csv")
|
|
88
|
+
print(result.summary()) # Columns/rows that differ
|
|
89
|
+
print(result.pandas_df) # Default pandas read
|
|
90
|
+
print(result.csvmedic_df) # csvmedic read (e.g. keeps "00742" as string)
|
|
91
|
+
print(result.sample_differences) # Example (row, column, pandas_val, csvmedic_val)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## How disambiguation works
|
|
95
|
+
|
|
96
|
+
For ambiguous dates like `03/04/2025` (March 4 or April 3?), csvmedic uses the data itself: if any value has a day > 12 (e.g. `25/03/2025`), the column is treated as day-first. It also uses cross-column inference, separator hints (e.g. period = European), and sequential order. If it still can’t decide, the column stays as string and is marked ambiguous in the diagnosis.
|
|
97
|
+
|
|
98
|
+
## Documentation
|
|
99
|
+
|
|
100
|
+
- [Quickstart](docs/quickstart.md)
|
|
101
|
+
- [How it works](docs/how-it-works.md)
|
|
102
|
+
- [API reference](docs/api-reference.md)
|
|
103
|
+
- [FAQ](docs/faq.md)
|
|
104
|
+
|
|
105
|
+
## License
|
|
106
|
+
|
|
107
|
+
MIT
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "csvmedic"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Automatic locale-aware CSV and Excel reader with encoding, delimiter, date format, and number locale detection."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "csvmedic contributors" },
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.9",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"pandas>=1.5.0",
|
|
28
|
+
"charset-normalizer>=3.0.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
fast = ["clevercsv>=0.8.0"]
|
|
33
|
+
excel = ["openpyxl>=3.1.0"]
|
|
34
|
+
all = ["clevercsv>=0.8.0", "openpyxl>=3.1.0"]
|
|
35
|
+
dev = [
|
|
36
|
+
"pytest>=7.0",
|
|
37
|
+
"pytest-cov>=4.0",
|
|
38
|
+
"ruff>=0.4.0",
|
|
39
|
+
"mypy>=1.8",
|
|
40
|
+
"pandas-stubs>=2.0",
|
|
41
|
+
"mkdocs-material>=9.0",
|
|
42
|
+
"mkdocstrings[python]>=0.24",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[project.urls]
|
|
46
|
+
Homepage = "https://github.com/csvmedic/csvmedic"
|
|
47
|
+
Documentation = "https://csvmedic.readthedocs.io"
|
|
48
|
+
Repository = "https://github.com/csvmedic/csvmedic"
|
|
49
|
+
|
|
50
|
+
[tool.hatch.build.targets.wheel]
|
|
51
|
+
packages = ["src/csvmedic"]
|
|
52
|
+
|
|
53
|
+
[tool.hatch.build.targets.sdist]
|
|
54
|
+
include = ["src/csvmedic"]
|
|
55
|
+
|
|
56
|
+
[tool.ruff]
|
|
57
|
+
line-length = 99
|
|
58
|
+
target-version = "py39"
|
|
59
|
+
src = ["src", "tests"]
|
|
60
|
+
|
|
61
|
+
[tool.ruff.lint]
|
|
62
|
+
select = ["E", "F", "I", "N", "W", "UP"]
|
|
63
|
+
|
|
64
|
+
[tool.mypy]
|
|
65
|
+
python_version = "3.9"
|
|
66
|
+
strict = true
|
|
67
|
+
warn_return_any = true
|
|
68
|
+
warn_unused_ignores = true
|
|
69
|
+
disallow_untyped_defs = true
|
|
70
|
+
|
|
71
|
+
[tool.pytest.ini_options]
|
|
72
|
+
testpaths = ["tests"]
|
|
73
|
+
addopts = "-v"
|
|
74
|
+
filterwarnings = ["ignore::DeprecationWarning"]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""csvmedic — Automatic locale-aware CSV reading."""
|
|
2
|
+
|
|
3
|
+
from csvmedic import accessor # noqa: F401 — registers df.diagnosis accessor
|
|
4
|
+
from csvmedic._version import __version__
|
|
5
|
+
from csvmedic.batch import read_batch
|
|
6
|
+
from csvmedic.diagnosis import Diagnosis, TransformationRecord
|
|
7
|
+
from csvmedic.diff import DiffResult, diff
|
|
8
|
+
from csvmedic.models import ColumnProfile, FileProfile
|
|
9
|
+
from csvmedic.reader import MedicReader, read, read_raw
|
|
10
|
+
from csvmedic.schema import load_schema, save_schema, schema_path_for_csv
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"__version__",
|
|
14
|
+
"read",
|
|
15
|
+
"read_raw",
|
|
16
|
+
"read_batch",
|
|
17
|
+
"MedicReader",
|
|
18
|
+
"Diagnosis",
|
|
19
|
+
"TransformationRecord",
|
|
20
|
+
"ColumnProfile",
|
|
21
|
+
"FileProfile",
|
|
22
|
+
"save_schema",
|
|
23
|
+
"load_schema",
|
|
24
|
+
"schema_path_for_csv",
|
|
25
|
+
"diff",
|
|
26
|
+
"DiffResult",
|
|
27
|
+
]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Pandas DataFrame accessor for .diagnosis attribute."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@pd.api.extensions.register_dataframe_accessor("diagnosis")
|
|
11
|
+
class DiagnosisAccessor:
|
|
12
|
+
"""Accessor for df.diagnosis — returns the Diagnosis object from df.attrs."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, pandas_obj: pd.DataFrame) -> None:
|
|
15
|
+
self._obj = pandas_obj
|
|
16
|
+
|
|
17
|
+
def __repr__(self) -> str:
|
|
18
|
+
d = self._obj.attrs.get("diagnosis")
|
|
19
|
+
if d is None:
|
|
20
|
+
return "No diagnosis available"
|
|
21
|
+
return repr(d)
|
|
22
|
+
|
|
23
|
+
def __getattr__(self, name: str) -> Any:
|
|
24
|
+
d = self._obj.attrs.get("diagnosis")
|
|
25
|
+
if d is None:
|
|
26
|
+
raise AttributeError("No diagnosis available")
|
|
27
|
+
return getattr(d, name)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Multi-file batch read with consensus detection.
|
|
3
|
+
|
|
4
|
+
When reading multiple similar CSVs (e.g. monthly exports), run encoding and
|
|
5
|
+
delimiter detection on each file's sample and use the majority result for all,
|
|
6
|
+
so every file is read with the same settings.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections import Counter
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
from csvmedic.detectors.dialect import detect_dialect
|
|
18
|
+
from csvmedic.detectors.encoding import detect_encoding
|
|
19
|
+
from csvmedic.reader import _read_byte_sample, read
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _consensus_encoding_dialect(
|
|
23
|
+
paths: list[Path],
|
|
24
|
+
) -> tuple[str, str, bool]:
|
|
25
|
+
"""Detect encoding/dialect per path; return (encoding, delimiter, has_header)."""
|
|
26
|
+
encodings: list[str] = []
|
|
27
|
+
delimiters: list[str] = []
|
|
28
|
+
headers: list[bool] = []
|
|
29
|
+
for path in paths:
|
|
30
|
+
try:
|
|
31
|
+
bytes_sample, _ = _read_byte_sample(path)
|
|
32
|
+
enc = detect_encoding(bytes_sample)
|
|
33
|
+
encodings.append(enc.encoding)
|
|
34
|
+
decoded = bytes_sample.decode(enc.encoding, errors="replace")
|
|
35
|
+
dialect = detect_dialect(None, enc.encoding, sample_text=decoded)
|
|
36
|
+
delimiters.append(dialect.delimiter)
|
|
37
|
+
headers.append(dialect.has_header)
|
|
38
|
+
except Exception:
|
|
39
|
+
continue
|
|
40
|
+
if not encodings:
|
|
41
|
+
return ("utf-8", ",", True)
|
|
42
|
+
enc_counter = Counter(encodings)
|
|
43
|
+
delim_counter = Counter(delimiters)
|
|
44
|
+
encoding = enc_counter.most_common(1)[0][0]
|
|
45
|
+
delimiter = delim_counter.most_common(1)[0][0]
|
|
46
|
+
has_header = sum(headers) > len(headers) / 2
|
|
47
|
+
return (encoding, delimiter, has_header)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def read_batch(
|
|
51
|
+
paths: str | Path | list[str] | list[Path],
|
|
52
|
+
*,
|
|
53
|
+
encoding: str | None = None,
|
|
54
|
+
delimiter: str | None = None,
|
|
55
|
+
use_consensus: bool = True,
|
|
56
|
+
**read_kw: Any,
|
|
57
|
+
) -> list[pd.DataFrame]:
|
|
58
|
+
"""
|
|
59
|
+
Read multiple CSV files with optional consensus detection.
|
|
60
|
+
|
|
61
|
+
When use_consensus is True (default), encoding and delimiter are detected
|
|
62
|
+
on a sample from each file and the majority choice is used for every file,
|
|
63
|
+
so all DataFrames are read with the same settings. When use_consensus is
|
|
64
|
+
False or encoding/delimiter are provided, each file is read with read() and
|
|
65
|
+
its own detection (or the given overrides).
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
paths : path or list of paths
|
|
70
|
+
One or more paths to CSV files.
|
|
71
|
+
encoding : str, optional
|
|
72
|
+
If set, overrides consensus and is used for all files.
|
|
73
|
+
delimiter : str, optional
|
|
74
|
+
If set, overrides consensus and is used for all files.
|
|
75
|
+
use_consensus : bool
|
|
76
|
+
If True, run detection on each file and use majority encoding/delimiter.
|
|
77
|
+
**read_kw
|
|
78
|
+
Passed through to read() (e.g. sample_rows, confidence_threshold).
|
|
79
|
+
|
|
80
|
+
Returns
|
|
81
|
+
-------
|
|
82
|
+
list of DataFrame
|
|
83
|
+
One DataFrame per path, in order.
|
|
84
|
+
"""
|
|
85
|
+
if isinstance(paths, (str, Path)):
|
|
86
|
+
paths = [Path(paths)]
|
|
87
|
+
else:
|
|
88
|
+
paths = [Path(p) for p in paths]
|
|
89
|
+
if not paths:
|
|
90
|
+
return []
|
|
91
|
+
|
|
92
|
+
if use_consensus and encoding is None and delimiter is None:
|
|
93
|
+
enc, delim, has_header = _consensus_encoding_dialect(paths)
|
|
94
|
+
# Pass consensus to read(); has_header is not a read() kwarg so we rely on detection
|
|
95
|
+
read_kw["encoding"] = enc
|
|
96
|
+
read_kw["delimiter"] = delim
|
|
97
|
+
elif encoding is not None:
|
|
98
|
+
read_kw["encoding"] = encoding
|
|
99
|
+
if delimiter is not None:
|
|
100
|
+
read_kw["delimiter"] = delimiter
|
|
101
|
+
|
|
102
|
+
return [read(p, **read_kw) for p in paths]
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Confidence scoring for ambiguous detections."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def clamp_confidence(value: float, min_val: float = 0.0, max_val: float = 1.0) -> float:
|
|
7
|
+
"""Clamp a confidence score to [min_val, max_val]."""
|
|
8
|
+
return max(min_val, min(max_val, value))
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Detectors for encoding, dialect, dates, numbers, booleans, strings."""
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Boolean variant detection across locales."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
# (true_values, false_values) per locale
|
|
8
|
+
BOOLEAN_MAP: list[tuple[list[str], list[str]]] = [
|
|
9
|
+
(["true", "yes", "y", "1", "on"], ["false", "no", "n", "0", "off"]),
|
|
10
|
+
(["ja", "j", "jaa"], ["nein", "n"]),
|
|
11
|
+
(["oui", "o", "vrai"], ["non", "faux"]),
|
|
12
|
+
(["sí", "si", "s"], ["no"]),
|
|
13
|
+
(["vero", "sì"], ["falso"]),
|
|
14
|
+
(["waar"], ["onwaar"]),
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
ALL_TRUE: set[str] = set()
|
|
18
|
+
ALL_FALSE: set[str] = set()
|
|
19
|
+
for trues, falses in BOOLEAN_MAP:
|
|
20
|
+
ALL_TRUE.update(trues)
|
|
21
|
+
ALL_FALSE.update(falses)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class BooleanDetectionResult:
|
|
26
|
+
"""Result of boolean detection."""
|
|
27
|
+
|
|
28
|
+
is_boolean: bool
|
|
29
|
+
confidence: float
|
|
30
|
+
true_variants: list[str]
|
|
31
|
+
false_variants: list[str]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def detect_boolean_column(values: list[str]) -> BooleanDetectionResult:
|
|
35
|
+
"""Detect if column is boolean; require >=90% of non-null values to match."""
|
|
36
|
+
filtered = [str(v).strip().lower() for v in values if v is not None and str(v).strip()]
|
|
37
|
+
if not filtered:
|
|
38
|
+
return BooleanDetectionResult(False, 0.0, [], [])
|
|
39
|
+
|
|
40
|
+
unique = set(filtered)
|
|
41
|
+
if len(unique) > 2:
|
|
42
|
+
return BooleanDetectionResult(False, 0.0, [], [])
|
|
43
|
+
|
|
44
|
+
allowed = ALL_TRUE | ALL_FALSE
|
|
45
|
+
match_count = sum(1 for v in filtered if v in allowed)
|
|
46
|
+
ratio = match_count / len(filtered)
|
|
47
|
+
|
|
48
|
+
if ratio < 0.9:
|
|
49
|
+
return BooleanDetectionResult(False, ratio, [], [])
|
|
50
|
+
|
|
51
|
+
true_found = [v for v in unique if v in ALL_TRUE]
|
|
52
|
+
false_found = [v for v in unique if v in ALL_FALSE]
|
|
53
|
+
if not true_found or not false_found:
|
|
54
|
+
return BooleanDetectionResult(False, ratio, [], [])
|
|
55
|
+
|
|
56
|
+
return BooleanDetectionResult(
|
|
57
|
+
is_boolean=True,
|
|
58
|
+
confidence=min(1.0, ratio + 0.05),
|
|
59
|
+
true_variants=true_found,
|
|
60
|
+
false_variants=false_found,
|
|
61
|
+
)
|