py-devo 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_devo-0.2.0/LICENSE +6 -0
- py_devo-0.2.0/PKG-INFO +167 -0
- py_devo-0.2.0/README.md +153 -0
- py_devo-0.2.0/devo/__init__.py +11 -0
- py_devo-0.2.0/devo/_infer.py +112 -0
- py_devo-0.2.0/devo/_parser.py +91 -0
- py_devo-0.2.0/devo/_report.py +88 -0
- py_devo-0.2.0/devo/_schema.py +111 -0
- py_devo-0.2.0/devo/cli.py +104 -0
- py_devo-0.2.0/devo/enrich.py +234 -0
- py_devo-0.2.0/devo/exceptions.py +14 -0
- py_devo-0.2.0/devo/validate.py +219 -0
- py_devo-0.2.0/devo/webui.py +46 -0
- py_devo-0.2.0/py_devo.egg-info/PKG-INFO +167 -0
- py_devo-0.2.0/py_devo.egg-info/SOURCES.txt +25 -0
- py_devo-0.2.0/py_devo.egg-info/dependency_links.txt +1 -0
- py_devo-0.2.0/py_devo.egg-info/entry_points.txt +2 -0
- py_devo-0.2.0/py_devo.egg-info/requires.txt +4 -0
- py_devo-0.2.0/py_devo.egg-info/top_level.txt +1 -0
- py_devo-0.2.0/pyproject.toml +27 -0
- py_devo-0.2.0/setup.cfg +4 -0
- py_devo-0.2.0/tests/test_cli.py +64 -0
- py_devo-0.2.0/tests/test_enrich.py +173 -0
- py_devo-0.2.0/tests/test_infer.py +90 -0
- py_devo-0.2.0/tests/test_parser.py +74 -0
- py_devo-0.2.0/tests/test_syntax_only.py +30 -0
- py_devo-0.2.0/tests/test_validate.py +108 -0
py_devo-0.2.0/LICENSE
ADDED
py_devo-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: py-devo
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: DEVO — CSV to iCSV enrichment and Frictionless validation
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Source, https://github.com/envidat/devo
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: frictionless>=4.0.0
|
|
11
|
+
Provides-Extra: webui
|
|
12
|
+
Requires-Dist: flask>=2.0.0; extra == "webui"
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# DEVO
|
|
16
|
+
<img title="whip it" alt="you know you should" height="50" src="/images/DEVO_Pixels_1.webp">
|
|
17
|
+
|
|
18
|
+
**Data Enrichment and Validation Operator.** Takes a plain CSV, infers types and constraints, writes a self-documenting [iCSV](https://envidat.github.io/iCSV/) file plus a Frictionless schema, and validates the data against it.
|
|
19
|
+
|
|
20
|
+
If you give it a `.csv`, it enriches → schema → validates. If you give it an `.icsv`, it skips enrichment.
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install -e .
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
For the Flask web demo:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install -e ".[webui]"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Requires Python 3.9+ and `frictionless` (v4 or v5).
|
|
35
|
+
|
|
36
|
+
## Try it out
|
|
37
|
+
|
|
38
|
+
A small sample dataset lives at `examples/sample.csv` — three columns (`timestamp`, `PSUM`, `TA`) representing hourly weather observations. Use it to take DEVO for a spin without needing your own data.
|
|
39
|
+
|
|
40
|
+
### CLI
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Enrich, build schema, and validate in one command
|
|
44
|
+
devo run examples/sample.csv
|
|
45
|
+
|
|
46
|
+
# Results land in DEVO_output/ by default:
|
|
47
|
+
# sample.icsv — annotated iCSV
|
|
48
|
+
# sample_schema.json — Frictionless Table Schema
|
|
49
|
+
# sample_DEVO_report.txt — human-readable validation report
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Run `devo run examples/sample.csv --out my_output` to write to a different directory.
|
|
53
|
+
|
|
54
|
+
### Python
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from devo.enrich import ICSVEnricher
|
|
58
|
+
from devo.validate import validate_icsv
|
|
59
|
+
|
|
60
|
+
icsv, schema = ICSVEnricher().make_icsv("examples/sample.csv", "DEVO_output")
|
|
61
|
+
report_path, valid = validate_icsv(icsv, schema_path=schema)
|
|
62
|
+
|
|
63
|
+
print(f"Valid: {valid}")
|
|
64
|
+
print(f"Report written to: {report_path}")
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Web demo
|
|
68
|
+
|
|
69
|
+
Install the optional Flask dependency first (if you haven't already):
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install -e ".[webui]"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Start the local server:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
flask --app devo.webui run
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Then open `http://127.0.0.1:5000` in your browser. Click **Choose File**, select `examples/sample.csv`, and click **Upload**. The page will display the paths to the generated iCSV, schema, and report, along with the overall `Valid` result.
|
|
82
|
+
|
|
83
|
+
> The web UI is a local demo only — do not expose it to a network.
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## CLI
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
devo enrich data.csv # write data.icsv + data_schema.json
|
|
91
|
+
devo validate data.icsv # validate against neighbouring schema
|
|
92
|
+
devo run data.csv # do both in one go
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Common flags: `--out DIR` (default `DEVO_output/`), `--delimiter CHAR`, `--nodata VALUE`, `--app PROFILE`, `--schema PATH`.
|
|
96
|
+
|
|
97
|
+
Exit codes: `0` = success, `1` = validation failed, `2` = usage or runtime error.
|
|
98
|
+
|
|
99
|
+
## What lands on disk
|
|
100
|
+
|
|
101
|
+
For input `data.csv`, after `devo run`:
|
|
102
|
+
|
|
103
|
+
| File | What |
|
|
104
|
+
|---|---|
|
|
105
|
+
| `DEVO_output/data.icsv` | iCSV with `# [METADATA]`, `# [FIELDS]`, `# [DATA]` |
|
|
106
|
+
| `DEVO_output/data_schema.json` | Frictionless Table Schema JSON |
|
|
107
|
+
| `DEVO_output/data_DEVO_report.txt` | Validation report (read this) |
|
|
108
|
+
|
|
109
|
+
## Python API
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from devo.enrich import ICSVEnricher
|
|
113
|
+
from devo.validate import validate_icsv
|
|
114
|
+
|
|
115
|
+
icsv, schema = ICSVEnricher().make_icsv("data.csv", "DEVO_output")
|
|
116
|
+
report_path, valid = validate_icsv(icsv, schema_path=schema)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Files
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
devo/
|
|
123
|
+
├── cli.py # argparse front-end (enrich / validate / run)
|
|
124
|
+
├── enrich.py # CSV → iCSV + schema (ICSVEnricher class)
|
|
125
|
+
├── validate.py # iCSV + schema → Frictionless validation + report
|
|
126
|
+
├── _infer.py # pure type-inference functions (shared by enrich + validate)
|
|
127
|
+
├── _parser.py # iCSV header parser (shared by enrich + validate)
|
|
128
|
+
├── _schema.py # per-column statistics + Frictionless schema builder
|
|
129
|
+
├── _report.py # plain-text report writer
|
|
130
|
+
├── exceptions.py # DEVOError hierarchy
|
|
131
|
+
└── webui.py # Flask demo (optional; requires pip install -e ".[webui]")
|
|
132
|
+
tests/
|
|
133
|
+
├── conftest.py
|
|
134
|
+
├── fixtures/ # sample CSV and iCSV files
|
|
135
|
+
└── test_*.py
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## How it works
|
|
139
|
+
|
|
140
|
+
### Enrichment (`devo enrich`)
|
|
141
|
+
|
|
142
|
+
1. **Read** — the CSV is read in one pass. If no `--delimiter` is given, `csv.Sniffer` detects it from the first 10 lines.
|
|
143
|
+
2. **Delimiter mapping** — comma is remapped to pipe in the iCSV output (pipe is also the default fallback for non-spec delimiters). Column names that contain the output delimiter are rejected with a clear error.
|
|
144
|
+
3. **Normalisation** — every row is padded or clipped to header length and stripped of leading/trailing whitespace.
|
|
145
|
+
4. **Type inference** — each column is classified: `integer → number → datetime → string`. Scientific notation (`1.5e-3`, `2E10`) is recognised as `number`. Missing-value sentinels (and any custom `--nodata` value) are excluded before inference.
|
|
146
|
+
5. **Statistics** — per-column `min`, `max`, and `missing_count` are computed from the normalised data and written to the iCSV `# [FIELDS]` section. They do not appear in the Frictionless schema JSON.
|
|
147
|
+
6. **Geometry detection** — if the header contains `lat`/`latitude` + `lon`/`lng`/`longitude`, DEVO writes `geometry = column:lat,lon` and `srid = EPSG:4326` to metadata. A single column named `geometry` (WKT) gets `geometry = column:geometry` only — no `srid`, because WKT embeds its own CRS.
|
|
148
|
+
7. **Write** — the normalised rows are written to the iCSV `# [DATA]` section, and the Frictionless schema is written to `_schema.json`.
|
|
149
|
+
|
|
150
|
+
### Validation (`devo validate`)
|
|
151
|
+
|
|
152
|
+
1. **Parse header** — `_parser.py` reads the `# [METADATA]` and `# [FIELDS]` sections, using `field_delimiter` from metadata to split field values.
|
|
153
|
+
2. **Metadata check** — required keys are verified. `geometry` and `srid` are only checked when spatial column names are present; `srid` is only required for lat/lon columns (not WKT).
|
|
154
|
+
3. **Type cross-check (Option A)** — column types are re-inferred from up to 500 data rows and compared to the declared types. The iCSV's own `nodata` sentinel is merged with the standard missing-value set before re-inference so custom sentinels are not mistaken for real data. Inferred type narrower than or equal to declared → `[OK]`. Inferred wider → `[WARN]`.
|
|
155
|
+
4. **Frictionless validation** — data is written to a temporary comma-delimited CSV and validated against the schema using `frictionless.Resource`. The temp file is always deleted in a `finally` block.
|
|
156
|
+
5. **Report** — a plain-text `.txt` report is written with three sections: `METADATA`, `TYPE CONSISTENCY`, and `DATA VALIDATION`. `Valid: YES` only when metadata has no `[FAIL]` entries and Frictionless reports no data errors. Type warnings do not affect the valid flag.
|
|
157
|
+
|
|
158
|
+
## Limitations
|
|
159
|
+
|
|
160
|
+
- Type inference is conservative: `integer → number → datetime → string`. Mixed-format columns fall back to `string`.
|
|
161
|
+
- Datetime detection uses `datetime.fromisoformat()` and a fixed list of common strptime formats. Unusual formats need a custom schema.
|
|
162
|
+
- Column descriptions are left blank in the iCSV `# [FIELDS]` section; fill them in by hand.
|
|
163
|
+
- The web UI (`webui.py`) is a local demo only — do not expose it to a network.
|
|
164
|
+
|
|
165
|
+
## License
|
|
166
|
+
|
|
167
|
+
MIT. See `LICENSE`.
|
py_devo-0.2.0/README.md
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# DEVO
|
|
2
|
+
<img title="whip it" alt="you know you should" height="50" src="/images/DEVO_Pixels_1.webp">
|
|
3
|
+
|
|
4
|
+
**Data Enrichment and Validation Operator.** Takes a plain CSV, infers types and constraints, writes a self-documenting [iCSV](https://envidat.github.io/iCSV/) file plus a Frictionless schema, and validates the data against it.
|
|
5
|
+
|
|
6
|
+
If you give it a `.csv`, it enriches → schema → validates. If you give it an `.icsv`, it skips enrichment.
|
|
7
|
+
|
|
8
|
+
## Install
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
pip install -e .
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
For the Flask web demo:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install -e ".[webui]"
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Requires Python 3.9+ and `frictionless` (v4 or v5).
|
|
21
|
+
|
|
22
|
+
## Try it out
|
|
23
|
+
|
|
24
|
+
A small sample dataset lives at `examples/sample.csv` — three columns (`timestamp`, `PSUM`, `TA`) representing hourly weather observations. Use it to take DEVO for a spin without needing your own data.
|
|
25
|
+
|
|
26
|
+
### CLI
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Enrich, build schema, and validate in one command
|
|
30
|
+
devo run examples/sample.csv
|
|
31
|
+
|
|
32
|
+
# Results land in DEVO_output/ by default:
|
|
33
|
+
# sample.icsv — annotated iCSV
|
|
34
|
+
# sample_schema.json — Frictionless Table Schema
|
|
35
|
+
# sample_DEVO_report.txt — human-readable validation report
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Run `devo run examples/sample.csv --out my_output` to write to a different directory.
|
|
39
|
+
|
|
40
|
+
### Python
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from devo.enrich import ICSVEnricher
|
|
44
|
+
from devo.validate import validate_icsv
|
|
45
|
+
|
|
46
|
+
icsv, schema = ICSVEnricher().make_icsv("examples/sample.csv", "DEVO_output")
|
|
47
|
+
report_path, valid = validate_icsv(icsv, schema_path=schema)
|
|
48
|
+
|
|
49
|
+
print(f"Valid: {valid}")
|
|
50
|
+
print(f"Report written to: {report_path}")
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Web demo
|
|
54
|
+
|
|
55
|
+
Install the optional Flask dependency first (if you haven't already):
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install -e ".[webui]"
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Start the local server:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
flask --app devo.webui run
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Then open `http://127.0.0.1:5000` in your browser. Click **Choose File**, select `examples/sample.csv`, and click **Upload**. The page will display the paths to the generated iCSV, schema, and report, along with the overall `Valid` result.
|
|
68
|
+
|
|
69
|
+
> The web UI is a local demo only — do not expose it to a network.
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## CLI
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
devo enrich data.csv # write data.icsv + data_schema.json
|
|
77
|
+
devo validate data.icsv # validate against neighbouring schema
|
|
78
|
+
devo run data.csv # do both in one go
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Common flags: `--out DIR` (default `DEVO_output/`), `--delimiter CHAR`, `--nodata VALUE`, `--app PROFILE`, `--schema PATH`.
|
|
82
|
+
|
|
83
|
+
Exit codes: `0` = success, `1` = validation failed, `2` = usage or runtime error.
|
|
84
|
+
|
|
85
|
+
## What lands on disk
|
|
86
|
+
|
|
87
|
+
For input `data.csv`, after `devo run`:
|
|
88
|
+
|
|
89
|
+
| File | What |
|
|
90
|
+
|---|---|
|
|
91
|
+
| `DEVO_output/data.icsv` | iCSV with `# [METADATA]`, `# [FIELDS]`, `# [DATA]` |
|
|
92
|
+
| `DEVO_output/data_schema.json` | Frictionless Table Schema JSON |
|
|
93
|
+
| `DEVO_output/data_DEVO_report.txt` | Validation report (read this) |
|
|
94
|
+
|
|
95
|
+
## Python API
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from devo.enrich import ICSVEnricher
|
|
99
|
+
from devo.validate import validate_icsv
|
|
100
|
+
|
|
101
|
+
icsv, schema = ICSVEnricher().make_icsv("data.csv", "DEVO_output")
|
|
102
|
+
report_path, valid = validate_icsv(icsv, schema_path=schema)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Files
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
devo/
|
|
109
|
+
├── cli.py # argparse front-end (enrich / validate / run)
|
|
110
|
+
├── enrich.py # CSV → iCSV + schema (ICSVEnricher class)
|
|
111
|
+
├── validate.py # iCSV + schema → Frictionless validation + report
|
|
112
|
+
├── _infer.py # pure type-inference functions (shared by enrich + validate)
|
|
113
|
+
├── _parser.py # iCSV header parser (shared by enrich + validate)
|
|
114
|
+
├── _schema.py # per-column statistics + Frictionless schema builder
|
|
115
|
+
├── _report.py # plain-text report writer
|
|
116
|
+
├── exceptions.py # DEVOError hierarchy
|
|
117
|
+
└── webui.py # Flask demo (optional; requires pip install -e ".[webui]")
|
|
118
|
+
tests/
|
|
119
|
+
├── conftest.py
|
|
120
|
+
├── fixtures/ # sample CSV and iCSV files
|
|
121
|
+
└── test_*.py
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## How it works
|
|
125
|
+
|
|
126
|
+
### Enrichment (`devo enrich`)
|
|
127
|
+
|
|
128
|
+
1. **Read** — the CSV is read in one pass. If no `--delimiter` is given, `csv.Sniffer` detects it from the first 10 lines.
|
|
129
|
+
2. **Delimiter mapping** — comma is remapped to pipe in the iCSV output (pipe is also the default fallback for non-spec delimiters). Column names that contain the output delimiter are rejected with a clear error.
|
|
130
|
+
3. **Normalisation** — every row is padded or clipped to header length and stripped of leading/trailing whitespace.
|
|
131
|
+
4. **Type inference** — each column is classified: `integer → number → datetime → string`. Scientific notation (`1.5e-3`, `2E10`) is recognised as `number`. Missing-value sentinels (and any custom `--nodata` value) are excluded before inference.
|
|
132
|
+
5. **Statistics** — per-column `min`, `max`, and `missing_count` are computed from the normalised data and written to the iCSV `# [FIELDS]` section. They do not appear in the Frictionless schema JSON.
|
|
133
|
+
6. **Geometry detection** — if the header contains `lat`/`latitude` + `lon`/`lng`/`longitude`, DEVO writes `geometry = column:lat,lon` and `srid = EPSG:4326` to metadata. A single column named `geometry` (WKT) gets `geometry = column:geometry` only — no `srid`, because WKT embeds its own CRS.
|
|
134
|
+
7. **Write** — the normalised rows are written to the iCSV `# [DATA]` section, and the Frictionless schema is written to `_schema.json`.
|
|
135
|
+
|
|
136
|
+
### Validation (`devo validate`)
|
|
137
|
+
|
|
138
|
+
1. **Parse header** — `_parser.py` reads the `# [METADATA]` and `# [FIELDS]` sections, using `field_delimiter` from metadata to split field values.
|
|
139
|
+
2. **Metadata check** — required keys are verified. `geometry` and `srid` are only checked when spatial column names are present; `srid` is only required for lat/lon columns (not WKT).
|
|
140
|
+
3. **Type cross-check (Option A)** — column types are re-inferred from up to 500 data rows and compared to the declared types. The iCSV's own `nodata` sentinel is merged with the standard missing-value set before re-inference so custom sentinels are not mistaken for real data. Inferred type narrower than or equal to declared → `[OK]`. Inferred wider → `[WARN]`.
|
|
141
|
+
4. **Frictionless validation** — data is written to a temporary comma-delimited CSV and validated against the schema using `frictionless.Resource`. The temp file is always deleted in a `finally` block.
|
|
142
|
+
5. **Report** — a plain-text `.txt` report is written with three sections: `METADATA`, `TYPE CONSISTENCY`, and `DATA VALIDATION`. `Valid: YES` only when metadata has no `[FAIL]` entries and Frictionless reports no data errors. Type warnings do not affect the valid flag.
|
|
143
|
+
|
|
144
|
+
## Limitations
|
|
145
|
+
|
|
146
|
+
- Type inference is conservative: `integer → number → datetime → string`. Mixed-format columns fall back to `string`.
|
|
147
|
+
- Datetime detection uses `datetime.fromisoformat()` and a fixed list of common strptime formats. Unusual formats need a custom schema.
|
|
148
|
+
- Column descriptions are left blank in the iCSV `# [FIELDS]` section; fill them in by hand.
|
|
149
|
+
- The web UI (`webui.py`) is a local demo only — do not expose it to a network.
|
|
150
|
+
|
|
151
|
+
## License
|
|
152
|
+
|
|
153
|
+
MIT. See `LICENSE`.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""DEVO — CSV to iCSV enrichment and validation.
|
|
2
|
+
|
|
3
|
+
Public API:
|
|
4
|
+
from devo.enrich import ICSVEnricher
|
|
5
|
+
from devo.validate import validate_icsv
|
|
6
|
+
|
|
7
|
+
Intentionally imports nothing on package load to avoid side effects
|
|
8
|
+
(frictionless, flask) in environments where only one function is needed.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__version__ = "0.2.0"
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Pure type-inference functions — no I/O, no side effects.
|
|
2
|
+
|
|
3
|
+
All functions in this module are deterministic and dependency-free.
|
|
4
|
+
They are shared by the enricher (CSV → type) and the validator (data re-inference).
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
|
|
11
|
+
# --- Constants ---
|
|
12
|
+
|
|
13
|
+
INT_RE = re.compile(r"^-?\d+$")
|
|
14
|
+
# Optional decimal: matches "5" and "5.0" — needed so mixed int/float columns
|
|
15
|
+
# resolve to 'number' rather than falling through to 'string'.
|
|
16
|
+
FLOAT_RE = re.compile(r"^-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?$")
|
|
17
|
+
|
|
18
|
+
# Tried after fromisoformat fails; restored from DEVO_enricher.py (was dropped in refactor).
|
|
19
|
+
STRPTIME_FORMATS: tuple[str, ...] = (
|
|
20
|
+
"%Y-%m-%d %H:%M:%S",
|
|
21
|
+
"%Y-%m-%d %H:%M",
|
|
22
|
+
"%Y-%m-%d",
|
|
23
|
+
"%d.%m.%Y",
|
|
24
|
+
"%d/%m/%Y",
|
|
25
|
+
"%m/%d/%Y",
|
|
26
|
+
"%Y/%m/%d",
|
|
27
|
+
"%d-%m-%Y",
|
|
28
|
+
"%Y%m%dT%H%M%S",
|
|
29
|
+
"%Y-%m-%dT%H:%M:%S%z",
|
|
30
|
+
"%Y-%m-%dT%H:%M:%S",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# iCSV spec EBNF: field_delimiter ::= [,|\/:;] — tab is not in the allowed set.
|
|
34
|
+
VALID_ICSV_DELIMITERS: frozenset[str] = frozenset({",", "|", "\\", "/", ":", ";"})
|
|
35
|
+
|
|
36
|
+
# Common missing-value sentinels. Single source of truth shared by enricher and validator.
|
|
37
|
+
# EnviDat has no standardised sentinel, so we cast a wide net.
|
|
38
|
+
COMMON_MISSING: frozenset[str] = frozenset({
|
|
39
|
+
"", "NA", "N/A", "na", "n/a", "NULL", "null", "nan", "NaN",
|
|
40
|
+
"-999", "-999.0", "-999.000000",
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
# Type subtype lattice: inferred → set of declared types it is valid under.
|
|
44
|
+
# integer ⊂ number ⊂ string; datetime ⊂ string.
|
|
45
|
+
# Used by the validator for Option-A cross-check (declared type is authoritative).
|
|
46
|
+
_SUBTYPES: dict[str, frozenset[str]] = {
|
|
47
|
+
"integer": frozenset({"integer", "number", "string"}),
|
|
48
|
+
"number": frozenset({"number", "string"}),
|
|
49
|
+
"datetime": frozenset({"datetime", "string"}),
|
|
50
|
+
"string": frozenset({"string"}),
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# --- Type checkers ---
|
|
55
|
+
|
|
56
|
+
def _is_integer(s: str) -> bool:
|
|
57
|
+
return bool(INT_RE.match(s))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _is_number(s: str) -> bool:
|
|
61
|
+
return bool(INT_RE.match(s) or FLOAT_RE.match(s))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _is_datetime(s: str) -> bool:
|
|
65
|
+
"""Try fromisoformat first, then a fixed list of strptime formats."""
|
|
66
|
+
s = s.strip()
|
|
67
|
+
if not s:
|
|
68
|
+
return False
|
|
69
|
+
try:
|
|
70
|
+
datetime.fromisoformat(s)
|
|
71
|
+
return True
|
|
72
|
+
except (ValueError, TypeError):
|
|
73
|
+
pass
|
|
74
|
+
for fmt in STRPTIME_FORMATS:
|
|
75
|
+
try:
|
|
76
|
+
datetime.strptime(s, fmt)
|
|
77
|
+
return True
|
|
78
|
+
except (ValueError, TypeError):
|
|
79
|
+
continue
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# --- Public API ---
|
|
84
|
+
|
|
85
|
+
def infer_type(values: list[str], missing: frozenset[str] = COMMON_MISSING) -> str:
|
|
86
|
+
"""
|
|
87
|
+
Infer a Frictionless field type from a list of string values.
|
|
88
|
+
Cascade: integer → number → datetime → string.
|
|
89
|
+
Missing-value sentinels are excluded before testing.
|
|
90
|
+
An all-missing or empty column returns 'string'.
|
|
91
|
+
"""
|
|
92
|
+
pruned = [v.strip() for v in values if v.strip() not in missing]
|
|
93
|
+
if not pruned:
|
|
94
|
+
return "string"
|
|
95
|
+
if all(_is_integer(v) for v in pruned):
|
|
96
|
+
return "integer"
|
|
97
|
+
if all(_is_number(v) for v in pruned):
|
|
98
|
+
return "number"
|
|
99
|
+
if all(_is_datetime(v) for v in pruned):
|
|
100
|
+
return "datetime"
|
|
101
|
+
return "string"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def is_subtype_or_equal(inferred: str, declared: str) -> bool:
|
|
105
|
+
"""
|
|
106
|
+
True when the inferred type is at least as specific as (or equal to) the declared type.
|
|
107
|
+
This means existing data satisfies the declared schema:
|
|
108
|
+
- inferred=integer, declared=number → True (integers pass number validation)
|
|
109
|
+
- inferred=number, declared=integer → False (floats fail integer validation)
|
|
110
|
+
Used by the validator to produce [WARN] when inferred is wider than declared.
|
|
111
|
+
"""
|
|
112
|
+
return declared in _SUBTYPES.get(inferred, frozenset())
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Canonical iCSV header parser — single implementation shared by enricher and validator.
|
|
2
|
+
|
|
3
|
+
Parses [METADATA] and [FIELDS] sections from iCSV files per the iCSV 1.0 spec.
|
|
4
|
+
Stops at # [DATA] and does not read data rows.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from .exceptions import ParseError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ICSVHeader:
|
|
16
|
+
metadata: dict[str, str]
|
|
17
|
+
fields_meta: dict[str, list[str]]
|
|
18
|
+
field_delimiter: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def is_icsv(path: Path) -> bool:
|
|
22
|
+
"""Return True if the file's first line marks it as an iCSV file."""
|
|
23
|
+
try:
|
|
24
|
+
# utf-8-sig strips the BOM if present
|
|
25
|
+
with open(path, "r", encoding="utf-8-sig") as fh:
|
|
26
|
+
return fh.readline().strip().startswith("# iCSV")
|
|
27
|
+
except (OSError, UnicodeDecodeError):
|
|
28
|
+
return False
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def parse_header(path: Path) -> ICSVHeader:
|
|
32
|
+
"""
|
|
33
|
+
Parse [METADATA] and [FIELDS] sections of an iCSV file.
|
|
34
|
+
|
|
35
|
+
field_delimiter is read from metadata before the FIELDS section is split,
|
|
36
|
+
so key order in the file does not matter — the correct delimiter is always used.
|
|
37
|
+
Raises ParseError if the file is unreadable or has no [METADATA] section.
|
|
38
|
+
"""
|
|
39
|
+
metadata: dict[str, str] = {}
|
|
40
|
+
raw_fields: dict[str, str] = {} # key → unsplit value string; split after delimiter known
|
|
41
|
+
section: str | None = None
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
with open(path, "r", encoding="utf-8-sig") as fh:
|
|
45
|
+
for line in fh:
|
|
46
|
+
stripped = line.rstrip("\r\n")
|
|
47
|
+
|
|
48
|
+
if not stripped.startswith("#"):
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
content = stripped.lstrip("#").strip()
|
|
52
|
+
|
|
53
|
+
if content == "[METADATA]":
|
|
54
|
+
section = "metadata"
|
|
55
|
+
continue
|
|
56
|
+
if content == "[FIELDS]":
|
|
57
|
+
section = "fields"
|
|
58
|
+
continue
|
|
59
|
+
if content == "[DATA]":
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
# Skip blank comment lines and section headers
|
|
63
|
+
if not content or "=" not in content or section is None:
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
key, _, val = content.partition("=")
|
|
67
|
+
key = key.strip()
|
|
68
|
+
val = val.strip()
|
|
69
|
+
|
|
70
|
+
if section == "metadata":
|
|
71
|
+
metadata[key] = val
|
|
72
|
+
else:
|
|
73
|
+
raw_fields[key] = val
|
|
74
|
+
|
|
75
|
+
except OSError as e:
|
|
76
|
+
raise ParseError(f"Cannot read {path}: {e}") from e
|
|
77
|
+
|
|
78
|
+
if not metadata:
|
|
79
|
+
raise ParseError(f"{path.name}: no [METADATA] section found or file is empty")
|
|
80
|
+
|
|
81
|
+
field_delimiter = metadata.get("field_delimiter", ",")
|
|
82
|
+
fields_meta = {
|
|
83
|
+
k: [v.strip() for v in raw.split(field_delimiter)]
|
|
84
|
+
for k, raw in raw_fields.items()
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return ICSVHeader(
|
|
88
|
+
metadata=metadata,
|
|
89
|
+
fields_meta=fields_meta,
|
|
90
|
+
field_delimiter=field_delimiter,
|
|
91
|
+
)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Plain-text validation report writer.
|
|
2
|
+
|
|
3
|
+
Produces a human-readable .txt file covering three checks:
|
|
4
|
+
1. Metadata completeness
|
|
5
|
+
2. Type consistency (declared vs re-inferred)
|
|
6
|
+
3. Frictionless data validation
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def write_report(
|
|
16
|
+
path: Path,
|
|
17
|
+
icsv_name: str,
|
|
18
|
+
metadata_issues: list[str],
|
|
19
|
+
type_issues: list[tuple[str, str, str, bool]],
|
|
20
|
+
frictionless_report: Any,
|
|
21
|
+
is_valid: bool,
|
|
22
|
+
) -> None:
|
|
23
|
+
"""
|
|
24
|
+
Write a plain-text DEVO validation report to `path`.
|
|
25
|
+
|
|
26
|
+
type_issues: list of (column_name, declared_type, inferred_type, is_ok).
|
|
27
|
+
is_ok=True means inferred is a subtype of (or equal to) declared.
|
|
28
|
+
frictionless_report: the object returned by frictionless Resource.validate().
|
|
29
|
+
"""
|
|
30
|
+
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
31
|
+
_SEP = "-" * 40
|
|
32
|
+
|
|
33
|
+
with open(path, "w", encoding="utf-8") as fh:
|
|
34
|
+
|
|
35
|
+
fh.write("DEVO Validation Report\n")
|
|
36
|
+
fh.write("=" * 22 + "\n")
|
|
37
|
+
fh.write(f"File: {icsv_name}\n")
|
|
38
|
+
fh.write(f"Date: {now}\n")
|
|
39
|
+
fh.write(f"Valid: {'YES' if is_valid else 'NO'}\n\n")
|
|
40
|
+
|
|
41
|
+
# --- Metadata ---
|
|
42
|
+
fh.write("METADATA\n")
|
|
43
|
+
fh.write(_SEP + "\n")
|
|
44
|
+
if metadata_issues:
|
|
45
|
+
for issue in metadata_issues:
|
|
46
|
+
fh.write(f"{issue}\n")
|
|
47
|
+
else:
|
|
48
|
+
fh.write("[OK] All required metadata present.\n")
|
|
49
|
+
fh.write("\n")
|
|
50
|
+
|
|
51
|
+
# --- Type consistency (Option A cross-check) ---
|
|
52
|
+
fh.write("TYPE CONSISTENCY\n")
|
|
53
|
+
fh.write(_SEP + "\n")
|
|
54
|
+
if not type_issues:
|
|
55
|
+
fh.write("[OK] No declared types to cross-check.\n")
|
|
56
|
+
else:
|
|
57
|
+
for col, declared, inferred, ok in type_issues:
|
|
58
|
+
if ok:
|
|
59
|
+
fh.write(f"[OK] {col}: declared={declared}, inferred={inferred}\n")
|
|
60
|
+
else:
|
|
61
|
+
fh.write(f"[WARN] {col}: declared={declared}, inferred={inferred}\n")
|
|
62
|
+
fh.write(
|
|
63
|
+
f" Inferred type is wider than declared. "
|
|
64
|
+
f"Data may not satisfy '{declared}' constraints.\n"
|
|
65
|
+
)
|
|
66
|
+
fh.write("\n")
|
|
67
|
+
|
|
68
|
+
# --- Frictionless data validation ---
|
|
69
|
+
fh.write("DATA VALIDATION\n")
|
|
70
|
+
fh.write(_SEP + "\n")
|
|
71
|
+
try:
|
|
72
|
+
errors = frictionless_report.flatten(
|
|
73
|
+
["rowNumber", "fieldNumber", "fieldName", "code", "message"]
|
|
74
|
+
)
|
|
75
|
+
except (AttributeError, TypeError):
|
|
76
|
+
errors = []
|
|
77
|
+
fh.write("[WARN] Could not extract error details from frictionless report.\n")
|
|
78
|
+
|
|
79
|
+
if not errors:
|
|
80
|
+
fh.write("[PASS] No data errors found.\n")
|
|
81
|
+
else:
|
|
82
|
+
shown = errors[:50]
|
|
83
|
+
suffix = f" (showing first 50 of {len(errors)})" if len(errors) > 50 else ""
|
|
84
|
+
fh.write(f"[FAIL] {len(errors)} error(s) found{suffix}:\n")
|
|
85
|
+
for row, col_num, col_name, code, message in shown:
|
|
86
|
+
row_str = str(row) if row is not None else "?"
|
|
87
|
+
col_str = col_name or (str(col_num) if col_num is not None else "?")
|
|
88
|
+
fh.write(f" Row {row_str}, Col {col_str} [{code}]: {message}\n")
|