py-devo 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devo/__init__.py +11 -0
- devo/_infer.py +112 -0
- devo/_parser.py +91 -0
- devo/_report.py +88 -0
- devo/_schema.py +111 -0
- devo/cli.py +104 -0
- devo/enrich.py +234 -0
- devo/exceptions.py +14 -0
- devo/validate.py +219 -0
- devo/webui.py +46 -0
- py_devo-0.2.0.dist-info/METADATA +167 -0
- py_devo-0.2.0.dist-info/RECORD +16 -0
- py_devo-0.2.0.dist-info/WHEEL +5 -0
- py_devo-0.2.0.dist-info/entry_points.txt +2 -0
- py_devo-0.2.0.dist-info/licenses/LICENSE +6 -0
- py_devo-0.2.0.dist-info/top_level.txt +1 -0
devo/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""DEVO — CSV to iCSV enrichment and validation.
|
|
2
|
+
|
|
3
|
+
Public API:
|
|
4
|
+
from devo.enrich import ICSVEnricher
|
|
5
|
+
from devo.validate import validate_icsv
|
|
6
|
+
|
|
7
|
+
Intentionally imports nothing on package load to avoid side effects
|
|
8
|
+
(frictionless, flask) in environments where only one function is needed.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__version__ = "0.2.0"
|
devo/_infer.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Pure type-inference functions — no I/O, no side effects.
|
|
2
|
+
|
|
3
|
+
All functions in this module are deterministic and dependency-free.
|
|
4
|
+
They are shared by the enricher (CSV → type) and the validator (data re-inference).
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
|
|
11
|
+
# --- Constants ---
|
|
12
|
+
|
|
13
|
+
INT_RE = re.compile(r"^-?\d+$")
|
|
14
|
+
# Optional decimal: matches "5" and "5.0" — needed so mixed int/float columns
|
|
15
|
+
# resolve to 'number' rather than falling through to 'string'.
|
|
16
|
+
FLOAT_RE = re.compile(r"^-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?$")
|
|
17
|
+
|
|
18
|
+
# Tried after fromisoformat fails; restored from DEVO_enricher.py (was dropped in refactor).
|
|
19
|
+
STRPTIME_FORMATS: tuple[str, ...] = (
|
|
20
|
+
"%Y-%m-%d %H:%M:%S",
|
|
21
|
+
"%Y-%m-%d %H:%M",
|
|
22
|
+
"%Y-%m-%d",
|
|
23
|
+
"%d.%m.%Y",
|
|
24
|
+
"%d/%m/%Y",
|
|
25
|
+
"%m/%d/%Y",
|
|
26
|
+
"%Y/%m/%d",
|
|
27
|
+
"%d-%m-%Y",
|
|
28
|
+
"%Y%m%dT%H%M%S",
|
|
29
|
+
"%Y-%m-%dT%H:%M:%S%z",
|
|
30
|
+
"%Y-%m-%dT%H:%M:%S",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# iCSV spec EBNF: field_delimiter ::= [,|\/:;] — tab is not in the allowed set.
|
|
34
|
+
VALID_ICSV_DELIMITERS: frozenset[str] = frozenset({",", "|", "\\", "/", ":", ";"})
|
|
35
|
+
|
|
36
|
+
# Common missing-value sentinels. Single source of truth shared by enricher and validator.
|
|
37
|
+
# EnviDat has no standardised sentinel, so we cast a wide net.
|
|
38
|
+
COMMON_MISSING: frozenset[str] = frozenset({
|
|
39
|
+
"", "NA", "N/A", "na", "n/a", "NULL", "null", "nan", "NaN",
|
|
40
|
+
"-999", "-999.0", "-999.000000",
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
# Type subtype lattice: inferred → set of declared types it is valid under.
|
|
44
|
+
# integer ⊂ number ⊂ string; datetime ⊂ string.
|
|
45
|
+
# Used by the validator for Option-A cross-check (declared type is authoritative).
|
|
46
|
+
_SUBTYPES: dict[str, frozenset[str]] = {
|
|
47
|
+
"integer": frozenset({"integer", "number", "string"}),
|
|
48
|
+
"number": frozenset({"number", "string"}),
|
|
49
|
+
"datetime": frozenset({"datetime", "string"}),
|
|
50
|
+
"string": frozenset({"string"}),
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# --- Type checkers ---
|
|
55
|
+
|
|
56
|
+
def _is_integer(s: str) -> bool:
|
|
57
|
+
return bool(INT_RE.match(s))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _is_number(s: str) -> bool:
|
|
61
|
+
return bool(INT_RE.match(s) or FLOAT_RE.match(s))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _is_datetime(s: str) -> bool:
|
|
65
|
+
"""Try fromisoformat first, then a fixed list of strptime formats."""
|
|
66
|
+
s = s.strip()
|
|
67
|
+
if not s:
|
|
68
|
+
return False
|
|
69
|
+
try:
|
|
70
|
+
datetime.fromisoformat(s)
|
|
71
|
+
return True
|
|
72
|
+
except (ValueError, TypeError):
|
|
73
|
+
pass
|
|
74
|
+
for fmt in STRPTIME_FORMATS:
|
|
75
|
+
try:
|
|
76
|
+
datetime.strptime(s, fmt)
|
|
77
|
+
return True
|
|
78
|
+
except (ValueError, TypeError):
|
|
79
|
+
continue
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# --- Public API ---
|
|
84
|
+
|
|
85
|
+
def infer_type(values: list[str], missing: frozenset[str] = COMMON_MISSING) -> str:
|
|
86
|
+
"""
|
|
87
|
+
Infer a Frictionless field type from a list of string values.
|
|
88
|
+
Cascade: integer → number → datetime → string.
|
|
89
|
+
Missing-value sentinels are excluded before testing.
|
|
90
|
+
An all-missing or empty column returns 'string'.
|
|
91
|
+
"""
|
|
92
|
+
pruned = [v.strip() for v in values if v.strip() not in missing]
|
|
93
|
+
if not pruned:
|
|
94
|
+
return "string"
|
|
95
|
+
if all(_is_integer(v) for v in pruned):
|
|
96
|
+
return "integer"
|
|
97
|
+
if all(_is_number(v) for v in pruned):
|
|
98
|
+
return "number"
|
|
99
|
+
if all(_is_datetime(v) for v in pruned):
|
|
100
|
+
return "datetime"
|
|
101
|
+
return "string"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def is_subtype_or_equal(inferred: str, declared: str) -> bool:
|
|
105
|
+
"""
|
|
106
|
+
True when the inferred type is at least as specific as (or equal to) the declared type.
|
|
107
|
+
This means existing data satisfies the declared schema:
|
|
108
|
+
- inferred=integer, declared=number → True (integers pass number validation)
|
|
109
|
+
- inferred=number, declared=integer → False (floats fail integer validation)
|
|
110
|
+
Used by the validator to produce [WARN] when inferred is wider than declared.
|
|
111
|
+
"""
|
|
112
|
+
return declared in _SUBTYPES.get(inferred, frozenset())
|
devo/_parser.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Canonical iCSV header parser — single implementation shared by enricher and validator.
|
|
2
|
+
|
|
3
|
+
Parses [METADATA] and [FIELDS] sections from iCSV files per the iCSV 1.0 spec.
|
|
4
|
+
Stops at # [DATA] and does not read data rows.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from .exceptions import ParseError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ICSVHeader:
|
|
16
|
+
metadata: dict[str, str]
|
|
17
|
+
fields_meta: dict[str, list[str]]
|
|
18
|
+
field_delimiter: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def is_icsv(path: Path) -> bool:
|
|
22
|
+
"""Return True if the file's first line marks it as an iCSV file."""
|
|
23
|
+
try:
|
|
24
|
+
# utf-8-sig strips the BOM if present
|
|
25
|
+
with open(path, "r", encoding="utf-8-sig") as fh:
|
|
26
|
+
return fh.readline().strip().startswith("# iCSV")
|
|
27
|
+
except (OSError, UnicodeDecodeError):
|
|
28
|
+
return False
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def parse_header(path: Path) -> ICSVHeader:
|
|
32
|
+
"""
|
|
33
|
+
Parse [METADATA] and [FIELDS] sections of an iCSV file.
|
|
34
|
+
|
|
35
|
+
field_delimiter is read from metadata before the FIELDS section is split,
|
|
36
|
+
so key order in the file does not matter — the correct delimiter is always used.
|
|
37
|
+
Raises ParseError if the file is unreadable or has no [METADATA] section.
|
|
38
|
+
"""
|
|
39
|
+
metadata: dict[str, str] = {}
|
|
40
|
+
raw_fields: dict[str, str] = {} # key → unsplit value string; split after delimiter known
|
|
41
|
+
section: str | None = None
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
with open(path, "r", encoding="utf-8-sig") as fh:
|
|
45
|
+
for line in fh:
|
|
46
|
+
stripped = line.rstrip("\r\n")
|
|
47
|
+
|
|
48
|
+
if not stripped.startswith("#"):
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
content = stripped.lstrip("#").strip()
|
|
52
|
+
|
|
53
|
+
if content == "[METADATA]":
|
|
54
|
+
section = "metadata"
|
|
55
|
+
continue
|
|
56
|
+
if content == "[FIELDS]":
|
|
57
|
+
section = "fields"
|
|
58
|
+
continue
|
|
59
|
+
if content == "[DATA]":
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
# Skip blank comment lines and section headers
|
|
63
|
+
if not content or "=" not in content or section is None:
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
key, _, val = content.partition("=")
|
|
67
|
+
key = key.strip()
|
|
68
|
+
val = val.strip()
|
|
69
|
+
|
|
70
|
+
if section == "metadata":
|
|
71
|
+
metadata[key] = val
|
|
72
|
+
else:
|
|
73
|
+
raw_fields[key] = val
|
|
74
|
+
|
|
75
|
+
except OSError as e:
|
|
76
|
+
raise ParseError(f"Cannot read {path}: {e}") from e
|
|
77
|
+
|
|
78
|
+
if not metadata:
|
|
79
|
+
raise ParseError(f"{path.name}: no [METADATA] section found or file is empty")
|
|
80
|
+
|
|
81
|
+
field_delimiter = metadata.get("field_delimiter", ",")
|
|
82
|
+
fields_meta = {
|
|
83
|
+
k: [v.strip() for v in raw.split(field_delimiter)]
|
|
84
|
+
for k, raw in raw_fields.items()
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return ICSVHeader(
|
|
88
|
+
metadata=metadata,
|
|
89
|
+
fields_meta=fields_meta,
|
|
90
|
+
field_delimiter=field_delimiter,
|
|
91
|
+
)
|
devo/_report.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Plain-text validation report writer.
|
|
2
|
+
|
|
3
|
+
Produces a human-readable .txt file covering three checks:
|
|
4
|
+
1. Metadata completeness
|
|
5
|
+
2. Type consistency (declared vs re-inferred)
|
|
6
|
+
3. Frictionless data validation
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def write_report(
|
|
16
|
+
path: Path,
|
|
17
|
+
icsv_name: str,
|
|
18
|
+
metadata_issues: list[str],
|
|
19
|
+
type_issues: list[tuple[str, str, str, bool]],
|
|
20
|
+
frictionless_report: Any,
|
|
21
|
+
is_valid: bool,
|
|
22
|
+
) -> None:
|
|
23
|
+
"""
|
|
24
|
+
Write a plain-text DEVO validation report to `path`.
|
|
25
|
+
|
|
26
|
+
type_issues: list of (column_name, declared_type, inferred_type, is_ok).
|
|
27
|
+
is_ok=True means inferred is a subtype of (or equal to) declared.
|
|
28
|
+
frictionless_report: the object returned by frictionless Resource.validate().
|
|
29
|
+
"""
|
|
30
|
+
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
31
|
+
_SEP = "-" * 40
|
|
32
|
+
|
|
33
|
+
with open(path, "w", encoding="utf-8") as fh:
|
|
34
|
+
|
|
35
|
+
fh.write("DEVO Validation Report\n")
|
|
36
|
+
fh.write("=" * 22 + "\n")
|
|
37
|
+
fh.write(f"File: {icsv_name}\n")
|
|
38
|
+
fh.write(f"Date: {now}\n")
|
|
39
|
+
fh.write(f"Valid: {'YES' if is_valid else 'NO'}\n\n")
|
|
40
|
+
|
|
41
|
+
# --- Metadata ---
|
|
42
|
+
fh.write("METADATA\n")
|
|
43
|
+
fh.write(_SEP + "\n")
|
|
44
|
+
if metadata_issues:
|
|
45
|
+
for issue in metadata_issues:
|
|
46
|
+
fh.write(f"{issue}\n")
|
|
47
|
+
else:
|
|
48
|
+
fh.write("[OK] All required metadata present.\n")
|
|
49
|
+
fh.write("\n")
|
|
50
|
+
|
|
51
|
+
# --- Type consistency (Option A cross-check) ---
|
|
52
|
+
fh.write("TYPE CONSISTENCY\n")
|
|
53
|
+
fh.write(_SEP + "\n")
|
|
54
|
+
if not type_issues:
|
|
55
|
+
fh.write("[OK] No declared types to cross-check.\n")
|
|
56
|
+
else:
|
|
57
|
+
for col, declared, inferred, ok in type_issues:
|
|
58
|
+
if ok:
|
|
59
|
+
fh.write(f"[OK] {col}: declared={declared}, inferred={inferred}\n")
|
|
60
|
+
else:
|
|
61
|
+
fh.write(f"[WARN] {col}: declared={declared}, inferred={inferred}\n")
|
|
62
|
+
fh.write(
|
|
63
|
+
f" Inferred type is wider than declared. "
|
|
64
|
+
f"Data may not satisfy '{declared}' constraints.\n"
|
|
65
|
+
)
|
|
66
|
+
fh.write("\n")
|
|
67
|
+
|
|
68
|
+
# --- Frictionless data validation ---
|
|
69
|
+
fh.write("DATA VALIDATION\n")
|
|
70
|
+
fh.write(_SEP + "\n")
|
|
71
|
+
try:
|
|
72
|
+
errors = frictionless_report.flatten(
|
|
73
|
+
["rowNumber", "fieldNumber", "fieldName", "code", "message"]
|
|
74
|
+
)
|
|
75
|
+
except (AttributeError, TypeError):
|
|
76
|
+
errors = []
|
|
77
|
+
fh.write("[WARN] Could not extract error details from frictionless report.\n")
|
|
78
|
+
|
|
79
|
+
if not errors:
|
|
80
|
+
fh.write("[PASS] No data errors found.\n")
|
|
81
|
+
else:
|
|
82
|
+
shown = errors[:50]
|
|
83
|
+
suffix = f" (showing first 50 of {len(errors)})" if len(errors) > 50 else ""
|
|
84
|
+
fh.write(f"[FAIL] {len(errors)} error(s) found{suffix}:\n")
|
|
85
|
+
for row, col_num, col_name, code, message in shown:
|
|
86
|
+
row_str = str(row) if row is not None else "?"
|
|
87
|
+
col_str = col_name or (str(col_num) if col_num is not None else "?")
|
|
88
|
+
fh.write(f" Row {row_str}, Col {col_str} [{code}]: {message}\n")
|
devo/_schema.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Frictionless schema builder and per-column statistics.
|
|
2
|
+
|
|
3
|
+
Separates DEVO-specific stats (min, max, missing_count — written to the iCSV FIELDS section)
|
|
4
|
+
from the Frictionless schema JSON (which must only contain standard Frictionless keys).
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
|
|
10
|
+
from ._infer import STRPTIME_FORMATS, COMMON_MISSING
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _numeric_minmax(
|
|
14
|
+
pruned: list[str], as_type: str
|
|
15
|
+
) -> tuple[Optional[float | int], Optional[float | int]]:
|
|
16
|
+
"""Compute min/max for integer or number columns. Returns (None, None) on failure."""
|
|
17
|
+
if not pruned:
|
|
18
|
+
return None, None
|
|
19
|
+
try:
|
|
20
|
+
nums = [int(x) if as_type == "integer" else float(x) for x in pruned]
|
|
21
|
+
return min(nums), max(nums)
|
|
22
|
+
except (ValueError, TypeError):
|
|
23
|
+
return None, None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _datetime_minmax(pruned: list[str]) -> tuple[Optional[str], Optional[str]]:
|
|
27
|
+
"""
|
|
28
|
+
Compute min/max for datetime columns.
|
|
29
|
+
Returns ISO-format strings or (None, None) if nothing can be parsed.
|
|
30
|
+
Uses the same format list as _infer.py to stay consistent.
|
|
31
|
+
"""
|
|
32
|
+
from datetime import datetime
|
|
33
|
+
|
|
34
|
+
parsed = []
|
|
35
|
+
for v in pruned:
|
|
36
|
+
try:
|
|
37
|
+
parsed.append(datetime.fromisoformat(v))
|
|
38
|
+
continue
|
|
39
|
+
except (ValueError, TypeError):
|
|
40
|
+
pass
|
|
41
|
+
for fmt in STRPTIME_FORMATS:
|
|
42
|
+
try:
|
|
43
|
+
parsed.append(datetime.strptime(v, fmt))
|
|
44
|
+
break
|
|
45
|
+
except (ValueError, TypeError):
|
|
46
|
+
continue
|
|
47
|
+
if not parsed:
|
|
48
|
+
return None, None
|
|
49
|
+
return min(parsed).isoformat(), max(parsed).isoformat()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def compute_col_stats(
|
|
53
|
+
vals: list[str],
|
|
54
|
+
inferred_type: str,
|
|
55
|
+
missing: frozenset[str] = COMMON_MISSING,
|
|
56
|
+
) -> dict[str, Any]:
|
|
57
|
+
"""
|
|
58
|
+
Compute per-column statistics for the iCSV [FIELDS] section.
|
|
59
|
+
These values go into # min =, # max =, # missing_count =.
|
|
60
|
+
They do NOT appear in the Frictionless schema JSON.
|
|
61
|
+
"""
|
|
62
|
+
pruned = [v for v in vals if v not in missing and v.strip() != ""]
|
|
63
|
+
missing_count = len(vals) - len(pruned)
|
|
64
|
+
stats: dict[str, Any] = {
|
|
65
|
+
"type": inferred_type,
|
|
66
|
+
"min": None,
|
|
67
|
+
"max": None,
|
|
68
|
+
"missing_count": missing_count,
|
|
69
|
+
# required only if no missing values were observed in the current data
|
|
70
|
+
"required": missing_count == 0 and len(vals) > 0,
|
|
71
|
+
}
|
|
72
|
+
if inferred_type in ("integer", "number") and pruned:
|
|
73
|
+
stats["min"], stats["max"] = _numeric_minmax(pruned, inferred_type)
|
|
74
|
+
elif inferred_type == "datetime" and pruned:
|
|
75
|
+
stats["min"], stats["max"] = _datetime_minmax(pruned)
|
|
76
|
+
return stats
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def build_frictionless_schema(
|
|
80
|
+
header: list[str],
|
|
81
|
+
col_stats: list[dict[str, Any]],
|
|
82
|
+
missing: frozenset[str] = COMMON_MISSING,
|
|
83
|
+
) -> dict[str, Any]:
|
|
84
|
+
"""
|
|
85
|
+
Build a clean Frictionless Table Schema dict.
|
|
86
|
+
Only standard Frictionless keys are written here.
|
|
87
|
+
DEVO-specific stats (min, max, missing_count) live in the iCSV FIELDS section only.
|
|
88
|
+
"""
|
|
89
|
+
fields = []
|
|
90
|
+
for name, stats in zip(header, col_stats):
|
|
91
|
+
field: dict[str, Any] = {"name": name, "type": stats["type"]}
|
|
92
|
+
# frictionless datetime/default rejects partial datetime strings (e.g. date-only).
|
|
93
|
+
# format=any tells frictionless to accept any parseable datetime representation,
|
|
94
|
+
# consistent with DEVO's own broad datetime detection.
|
|
95
|
+
if stats["type"] == "datetime":
|
|
96
|
+
field["format"] = "any"
|
|
97
|
+
constraints: dict[str, Any] = {}
|
|
98
|
+
if stats["min"] is not None:
|
|
99
|
+
constraints["minimum"] = stats["min"]
|
|
100
|
+
if stats["max"] is not None:
|
|
101
|
+
constraints["maximum"] = stats["max"]
|
|
102
|
+
if stats.get("required"):
|
|
103
|
+
constraints["required"] = True
|
|
104
|
+
if constraints:
|
|
105
|
+
field["constraints"] = constraints
|
|
106
|
+
fields.append(field)
|
|
107
|
+
|
|
108
|
+
return {
|
|
109
|
+
"fields": fields,
|
|
110
|
+
"missingValues": sorted(missing),
|
|
111
|
+
}
|
devo/cli.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Command-line front-end for DEVO.
|
|
2
|
+
|
|
3
|
+
Three subcommands:
|
|
4
|
+
enrich — CSV → iCSV + schema
|
|
5
|
+
validate — iCSV + schema → report
|
|
6
|
+
run — enrich then validate (or just validate if input is already .icsv)
|
|
7
|
+
|
|
8
|
+
Exit codes: 0 = success, 1 = validation failed (data errors), 2 = usage/runtime error.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import sys
|
|
14
|
+
|
|
15
|
+
from .enrich import ICSVEnricher
|
|
16
|
+
from .exceptions import DEVOError
|
|
17
|
+
from .validate import validate_icsv
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
21
|
+
p = argparse.ArgumentParser(prog="devo", description="DEVO — CSV to iCSV enrichment and validation")
|
|
22
|
+
sub = p.add_subparsers(dest="cmd", required=True)
|
|
23
|
+
|
|
24
|
+
p_enrich = sub.add_parser("enrich", help="Convert a CSV to iCSV + Frictionless schema")
|
|
25
|
+
p_enrich.add_argument("infile", help="Input CSV file")
|
|
26
|
+
p_enrich.add_argument("--out", default="DEVO_output", metavar="DIR", help="Output directory")
|
|
27
|
+
p_enrich.add_argument("--delimiter", metavar="CHAR", help="Force input delimiter")
|
|
28
|
+
p_enrich.add_argument("--nodata", metavar="VALUE", help="Force nodata sentinel")
|
|
29
|
+
p_enrich.add_argument("--app", metavar="PROFILE", help="iCSV application profile")
|
|
30
|
+
|
|
31
|
+
p_val = sub.add_parser("validate", help="Validate an iCSV against its schema")
|
|
32
|
+
p_val.add_argument("infile", help="Input .icsv file")
|
|
33
|
+
p_val.add_argument("--schema", metavar="PATH", help="Schema JSON path (default: auto-discover)")
|
|
34
|
+
p_val.add_argument("--out", default="DEVO_output", metavar="DIR", help="Output directory")
|
|
35
|
+
|
|
36
|
+
p_run = sub.add_parser(
|
|
37
|
+
"run",
|
|
38
|
+
help="Enrich then validate. If input is already .icsv, skips enrichment.",
|
|
39
|
+
)
|
|
40
|
+
p_run.add_argument("infile", help="Input CSV or iCSV file")
|
|
41
|
+
p_run.add_argument("--out", default="DEVO_output", metavar="DIR", help="Output directory")
|
|
42
|
+
p_run.add_argument("--delimiter", metavar="CHAR", help="Force input delimiter (CSV only)")
|
|
43
|
+
p_run.add_argument("--nodata", metavar="VALUE", help="Force nodata sentinel (CSV only)")
|
|
44
|
+
p_run.add_argument("--app", metavar="PROFILE", help="iCSV application profile (CSV only)")
|
|
45
|
+
|
|
46
|
+
return p
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def main(argv=None) -> None:
|
|
50
|
+
p = build_parser()
|
|
51
|
+
args = p.parse_args(argv)
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
if args.cmd == "enrich":
|
|
55
|
+
enr = ICSVEnricher()
|
|
56
|
+
icsv, schema = enr.make_icsv(
|
|
57
|
+
args.infile, args.out,
|
|
58
|
+
user_delimiter=args.delimiter,
|
|
59
|
+
nodata_override=args.nodata,
|
|
60
|
+
application_profile=args.app,
|
|
61
|
+
)
|
|
62
|
+
print(f"[OK] {icsv}")
|
|
63
|
+
print(f"[OK] {schema}")
|
|
64
|
+
|
|
65
|
+
elif args.cmd == "validate":
|
|
66
|
+
report, valid = validate_icsv(
|
|
67
|
+
args.infile, schema_path=args.schema, outdir=args.out
|
|
68
|
+
)
|
|
69
|
+
print(f"[{'OK' if valid else 'FAIL'}] Report: {report}")
|
|
70
|
+
if not valid:
|
|
71
|
+
sys.exit(1)
|
|
72
|
+
|
|
73
|
+
elif args.cmd == "run":
|
|
74
|
+
from pathlib import Path
|
|
75
|
+
from ._parser import is_icsv
|
|
76
|
+
|
|
77
|
+
inpath = Path(args.infile)
|
|
78
|
+
if is_icsv(inpath):
|
|
79
|
+
# Already enriched — skip enrichment, use sibling schema if it exists
|
|
80
|
+
icsv = str(inpath)
|
|
81
|
+
schema = str(inpath.with_name(inpath.stem + "_schema.json"))
|
|
82
|
+
print(f"[OK] Input is already an iCSV — skipping enrichment.")
|
|
83
|
+
else:
|
|
84
|
+
enr = ICSVEnricher()
|
|
85
|
+
icsv, schema = enr.make_icsv(
|
|
86
|
+
args.infile, args.out,
|
|
87
|
+
user_delimiter=args.delimiter,
|
|
88
|
+
nodata_override=args.nodata,
|
|
89
|
+
application_profile=args.app,
|
|
90
|
+
)
|
|
91
|
+
print(f"[OK] Enriched: {icsv}")
|
|
92
|
+
|
|
93
|
+
report, valid = validate_icsv(icsv, schema_path=schema, outdir=args.out)
|
|
94
|
+
print(f"[{'OK' if valid else 'FAIL'}] Report: {report}")
|
|
95
|
+
if not valid:
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
|
|
98
|
+
except (DEVOError, FileNotFoundError) as e:
|
|
99
|
+
print(f"[ERROR] {e}", file=sys.stderr)
|
|
100
|
+
sys.exit(2)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
if __name__ == "__main__":
|
|
104
|
+
main()
|
devo/enrich.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""CSV → iCSV enrichment.
|
|
2
|
+
|
|
3
|
+
Public API: ICSVEnricher().make_icsv(infile, outdir, ...)
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import csv
|
|
8
|
+
import json
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
from ._infer import COMMON_MISSING, VALID_ICSV_DELIMITERS, infer_type
|
|
14
|
+
from ._parser import is_icsv
|
|
15
|
+
from ._schema import build_frictionless_schema, compute_col_stats
|
|
16
|
+
from .exceptions import EnrichError
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _detect_delimiter(sample: str) -> str:
|
|
20
|
+
"""Sniff a delimiter from a text sample; fall back to comma on failure."""
|
|
21
|
+
try:
|
|
22
|
+
dialect = csv.Sniffer().sniff(sample, delimiters=",|;:\t/")
|
|
23
|
+
return dialect.delimiter
|
|
24
|
+
except csv.Error:
|
|
25
|
+
return ","
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _to_icsv_delimiter(detected: str) -> str:
|
|
29
|
+
"""
|
|
30
|
+
Map the detected input delimiter to a valid iCSV output delimiter.
|
|
31
|
+
- Comma → pipe: avoids ambiguity with the ',' separator inside metadata lines.
|
|
32
|
+
- Tab → pipe: tab is not in the iCSV spec's allowed set [,|/:;].
|
|
33
|
+
- Anything else not in the spec → pipe as a safe default.
|
|
34
|
+
"""
|
|
35
|
+
if detected not in VALID_ICSV_DELIMITERS:
|
|
36
|
+
return "|"
|
|
37
|
+
return "|" if detected == "," else detected
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _detect_geometry(header: list[str]) -> tuple[Optional[str], Optional[str]]:
|
|
41
|
+
"""
|
|
42
|
+
Heuristic spatial-column detection.
|
|
43
|
+
Returns (geometry_value, srid_value) or (None, None) when no spatial columns are found.
|
|
44
|
+
Per Q1: geometry and srid are written only when spatial columns are detected.
|
|
45
|
+
"""
|
|
46
|
+
lower = [h.lower() for h in header]
|
|
47
|
+
if "geometry" in lower:
|
|
48
|
+
idx = lower.index("geometry")
|
|
49
|
+
return f"column:{header[idx]}", None
|
|
50
|
+
lat_idx = lon_idx = None
|
|
51
|
+
for i, h in enumerate(lower):
|
|
52
|
+
if h in ("lat", "latitude"):
|
|
53
|
+
lat_idx = i
|
|
54
|
+
if h in ("lon", "lng", "longitude"):
|
|
55
|
+
lon_idx = i
|
|
56
|
+
if lat_idx is not None and lon_idx is not None:
|
|
57
|
+
return f"column:{header[lat_idx]},{header[lon_idx]}", "EPSG:4326"
|
|
58
|
+
return None, None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ICSVEnricher:
|
|
62
|
+
def __init__(self, nodata_candidates: Optional[set[str]] = None):
|
|
63
|
+
self.missing: frozenset[str] = (
|
|
64
|
+
frozenset(nodata_candidates) if nodata_candidates else COMMON_MISSING
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def _load_rows(
|
|
68
|
+
self,
|
|
69
|
+
path: Path,
|
|
70
|
+
user_delimiter: Optional[str],
|
|
71
|
+
) -> tuple[list[str], list[list[str]], str]:
|
|
72
|
+
"""
|
|
73
|
+
Read a CSV in a single pass: collect a 10-line sample, sniff the delimiter,
|
|
74
|
+
then parse all rows. Returns (header, rows, detected_delimiter).
|
|
75
|
+
Using utf-8-sig to transparently strip a BOM if present.
|
|
76
|
+
"""
|
|
77
|
+
lines: list[str] = []
|
|
78
|
+
try:
|
|
79
|
+
with open(path, "r", encoding="utf-8-sig", errors="replace") as fh:
|
|
80
|
+
lines = fh.readlines()
|
|
81
|
+
except OSError as e:
|
|
82
|
+
raise EnrichError(f"Cannot read {path}: {e}") from e
|
|
83
|
+
|
|
84
|
+
if not lines:
|
|
85
|
+
raise EnrichError(f"{path.name}: file is empty")
|
|
86
|
+
|
|
87
|
+
sample = "".join(lines[:10])
|
|
88
|
+
delimiter = user_delimiter or _detect_delimiter(sample)
|
|
89
|
+
|
|
90
|
+
header: list[str] = []
|
|
91
|
+
rows: list[list[str]] = []
|
|
92
|
+
for i, line in enumerate(lines):
|
|
93
|
+
row = list(csv.reader([line], delimiter=delimiter))[0]
|
|
94
|
+
if i == 0:
|
|
95
|
+
header = [c.strip() for c in row]
|
|
96
|
+
else:
|
|
97
|
+
rows.append(row)
|
|
98
|
+
|
|
99
|
+
if not header or all(c == "" for c in header):
|
|
100
|
+
raise EnrichError(f"{path.name}: no usable header row found")
|
|
101
|
+
|
|
102
|
+
return header, rows, delimiter
|
|
103
|
+
|
|
104
|
+
def _detect_nodata(self, rows: list[list[str]]) -> str:
|
|
105
|
+
"""Return the most common missing-value sentinel seen in the data, or ''."""
|
|
106
|
+
counts: dict[str, int] = {}
|
|
107
|
+
for row in rows:
|
|
108
|
+
for cell in row:
|
|
109
|
+
if cell in self.missing:
|
|
110
|
+
counts[cell] = counts.get(cell, 0) + 1
|
|
111
|
+
return max(counts, key=lambda k: counts[k]) if counts else ""
|
|
112
|
+
|
|
113
|
+
def make_icsv(
|
|
114
|
+
self,
|
|
115
|
+
infile: str,
|
|
116
|
+
outdir: str,
|
|
117
|
+
user_delimiter: Optional[str] = None,
|
|
118
|
+
nodata_override: Optional[str] = None,
|
|
119
|
+
application_profile: Optional[str] = None,
|
|
120
|
+
) -> tuple[str, str]:
|
|
121
|
+
"""
|
|
122
|
+
Convert a CSV to an iCSV + Frictionless schema JSON.
|
|
123
|
+
Returns (icsv_path, schema_path) as strings.
|
|
124
|
+
Raises EnrichError if the input is already an iCSV or cannot be read.
|
|
125
|
+
"""
|
|
126
|
+
path = Path(infile)
|
|
127
|
+
if is_icsv(path):
|
|
128
|
+
raise EnrichError(
|
|
129
|
+
f"{path.name} is already an iCSV file. "
|
|
130
|
+
"Use 'devo validate' to validate it, or 'devo run' which handles both."
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
header, rows, detected_delim = self._load_rows(path, user_delimiter)
|
|
134
|
+
icsv_delim = _to_icsv_delimiter(detected_delim)
|
|
135
|
+
bad_names = [c for c in header if icsv_delim in c]
|
|
136
|
+
if bad_names:
|
|
137
|
+
raise EnrichError(
|
|
138
|
+
f"Column name(s) contain the iCSV delimiter '{icsv_delim}': {bad_names}. "
|
|
139
|
+
"Rename the columns or force a different delimiter with --delimiter."
|
|
140
|
+
)
|
|
141
|
+
nodata = nodata_override if nodata_override is not None else self._detect_nodata(rows)
|
|
142
|
+
|
|
143
|
+
# Normalise all rows to header length once, then transpose to column lists.
|
|
144
|
+
padded = []
|
|
145
|
+
for row in rows:
|
|
146
|
+
if len(row) < len(header):
|
|
147
|
+
row = row + [""] * (len(header) - len(row))
|
|
148
|
+
else:
|
|
149
|
+
row = row[: len(header)]
|
|
150
|
+
padded.append([c.strip() for c in row])
|
|
151
|
+
|
|
152
|
+
col_values: list[list[str]] = (
|
|
153
|
+
[[row[i] for row in padded] for i in range(len(header))]
|
|
154
|
+
if padded else [[] for _ in header]
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
types = [infer_type(col, self.missing) for col in col_values]
|
|
158
|
+
col_stats = [
|
|
159
|
+
compute_col_stats(col_values[i], types[i], self.missing)
|
|
160
|
+
for i in range(len(header))
|
|
161
|
+
]
|
|
162
|
+
schema = build_frictionless_schema(header, col_stats, self.missing)
|
|
163
|
+
|
|
164
|
+
geometry, srid = _detect_geometry(header)
|
|
165
|
+
|
|
166
|
+
metadata: dict[str, str] = {
|
|
167
|
+
"iCSV_version": "1.0",
|
|
168
|
+
}
|
|
169
|
+
if application_profile:
|
|
170
|
+
metadata["application_profile"] = application_profile
|
|
171
|
+
metadata["field_delimiter"] = icsv_delim
|
|
172
|
+
metadata["rows"] = str(len(rows))
|
|
173
|
+
metadata["columns"] = str(len(header))
|
|
174
|
+
metadata["creation_date"] = (
|
|
175
|
+
datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
|
176
|
+
)
|
|
177
|
+
if nodata:
|
|
178
|
+
metadata["nodata"] = nodata
|
|
179
|
+
if geometry:
|
|
180
|
+
metadata["geometry"] = geometry
|
|
181
|
+
if srid:
|
|
182
|
+
metadata["srid"] = srid
|
|
183
|
+
metadata["generator"] = "DEVO"
|
|
184
|
+
|
|
185
|
+
def _join(vals: list) -> str:
|
|
186
|
+
return icsv_delim.join("" if v is None else str(v) for v in vals)
|
|
187
|
+
|
|
188
|
+
fields_lines = [
|
|
189
|
+
f"fields = {_join(header)}",
|
|
190
|
+
f"types = {_join(types)}",
|
|
191
|
+
f"min = {_join(s['min'] for s in col_stats)}",
|
|
192
|
+
f"max = {_join(s['max'] for s in col_stats)}",
|
|
193
|
+
f"missing_count = {_join(s['missing_count'] for s in col_stats)}",
|
|
194
|
+
f"description = {_join('' for _ in header)}",
|
|
195
|
+
]
|
|
196
|
+
|
|
197
|
+
out = Path(outdir)
|
|
198
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
199
|
+
base = path.stem
|
|
200
|
+
icsv_path = out / f"{base}.icsv"
|
|
201
|
+
schema_path = out / f"{base}_schema.json"
|
|
202
|
+
|
|
203
|
+
self._write_icsv(icsv_path, metadata, fields_lines, header, padded, icsv_delim)
|
|
204
|
+
|
|
205
|
+
with open(schema_path, "w", encoding="utf-8") as fh:
|
|
206
|
+
json.dump(schema, fh, indent=2, ensure_ascii=False)
|
|
207
|
+
|
|
208
|
+
return str(icsv_path), str(schema_path)
|
|
209
|
+
|
|
210
|
+
def _write_icsv(
|
|
211
|
+
self,
|
|
212
|
+
path: Path,
|
|
213
|
+
metadata: dict[str, str],
|
|
214
|
+
fields_lines: list[str],
|
|
215
|
+
header: list[str],
|
|
216
|
+
rows: list[list[str]],
|
|
217
|
+
field_delimiter: str,
|
|
218
|
+
) -> None:
|
|
219
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
220
|
+
with open(path, "w", encoding="utf-8", newline="") as fh:
|
|
221
|
+
fh.write("# iCSV 1.0 UTF-8\n")
|
|
222
|
+
fh.write("# [METADATA]\n")
|
|
223
|
+
for k, v in metadata.items():
|
|
224
|
+
fh.write(f"# {k} = {v}\n")
|
|
225
|
+
fh.write("\n")
|
|
226
|
+
fh.write("# [FIELDS]\n")
|
|
227
|
+
for line in fields_lines:
|
|
228
|
+
fh.write(f"# {line}\n")
|
|
229
|
+
fh.write("\n")
|
|
230
|
+
fh.write("# [DATA]\n")
|
|
231
|
+
writer = csv.writer(fh, delimiter=field_delimiter)
|
|
232
|
+
writer.writerow(header)
|
|
233
|
+
for row in rows:
|
|
234
|
+
writer.writerow(row)
|
devo/exceptions.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
class DEVOError(Exception):
|
|
2
|
+
"""Base for all DEVO errors — catch this to handle any DEVO failure."""
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class EnrichError(DEVOError):
|
|
6
|
+
"""Raised during CSV → iCSV conversion (bad input, unreadable file, etc.)."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ParseError(DEVOError):
|
|
10
|
+
"""Raised when an iCSV file cannot be parsed (missing sections, malformed lines)."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ValidationError(DEVOError):
|
|
14
|
+
"""Raised when validation infrastructure fails — not for data errors themselves."""
|
devo/validate.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""iCSV validation.
|
|
2
|
+
|
|
3
|
+
Public API: validate_icsv(icsv_path, schema_path=None, outdir="DEVO_output")
|
|
4
|
+
|
|
5
|
+
Three-stage check:
|
|
6
|
+
1. Metadata completeness (field_delimiter required; geometry/srid conditional on Q1).
|
|
7
|
+
2. Type consistency: re-infer column types from data and compare to declared types
|
|
8
|
+
(Option A: declared type is authoritative; inferred wider than declared → [WARN]).
|
|
9
|
+
3. Frictionless data validation against the schema JSON.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import csv
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import tempfile
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
from ._infer import COMMON_MISSING, infer_type, is_subtype_or_equal
|
|
21
|
+
from ._parser import ICSVHeader, parse_header
|
|
22
|
+
from ._report import write_report
|
|
23
|
+
from .exceptions import ParseError, ValidationError
|
|
24
|
+
|
|
25
|
+
# How many data rows (excluding header) to sample for type re-inference.
|
|
26
|
+
_INFER_SAMPLE = 500
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _check_metadata(header: ICSVHeader) -> list[str]:
|
|
30
|
+
"""
|
|
31
|
+
Return a list of issue strings (empty = clean).
|
|
32
|
+
geometry/srid are only flagged when spatial column names are present (Q1).
|
|
33
|
+
srid is only required for lat/lon columns — WKT geometry embeds its own CRS.
|
|
34
|
+
"""
|
|
35
|
+
issues = []
|
|
36
|
+
|
|
37
|
+
if "field_delimiter" not in header.metadata:
|
|
38
|
+
issues.append("[FAIL] Missing required metadata key: field_delimiter")
|
|
39
|
+
|
|
40
|
+
fields = header.fields_meta.get("fields", [])
|
|
41
|
+
lat_lon_names = {"lat", "latitude", "lon", "lng", "longitude"}
|
|
42
|
+
wkt_names = {"geometry"}
|
|
43
|
+
has_lat_lon = any(f.lower() in lat_lon_names for f in fields)
|
|
44
|
+
has_wkt = any(f.lower() in wkt_names for f in fields)
|
|
45
|
+
|
|
46
|
+
if has_lat_lon or has_wkt:
|
|
47
|
+
if "geometry" not in header.metadata:
|
|
48
|
+
issues.append(
|
|
49
|
+
"[WARN] Spatial columns detected but 'geometry' metadata key is missing"
|
|
50
|
+
)
|
|
51
|
+
if has_lat_lon:
|
|
52
|
+
if "srid" not in header.metadata:
|
|
53
|
+
issues.append(
|
|
54
|
+
"[WARN] Spatial columns detected but 'srid' metadata key is missing"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
return issues
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _extract_data(
|
|
61
|
+
icsv_path: Path,
|
|
62
|
+
tmp_csv: Path,
|
|
63
|
+
field_delimiter: str,
|
|
64
|
+
) -> list[list[str]]:
|
|
65
|
+
"""
|
|
66
|
+
Write the [DATA] section of the iCSV to a comma-delimited temp CSV.
|
|
67
|
+
Returns the first _INFER_SAMPLE data rows (excluding the header row) for type inference.
|
|
68
|
+
|
|
69
|
+
Writing with comma delimiter avoids fighting the Frictionless dialect API across v4/v5;
|
|
70
|
+
csv.writer quotes any values that contain a comma, so round-tripping is lossless.
|
|
71
|
+
"""
|
|
72
|
+
sampled: list[list[str]] = []
|
|
73
|
+
in_data = False
|
|
74
|
+
header_done = False # first DATA row is the column header, not a data row
|
|
75
|
+
|
|
76
|
+
with open(icsv_path, "r", encoding="utf-8-sig") as src, \
|
|
77
|
+
open(tmp_csv, "w", encoding="utf-8", newline="") as tgt:
|
|
78
|
+
writer = csv.writer(tgt)
|
|
79
|
+
for line in src:
|
|
80
|
+
if line.strip() == "# [DATA]":
|
|
81
|
+
in_data = True
|
|
82
|
+
continue
|
|
83
|
+
if not in_data:
|
|
84
|
+
continue
|
|
85
|
+
stripped = line.strip()
|
|
86
|
+
if not stripped or stripped.startswith("#"):
|
|
87
|
+
continue
|
|
88
|
+
row = list(csv.reader([line.rstrip("\r\n")], delimiter=field_delimiter))[0]
|
|
89
|
+
writer.writerow(row)
|
|
90
|
+
if not header_done:
|
|
91
|
+
header_done = True
|
|
92
|
+
elif len(sampled) < _INFER_SAMPLE:
|
|
93
|
+
sampled.append(row)
|
|
94
|
+
|
|
95
|
+
return sampled
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _cross_check_types(
|
|
99
|
+
declared_types: list[str],
|
|
100
|
+
data_rows: list[list[str]],
|
|
101
|
+
field_names: list[str],
|
|
102
|
+
missing: frozenset[str] = COMMON_MISSING,
|
|
103
|
+
) -> list[tuple[str, str, str, bool]]:
|
|
104
|
+
"""
|
|
105
|
+
Re-infer column types from data_rows and compare to declared types.
|
|
106
|
+
Returns list of (col_name, declared, inferred, is_ok).
|
|
107
|
+
is_ok=True means inferred is a subtype of or equal to declared (Option A).
|
|
108
|
+
Pass the iCSV's own nodata sentinel merged into missing so custom sentinels
|
|
109
|
+
are not treated as real data values during re-inference.
|
|
110
|
+
"""
|
|
111
|
+
if not declared_types or not data_rows:
|
|
112
|
+
return []
|
|
113
|
+
|
|
114
|
+
n = len(declared_types)
|
|
115
|
+
col_values: list[list[str]] = [[] for _ in range(n)]
|
|
116
|
+
for row in data_rows:
|
|
117
|
+
for i in range(min(len(row), n)):
|
|
118
|
+
col_values[i].append(row[i])
|
|
119
|
+
|
|
120
|
+
results = []
|
|
121
|
+
for i, declared in enumerate(declared_types):
|
|
122
|
+
name = field_names[i] if i < len(field_names) else str(i)
|
|
123
|
+
inferred = infer_type(col_values[i], missing)
|
|
124
|
+
results.append((name, declared, inferred, is_subtype_or_equal(inferred, declared)))
|
|
125
|
+
return results
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _import_frictionless_schema():
|
|
129
|
+
"""Lazy import for frictionless.Schema — avoids module-level import of an optional dep."""
|
|
130
|
+
try:
|
|
131
|
+
from frictionless import Schema
|
|
132
|
+
return Schema
|
|
133
|
+
except ImportError as exc:
|
|
134
|
+
raise ValidationError(
|
|
135
|
+
"The 'frictionless' package is required. Install it: pip install frictionless"
|
|
136
|
+
) from exc
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def validate_icsv(
|
|
140
|
+
icsv_path: str,
|
|
141
|
+
schema_path: Optional[str] = None,
|
|
142
|
+
outdir: str = "DEVO_output",
|
|
143
|
+
) -> tuple[str, bool]:
|
|
144
|
+
"""
|
|
145
|
+
Validate an iCSV file. Returns (report_path, valid).
|
|
146
|
+
valid=True only when metadata is clean AND Frictionless reports no data errors.
|
|
147
|
+
Type-consistency [WARN] entries do not affect the valid flag.
|
|
148
|
+
Raises ValidationError if frictionless is not installed.
|
|
149
|
+
Raises FileNotFoundError if no schema can be found.
|
|
150
|
+
"""
|
|
151
|
+
try:
|
|
152
|
+
from frictionless import Resource
|
|
153
|
+
except ImportError as exc:
|
|
154
|
+
raise ValidationError(
|
|
155
|
+
"The 'frictionless' package is required. Install it: pip install frictionless"
|
|
156
|
+
) from exc
|
|
157
|
+
|
|
158
|
+
path = Path(icsv_path)
|
|
159
|
+
out = Path(outdir)
|
|
160
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
161
|
+
|
|
162
|
+
header = parse_header(path)
|
|
163
|
+
metadata_issues = _check_metadata(header)
|
|
164
|
+
metadata_ok = not any(line.startswith("[FAIL]") for line in metadata_issues)
|
|
165
|
+
|
|
166
|
+
declared_types = header.fields_meta.get("types", [])
|
|
167
|
+
field_names = header.fields_meta.get("fields", [])
|
|
168
|
+
|
|
169
|
+
if not schema_path:
|
|
170
|
+
candidate = path.with_name(path.stem + "_schema.json")
|
|
171
|
+
if candidate.exists():
|
|
172
|
+
schema_path = str(candidate)
|
|
173
|
+
else:
|
|
174
|
+
raise FileNotFoundError(
|
|
175
|
+
f"No schema provided and none found alongside {path.name}. "
|
|
176
|
+
"Run 'devo enrich' first or pass --schema."
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Create a unique temp file so concurrent calls on different inputs do not collide.
|
|
180
|
+
fd, tmp_str = tempfile.mkstemp(suffix=".csv", dir=out)
|
|
181
|
+
os.close(fd)
|
|
182
|
+
tmp_csv = Path(tmp_str)
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
data_rows = _extract_data(path, tmp_csv, header.field_delimiter)
|
|
186
|
+
nodata_val = header.metadata.get("nodata", "")
|
|
187
|
+
effective_missing = COMMON_MISSING | {nodata_val} if nodata_val else COMMON_MISSING
|
|
188
|
+
type_issues = _cross_check_types(declared_types, data_rows, field_names, effective_missing)
|
|
189
|
+
|
|
190
|
+
# frictionless v5 rejects absolute paths outside the working directory.
|
|
191
|
+
# Fix: pass the filename relative to its parent via basepath.
|
|
192
|
+
# Schema is loaded as a dict first so schema-path resolution is ours, not theirs.
|
|
193
|
+
schema_dict = json.loads(Path(schema_path).read_text(encoding="utf-8"))
|
|
194
|
+
Schema = _import_frictionless_schema()
|
|
195
|
+
schema_obj = Schema.from_descriptor(schema_dict)
|
|
196
|
+
resource = Resource(
|
|
197
|
+
path=tmp_csv.name,
|
|
198
|
+
basepath=str(tmp_csv.parent),
|
|
199
|
+
schema=schema_obj,
|
|
200
|
+
)
|
|
201
|
+
report = resource.validate()
|
|
202
|
+
data_valid = report.valid
|
|
203
|
+
|
|
204
|
+
finally:
|
|
205
|
+
if tmp_csv.exists():
|
|
206
|
+
tmp_csv.unlink()
|
|
207
|
+
|
|
208
|
+
is_valid = metadata_ok and data_valid
|
|
209
|
+
report_path = out / f"{path.stem}_DEVO_report.txt"
|
|
210
|
+
write_report(
|
|
211
|
+
path=report_path,
|
|
212
|
+
icsv_name=path.name,
|
|
213
|
+
metadata_issues=metadata_issues,
|
|
214
|
+
type_issues=type_issues,
|
|
215
|
+
frictionless_report=report,
|
|
216
|
+
is_valid=is_valid,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
return str(report_path), is_valid
|
devo/webui.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""A tiny Flask web UI which allows uploading a CSV, creating an iCSV, and running validation.
|
|
2
|
+
|
|
3
|
+
This is intentionally minimal — suitable for local testing and demonstration.
|
|
4
|
+
"""
|
|
5
|
+
from flask import Flask, request, render_template_string, send_file
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from .enrich import ICSVEnricher
|
|
8
|
+
from .validate import validate_icsv
|
|
9
|
+
|
|
10
|
+
app = Flask(__name__)
|
|
11
|
+
|
|
12
|
+
TEMPLATE = """
|
|
13
|
+
<!doctype html>
|
|
14
|
+
<title>DEVO demo</title>
|
|
15
|
+
<h1>DEVO — upload CSV</h1>
|
|
16
|
+
<form method=post enctype=multipart/form-data>
|
|
17
|
+
<input type=file name=file>
|
|
18
|
+
<input type=submit value=Upload>
|
|
19
|
+
</form>
|
|
20
|
+
{% if message %}
|
|
21
|
+
<hr>
|
|
22
|
+
<h2>Result</h2>
|
|
23
|
+
<pre>{{ message }}</pre>
|
|
24
|
+
{% endif %}
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
@app.route("/", methods=["GET", "POST"])
|
|
28
|
+
def index():
|
|
29
|
+
message = None
|
|
30
|
+
if request.method == "POST":
|
|
31
|
+
f = request.files.get("file")
|
|
32
|
+
if not f:
|
|
33
|
+
message = "No file uploaded"
|
|
34
|
+
else:
|
|
35
|
+
outdir = Path("DEVO_output")
|
|
36
|
+
outdir.mkdir(exist_ok=True)
|
|
37
|
+
infile = outdir / f.filename
|
|
38
|
+
f.save(infile)
|
|
39
|
+
enr = ICSVEnricher()
|
|
40
|
+
try:
|
|
41
|
+
icsv, schema = enr.make_icsv(str(infile), str(outdir))
|
|
42
|
+
report, valid = validate_icsv(icsv, schema_path=schema, outdir=str(outdir))
|
|
43
|
+
message = f"iCSV: {icsv}\nSchema: {schema}\nReport: {report}\nValid: {valid}"
|
|
44
|
+
except Exception as e: # top-level demo catch-all; render any error to the UI rather than 500
|
|
45
|
+
message = f"Error: {e}"
|
|
46
|
+
return render_template_string(TEMPLATE, message=message)
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: py-devo
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: DEVO — CSV to iCSV enrichment and Frictionless validation
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Source, https://github.com/envidat/devo
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: frictionless>=4.0.0
|
|
11
|
+
Provides-Extra: webui
|
|
12
|
+
Requires-Dist: flask>=2.0.0; extra == "webui"
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# DEVO
|
|
16
|
+
<img title="whip it" alt="you know you should" height="50" src="/images/DEVO_Pixels_1.webp">
|
|
17
|
+
|
|
18
|
+
**Data Enrichment and Validation Operator.** Takes a plain CSV, infers types and constraints, writes a self-documenting [iCSV](https://envidat.github.io/iCSV/) file plus a Frictionless schema, and validates the data against it.
|
|
19
|
+
|
|
20
|
+
If you give it a `.csv`, it enriches → schema → validates. If you give it an `.icsv`, it skips enrichment.
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install -e .
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
For the Flask web demo:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install -e ".[webui]"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Requires Python 3.9+ and `frictionless` (v4 or v5).
|
|
35
|
+
|
|
36
|
+
## Try it out
|
|
37
|
+
|
|
38
|
+
A small sample dataset lives at `examples/sample.csv` — three columns (`timestamp`, `PSUM`, `TA`) representing hourly weather observations. Use it to take DEVO for a spin without needing your own data.
|
|
39
|
+
|
|
40
|
+
### CLI
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Enrich, build schema, and validate in one command
|
|
44
|
+
devo run examples/sample.csv
|
|
45
|
+
|
|
46
|
+
# Results land in DEVO_output/ by default:
|
|
47
|
+
# sample.icsv — annotated iCSV
|
|
48
|
+
# sample_schema.json — Frictionless Table Schema
|
|
49
|
+
# sample_DEVO_report.txt — human-readable validation report
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Run `devo run examples/sample.csv --out my_output` to write to a different directory.
|
|
53
|
+
|
|
54
|
+
### Python
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from devo.enrich import ICSVEnricher
|
|
58
|
+
from devo.validate import validate_icsv
|
|
59
|
+
|
|
60
|
+
icsv, schema = ICSVEnricher().make_icsv("examples/sample.csv", "DEVO_output")
|
|
61
|
+
report_path, valid = validate_icsv(icsv, schema_path=schema)
|
|
62
|
+
|
|
63
|
+
print(f"Valid: {valid}")
|
|
64
|
+
print(f"Report written to: {report_path}")
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Web demo
|
|
68
|
+
|
|
69
|
+
Install the optional Flask dependency first (if you haven't already):
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install -e ".[webui]"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Start the local server:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
flask --app devo.webui run
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Then open `http://127.0.0.1:5000` in your browser. Click **Choose File**, select `examples/sample.csv`, and click **Upload**. The page will display the paths to the generated iCSV, schema, and report, along with the overall `Valid` result.
|
|
82
|
+
|
|
83
|
+
> The web UI is a local demo only — do not expose it to a network.
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## CLI
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
devo enrich data.csv # write data.icsv + data_schema.json
|
|
91
|
+
devo validate data.icsv # validate against neighbouring schema
|
|
92
|
+
devo run data.csv # do both in one go
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Common flags: `--out DIR` (default `DEVO_output/`), `--delimiter CHAR`, `--nodata VALUE`, `--app PROFILE`, `--schema PATH`.
|
|
96
|
+
|
|
97
|
+
Exit codes: `0` = success, `1` = validation failed, `2` = usage or runtime error.
|
|
98
|
+
|
|
99
|
+
## What lands on disk
|
|
100
|
+
|
|
101
|
+
For input `data.csv`, after `devo run`:
|
|
102
|
+
|
|
103
|
+
| File | What |
|
|
104
|
+
|---|---|
|
|
105
|
+
| `DEVO_output/data.icsv` | iCSV with `# [METADATA]`, `# [FIELDS]`, `# [DATA]` |
|
|
106
|
+
| `DEVO_output/data_schema.json` | Frictionless Table Schema JSON |
|
|
107
|
+
| `DEVO_output/data_DEVO_report.txt` | Validation report (read this) |
|
|
108
|
+
|
|
109
|
+
## Python API
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from devo.enrich import ICSVEnricher
|
|
113
|
+
from devo.validate import validate_icsv
|
|
114
|
+
|
|
115
|
+
icsv, schema = ICSVEnricher().make_icsv("data.csv", "DEVO_output")
|
|
116
|
+
report_path, valid = validate_icsv(icsv, schema_path=schema)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Files
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
devo/
|
|
123
|
+
├── cli.py # argparse front-end (enrich / validate / run)
|
|
124
|
+
├── enrich.py # CSV → iCSV + schema (ICSVEnricher class)
|
|
125
|
+
├── validate.py # iCSV + schema → Frictionless validation + report
|
|
126
|
+
├── _infer.py # pure type-inference functions (shared by enrich + validate)
|
|
127
|
+
├── _parser.py # iCSV header parser (shared by enrich + validate)
|
|
128
|
+
├── _schema.py # per-column statistics + Frictionless schema builder
|
|
129
|
+
├── _report.py # plain-text report writer
|
|
130
|
+
├── exceptions.py # DEVOError hierarchy
|
|
131
|
+
└── webui.py # Flask demo (optional; requires pip install -e ".[webui]")
|
|
132
|
+
tests/
|
|
133
|
+
├── conftest.py
|
|
134
|
+
├── fixtures/ # sample CSV and iCSV files
|
|
135
|
+
└── test_*.py
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## How it works
|
|
139
|
+
|
|
140
|
+
### Enrichment (`devo enrich`)
|
|
141
|
+
|
|
142
|
+
1. **Read** — the CSV is read in one pass. If no `--delimiter` is given, `csv.Sniffer` detects it from the first 10 lines.
|
|
143
|
+
2. **Delimiter mapping** — comma is remapped to pipe in the iCSV output (pipe is also the default fallback for non-spec delimiters). Column names that contain the output delimiter are rejected with a clear error.
|
|
144
|
+
3. **Normalisation** — every row is padded or clipped to header length and stripped of leading/trailing whitespace.
|
|
145
|
+
4. **Type inference** — each column is classified: `integer → number → datetime → string`. Scientific notation (`1.5e-3`, `2E10`) is recognised as `number`. Missing-value sentinels (and any custom `--nodata` value) are excluded before inference.
|
|
146
|
+
5. **Statistics** — per-column `min`, `max`, and `missing_count` are computed from the normalised data and written to the iCSV `# [FIELDS]` section. They do not appear in the Frictionless schema JSON.
|
|
147
|
+
6. **Geometry detection** — if the header contains `lat`/`latitude` + `lon`/`lng`/`longitude`, DEVO writes `geometry = column:lat,lon` and `srid = EPSG:4326` to metadata. A single column named `geometry` (WKT) gets `geometry = column:geometry` only — no `srid`, because WKT embeds its own CRS.
|
|
148
|
+
7. **Write** — the normalised rows are written to the iCSV `# [DATA]` section, and the Frictionless schema is written to `_schema.json`.
|
|
149
|
+
|
|
150
|
+
### Validation (`devo validate`)
|
|
151
|
+
|
|
152
|
+
1. **Parse header** — `_parser.py` reads the `# [METADATA]` and `# [FIELDS]` sections, using `field_delimiter` from metadata to split field values.
|
|
153
|
+
2. **Metadata check** — required keys are verified. `geometry` and `srid` are only checked when spatial column names are present; `srid` is only required for lat/lon columns (not WKT).
|
|
154
|
+
3. **Type cross-check (Option A)** — column types are re-inferred from up to 500 data rows and compared to the declared types. The iCSV's own `nodata` sentinel is merged with the standard missing-value set before re-inference so custom sentinels are not mistaken for real data. Inferred type narrower than or equal to declared → `[OK]`. Inferred wider → `[WARN]`.
|
|
155
|
+
4. **Frictionless validation** — data is written to a temporary comma-delimited CSV and validated against the schema using `frictionless.Resource`. The temp file is always deleted in a `finally` block.
|
|
156
|
+
5. **Report** — a plain-text `.txt` report is written with three sections: `METADATA`, `TYPE CONSISTENCY`, and `DATA VALIDATION`. `Valid: YES` only when metadata has no `[FAIL]` entries and Frictionless reports no data errors. Type warnings do not affect the valid flag.
|
|
157
|
+
|
|
158
|
+
## Limitations
|
|
159
|
+
|
|
160
|
+
- Type inference is conservative: `integer → number → datetime → string`. Mixed-format columns fall back to `string`.
|
|
161
|
+
- Datetime detection uses `datetime.fromisoformat()` and a fixed list of common strptime formats. Unusual formats need a custom schema.
|
|
162
|
+
- Column descriptions are left blank in the iCSV `# [FIELDS]` section; fill them in by hand.
|
|
163
|
+
- The web UI (`webui.py`) is a local demo only — do not expose it to a network.
|
|
164
|
+
|
|
165
|
+
## License
|
|
166
|
+
|
|
167
|
+
MIT. See `LICENSE`.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
devo/__init__.py,sha256=bB7C_WzwA9iA3r0u17gOINgpbNFQanfii1opOtnvNkY,318
|
|
2
|
+
devo/_infer.py,sha256=_hUPO_4VVMmvuLYksHqbRxMb05iaW1YKNOrqSgghIVY,3719
|
|
3
|
+
devo/_parser.py,sha256=TzQOIeIscQIh-gGzqfyiaE_cq-9cT2dxbLqNYKNyzNY,2867
|
|
4
|
+
devo/_report.py,sha256=DHd58YYoM5l0SmalbVx3gKTk7ClNALP5G_34OXvkdlo,3251
|
|
5
|
+
devo/_schema.py,sha256=c1HDTHqbk8QHukVruAHsyHQhCoe27wrIcbqnQ45H0IQ,3904
|
|
6
|
+
devo/cli.py,sha256=Tb715NlmBMewejt9rDdohZKKxYYBtaw8Yiau7US1iZY,4059
|
|
7
|
+
devo/enrich.py,sha256=3zNa7LIeusw6jfqK1WozZJYlbawIUuSHdSVKRY4jkZc,8491
|
|
8
|
+
devo/exceptions.py,sha256=AtE1OEeK3CIgwWugiaWDu7jO9YpegBpWvuLu8TBH7tA,468
|
|
9
|
+
devo/validate.py,sha256=oN4AN8xIIys9nS1tpG7dgRPQCPHV6jgWehL93X1KgwM,7861
|
|
10
|
+
devo/webui.py,sha256=vemf9W-aMa0Nq69TXIfsUx349s60X5CU3dhJ7B5JqOs,1532
|
|
11
|
+
py_devo-0.2.0.dist-info/licenses/LICENSE,sha256=EG2ApufGa51t5fPfMM9V79YYJ3QVUDqTOR251pNEQ3s,157
|
|
12
|
+
py_devo-0.2.0.dist-info/METADATA,sha256=MOyDwbMwvKortJTeaBMRR8RzvCoaYfb7V4Wf4slJVLA,7343
|
|
13
|
+
py_devo-0.2.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
14
|
+
py_devo-0.2.0.dist-info/entry_points.txt,sha256=HRSFN7b01VzDUIpM9sqW-kked7vUOBkXSSAwBSFVfqQ,39
|
|
15
|
+
py_devo-0.2.0.dist-info/top_level.txt,sha256=_jGERvww3l96x1xDBpeKD7weHkxKWVR3UVMs7ty3Qqw,5
|
|
16
|
+
py_devo-0.2.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
devo
|