csvmedic 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ # Byte-compiled
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ .venv/
25
+ venv/
26
+ ENV/
27
+ env/
28
+
29
+ # IDE
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+
35
+ # Testing
36
+ .coverage
37
+ .pytest_cache/
38
+ htmlcov/
39
+ .tox/
40
+ nox/
41
+
42
+ # mypy
43
+ .mypy_cache/
44
+ .dmypy.json
45
+ dmypy.json
46
+
47
+ # uv
48
+ .uv/
49
+
50
+ # OS
51
+ .DS_Store
52
+ Thumbs.db
csvmedic-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 csvmedic contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,146 @@
1
+ Metadata-Version: 2.4
2
+ Name: csvmedic
3
+ Version: 0.1.0
4
+ Summary: Automatic locale-aware CSV and Excel reader with encoding, delimiter, date format, and number locale detection.
5
+ Project-URL: Homepage, https://github.com/csvmedic/csvmedic
6
+ Project-URL: Documentation, https://csvmedic.readthedocs.io
7
+ Project-URL: Repository, https://github.com/csvmedic/csvmedic
8
+ Author: csvmedic contributors
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
20
+ Requires-Python: >=3.9
21
+ Requires-Dist: charset-normalizer>=3.0.0
22
+ Requires-Dist: pandas>=1.5.0
23
+ Provides-Extra: all
24
+ Requires-Dist: clevercsv>=0.8.0; extra == 'all'
25
+ Requires-Dist: openpyxl>=3.1.0; extra == 'all'
26
+ Provides-Extra: dev
27
+ Requires-Dist: mkdocs-material>=9.0; extra == 'dev'
28
+ Requires-Dist: mkdocstrings[python]>=0.24; extra == 'dev'
29
+ Requires-Dist: mypy>=1.8; extra == 'dev'
30
+ Requires-Dist: pandas-stubs>=2.0; extra == 'dev'
31
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
32
+ Requires-Dist: pytest>=7.0; extra == 'dev'
33
+ Requires-Dist: ruff>=0.4.0; extra == 'dev'
34
+ Provides-Extra: excel
35
+ Requires-Dist: openpyxl>=3.1.0; extra == 'excel'
36
+ Provides-Extra: fast
37
+ Requires-Dist: clevercsv>=0.8.0; extra == 'fast'
38
+ Description-Content-Type: text/markdown
39
+
40
+ # csvmedic
41
+
42
+ Automatic locale-aware CSV and Excel reader. One line to clean messy data:
43
+
44
+ ```python
45
+ import csvmedic
46
+
47
+ df = csvmedic.read("messy_file.csv")
48
+ print(df.diagnosis) # See what was detected and converted
49
+ ```
50
+
51
+ ## What it does
52
+
53
+ | Detects | Examples |
54
+ |--------|----------|
55
+ | **Encoding** | UTF-8, Windows-1252, ISO-8859-1, Shift-JIS, BOM |
56
+ | **Delimiter** | Comma, semicolon, tab, pipe |
57
+ | **Dates** | DD-MM vs MM-DD resolved statistically; ISO, European, US formats |
58
+ | **Numbers** | European (1.234,56) vs US (1,234.56); locale hint |
59
+ | **Booleans** | Yes/No, Ja/Nein, Oui/Non, Sí/No, and more |
60
+ | **Strings** | Preserves leading zeros (IDs like 00742) |
61
+
62
+ Every transformation is recorded in the `.diagnosis` attribute so you can audit what was changed.
63
+
64
+ ## Installation
65
+
66
+ ```bash
67
+ pip install csvmedic
68
+ ```
69
+
70
+ Optional extras:
71
+
72
+ - `pip install csvmedic[fast]` — better dialect detection (clevercsv)
73
+ - `pip install csvmedic[excel]` — .xlsx support (openpyxl)
74
+ - `pip install csvmedic[all]` — both
75
+
76
+ ## Configuration
77
+
78
+ Override auto-detection when you know better:
79
+
80
+ ```python
81
+ df = csvmedic.read(
82
+ "file.csv",
83
+ encoding="utf-8",
84
+ delimiter=";",
85
+ dayfirst=True, # Force DD-MM dates
86
+ preserve_strings=["ID"], # Never convert these columns
87
+ sample_rows=2000, # Rows to use for detection
88
+ confidence_threshold=0.75, # Min confidence to convert (0–1)
89
+ )
90
+ ```
91
+
92
+ ## Analyze without converting
93
+
94
+ ```python
95
+ profile = csvmedic.read_raw("file.csv")
96
+ print(profile.summary())
97
+ print(profile.columns["Date"].details)
98
+ ```
99
+
100
+ ## Schema pinning (recurring files)
101
+
102
+ Save the detected schema after the first read and reuse it so the next read skips detection:
103
+
104
+ ```python
105
+ df = csvmedic.read("monthly_export.csv")
106
+ csvmedic.save_schema(df.attrs["diagnosis"].file_profile, "monthly_export.csvmedic.json")
107
+
108
+ # Next time: same encoding, delimiter, and column types, no re-detection
109
+ df2 = csvmedic.read("monthly_export.csv", schema="monthly_export.csvmedic.json")
110
+ ```
111
+
112
+ ## Batch read with consensus
113
+
114
+ When reading many similar CSVs (e.g. one per month), use consensus so every file gets the same encoding and delimiter:
115
+
116
+ ```python
117
+ dfs = csvmedic.read_batch(["jan.csv", "feb.csv", "mar.csv"], use_consensus=True)
118
+ # Encoding and delimiter are chosen by majority across the three files.
119
+ ```
120
+
121
+ ## Diff: pandas vs csvmedic
122
+
123
+ See exactly what pandas would have changed or corrupted vs what csvmedic preserves:
124
+
125
+ ```python
126
+ result = csvmedic.diff("leading_zeros.csv")
127
+ print(result.summary()) # Columns/rows that differ
128
+ print(result.pandas_df) # Default pandas read
129
+ print(result.csvmedic_df) # csvmedic read (e.g. keeps "00742" as string)
130
+ print(result.sample_differences) # Example (row, column, pandas_val, csvmedic_val)
131
+ ```
132
+
133
+ ## How disambiguation works
134
+
135
+ For ambiguous dates like `03/04/2025` (March 4 or April 3?), csvmedic uses the data itself: if any value has a day > 12 (e.g. `25/03/2025`), the column is treated as day-first. It also uses cross-column inference, separator hints (e.g. period = European), and sequential order. If it still can’t decide, the column stays as string and is marked ambiguous in the diagnosis.
136
+
137
+ ## Documentation
138
+
139
+ - [Quickstart](docs/quickstart.md)
140
+ - [How it works](docs/how-it-works.md)
141
+ - [API reference](docs/api-reference.md)
142
+ - [FAQ](docs/faq.md)
143
+
144
+ ## License
145
+
146
+ MIT
@@ -0,0 +1,107 @@
1
+ # csvmedic
2
+
3
+ Automatic locale-aware CSV and Excel reader. One line to clean messy data:
4
+
5
+ ```python
6
+ import csvmedic
7
+
8
+ df = csvmedic.read("messy_file.csv")
9
+ print(df.diagnosis) # See what was detected and converted
10
+ ```
11
+
12
+ ## What it does
13
+
14
+ | Detects | Examples |
15
+ |--------|----------|
16
+ | **Encoding** | UTF-8, Windows-1252, ISO-8859-1, Shift-JIS, BOM |
17
+ | **Delimiter** | Comma, semicolon, tab, pipe |
18
+ | **Dates** | DD-MM vs MM-DD resolved statistically; ISO, European, US formats |
19
+ | **Numbers** | European (1.234,56) vs US (1,234.56); locale hint |
20
+ | **Booleans** | Yes/No, Ja/Nein, Oui/Non, Sí/No, and more |
21
+ | **Strings** | Preserves leading zeros (IDs like 00742) |
22
+
23
+ Every transformation is recorded in the `.diagnosis` attribute so you can audit what was changed.
24
+
25
+ ## Installation
26
+
27
+ ```bash
28
+ pip install csvmedic
29
+ ```
30
+
31
+ Optional extras:
32
+
33
+ - `pip install csvmedic[fast]` — better dialect detection (clevercsv)
34
+ - `pip install csvmedic[excel]` — .xlsx support (openpyxl)
35
+ - `pip install csvmedic[all]` — both
36
+
37
+ ## Configuration
38
+
39
+ Override auto-detection when you know better:
40
+
41
+ ```python
42
+ df = csvmedic.read(
43
+ "file.csv",
44
+ encoding="utf-8",
45
+ delimiter=";",
46
+ dayfirst=True, # Force DD-MM dates
47
+ preserve_strings=["ID"], # Never convert these columns
48
+ sample_rows=2000, # Rows to use for detection
49
+ confidence_threshold=0.75, # Min confidence to convert (0–1)
50
+ )
51
+ ```
52
+
53
+ ## Analyze without converting
54
+
55
+ ```python
56
+ profile = csvmedic.read_raw("file.csv")
57
+ print(profile.summary())
58
+ print(profile.columns["Date"].details)
59
+ ```
60
+
61
+ ## Schema pinning (recurring files)
62
+
63
+ Save the detected schema after the first read and reuse it so the next read skips detection:
64
+
65
+ ```python
66
+ df = csvmedic.read("monthly_export.csv")
67
+ csvmedic.save_schema(df.attrs["diagnosis"].file_profile, "monthly_export.csvmedic.json")
68
+
69
+ # Next time: same encoding, delimiter, and column types, no re-detection
70
+ df2 = csvmedic.read("monthly_export.csv", schema="monthly_export.csvmedic.json")
71
+ ```
72
+
73
+ ## Batch read with consensus
74
+
75
+ When reading many similar CSVs (e.g. one per month), use consensus so every file gets the same encoding and delimiter:
76
+
77
+ ```python
78
+ dfs = csvmedic.read_batch(["jan.csv", "feb.csv", "mar.csv"], use_consensus=True)
79
+ # Encoding and delimiter are chosen by majority across the three files.
80
+ ```
81
+
82
+ ## Diff: pandas vs csvmedic
83
+
84
+ See exactly what pandas would have changed or corrupted vs what csvmedic preserves:
85
+
86
+ ```python
87
+ result = csvmedic.diff("leading_zeros.csv")
88
+ print(result.summary()) # Columns/rows that differ
89
+ print(result.pandas_df) # Default pandas read
90
+ print(result.csvmedic_df) # csvmedic read (e.g. keeps "00742" as string)
91
+ print(result.sample_differences) # Example (row, column, pandas_val, csvmedic_val)
92
+ ```
93
+
94
+ ## How disambiguation works
95
+
96
+ For ambiguous dates like `03/04/2025` (March 4 or April 3?), csvmedic uses the data itself: if any value has a day > 12 (e.g. `25/03/2025`), the column is treated as day-first. It also uses cross-column inference, separator hints (e.g. period = European), and sequential order. If it still can’t decide, the column stays as string and is marked ambiguous in the diagnosis.
97
+
98
+ ## Documentation
99
+
100
+ - [Quickstart](docs/quickstart.md)
101
+ - [How it works](docs/how-it-works.md)
102
+ - [API reference](docs/api-reference.md)
103
+ - [FAQ](docs/faq.md)
104
+
105
+ ## License
106
+
107
+ MIT
@@ -0,0 +1,74 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "csvmedic"
7
+ version = "0.1.0"
8
+ description = "Automatic locale-aware CSV and Excel reader with encoding, delimiter, date format, and number locale detection."
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.9"
12
+ authors = [
13
+ { name = "csvmedic contributors" },
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.9",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Topic :: Scientific/Engineering :: Information Analysis",
25
+ ]
26
+ dependencies = [
27
+ "pandas>=1.5.0",
28
+ "charset-normalizer>=3.0.0",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ fast = ["clevercsv>=0.8.0"]
33
+ excel = ["openpyxl>=3.1.0"]
34
+ all = ["clevercsv>=0.8.0", "openpyxl>=3.1.0"]
35
+ dev = [
36
+ "pytest>=7.0",
37
+ "pytest-cov>=4.0",
38
+ "ruff>=0.4.0",
39
+ "mypy>=1.8",
40
+ "pandas-stubs>=2.0",
41
+ "mkdocs-material>=9.0",
42
+ "mkdocstrings[python]>=0.24",
43
+ ]
44
+
45
+ [project.urls]
46
+ Homepage = "https://github.com/csvmedic/csvmedic"
47
+ Documentation = "https://csvmedic.readthedocs.io"
48
+ Repository = "https://github.com/csvmedic/csvmedic"
49
+
50
+ [tool.hatch.build.targets.wheel]
51
+ packages = ["src/csvmedic"]
52
+
53
+ [tool.hatch.build.targets.sdist]
54
+ include = ["src/csvmedic"]
55
+
56
+ [tool.ruff]
57
+ line-length = 99
58
+ target-version = "py39"
59
+ src = ["src", "tests"]
60
+
61
+ [tool.ruff.lint]
62
+ select = ["E", "F", "I", "N", "W", "UP"]
63
+
64
+ [tool.mypy]
65
+ python_version = "3.9"
66
+ strict = true
67
+ warn_return_any = true
68
+ warn_unused_ignores = true
69
+ disallow_untyped_defs = true
70
+
71
+ [tool.pytest.ini_options]
72
+ testpaths = ["tests"]
73
+ addopts = "-v"
74
+ filterwarnings = ["ignore::DeprecationWarning"]
@@ -0,0 +1,27 @@
1
+ """csvmedic — Automatic locale-aware CSV reading."""
2
+
3
+ from csvmedic import accessor # noqa: F401 — registers df.diagnosis accessor
4
+ from csvmedic._version import __version__
5
+ from csvmedic.batch import read_batch
6
+ from csvmedic.diagnosis import Diagnosis, TransformationRecord
7
+ from csvmedic.diff import DiffResult, diff
8
+ from csvmedic.models import ColumnProfile, FileProfile
9
+ from csvmedic.reader import MedicReader, read, read_raw
10
+ from csvmedic.schema import load_schema, save_schema, schema_path_for_csv
11
+
12
+ __all__ = [
13
+ "__version__",
14
+ "read",
15
+ "read_raw",
16
+ "read_batch",
17
+ "MedicReader",
18
+ "Diagnosis",
19
+ "TransformationRecord",
20
+ "ColumnProfile",
21
+ "FileProfile",
22
+ "save_schema",
23
+ "load_schema",
24
+ "schema_path_for_csv",
25
+ "diff",
26
+ "DiffResult",
27
+ ]
@@ -0,0 +1,3 @@
1
+ """Single source of version string for csvmedic."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,27 @@
1
+ """Pandas DataFrame accessor for .diagnosis attribute."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+
10
+ @pd.api.extensions.register_dataframe_accessor("diagnosis")
11
+ class DiagnosisAccessor:
12
+ """Accessor for df.diagnosis — returns the Diagnosis object from df.attrs."""
13
+
14
+ def __init__(self, pandas_obj: pd.DataFrame) -> None:
15
+ self._obj = pandas_obj
16
+
17
+ def __repr__(self) -> str:
18
+ d = self._obj.attrs.get("diagnosis")
19
+ if d is None:
20
+ return "No diagnosis available"
21
+ return repr(d)
22
+
23
+ def __getattr__(self, name: str) -> Any:
24
+ d = self._obj.attrs.get("diagnosis")
25
+ if d is None:
26
+ raise AttributeError("No diagnosis available")
27
+ return getattr(d, name)
@@ -0,0 +1,102 @@
1
+ """
2
+ Multi-file batch read with consensus detection.
3
+
4
+ When reading multiple similar CSVs (e.g. monthly exports), run encoding and
5
+ delimiter detection on each file's sample and use the majority result for all,
6
+ so every file is read with the same settings.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections import Counter
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import pandas as pd
16
+
17
+ from csvmedic.detectors.dialect import detect_dialect
18
+ from csvmedic.detectors.encoding import detect_encoding
19
+ from csvmedic.reader import _read_byte_sample, read
20
+
21
+
22
+ def _consensus_encoding_dialect(
23
+ paths: list[Path],
24
+ ) -> tuple[str, str, bool]:
25
+ """Detect encoding/dialect per path; return (encoding, delimiter, has_header)."""
26
+ encodings: list[str] = []
27
+ delimiters: list[str] = []
28
+ headers: list[bool] = []
29
+ for path in paths:
30
+ try:
31
+ bytes_sample, _ = _read_byte_sample(path)
32
+ enc = detect_encoding(bytes_sample)
33
+ encodings.append(enc.encoding)
34
+ decoded = bytes_sample.decode(enc.encoding, errors="replace")
35
+ dialect = detect_dialect(None, enc.encoding, sample_text=decoded)
36
+ delimiters.append(dialect.delimiter)
37
+ headers.append(dialect.has_header)
38
+ except Exception:
39
+ continue
40
+ if not encodings:
41
+ return ("utf-8", ",", True)
42
+ enc_counter = Counter(encodings)
43
+ delim_counter = Counter(delimiters)
44
+ encoding = enc_counter.most_common(1)[0][0]
45
+ delimiter = delim_counter.most_common(1)[0][0]
46
+ has_header = sum(headers) > len(headers) / 2
47
+ return (encoding, delimiter, has_header)
48
+
49
+
50
+ def read_batch(
51
+ paths: str | Path | list[str] | list[Path],
52
+ *,
53
+ encoding: str | None = None,
54
+ delimiter: str | None = None,
55
+ use_consensus: bool = True,
56
+ **read_kw: Any,
57
+ ) -> list[pd.DataFrame]:
58
+ """
59
+ Read multiple CSV files with optional consensus detection.
60
+
61
+ When use_consensus is True (default), encoding and delimiter are detected
62
+ on a sample from each file and the majority choice is used for every file,
63
+ so all DataFrames are read with the same settings. When use_consensus is
64
+ False or encoding/delimiter are provided, each file is read with read() and
65
+ its own detection (or the given overrides).
66
+
67
+ Parameters
68
+ ----------
69
+ paths : path or list of paths
70
+ One or more paths to CSV files.
71
+ encoding : str, optional
72
+ If set, overrides consensus and is used for all files.
73
+ delimiter : str, optional
74
+ If set, overrides consensus and is used for all files.
75
+ use_consensus : bool
76
+ If True, run detection on each file and use majority encoding/delimiter.
77
+ **read_kw
78
+ Passed through to read() (e.g. sample_rows, confidence_threshold).
79
+
80
+ Returns
81
+ -------
82
+ list of DataFrame
83
+ One DataFrame per path, in order.
84
+ """
85
+ if isinstance(paths, (str, Path)):
86
+ paths = [Path(paths)]
87
+ else:
88
+ paths = [Path(p) for p in paths]
89
+ if not paths:
90
+ return []
91
+
92
+ if use_consensus and encoding is None and delimiter is None:
93
+ enc, delim, has_header = _consensus_encoding_dialect(paths)
94
+ # Pass consensus to read(); has_header is not a read() kwarg so we rely on detection
95
+ read_kw["encoding"] = enc
96
+ read_kw["delimiter"] = delim
97
+ elif encoding is not None:
98
+ read_kw["encoding"] = encoding
99
+ if delimiter is not None:
100
+ read_kw["delimiter"] = delimiter
101
+
102
+ return [read(p, **read_kw) for p in paths]
@@ -0,0 +1,8 @@
1
+ """Confidence scoring for ambiguous detections."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ def clamp_confidence(value: float, min_val: float = 0.0, max_val: float = 1.0) -> float:
7
+ """Clamp a confidence score to [min_val, max_val]."""
8
+ return max(min_val, min(max_val, value))
@@ -0,0 +1 @@
1
+ """Detectors for encoding, dialect, dates, numbers, booleans, strings."""
@@ -0,0 +1,61 @@
1
+ """Boolean variant detection across locales."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ # (true_values, false_values) per locale
8
+ BOOLEAN_MAP: list[tuple[list[str], list[str]]] = [
9
+ (["true", "yes", "y", "1", "on"], ["false", "no", "n", "0", "off"]),
10
+ (["ja", "j", "jaa"], ["nein", "n"]),
11
+ (["oui", "o", "vrai"], ["non", "faux"]),
12
+ (["sí", "si", "s"], ["no"]),
13
+ (["vero", "sì"], ["falso"]),
14
+ (["waar"], ["onwaar"]),
15
+ ]
16
+
17
+ ALL_TRUE: set[str] = set()
18
+ ALL_FALSE: set[str] = set()
19
+ for trues, falses in BOOLEAN_MAP:
20
+ ALL_TRUE.update(trues)
21
+ ALL_FALSE.update(falses)
22
+
23
+
24
+ @dataclass
25
+ class BooleanDetectionResult:
26
+ """Result of boolean detection."""
27
+
28
+ is_boolean: bool
29
+ confidence: float
30
+ true_variants: list[str]
31
+ false_variants: list[str]
32
+
33
+
34
+ def detect_boolean_column(values: list[str]) -> BooleanDetectionResult:
35
+ """Detect if column is boolean; require >=90% of non-null values to match."""
36
+ filtered = [str(v).strip().lower() for v in values if v is not None and str(v).strip()]
37
+ if not filtered:
38
+ return BooleanDetectionResult(False, 0.0, [], [])
39
+
40
+ unique = set(filtered)
41
+ if len(unique) > 2:
42
+ return BooleanDetectionResult(False, 0.0, [], [])
43
+
44
+ allowed = ALL_TRUE | ALL_FALSE
45
+ match_count = sum(1 for v in filtered if v in allowed)
46
+ ratio = match_count / len(filtered)
47
+
48
+ if ratio < 0.9:
49
+ return BooleanDetectionResult(False, ratio, [], [])
50
+
51
+ true_found = [v for v in unique if v in ALL_TRUE]
52
+ false_found = [v for v in unique if v in ALL_FALSE]
53
+ if not true_found or not false_found:
54
+ return BooleanDetectionResult(False, ratio, [], [])
55
+
56
+ return BooleanDetectionResult(
57
+ is_boolean=True,
58
+ confidence=min(1.0, ratio + 0.05),
59
+ true_variants=true_found,
60
+ false_variants=false_found,
61
+ )