datemonkey 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datemonkey/__init__.py +57 -0
- datemonkey/cli.py +213 -0
- datemonkey/detector.py +306 -0
- datemonkey/excel.py +120 -0
- datemonkey/formats.py +133 -0
- datemonkey/models.py +196 -0
- datemonkey/parser.py +174 -0
- datemonkey-0.1.0.dist-info/METADATA +198 -0
- datemonkey-0.1.0.dist-info/RECORD +13 -0
- datemonkey-0.1.0.dist-info/WHEEL +5 -0
- datemonkey-0.1.0.dist-info/entry_points.txt +2 -0
- datemonkey-0.1.0.dist-info/licenses/LICENSE +21 -0
- datemonkey-0.1.0.dist-info/top_level.txt +1 -0
datemonkey/formats.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Known date format patterns and detection logic."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from .models import DateFormat
|
|
9
|
+
|
|
10
|
+
# ── Well-known formats, ordered from most specific to least ──────────────
|
|
11
|
+
|
|
12
|
+
ISO_8601 = DateFormat("%Y-%m-%d", "ISO 8601", "2024-03-15")
|
|
13
|
+
ISO_8601_T = DateFormat("%Y-%m-%dT%H:%M:%S", "ISO 8601 datetime", "2024-03-15T14:30:00")
|
|
14
|
+
ISO_8601_TZ = DateFormat("%Y-%m-%dT%H:%M:%S%z", "ISO 8601 with timezone", "2024-03-15T14:30:00+00:00")
|
|
15
|
+
|
|
16
|
+
US_SLASH = DateFormat("%m/%d/%Y", "US date (MM/DD/YYYY)", "03/15/2024")
|
|
17
|
+
US_SLASH_SHORT = DateFormat("%m/%d/%y", "US date short year (MM/DD/YY)", "03/15/24")
|
|
18
|
+
US_DASH = DateFormat("%m-%d-%Y", "US date dash (MM-DD-YYYY)", "03-15-2024")
|
|
19
|
+
|
|
20
|
+
EU_SLASH = DateFormat("%d/%m/%Y", "European date (DD/MM/YYYY)", "15/03/2024")
|
|
21
|
+
EU_SLASH_SHORT = DateFormat("%d/%m/%y", "European date short year (DD/MM/YY)", "15/03/24")
|
|
22
|
+
EU_DASH = DateFormat("%d-%m-%Y", "European date dash (DD-MM-YYYY)", "15-03-2024")
|
|
23
|
+
EU_DOT = DateFormat("%d.%m.%Y", "European date dot (DD.MM.YYYY)", "15.03.2024")
|
|
24
|
+
EU_DOT_SHORT = DateFormat("%d.%m.%y", "European date dot short (DD.MM.YY)", "15.03.24")
|
|
25
|
+
|
|
26
|
+
YYYYMMDD = DateFormat("%Y%m%d", "Compact (YYYYMMDD)", "20240315")
|
|
27
|
+
|
|
28
|
+
YYYY_MM_DD_SLASH = DateFormat("%Y/%m/%d", "Year-first slash (YYYY/MM/DD)", "2024/03/15")
|
|
29
|
+
|
|
30
|
+
MONTH_NAME_DMY = DateFormat("%d %B %Y", "Day Month Year (15 March 2024)", "15 March 2024")
|
|
31
|
+
MONTH_NAME_MDY = DateFormat("%B %d, %Y", "Month Day, Year (March 15, 2024)", "March 15, 2024")
|
|
32
|
+
MONTH_ABBR_DMY = DateFormat("%d %b %Y", "Day Mon Year (15 Mar 2024)", "15 Mar 2024")
|
|
33
|
+
MONTH_ABBR_MDY = DateFormat("%b %d, %Y", "Mon Day, Year (Mar 15, 2024)", "Mar 15, 2024")
|
|
34
|
+
MONTH_ABBR_MDY_NO_COMMA = DateFormat("%b %d %Y", "Mon Day Year (Mar 15 2024)", "Mar 15 2024")
|
|
35
|
+
MONTH_NAME_MDY_NO_COMMA = DateFormat("%B %d %Y", "Month Day Year (March 15 2024)", "March 15 2024")
|
|
36
|
+
|
|
37
|
+
# With time components
|
|
38
|
+
US_SLASH_TIME = DateFormat("%m/%d/%Y %H:%M:%S", "US datetime", "03/15/2024 14:30:00")
|
|
39
|
+
US_SLASH_TIME_12 = DateFormat("%m/%d/%Y %I:%M:%S %p", "US datetime 12h", "03/15/2024 02:30:00 PM")
|
|
40
|
+
EU_SLASH_TIME = DateFormat("%d/%m/%Y %H:%M:%S", "European datetime", "15/03/2024 14:30:00")
|
|
41
|
+
ISO_SPACE = DateFormat("%Y-%m-%d %H:%M:%S", "ISO datetime (space)", "2024-03-15 14:30:00")
|
|
42
|
+
|
|
43
|
+
# ── Format groups ────────────────────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
# Formats where day and month positions are unambiguous
|
|
46
|
+
UNAMBIGUOUS_FORMATS: list[DateFormat] = [
|
|
47
|
+
ISO_8601_TZ,
|
|
48
|
+
ISO_8601_T,
|
|
49
|
+
ISO_8601,
|
|
50
|
+
ISO_SPACE,
|
|
51
|
+
YYYY_MM_DD_SLASH,
|
|
52
|
+
YYYYMMDD,
|
|
53
|
+
MONTH_NAME_MDY,
|
|
54
|
+
MONTH_NAME_DMY,
|
|
55
|
+
MONTH_ABBR_MDY,
|
|
56
|
+
MONTH_ABBR_DMY,
|
|
57
|
+
MONTH_ABBR_MDY_NO_COMMA,
|
|
58
|
+
MONTH_NAME_MDY_NO_COMMA,
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
# US formats (MM/DD)
|
|
62
|
+
US_FORMATS: list[DateFormat] = [
|
|
63
|
+
US_SLASH,
|
|
64
|
+
US_SLASH_SHORT,
|
|
65
|
+
US_DASH,
|
|
66
|
+
US_SLASH_TIME,
|
|
67
|
+
US_SLASH_TIME_12,
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
# European formats (DD/MM)
|
|
71
|
+
EU_FORMATS: list[DateFormat] = [
|
|
72
|
+
EU_SLASH,
|
|
73
|
+
EU_SLASH_SHORT,
|
|
74
|
+
EU_DASH,
|
|
75
|
+
EU_DOT,
|
|
76
|
+
EU_DOT_SHORT,
|
|
77
|
+
EU_SLASH_TIME,
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
# All formats for detection, in priority order
|
|
81
|
+
ALL_FORMATS: list[DateFormat] = UNAMBIGUOUS_FORMATS + US_FORMATS + EU_FORMATS
|
|
82
|
+
|
|
83
|
+
# ── Ambiguous pairs: formats that differ only in day/month position ──────
|
|
84
|
+
|
|
85
|
+
AMBIGUOUS_PAIRS: list[tuple[DateFormat, DateFormat]] = [
|
|
86
|
+
(US_SLASH, EU_SLASH),
|
|
87
|
+
(US_SLASH_SHORT, EU_SLASH_SHORT),
|
|
88
|
+
(US_DASH, EU_DASH),
|
|
89
|
+
(US_SLASH_TIME, EU_SLASH_TIME),
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
# ── Regex for quick pre-screening ────────────────────────────────────────
|
|
93
|
+
|
|
94
|
+
# Matches potential Excel serial date numbers (integers 1-2958465 or floats)
|
|
95
|
+
EXCEL_SERIAL_RE = re.compile(r"^\d{1,7}(\.\d+)?$")
|
|
96
|
+
|
|
97
|
+
# Quick check: does this look like a date-ish string at all?
|
|
98
|
+
DATE_LIKE_RE = re.compile(
|
|
99
|
+
r"(?:"
|
|
100
|
+
r"\d{1,4}[\-/\.]\d{1,2}[\-/\.]\d{1,4}" # numeric with separators
|
|
101
|
+
r"|\d{8}" # YYYYMMDD
|
|
102
|
+
r"|\d{1,2}\s+\w+\s+\d{2,4}" # 15 March 2024
|
|
103
|
+
r"|\w+\s+\d{1,2},?\s+\d{2,4}" # March 15, 2024
|
|
104
|
+
r"|\d{4}-\d{2}-\d{2}T" # ISO with T
|
|
105
|
+
r")",
|
|
106
|
+
re.IGNORECASE,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def is_date_like(value: str) -> bool:
|
|
111
|
+
"""Quick check whether a string looks like it could be a date."""
|
|
112
|
+
return bool(DATE_LIKE_RE.search(value.strip()))
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def could_be_excel_serial(value: str) -> bool:
|
|
116
|
+
"""Check whether a string looks like an Excel serial date number."""
|
|
117
|
+
s = value.strip()
|
|
118
|
+
if not EXCEL_SERIAL_RE.match(s):
|
|
119
|
+
return False
|
|
120
|
+
num = float(s)
|
|
121
|
+
# Excel serial dates: 1 = 1900-01-01, reasonable range up to ~2958465 (9999-12-31)
|
|
122
|
+
# But practically, most dates are between 1 (1900) and ~55000 (2050+)
|
|
123
|
+
return 1 <= num <= 2958465
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def get_ambiguous_partner(fmt: DateFormat) -> Optional[DateFormat]:
|
|
127
|
+
"""If a format has a DD/MM vs MM/DD ambiguous partner, return it."""
|
|
128
|
+
for a, b in AMBIGUOUS_PAIRS:
|
|
129
|
+
if fmt == a:
|
|
130
|
+
return b
|
|
131
|
+
if fmt == b:
|
|
132
|
+
return a
|
|
133
|
+
return None
|
datemonkey/models.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Result models for datemonkey."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import datetime
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AmbiguityType(Enum):
|
|
12
|
+
"""Types of date ambiguity."""
|
|
13
|
+
|
|
14
|
+
NONE = "none"
|
|
15
|
+
DAY_MONTH_SWAP = "day_month_swap" # DD/MM vs MM/DD
|
|
16
|
+
TWO_DIGIT_YEAR = "two_digit_year" # 03 -> 2003 or 1903?
|
|
17
|
+
MIXED_FORMATS = "mixed_formats" # Multiple formats in batch
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Confidence(Enum):
|
|
21
|
+
"""Confidence levels for parsed dates."""
|
|
22
|
+
|
|
23
|
+
HIGH = "high" # Unambiguous parse
|
|
24
|
+
MEDIUM = "medium" # Likely correct but some ambiguity
|
|
25
|
+
LOW = "low" # Ambiguous, could be wrong
|
|
26
|
+
FAILED = "failed" # Could not parse
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class DateFormat:
|
|
31
|
+
"""A detected or specified date format.
|
|
32
|
+
|
|
33
|
+
Attributes:
|
|
34
|
+
pattern: strftime-compatible format string (e.g. "%Y-%m-%d").
|
|
35
|
+
label: Human-readable name (e.g. "ISO 8601", "US date", "European date").
|
|
36
|
+
example: Example value matching this format.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
pattern: str
|
|
40
|
+
label: str
|
|
41
|
+
example: str = ""
|
|
42
|
+
|
|
43
|
+
def __str__(self) -> str:
|
|
44
|
+
return self.pattern
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class DateResult:
|
|
49
|
+
"""Result of parsing a single date value.
|
|
50
|
+
|
|
51
|
+
Returned as part of a BatchResult. Downstream consumers should check
|
|
52
|
+
``confidence`` and ``warnings`` before using ``parsed``.
|
|
53
|
+
|
|
54
|
+
Attributes:
|
|
55
|
+
original: The raw input value.
|
|
56
|
+
parsed: The parsed datetime, or None if parsing failed.
|
|
57
|
+
format_used: The DateFormat applied.
|
|
58
|
+
confidence: How confident we are in the parse.
|
|
59
|
+
warnings: Any issues detected (ambiguity, coercion, etc.).
|
|
60
|
+
row_index: Optional position in the source batch.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
original: Any
|
|
64
|
+
parsed: Optional[datetime.datetime] = None
|
|
65
|
+
format_used: Optional[DateFormat] = None
|
|
66
|
+
confidence: Confidence = Confidence.FAILED
|
|
67
|
+
warnings: list[str] = field(default_factory=list)
|
|
68
|
+
row_index: Optional[int] = None
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def ok(self) -> bool:
|
|
72
|
+
"""True if the value was parsed successfully."""
|
|
73
|
+
return self.parsed is not None and self.confidence != Confidence.FAILED
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def date(self) -> Optional[datetime.date]:
|
|
77
|
+
"""Return just the date portion, if parsed."""
|
|
78
|
+
return self.parsed.date() if self.parsed else None
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def iso(self) -> Optional[str]:
|
|
82
|
+
"""Return ISO 8601 string, if parsed."""
|
|
83
|
+
return self.parsed.isoformat() if self.parsed else None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class FormatDetectionResult:
|
|
88
|
+
"""Result of detecting the date format for a batch of values.
|
|
89
|
+
|
|
90
|
+
Attributes:
|
|
91
|
+
format: The most likely format, or None if detection failed.
|
|
92
|
+
confidence: Overall confidence in the detected format.
|
|
93
|
+
ambiguities: List of ambiguity types found.
|
|
94
|
+
candidates: All candidate formats considered, with match counts.
|
|
95
|
+
sample_size: Number of values analyzed.
|
|
96
|
+
match_count: Number of values matching the detected format.
|
|
97
|
+
warnings: Issues found during detection.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
format: Optional[DateFormat] = None
|
|
101
|
+
confidence: Confidence = Confidence.FAILED
|
|
102
|
+
ambiguities: list[AmbiguityType] = field(default_factory=list)
|
|
103
|
+
candidates: list[FormatCandidate] = field(default_factory=list)
|
|
104
|
+
sample_size: int = 0
|
|
105
|
+
match_count: int = 0
|
|
106
|
+
warnings: list[str] = field(default_factory=list)
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def is_ambiguous(self) -> bool:
|
|
110
|
+
"""True if ambiguity was detected."""
|
|
111
|
+
return len(self.ambiguities) > 0
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def match_ratio(self) -> float:
|
|
115
|
+
"""Fraction of values matching the detected format."""
|
|
116
|
+
if self.sample_size == 0:
|
|
117
|
+
return 0.0
|
|
118
|
+
return self.match_count / self.sample_size
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass
|
|
122
|
+
class FormatCandidate:
|
|
123
|
+
"""A candidate format with its match statistics.
|
|
124
|
+
|
|
125
|
+
Attributes:
|
|
126
|
+
format: The candidate date format.
|
|
127
|
+
match_count: How many values match this format.
|
|
128
|
+
sample_size: Total values tested.
|
|
129
|
+
confidence: Confidence if this format were chosen.
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
format: DateFormat
|
|
133
|
+
match_count: int = 0
|
|
134
|
+
sample_size: int = 0
|
|
135
|
+
confidence: Confidence = Confidence.FAILED
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def match_ratio(self) -> float:
|
|
139
|
+
if self.sample_size == 0:
|
|
140
|
+
return 0.0
|
|
141
|
+
return self.match_count / self.sample_size
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@dataclass
|
|
145
|
+
class BatchResult:
|
|
146
|
+
"""Result of parsing a batch of date values.
|
|
147
|
+
|
|
148
|
+
Attributes:
|
|
149
|
+
results: Per-value parse results.
|
|
150
|
+
detected_format: The format used (detected or specified).
|
|
151
|
+
total: Total number of values in the batch.
|
|
152
|
+
parsed_count: Number successfully parsed.
|
|
153
|
+
failed_count: Number that failed to parse.
|
|
154
|
+
warnings: Batch-level warnings.
|
|
155
|
+
format_detection: Full detection result, if detection was performed.
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
results: list[DateResult] = field(default_factory=list)
|
|
159
|
+
detected_format: Optional[DateFormat] = None
|
|
160
|
+
total: int = 0
|
|
161
|
+
parsed_count: int = 0
|
|
162
|
+
failed_count: int = 0
|
|
163
|
+
warnings: list[str] = field(default_factory=list)
|
|
164
|
+
format_detection: Optional[FormatDetectionResult] = None
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def ok(self) -> bool:
|
|
168
|
+
"""True if all values were parsed successfully."""
|
|
169
|
+
return self.failed_count == 0 and self.total > 0
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def success_ratio(self) -> float:
|
|
173
|
+
"""Fraction of values successfully parsed."""
|
|
174
|
+
if self.total == 0:
|
|
175
|
+
return 0.0
|
|
176
|
+
return self.parsed_count / self.total
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def dates(self) -> list[Optional[datetime.datetime]]:
|
|
180
|
+
"""Extract just the parsed datetimes (None for failures)."""
|
|
181
|
+
return [r.parsed for r in self.results]
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def iso_strings(self) -> list[Optional[str]]:
|
|
185
|
+
"""Extract ISO 8601 strings (None for failures)."""
|
|
186
|
+
return [r.iso for r in self.results]
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def failed(self) -> list[DateResult]:
|
|
190
|
+
"""Return only the failed results."""
|
|
191
|
+
return [r for r in self.results if not r.ok]
|
|
192
|
+
|
|
193
|
+
@property
|
|
194
|
+
def succeeded(self) -> list[DateResult]:
|
|
195
|
+
"""Return only the successful results."""
|
|
196
|
+
return [r for r in self.results if r.ok]
|
datemonkey/parser.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Batch date parser with format lock-in.
|
|
2
|
+
|
|
3
|
+
Once a format is detected (or specified), it is enforced across all values
|
|
4
|
+
in the batch. Values that don't match are flagged, not silently re-guessed.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import datetime
|
|
10
|
+
from typing import Any, Optional, Sequence, Union
|
|
11
|
+
|
|
12
|
+
from .detector import detect_format
|
|
13
|
+
from .excel import EXCEL_SERIAL_FORMAT, parse_excel_serial
|
|
14
|
+
from .formats import could_be_excel_serial
|
|
15
|
+
from .models import (
|
|
16
|
+
AmbiguityType,
|
|
17
|
+
BatchResult,
|
|
18
|
+
Confidence,
|
|
19
|
+
DateFormat,
|
|
20
|
+
DateResult,
|
|
21
|
+
FormatDetectionResult,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _parse_single(
|
|
26
|
+
value: Any,
|
|
27
|
+
fmt: DateFormat,
|
|
28
|
+
row_index: Optional[int] = None,
|
|
29
|
+
) -> DateResult:
|
|
30
|
+
"""Parse a single value with a locked-in format."""
|
|
31
|
+
# Handle blanks/None
|
|
32
|
+
if value is None or (isinstance(value, str) and value.strip() == ""):
|
|
33
|
+
return DateResult(
|
|
34
|
+
original=value,
|
|
35
|
+
confidence=Confidence.FAILED,
|
|
36
|
+
format_used=fmt,
|
|
37
|
+
warnings=["Blank or None value."],
|
|
38
|
+
row_index=row_index,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
s = str(value).strip()
|
|
42
|
+
|
|
43
|
+
# Excel serial path
|
|
44
|
+
if fmt == EXCEL_SERIAL_FORMAT:
|
|
45
|
+
return parse_excel_serial(value, row_index=row_index)
|
|
46
|
+
|
|
47
|
+
# Standard strftime path
|
|
48
|
+
try:
|
|
49
|
+
parsed = datetime.datetime.strptime(s, fmt.pattern)
|
|
50
|
+
return DateResult(
|
|
51
|
+
original=value,
|
|
52
|
+
parsed=parsed,
|
|
53
|
+
format_used=fmt,
|
|
54
|
+
confidence=Confidence.HIGH,
|
|
55
|
+
row_index=row_index,
|
|
56
|
+
)
|
|
57
|
+
except (ValueError, OverflowError):
|
|
58
|
+
warnings = [f"Value does not match format {fmt.label} ({fmt.pattern})."]
|
|
59
|
+
# Hint if it looks like an Excel serial
|
|
60
|
+
if could_be_excel_serial(s):
|
|
61
|
+
warnings.append("This looks like an Excel serial date number.")
|
|
62
|
+
return DateResult(
|
|
63
|
+
original=value,
|
|
64
|
+
confidence=Confidence.FAILED,
|
|
65
|
+
format_used=fmt,
|
|
66
|
+
warnings=warnings,
|
|
67
|
+
row_index=row_index,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def parse_dates(
|
|
72
|
+
values: Sequence[Any],
|
|
73
|
+
*,
|
|
74
|
+
format: Optional[Union[DateFormat, str]] = None,
|
|
75
|
+
locale_preference: Optional[str] = None,
|
|
76
|
+
strict: bool = False,
|
|
77
|
+
) -> BatchResult:
|
|
78
|
+
"""Parse a batch of date values.
|
|
79
|
+
|
|
80
|
+
If ``format`` is provided, it is used directly (format lock-in).
|
|
81
|
+
If not, the format is auto-detected from the batch.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
values: Sequence of date-like values (strings, ints, floats, None).
|
|
85
|
+
format: A DateFormat or strftime pattern string to enforce.
|
|
86
|
+
If None, auto-detect from the values.
|
|
87
|
+
locale_preference: Hint for resolving DD/MM vs MM/DD ambiguity
|
|
88
|
+
during auto-detection. "us" for MM/DD, "eu" for DD/MM.
|
|
89
|
+
strict: If True, treat any ambiguity as a failure (don't parse
|
|
90
|
+
ambiguous values, report them as errors).
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
BatchResult containing per-value results, detected format,
|
|
94
|
+
and batch-level statistics.
|
|
95
|
+
"""
|
|
96
|
+
if not values:
|
|
97
|
+
return BatchResult(warnings=["Empty input: no values to parse."])
|
|
98
|
+
|
|
99
|
+
detection_result: Optional[FormatDetectionResult] = None
|
|
100
|
+
resolved_format: Optional[DateFormat] = None
|
|
101
|
+
|
|
102
|
+
# Resolve the format to use
|
|
103
|
+
if format is not None:
|
|
104
|
+
if isinstance(format, str):
|
|
105
|
+
if format == "EXCEL_SERIAL":
|
|
106
|
+
resolved_format = EXCEL_SERIAL_FORMAT
|
|
107
|
+
else:
|
|
108
|
+
resolved_format = DateFormat(
|
|
109
|
+
pattern=format,
|
|
110
|
+
label=f"Custom ({format})",
|
|
111
|
+
)
|
|
112
|
+
else:
|
|
113
|
+
resolved_format = format
|
|
114
|
+
else:
|
|
115
|
+
# Auto-detect
|
|
116
|
+
detection_result = detect_format(
|
|
117
|
+
values,
|
|
118
|
+
locale_preference=locale_preference,
|
|
119
|
+
)
|
|
120
|
+
resolved_format = detection_result.format
|
|
121
|
+
|
|
122
|
+
if resolved_format is None:
|
|
123
|
+
return BatchResult(
|
|
124
|
+
total=len(values),
|
|
125
|
+
warnings=["Could not determine date format."],
|
|
126
|
+
format_detection=detection_result,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# In strict mode, refuse to parse if ambiguous
|
|
130
|
+
if strict and detection_result and detection_result.is_ambiguous:
|
|
131
|
+
if AmbiguityType.DAY_MONTH_SWAP in detection_result.ambiguities:
|
|
132
|
+
return BatchResult(
|
|
133
|
+
total=len(values),
|
|
134
|
+
detected_format=resolved_format,
|
|
135
|
+
warnings=[
|
|
136
|
+
"Strict mode: refusing to parse due to DD/MM vs MM/DD "
|
|
137
|
+
"ambiguity. Provide a format or locale_preference to resolve."
|
|
138
|
+
]
|
|
139
|
+
+ detection_result.warnings,
|
|
140
|
+
format_detection=detection_result,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Parse each value with the locked-in format
|
|
144
|
+
results: list[DateResult] = []
|
|
145
|
+
parsed_count = 0
|
|
146
|
+
failed_count = 0
|
|
147
|
+
batch_warnings: list[str] = []
|
|
148
|
+
|
|
149
|
+
for i, v in enumerate(values):
|
|
150
|
+
result = _parse_single(v, resolved_format, row_index=i)
|
|
151
|
+
results.append(result)
|
|
152
|
+
if result.ok:
|
|
153
|
+
parsed_count += 1
|
|
154
|
+
else:
|
|
155
|
+
failed_count += 1
|
|
156
|
+
|
|
157
|
+
if detection_result:
|
|
158
|
+
batch_warnings.extend(detection_result.warnings)
|
|
159
|
+
|
|
160
|
+
if failed_count > 0:
|
|
161
|
+
batch_warnings.append(
|
|
162
|
+
f"{failed_count}/{len(values)} values failed to parse "
|
|
163
|
+
f"with format {resolved_format.label}."
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return BatchResult(
|
|
167
|
+
results=results,
|
|
168
|
+
detected_format=resolved_format,
|
|
169
|
+
total=len(values),
|
|
170
|
+
parsed_count=parsed_count,
|
|
171
|
+
failed_count=failed_count,
|
|
172
|
+
warnings=batch_warnings,
|
|
173
|
+
format_detection=detection_result,
|
|
174
|
+
)
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datemonkey
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Batch date parsing with ambiguity detection, confidence scores, and format lock-in.
|
|
5
|
+
Author-email: RexBytes <pythonic@rexbytes.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/RexBytes/datemonkey
|
|
8
|
+
Project-URL: Repository, https://github.com/RexBytes/datemonkey
|
|
9
|
+
Project-URL: Issues, https://github.com/RexBytes/datemonkey/issues
|
|
10
|
+
Keywords: date,parsing,ambiguity,detection,batch,excel
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
20
|
+
Classifier: Topic :: Text Processing
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# datemonkey
|
|
27
|
+
|
|
28
|
+
Batch date parsing with ambiguity detection, confidence scores, and format lock-in.
|
|
29
|
+
|
|
30
|
+
**The problem:** `dateutil.parser.parse("01/02/03")` silently guesses and is often wrong. DD/MM vs MM/DD ambiguity corrupts joins, aggregations, and reports. datemonkey detects ambiguity and tells you about it instead of guessing.
|
|
31
|
+
|
|
32
|
+
## Install
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install datemonkey
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
### Detect format from a column of values
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from datemonkey import detect_format
|
|
44
|
+
|
|
45
|
+
result = detect_format(["15/03/2024", "20/04/2024", "25/12/2024"])
|
|
46
|
+
print(result.format.label) # "European date (DD/MM/YYYY)"
|
|
47
|
+
print(result.confidence) # Confidence.HIGH
|
|
48
|
+
print(result.is_ambiguous) # False — day > 12 resolves it
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Ambiguity detection
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
result = detect_format(["01/02/2024", "03/04/2024", "05/06/2024"])
|
|
55
|
+
print(result.is_ambiguous) # True
|
|
56
|
+
print(result.ambiguities) # [AmbiguityType.DAY_MONTH_SWAP]
|
|
57
|
+
print(result.warnings)
|
|
58
|
+
# ["Ambiguous: cannot distinguish US date (MM/DD/YYYY) from European date (DD/MM/YYYY) ..."]
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Resolve ambiguity with locale preference
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
result = detect_format(["01/02/2024", "03/04/2024"], locale_preference="eu")
|
|
65
|
+
print(result.format.label) # "European date (DD/MM/YYYY)"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Parse a batch of dates
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from datemonkey import parse_dates
|
|
72
|
+
|
|
73
|
+
batch = parse_dates(["2024-03-15", "2024-04-20", "2024-12-25"])
|
|
74
|
+
print(batch.ok) # True
|
|
75
|
+
print(batch.dates) # [datetime(2024,3,15), datetime(2024,4,20), datetime(2024,12,25)]
|
|
76
|
+
print(batch.iso_strings) # ["2024-03-15T00:00:00", ...]
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Format lock-in
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from datemonkey import parse_dates, ISO_8601
|
|
83
|
+
|
|
84
|
+
batch = parse_dates(["2024-03-15", "03/15/2024"], format=ISO_8601)
|
|
85
|
+
print(batch.results[0].ok) # True — matches ISO
|
|
86
|
+
print(batch.results[1].ok) # False — doesn't match, flagged not re-guessed
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Strict mode
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
batch = parse_dates(["01/02/2024", "03/04/2024"], strict=True)
|
|
93
|
+
print(batch.parsed_count) # 0 — refuses to parse ambiguous data
|
|
94
|
+
print(batch.warnings) # ["Strict mode: refusing to parse due to DD/MM vs MM/DD ambiguity..."]
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Excel serial dates
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from datemonkey import parse_dates, excel_serial_to_datetime
|
|
101
|
+
|
|
102
|
+
# Single value
|
|
103
|
+
dt = excel_serial_to_datetime(45292) # datetime(2024, 1, 1)
|
|
104
|
+
|
|
105
|
+
# Batch — auto-detected
|
|
106
|
+
batch = parse_dates(["45292", "45293", "45294"])
|
|
107
|
+
print(batch.detected_format.label) # "Excel serial date number"
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Per-value results
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
batch = parse_dates(["2024-03-15", "garbage", "2024-12-25"], format="%Y-%m-%d")
|
|
114
|
+
for r in batch.results:
|
|
115
|
+
print(f"{r.original:20s} ok={r.ok} parsed={r.iso} warnings={r.warnings}")
|
|
116
|
+
# 2024-03-15 ok=True parsed=2024-03-15T00:00:00 warnings=[]
|
|
117
|
+
# garbage ok=False parsed=None warnings=[...]
|
|
118
|
+
# 2024-12-25 ok=True parsed=2024-12-25T00:00:00 warnings=[]
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## CLI
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
# Detect format
|
|
125
|
+
datemonkey detect "15/03/2024" "20/04/2024" "25/12/2024"
|
|
126
|
+
|
|
127
|
+
# Detect with JSON output
|
|
128
|
+
datemonkey detect --json "01/02/2024" "03/04/2024"
|
|
129
|
+
|
|
130
|
+
# Parse dates
|
|
131
|
+
datemonkey parse "2024-03-15" "2024-04-20"
|
|
132
|
+
|
|
133
|
+
# Parse from CSV file (column 2, skip header)
|
|
134
|
+
datemonkey parse --file data.csv --column 2 --skip-header
|
|
135
|
+
|
|
136
|
+
# Parse with explicit format
|
|
137
|
+
datemonkey parse --format "%d-%m-%Y" "15-03-2024"
|
|
138
|
+
|
|
139
|
+
# Parse in strict mode
|
|
140
|
+
datemonkey parse --strict "01/02/2024" "03/04/2024"
|
|
141
|
+
|
|
142
|
+
# List known formats
|
|
143
|
+
datemonkey formats
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## API Reference
|
|
147
|
+
|
|
148
|
+
### `detect_format(values, *, locale_preference=None, formats=None) -> FormatDetectionResult`
|
|
149
|
+
|
|
150
|
+
Analyze a batch and determine the most likely format, reporting ambiguity.
|
|
151
|
+
|
|
152
|
+
- **values**: List of date-like values (strings, ints, floats, None)
|
|
153
|
+
- **locale_preference**: `"us"` for MM/DD, `"eu"` for DD/MM (only used when data alone can't resolve)
|
|
154
|
+
- **formats**: Custom list of `DateFormat` objects to test
|
|
155
|
+
|
|
156
|
+
### `parse_dates(values, *, format=None, locale_preference=None, strict=False) -> BatchResult`
|
|
157
|
+
|
|
158
|
+
Parse a batch with format lock-in.
|
|
159
|
+
|
|
160
|
+
- **format**: A `DateFormat` object or strftime string. If None, auto-detected.
|
|
161
|
+
- **strict**: If True, refuse to parse when DD/MM vs MM/DD is ambiguous.
|
|
162
|
+
|
|
163
|
+
### `excel_serial_to_datetime(serial) -> datetime | None`
|
|
164
|
+
|
|
165
|
+
Convert an Excel serial date number to a Python datetime.
|
|
166
|
+
|
|
167
|
+
### Result Objects
|
|
168
|
+
|
|
169
|
+
| Object | Key Properties |
|
|
170
|
+
|---|---|
|
|
171
|
+
| `FormatDetectionResult` | `.format`, `.confidence`, `.is_ambiguous`, `.ambiguities`, `.candidates`, `.warnings` |
|
|
172
|
+
| `BatchResult` | `.ok`, `.results`, `.detected_format`, `.dates`, `.iso_strings`, `.failed`, `.succeeded`, `.success_ratio` |
|
|
173
|
+
| `DateResult` | `.ok`, `.original`, `.parsed`, `.date`, `.iso`, `.confidence`, `.warnings`, `.row_index` |
|
|
174
|
+
|
|
175
|
+
### Confidence Levels
|
|
176
|
+
|
|
177
|
+
| Level | Meaning |
|
|
178
|
+
|---|---|
|
|
179
|
+
| `HIGH` | Unambiguous parse, format is certain |
|
|
180
|
+
| `MEDIUM` | Likely correct, minor ambiguity (e.g. two-digit year) |
|
|
181
|
+
| `LOW` | Ambiguous — DD/MM vs MM/DD unresolved, or poor match ratio |
|
|
182
|
+
| `FAILED` | Could not parse or detect |
|
|
183
|
+
|
|
184
|
+
## Design
|
|
185
|
+
|
|
186
|
+
- **Batch-first**: Designed for columns of data, not single strings
|
|
187
|
+
- **No silent guessing**: Ambiguity is reported, not hidden
|
|
188
|
+
- **Format lock-in**: Once detected, the format is enforced — violations are flagged
|
|
189
|
+
- **Structured results**: Every parse returns confidence scores and warnings
|
|
190
|
+
- **Zero dependencies**: Pure Python, stdlib only
|
|
191
|
+
|
|
192
|
+
## Built for LLMs
|
|
193
|
+
|
|
194
|
+
datemonkey is designed to work well as a tool for large language models. Date parsing is a common source of silent errors in LLM-driven data pipelines — ambiguous formats lead to wrong guesses, wasted tokens on retries, and broken downstream logic. datemonkey reduces that complexity: a single call returns a structured result with the detected format, confidence level, and any ambiguities — no multi-step prompting or validation loops required. Fewer tokens in, reliable answers out.
|
|
195
|
+
|
|
196
|
+
## License
|
|
197
|
+
|
|
198
|
+
MIT
|