PyPI - datemonkey - Versions diffs - 0.1.0__py3-none-any.whl - Mend

datemonkey 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

datemonkey/__init__.py +57 -0
datemonkey/cli.py +213 -0
datemonkey/detector.py +306 -0
datemonkey/excel.py +120 -0
datemonkey/formats.py +133 -0
datemonkey/models.py +196 -0
datemonkey/parser.py +174 -0
datemonkey-0.1.0.dist-info/METADATA +198 -0
datemonkey-0.1.0.dist-info/RECORD +13 -0
datemonkey-0.1.0.dist-info/WHEEL +5 -0
datemonkey-0.1.0.dist-info/entry_points.txt +2 -0
datemonkey-0.1.0.dist-info/licenses/LICENSE +21 -0
datemonkey-0.1.0.dist-info/top_level.txt +1 -0

datemonkey/__init__.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""datemonkey — Batch date parsing with ambiguity detection.
+>>> from datemonkey import detect_format, parse_dates
+>>> result = detect_format(["15/03/2024", "20/04/2024", "25/12/2024"])
+>>> result.format.label
+'European date (DD/MM/YYYY)'
+>>> batch = parse_dates(["2024-03-15", "2024-04-20", "2024-12-25"])
+>>> batch.ok
+True
+"""
+from .detector import detect_format
+from .excel import excel_serial_to_datetime
+from .formats import (
+    EU_DASH,
+    EU_DOT,
+    EU_SLASH,
+    ISO_8601,
+    ISO_8601_T,
+    US_DASH,
+    US_SLASH,
+    DateFormat,
+)
+from .models import (
+    AmbiguityType,
+    BatchResult,
+    Confidence,
+    DateResult,
+    FormatCandidate,
+    FormatDetectionResult,
+)
+from .parser import parse_dates
+__version__ = "0.1.0"
+__all__ = [
+    # Core API
+    "detect_format",
+    "parse_dates",
+    "excel_serial_to_datetime",
+    # Models
+    "DateResult",
+    "BatchResult",
+    "FormatDetectionResult",
+    "FormatCandidate",
+    "DateFormat",
+    "AmbiguityType",
+    "Confidence",
+    # Common formats
+    "ISO_8601",
+    "ISO_8601_T",
+    "US_SLASH",
+    "US_DASH",
+    "EU_SLASH",
+    "EU_DASH",
+    "EU_DOT",
+]

datemonkey/cli.py ADDED Viewed

@@ -0,0 +1,213 @@
+"""datemonkey CLI — detect and parse dates from the command line."""
+from __future__ import annotations
+import argparse
+import csv
+import json
+import sys
+from typing import Optional
+from .detector import detect_format
+from .formats import ALL_FORMATS
+from .models import Confidence
+from .parser import parse_dates
+def _read_values(args: argparse.Namespace) -> list[str]:
+    """Read date values from arguments or stdin."""
+    if args.values:
+        return args.values
+    if args.file:
+        with open(args.file) as f:
+            if args.column is not None:
+                if args.column < 0:
+                    print("Error: --column must be a non-negative integer", file=sys.stderr)
+                    sys.exit(1)
+                reader = csv.reader(f)
+                col = args.column
+                values = []
+                for i, row in enumerate(reader):
+                    if args.skip_header and i == 0:
+                        continue
+                    if col < len(row):
+                        values.append(row[col])
+                return values
+            else:
+                return [line.strip() for line in f if line.strip()]
+    if not sys.stdin.isatty():
+        return [line.strip() for line in sys.stdin if line.strip()]
+    print("No input provided. Pass values as arguments, via --file, or pipe to stdin.", file=sys.stderr)
+    sys.exit(1)
+def cmd_detect(args: argparse.Namespace) -> None:
+    """Handle the 'detect' subcommand."""
+    values = _read_values(args)
+    result = detect_format(
+        values,
+        locale_preference=args.locale,
+    )
+    if args.json:
+        out = {
+            "format": result.format.pattern if result.format else None,
+            "label": result.format.label if result.format else None,
+            "confidence": result.confidence.value,
+            "is_ambiguous": result.is_ambiguous,
+            "ambiguities": [a.value for a in result.ambiguities],
+            "sample_size": result.sample_size,
+            "match_count": result.match_count,
+            "match_ratio": round(result.match_ratio, 4),
+            "candidates": [
+                {
+                    "format": c.format.pattern,
+                    "label": c.format.label,
+                    "match_count": c.match_count,
+                    "match_ratio": round(c.match_ratio, 4),
+                }
+                for c in result.candidates[:5]
+            ],
+            "warnings": result.warnings,
+        }
+        print(json.dumps(out, indent=2))
+    else:
+        if result.format:
+            print(f"Detected format: {result.format.label}")
+            print(f"  Pattern:    {result.format.pattern}")
+            print(f"  Confidence: {result.confidence.value}")
+            print(f"  Matched:    {result.match_count}/{result.sample_size}")
+        else:
+            print("Could not detect date format.")
+        if result.is_ambiguous:
+            print(f"\n  AMBIGUOUS: {', '.join(a.value for a in result.ambiguities)}")
+        if result.candidates:
+            print(f"\n  Top candidates:")
+            for c in result.candidates[:5]:
+                print(f"    {c.format.label:40s} {c.match_count}/{c.sample_size} matches")
+        if result.warnings:
+            print(f"\n  Warnings:")
+            for w in result.warnings:
+                print(f"    - {w}")
+def cmd_parse(args: argparse.Namespace) -> None:
+    """Handle the 'parse' subcommand."""
+    values = _read_values(args)
+    result = parse_dates(
+        values,
+        format=args.format,
+        locale_preference=args.locale,
+        strict=args.strict,
+    )
+    if args.json:
+        out = {
+            "detected_format": result.detected_format.pattern if result.detected_format else None,
+            "total": result.total,
+            "parsed_count": result.parsed_count,
+            "failed_count": result.failed_count,
+            "success_ratio": round(result.success_ratio, 4),
+            "warnings": result.warnings,
+            "results": [
+                {
+                    "original": str(r.original),
+                    "parsed": r.iso,
+                    "confidence": r.confidence.value,
+                    "warnings": r.warnings,
+                }
+                for r in result.results
+            ],
+        }
+        print(json.dumps(out, indent=2))
+    else:
+        if result.detected_format:
+            print(f"Format: {result.detected_format.label} ({result.detected_format.pattern})")
+        print(f"Parsed: {result.parsed_count}/{result.total}")
+        if result.warnings:
+            for w in result.warnings:
+                print(f"  Warning: {w}")
+        print()
+        for r in result.results:
+            status = "OK" if r.ok else "FAIL"
+            parsed = r.iso or "---"
+            print(f"  [{status:4s}] {str(r.original):30s} -> {parsed}")
+            for w in r.warnings:
+                print(f"         {w}")
+def cmd_formats(args: argparse.Namespace) -> None:
+    """Handle the 'formats' subcommand — list known formats."""
+    if args.json:
+        out = [
+            {
+                "pattern": f.pattern,
+                "label": f.label,
+                "example": f.example,
+            }
+            for f in ALL_FORMATS
+        ]
+        print(json.dumps(out, indent=2))
+    else:
+        print("Known date formats:\n")
+        for f in ALL_FORMATS:
+            print(f"  {f.pattern:30s} {f.label}")
+            if f.example:
+                print(f"  {'':30s} Example: {f.example}")
+            print()
+def main(argv: Optional[list[str]] = None) -> None:
+    """CLI entry point."""
+    ap = argparse.ArgumentParser(
+        prog="datemonkey",
+        description="Batch date parsing with ambiguity detection.",
+    )
+    ap.add_argument("--version", action="version", version="datemonkey 0.1.0")
+    sub = ap.add_subparsers(dest="command", help="Available commands")
+    # ── detect ───────────────────────────────────────────────────────
+    p_detect = sub.add_parser("detect", help="Detect date format from values")
+    p_detect.add_argument("values", nargs="*", help="Date values to analyze")
+    p_detect.add_argument("--file", "-f", help="Read values from a file (one per line, or CSV)")
+    p_detect.add_argument("--column", "-c", type=int, help="CSV column index (0-based)")
+    p_detect.add_argument("--skip-header", action="store_true", help="Skip first row of CSV")
+    p_detect.add_argument("--locale", "-l", help="Locale preference: 'us' or 'eu'")
+    p_detect.add_argument("--json", "-j", action="store_true", help="Output as JSON")
+    p_detect.set_defaults(func=cmd_detect)
+    # ── parse ────────────────────────────────────────────────────────
+    p_parse = sub.add_parser("parse", help="Parse date values")
+    p_parse.add_argument("values", nargs="*", help="Date values to parse")
+    p_parse.add_argument("--file", "-f", help="Read values from a file")
+    p_parse.add_argument("--column", "-c", type=int, help="CSV column index (0-based)")
+    p_parse.add_argument("--skip-header", action="store_true", help="Skip first row of CSV")
+    p_parse.add_argument("--format", help="Enforce a specific strftime format")
+    p_parse.add_argument("--locale", "-l", help="Locale preference: 'us' or 'eu'")
+    p_parse.add_argument("--strict", "-s", action="store_true", help="Fail on ambiguity")
+    p_parse.add_argument("--json", "-j", action="store_true", help="Output as JSON")
+    p_parse.set_defaults(func=cmd_parse)
+    # ── formats ──────────────────────────────────────────────────────
+    p_formats = sub.add_parser("formats", help="List known date formats")
+    p_formats.add_argument("--json", "-j", action="store_true", help="Output as JSON")
+    p_formats.set_defaults(func=cmd_formats)
+    args = ap.parse_args(argv)
+    if not args.command:
+        ap.print_help()
+        sys.exit(1)
+    args.func(args)
+if __name__ == "__main__":
+    main()

datemonkey/detector.py ADDED Viewed

@@ -0,0 +1,306 @@
+"""Format detection engine — the core of datemonkey.
+Analyzes a batch of values and determines the most likely date format,
+reporting ambiguity instead of silently guessing.
+"""
+from __future__ import annotations
+import datetime
+from typing import Any, Optional, Sequence
+from .formats import (
+    ALL_FORMATS,
+    AMBIGUOUS_PAIRS,
+    could_be_excel_serial,
+    get_ambiguous_partner,
+    is_date_like,
+)
+from .models import (
+    AmbiguityType,
+    Confidence,
+    DateFormat,
+    FormatCandidate,
+    FormatDetectionResult,
+)
+def _try_parse(value: str, fmt: DateFormat) -> Optional[datetime.datetime]:
+    """Try to parse a string with a given format. Returns None on failure."""
+    try:
+        return datetime.datetime.strptime(value.strip(), fmt.pattern)
+    except (ValueError, OverflowError):
+        return None
+def _has_two_digit_year(fmt: DateFormat) -> bool:
+    """Check if format uses a two-digit year."""
+    return "%y" in fmt.pattern and "%Y" not in fmt.pattern
+def _score_candidates(
+    values: list[str], formats: list[DateFormat]
+) -> list[FormatCandidate]:
+    """Score each format against the batch of values."""
+    candidates = []
+    for fmt in formats:
+        match_count = 0
+        for v in values:
+            if _try_parse(v, fmt) is not None:
+                match_count += 1
+        if match_count > 0:
+            candidates.append(
+                FormatCandidate(
+                    format=fmt,
+                    match_count=match_count,
+                    sample_size=len(values),
+                )
+            )
+    # Sort by match count descending, then by format specificity (longer pattern = more specific)
+    candidates.sort(key=lambda c: (c.match_count, len(c.format.pattern)), reverse=True)
+    return candidates
+def _check_day_month_ambiguity(
+    values: list[str],
+    best: FormatCandidate,
+    candidates: list[FormatCandidate],
+) -> Optional[FormatCandidate]:
+    """Check if the best format has a DD/MM vs MM/DD ambiguous partner
+    that matches equally well.
+    Returns the ambiguous partner candidate if ambiguity exists, None otherwise.
+    """
+    partner_fmt = get_ambiguous_partner(best.format)
+    if partner_fmt is None:
+        return None
+    for c in candidates:
+        if c.format == partner_fmt and c.match_count == best.match_count:
+            return c
+    return None
+def _resolve_day_month(
+    values: list[str],
+    fmt_a: DateFormat,
+    fmt_b: DateFormat,
+) -> Optional[DateFormat]:
+    """Try to resolve DD/MM vs MM/DD ambiguity by looking for values where
+    the day field exceeds 12 (which would be invalid as a month).
+    Returns the resolved format, or None if truly ambiguous.
+    """
+    a_only = 0  # Values that parse with A but not B
+    b_only = 0  # Values that parse with B but not A
+    for v in values:
+        parsed_a = _try_parse(v, fmt_a)
+        parsed_b = _try_parse(v, fmt_b)
+        if parsed_a is not None and parsed_b is None:
+            a_only += 1
+        elif parsed_b is not None and parsed_a is None:
+            b_only += 1
+    if a_only > 0 and b_only == 0:
+        return fmt_a
+    if b_only > 0 and a_only == 0:
+        return fmt_b
+    return None
+def detect_format(
+    values: Sequence[Any],
+    *,
+    locale_preference: Optional[str] = None,
+    formats: Optional[list[DateFormat]] = None,
+) -> FormatDetectionResult:
+    """Detect the date format of a batch of values.
+    Examines the values and determines the most likely date format,
+    explicitly reporting ambiguity rather than silently guessing.
+    Args:
+        values: A sequence of date-like values (strings, ints, floats).
+        locale_preference: Hint for resolving DD/MM vs MM/DD ambiguity.
+            Use "us" for MM/DD preference, "eu" for DD/MM preference.
+            Only used when data alone cannot resolve the ambiguity.
+        formats: Custom list of DateFormat objects to test against.
+            If None, uses all built-in formats.
+    Returns:
+        FormatDetectionResult with detected format, confidence, and
+        ambiguity information.
+    """
+    if not values:
+        return FormatDetectionResult(
+            warnings=["Empty input: no values to analyze."]
+        )
+    # Coerce to strings, track excel serial candidates
+    str_values: list[str] = []
+    excel_count = 0
+    blank_count = 0
+    for v in values:
+        if v is None or (isinstance(v, str) and v.strip() == ""):
+            blank_count += 1
+            continue
+        s = str(v).strip()
+        if could_be_excel_serial(s):
+            excel_count += 1
+        str_values.append(s)
+    if not str_values:
+        return FormatDetectionResult(
+            sample_size=len(values),
+            warnings=["All values are blank or None."],
+        )
+    # Filter to date-like values for format detection
+    date_like = [v for v in str_values if is_date_like(v)]
+    # If most values look like Excel serials and few look like dates
+    if excel_count > len(date_like) and excel_count >= len(str_values) * 0.5:
+        from .excel import EXCEL_SERIAL_FORMAT
+        return FormatDetectionResult(
+            format=EXCEL_SERIAL_FORMAT,
+            confidence=Confidence.HIGH if excel_count == len(str_values) else Confidence.MEDIUM,
+            sample_size=len(str_values),
+            match_count=excel_count,
+            warnings=(
+                [f"{blank_count} blank values skipped."] if blank_count > 0 else []
+            ),
+        )
+    if not date_like:
+        return FormatDetectionResult(
+            sample_size=len(str_values),
+            warnings=[
+                "No values matched any known date pattern.",
+                f"{excel_count} values look like possible Excel serial numbers."
+                if excel_count > 0
+                else "No date-like values found.",
+            ],
+        )
+    # Score all candidate formats
+    test_formats = formats or ALL_FORMATS
+    candidates = _score_candidates(date_like, test_formats)
+    if not candidates:
+        return FormatDetectionResult(
+            sample_size=len(date_like),
+            warnings=["No formats matched any values."],
+        )
+    best = candidates[0]
+    ambiguities: list[AmbiguityType] = []
+    warnings: list[str] = []
+    if blank_count > 0:
+        warnings.append(f"{blank_count} blank values skipped.")
+    non_matching = len(date_like) - best.match_count
+    if non_matching > 0:
+        warnings.append(
+            f"{non_matching}/{len(date_like)} date-like values did not match "
+            f"the detected format ({best.format.label})."
+        )
+    # Check DD/MM vs MM/DD ambiguity
+    ambig_partner = _check_day_month_ambiguity(date_like, best, candidates)
+    if ambig_partner is not None:
+        # Try to resolve from data
+        resolved = _resolve_day_month(date_like, best.format, ambig_partner.format)
+        if resolved is not None:
+            # Data resolves it — pick the right one
+            if resolved != best.format:
+                # Swap: the partner is actually the correct one
+                for c in candidates:
+                    if c.format == resolved:
+                        best = c
+                        break
+        else:
+            # Truly ambiguous
+            ambiguities.append(AmbiguityType.DAY_MONTH_SWAP)
+            warnings.append(
+                f"Ambiguous: cannot distinguish {best.format.label} from "
+                f"{ambig_partner.format.label} using data alone."
+            )
+            # Apply locale preference if provided
+            if locale_preference:
+                lp = locale_preference.lower()
+                if lp in ("us", "en_us", "en-us", "american"):
+                    # Prefer MM/DD
+                    for c in candidates:
+                        if c.format.pattern.startswith("%m"):
+                            best = c
+                            break
+                    warnings.append(
+                        f"Locale preference '{locale_preference}' applied: "
+                        f"using {best.format.label}."
+                    )
+                elif lp in ("eu", "european", "en_gb", "en-gb", "british", "de", "fr", "es", "it"):
+                    # Prefer DD/MM
+                    for c in candidates:
+                        if c.format.pattern.startswith("%d"):
+                            best = c
+                            break
+                    warnings.append(
+                        f"Locale preference '{locale_preference}' applied: "
+                        f"using {best.format.label}."
+                    )
+    # Check two-digit year
+    if _has_two_digit_year(best.format):
+        ambiguities.append(AmbiguityType.TWO_DIGIT_YEAR)
+        warnings.append(
+            "Two-digit year detected. Python's strptime interprets "
+            "00-68 as 2000-2068 and 69-99 as 1969-1999."
+        )
+    # Check mixed formats (multiple candidates with high match counts)
+    if len(candidates) >= 2:
+        second = candidates[1]
+        # If second-best matches a significant portion and is a different "family"
+        if (
+            second.match_count >= len(date_like) * 0.2
+            and get_ambiguous_partner(best.format) != second.format
+        ):
+            ambiguities.append(AmbiguityType.MIXED_FORMATS)
+            warnings.append(
+                f"Possible mixed formats: {best.format.label} "
+                f"({best.match_count} matches) and {second.format.label} "
+                f"({second.match_count} matches)."
+            )
+    # Determine confidence
+    if best.match_count == len(date_like) and not ambiguities:
+        confidence = Confidence.HIGH
+    elif best.match_count >= len(date_like) * 0.8 and AmbiguityType.DAY_MONTH_SWAP not in ambiguities:
+        confidence = Confidence.MEDIUM
+    elif AmbiguityType.DAY_MONTH_SWAP in ambiguities:
+        confidence = Confidence.LOW
+    else:
+        confidence = Confidence.LOW
+    # Assign confidence to candidates
+    for c in candidates:
+        if c.match_count == len(date_like) and get_ambiguous_partner(c.format) is None:
+            c.confidence = Confidence.HIGH
+        elif c.match_count >= len(date_like) * 0.8:
+            c.confidence = Confidence.MEDIUM
+        else:
+            c.confidence = Confidence.LOW
+    return FormatDetectionResult(
+        format=best.format,
+        confidence=confidence,
+        ambiguities=ambiguities,
+        candidates=candidates,
+        sample_size=len(date_like),
+        match_count=best.match_count,
+        warnings=warnings,
+    )

datemonkey/excel.py ADDED Viewed

@@ -0,0 +1,120 @@
+"""Excel serial date number support.
+Excel stores dates as serial numbers:
+- Integer part = days since epoch (1900-01-01 = 1 in the 1900 system)
+- Fractional part = time of day (0.5 = noon)
+Note: Excel has a known bug where it treats 1900 as a leap year.
+Serial number 60 = 1900-02-29 (which doesn't exist). We handle this
+by matching Excel's behavior for compatibility.
+"""
+from __future__ import annotations
+import datetime
+from typing import Optional, Union
+from .models import Confidence, DateFormat, DateResult
+# Sentinel format object for Excel serial dates
+EXCEL_SERIAL_FORMAT = DateFormat(
+    pattern="EXCEL_SERIAL",
+    label="Excel serial date number",
+    example="45678",
+)
+# Excel epoch: 1899-12-31 (serial 1 = epoch + 1 day = 1900-01-01)
+_EXCEL_EPOCH = datetime.datetime(1899, 12, 31)
+# Reasonable range for Excel serial dates
+_MIN_SERIAL = 1  # 1900-01-01
+_MAX_SERIAL = 2958465  # 9999-12-31
+def excel_serial_to_datetime(
+    serial: Union[int, float, str],
+) -> Optional[datetime.datetime]:
+    """Convert an Excel serial date number to a Python datetime.
+    Args:
+        serial: The Excel serial number (int, float, or numeric string).
+    Returns:
+        A datetime, or None if the value is out of range or invalid.
+    """
+    try:
+        num = float(serial)
+    except (ValueError, TypeError):
+        return None
+    if num < _MIN_SERIAL or num > _MAX_SERIAL:
+        return None
+    # Handle the Lotus 1-2-3 leap year bug:
+    # Excel thinks 1900-02-29 exists (serial 60).
+    # For serials > 60, subtract 1 to correct the off-by-one.
+    if int(num) == 60:
+        # Serial 60 is the non-existent Feb 29, 1900.
+        # Return March 1, 1900 to match common convention.
+        return datetime.datetime(1900, 3, 1)
+    days = int(num)
+    fraction = num - days
+    if days > 60:
+        days -= 1
+    dt = _EXCEL_EPOCH + datetime.timedelta(days=days)
+    # Add time component from fractional part
+    if fraction > 0:
+        total_seconds = round(fraction * 86400)
+        dt += datetime.timedelta(seconds=total_seconds)
+    return dt
+def parse_excel_serial(
+    value: Union[int, float, str],
+    row_index: Optional[int] = None,
+) -> DateResult:
+    """Parse a single Excel serial date number.
+    Args:
+        value: The serial number to parse.
+        row_index: Optional row index for tracking.
+    Returns:
+        A DateResult with the parsed datetime.
+    """
+    parsed = excel_serial_to_datetime(value)
+    warnings: list[str] = []
+    if parsed is None:
+        return DateResult(
+            original=value,
+            confidence=Confidence.FAILED,
+            format_used=EXCEL_SERIAL_FORMAT,
+            warnings=["Not a valid Excel serial date number."],
+            row_index=row_index,
+        )
+    # Flag the Feb 29 1900 bug
+    try:
+        num = float(value)
+        if int(num) == 60:
+            warnings.append(
+                "Excel serial 60 maps to non-existent 1900-02-29. "
+                "Mapped to 1900-03-01."
+            )
+    except (ValueError, TypeError):
+        pass
+    return DateResult(
+        original=value,
+        parsed=parsed,
+        format_used=EXCEL_SERIAL_FORMAT,
+        confidence=Confidence.HIGH,
+        warnings=warnings,
+        row_index=row_index,
+    )