PyPI - stencilpy - Versions diffs - 0.1.0__tar.gz - Mend

stencilpy 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

stencilpy-0.1.0/.gitignore +15 -0
stencilpy-0.1.0/PKG-INFO +77 -0
stencilpy-0.1.0/README.md +47 -0
stencilpy-0.1.0/pyproject.toml +47 -0
stencilpy-0.1.0/src/stencilpy/__init__.py +122 -0
stencilpy-0.1.0/src/stencilpy/addressing.py +98 -0
stencilpy-0.1.0/src/stencilpy/computed.py +101 -0
stencilpy-0.1.0/src/stencilpy/errors.py +10 -0
stencilpy-0.1.0/src/stencilpy/extractor.py +206 -0
stencilpy-0.1.0/src/stencilpy/models.py +68 -0
stencilpy-0.1.0/src/stencilpy/schema.py +194 -0
stencilpy-0.1.0/tests/__init__.py +0 -0
stencilpy-0.1.0/tests/conftest.py +176 -0
stencilpy-0.1.0/tests/test_addressing.py +90 -0
stencilpy-0.1.0/tests/test_computed.py +66 -0
stencilpy-0.1.0/tests/test_extractor.py +89 -0
stencilpy-0.1.0/tests/test_models.py +68 -0
stencilpy-0.1.0/tests/test_schema.py +83 -0
stencilpy-0.1.0/tests/test_stencil.py +103 -0

stencilpy-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,15 @@
+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+dist/
+build/
+.venv/
+*.lock
+# Node
+node_modules/
+editor/dist/
+# OS
+.DS_Store

stencilpy-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,77 @@
+Metadata-Version: 2.4
+Name: stencilpy
+Version: 0.1.0
+Summary: Extract structured data from Excel files using YAML schema definitions
+Project-URL: Homepage, https://github.com/phlohouse/stencil
+Project-URL: Repository, https://github.com/phlohouse/stencil
+Project-URL: Issues, https://github.com/phlohouse/stencil/issues
+Author: Phlo House
+License-Expression: MIT
+Keywords: excel,extraction,pydantic,spreadsheet,yaml
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: File Formats
+Classifier: Topic :: Software Development :: Libraries
+Classifier: Typing :: Typed
+Requires-Python: >=3.10
+Requires-Dist: openpyxl>=3.1
+Requires-Dist: pydantic>=2.0
+Requires-Dist: pyyaml>=6.0
+Provides-Extra: dev
+Requires-Dist: pytest; extra == 'dev'
+Requires-Dist: pytest-cov; extra == 'dev'
+Description-Content-Type: text/markdown
+# stencilpy
+Extract structured data from Excel files using YAML schema definitions into dynamically-generated Pydantic models.
+## Installation
+```bash
+pip install stencilpy
+```
+## Quick Start
+```python
+from stencilpy import Stencil
+# Load a schema
+lab = Stencil("lab_report.stencil.yaml")
+# Extract data — version auto-detected via discriminator
+report = lab.extract("january_lab.xlsx")
+print(report.patient_name)
+print(report.model_dump())
+```
+## Schema Format
+Create a `.stencil.yaml` file:
+```yaml
+name: lab_report
+description: Monthly lab report
+discriminator:
+  cell: A1
+versions:
+  "v2.0":
+    fields:
+      patient_name:
+        cell: B3
+      sample_date:
+        cell: B4
+        type: datetime
+      readings:
+        range: D5:D
+        type: list[float]
+```

stencilpy-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,47 @@
+# stencilpy
+Extract structured data from Excel files using YAML schema definitions into dynamically-generated Pydantic models.
+## Installation
+```bash
+pip install stencilpy
+```
+## Quick Start
+```python
+from stencilpy import Stencil
+# Load a schema
+lab = Stencil("lab_report.stencil.yaml")
+# Extract data — version auto-detected via discriminator
+report = lab.extract("january_lab.xlsx")
+print(report.patient_name)
+print(report.model_dump())
+```
+## Schema Format
+Create a `.stencil.yaml` file:
+```yaml
+name: lab_report
+description: Monthly lab report
+discriminator:
+  cell: A1
+versions:
+  "v2.0":
+    fields:
+      patient_name:
+        cell: B3
+      sample_date:
+        cell: B4
+        type: datetime
+      readings:
+        range: D5:D
+        type: list[float]
+```

stencilpy-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,47 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "stencilpy"
+version = "0.1.0"
+description = "Extract structured data from Excel files using YAML schema definitions"
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+authors = [
+    { name = "Phlo House" },
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: File Formats",
+    "Topic :: Software Development :: Libraries",
+    "Typing :: Typed",
+]
+keywords = ["excel", "pydantic", "yaml", "extraction", "spreadsheet"]
+dependencies = [
+    "pydantic>=2.0",
+    "openpyxl>=3.1",
+    "pyyaml>=6.0",
+]
+[project.urls]
+Homepage = "https://github.com/phlohouse/stencil"
+Repository = "https://github.com/phlohouse/stencil"
+Issues = "https://github.com/phlohouse/stencil/issues"
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "pytest-cov",
+]
+[tool.hatch.build.targets.wheel]
+packages = ["src/stencilpy"]

stencilpy-0.1.0/src/stencilpy/__init__.py ADDED Viewed

@@ -0,0 +1,122 @@
+from __future__ import annotations
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Any
+from pydantic import BaseModel
+from pydantic import ValidationError as PydanticValidationError
+from .computed import get_computed_fields, resolve_computed
+from .errors import StencilError, ValidationError, VersionError
+from .extractor import extract_fields, read_cell
+from .models import build_all_models, get_or_create_model
+from .schema import StencilSchema
+__all__ = ["Stencil", "StencilError", "VersionError", "ValidationError"]
+class Stencil:
+    """Main entry point for stencilpy. Load a schema and extract data from Excel files."""
+    def __init__(self, path: str | Path) -> None:
+        self._schemas: list[StencilSchema] = []
+        path = Path(path)
+        if path.is_dir():
+            self._load_dir(path)
+        else:
+            self._schemas.append(StencilSchema.from_file(path))
+        self._model_cache: dict[str, dict[str, type[BaseModel]]] = {}
+    @classmethod
+    def from_dir(cls, path: str | Path) -> Stencil:
+        """Load all .stencil.yaml files from a directory."""
+        instance = cls.__new__(cls)
+        instance._schemas = []
+        instance._model_cache = {}
+        instance._load_dir(Path(path))
+        return instance
+    def _load_dir(self, path: Path) -> None:
+        files = sorted(path.glob("*.stencil.yaml"))
+        if not files:
+            raise StencilError(f"No .stencil.yaml files found in {path}")
+        for f in files:
+            self._schemas.append(StencilSchema.from_file(f))
+    def extract(self, path: str | Path) -> BaseModel:
+        """Extract data from an Excel file, auto-detecting version via discriminator."""
+        path = Path(path)
+        for schema in self._schemas:
+            try:
+                return self._extract_with_schema(schema, path)
+            except VersionError:
+                continue
+        raise VersionError(
+            f"No schema version matched the discriminator in '{path}'"
+        )
+    def extract_batch(
+        self, paths: Iterable[Path]
+    ) -> list[tuple[Path, BaseModel | StencilError]]:
+        """Extract data from multiple Excel files."""
+        results = []
+        for p in paths:
+            try:
+                model = self.extract(p)
+                results.append((p, model))
+            except StencilError as e:
+                results.append((p, e))
+        return results
+    @property
+    def models(self) -> dict[str, type[BaseModel]]:
+        """Return dict of version -> model class for all schemas."""
+        result: dict[str, type[BaseModel]] = {}
+        for schema in self._schemas:
+            models = self._get_models(schema)
+            result.update(models)
+        return result
+    def _get_models(self, schema: StencilSchema) -> dict[str, type[BaseModel]]:
+        if schema.name not in self._model_cache:
+            self._model_cache[schema.name] = build_all_models(schema)
+        return self._model_cache[schema.name]
+    def _extract_with_schema(
+        self, schema: StencilSchema, excel_path: Path
+    ) -> BaseModel:
+        disc_value = read_cell(excel_path, schema.discriminator_cell)
+        disc_str = str(disc_value).strip() if disc_value is not None else ""
+        matched_version = None
+        for ver_key in schema.versions:
+            if disc_str == ver_key:
+                matched_version = ver_key
+                break
+        if matched_version is None:
+            raise VersionError(
+                f"Discriminator '{disc_str}' doesn't match any version "
+                f"in schema '{schema.name}'"
+            )
+        version_def = schema.versions[matched_version]
+        model_cls = get_or_create_model(schema, matched_version)
+        # Extract non-computed fields
+        raw_values = extract_fields(excel_path, version_def.fields)
+        # Evaluate computed fields
+        computed_fields = get_computed_fields(version_def.fields)
+        if computed_fields:
+            computed_values = resolve_computed(computed_fields, raw_values)
+            raw_values.update(computed_values)
+        # Build and validate model
+        try:
+            return model_cls.model_validate(raw_values)
+        except PydanticValidationError as e:
+            raise ValidationError(str(e)) from e

stencilpy-0.1.0/src/stencilpy/addressing.py ADDED Viewed

@@ -0,0 +1,98 @@
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+_CELL_RE = re.compile(r"^([A-Z]+)(\d+)$")
+def _col_to_index(col: str) -> int:
+    """Convert a column letter string (A, B, ..., Z, AA, ...) to a 1-based index."""
+    result = 0
+    for ch in col:
+        result = result * 26 + (ord(ch) - ord("A") + 1)
+    return result
+def _index_to_col(index: int) -> str:
+    """Convert a 1-based column index to a column letter string."""
+    result = []
+    while index > 0:
+        index, remainder = divmod(index - 1, 26)
+        result.append(chr(ord("A") + remainder))
+    return "".join(reversed(result))
+@dataclass(frozen=True)
+class CellAddress:
+    """A single cell reference, optionally sheet-qualified."""
+    sheet: str | None
+    col: int  # 1-based
+    row: int  # 1-based
+    @property
+    def col_letter(self) -> str:
+        return _index_to_col(self.col)
+@dataclass(frozen=True)
+class RangeAddress:
+    """A range reference, optionally sheet-qualified. end_row=None means open-ended."""
+    sheet: str | None
+    start_col: int  # 1-based
+    start_row: int  # 1-based
+    end_col: int  # 1-based
+    end_row: int | None  # None = open-ended
+def parse_cell(ref: str) -> CellAddress:
+    """Parse a cell reference like 'A1' or 'Sheet2!B3'."""
+    sheet, cell_part = _split_sheet(ref)
+    col, row = _parse_cell_part(cell_part)
+    return CellAddress(sheet=sheet, col=col, row=row)
+def parse_range(ref: str) -> RangeAddress:
+    """Parse a range reference like 'A1:D50', 'D5:D', or 'Sheet2!A1:D50'."""
+    sheet, range_part = _split_sheet(ref)
+    if ":" not in range_part:
+        raise ValueError(f"Invalid range reference (no ':'): {ref}")
+    start_str, end_str = range_part.split(":", 1)
+    start_col, start_row = _parse_cell_part(start_str)
+    # Check for open-ended range (end is column-only, e.g. "D")
+    if re.match(r"^[A-Z]+$", end_str):
+        end_col = _col_to_index(end_str)
+        end_row = None
+    else:
+        end_col, end_row = _parse_cell_part(end_str)
+    return RangeAddress(
+        sheet=sheet,
+        start_col=start_col,
+        start_row=start_row,
+        end_col=end_col,
+        end_row=end_row,
+    )
+def _split_sheet(ref: str) -> tuple[str | None, str]:
+    """Split 'Sheet2!A1' into ('Sheet2', 'A1') or ('A1',) -> (None, 'A1')."""
+    if "!" in ref:
+        sheet, rest = ref.split("!", 1)
+        return sheet, rest
+    return None, ref
+def _parse_cell_part(cell: str) -> tuple[int, int]:
+    """Parse 'A1' into (col_index, row_index), both 1-based."""
+    m = _CELL_RE.match(cell.upper())
+    if not m:
+        raise ValueError(f"Invalid cell reference: {cell}")
+    col_str, row_str = m.groups()
+    return _col_to_index(col_str), int(row_str)

stencilpy-0.1.0/src/stencilpy/computed.py ADDED Viewed

@@ -0,0 +1,101 @@
+from __future__ import annotations
+import re
+from typing import Any
+from .schema import FieldDef
+_FIELD_REF_RE = re.compile(r"\{(\w+)\}")
+def get_computed_fields(fields: dict[str, FieldDef]) -> dict[str, FieldDef]:
+    """Return only the computed fields from a field dict."""
+    return {name: f for name, f in fields.items() if f.is_computed}
+def resolve_computed(
+    computed_fields: dict[str, FieldDef],
+    extracted_values: dict[str, Any],
+) -> dict[str, Any]:
+    """Evaluate computed fields in dependency order and return their values."""
+    order = _topological_sort(computed_fields)
+    all_values = dict(extracted_values)
+    results: dict[str, Any] = {}
+    for name in order:
+        field_def = computed_fields[name]
+        value = _evaluate(field_def.computed, all_values)
+        all_values[name] = value
+        results[name] = value
+    return results
+def get_field_references(expression: str) -> list[str]:
+    """Extract field references like {field_name} from an expression."""
+    return _FIELD_REF_RE.findall(expression)
+def _topological_sort(computed_fields: dict[str, FieldDef]) -> list[str]:
+    """Sort computed fields by dependency order."""
+    deps: dict[str, set[str]] = {}
+    for name, field_def in computed_fields.items():
+        refs = set(get_field_references(field_def.computed))
+        deps[name] = refs & set(computed_fields.keys())
+    visited: set[str] = set()
+    order: list[str] = []
+    visiting: set[str] = set()
+    def visit(name: str) -> None:
+        if name in visited:
+            return
+        if name in visiting:
+            raise ValueError(f"Circular dependency detected involving '{name}'")
+        visiting.add(name)
+        for dep in deps.get(name, set()):
+            visit(dep)
+        visiting.discard(name)
+        visited.add(name)
+        order.append(name)
+    for name in computed_fields:
+        visit(name)
+    return order
+def _is_interpolation(expression: str) -> bool:
+    """Check if expression is pure string interpolation (no operators outside refs).
+    Returns True when the text between {field} references is only whitespace,
+    e.g. "{first_name} {last_name}" — but NOT "{weight} / ({height} ** 2)".
+    """
+    stripped = _FIELD_REF_RE.sub("", expression)
+    # Must have some literal text (spaces) AND only whitespace characters
+    return len(stripped) > 0 and stripped.isspace()
+def _evaluate(expression: str, values: dict[str, Any]) -> Any:
+    """Evaluate a computed expression with {field_name} substitutions."""
+    if _is_interpolation(expression):
+        def str_replacer(match: re.Match) -> str:
+            field_name = match.group(1)
+            val = values.get(field_name)
+            return str(val) if val is not None else ""
+        return _FIELD_REF_RE.sub(str_replacer, expression)
+    def replacer(match: re.Match) -> str:
+        field_name = match.group(1)
+        val = values.get(field_name)
+        if val is None:
+            return "None"
+        return repr(val)
+    code = _FIELD_REF_RE.sub(replacer, expression)
+    try:
+        return eval(code)  # noqa: S307 — trusted YAML author
+    except Exception:
+        return code

stencilpy-0.1.0/src/stencilpy/errors.py ADDED Viewed

@@ -0,0 +1,10 @@
+class StencilError(Exception):
+    """Base exception for stencilpy."""
+class VersionError(StencilError):
+    """Discriminator cell value didn't match any known version."""
+class ValidationError(StencilError):
+    """Pydantic validation failed during extraction."""