stencilpy 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .venv/
8
+ *.lock
9
+
10
+ # Node
11
+ node_modules/
12
+ editor/dist/
13
+
14
+ # OS
15
+ .DS_Store
@@ -0,0 +1,77 @@
1
+ Metadata-Version: 2.4
2
+ Name: stencilpy
3
+ Version: 0.1.0
4
+ Summary: Extract structured data from Excel files using YAML schema definitions
5
+ Project-URL: Homepage, https://github.com/phlohouse/stencil
6
+ Project-URL: Repository, https://github.com/phlohouse/stencil
7
+ Project-URL: Issues, https://github.com/phlohouse/stencil/issues
8
+ Author: Phlo House
9
+ License-Expression: MIT
10
+ Keywords: excel,extraction,pydantic,spreadsheet,yaml
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: File Formats
20
+ Classifier: Topic :: Software Development :: Libraries
21
+ Classifier: Typing :: Typed
22
+ Requires-Python: >=3.10
23
+ Requires-Dist: openpyxl>=3.1
24
+ Requires-Dist: pydantic>=2.0
25
+ Requires-Dist: pyyaml>=6.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest; extra == 'dev'
28
+ Requires-Dist: pytest-cov; extra == 'dev'
29
+ Description-Content-Type: text/markdown
30
+
31
+ # stencilpy
32
+
33
+ Extract structured data from Excel files using YAML schema definitions into dynamically-generated Pydantic models.
34
+
35
+ ## Installation
36
+
37
+ ```bash
38
+ pip install stencilpy
39
+ ```
40
+
41
+ ## Quick Start
42
+
43
+ ```python
44
+ from stencilpy import Stencil
45
+
46
+ # Load a schema
47
+ lab = Stencil("lab_report.stencil.yaml")
48
+
49
+ # Extract data — version auto-detected via discriminator
50
+ report = lab.extract("january_lab.xlsx")
51
+ print(report.patient_name)
52
+ print(report.model_dump())
53
+ ```
54
+
55
+ ## Schema Format
56
+
57
+ Create a `.stencil.yaml` file:
58
+
59
+ ```yaml
60
+ name: lab_report
61
+ description: Monthly lab report
62
+
63
+ discriminator:
64
+ cell: A1
65
+
66
+ versions:
67
+ "v2.0":
68
+ fields:
69
+ patient_name:
70
+ cell: B3
71
+ sample_date:
72
+ cell: B4
73
+ type: datetime
74
+ readings:
75
+ range: D5:D
76
+ type: list[float]
77
+ ```
@@ -0,0 +1,47 @@
1
+ # stencilpy
2
+
3
+ Extract structured data from Excel files using YAML schema definitions into dynamically-generated Pydantic models.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install stencilpy
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```python
14
+ from stencilpy import Stencil
15
+
16
+ # Load a schema
17
+ lab = Stencil("lab_report.stencil.yaml")
18
+
19
+ # Extract data — version auto-detected via discriminator
20
+ report = lab.extract("january_lab.xlsx")
21
+ print(report.patient_name)
22
+ print(report.model_dump())
23
+ ```
24
+
25
+ ## Schema Format
26
+
27
+ Create a `.stencil.yaml` file:
28
+
29
+ ```yaml
30
+ name: lab_report
31
+ description: Monthly lab report
32
+
33
+ discriminator:
34
+ cell: A1
35
+
36
+ versions:
37
+ "v2.0":
38
+ fields:
39
+ patient_name:
40
+ cell: B3
41
+ sample_date:
42
+ cell: B4
43
+ type: datetime
44
+ readings:
45
+ range: D5:D
46
+ type: list[float]
47
+ ```
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "stencilpy"
7
+ version = "0.1.0"
8
+ description = "Extract structured data from Excel files using YAML schema definitions"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ authors = [
13
+ { name = "Phlo House" },
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
24
+ "Topic :: File Formats",
25
+ "Topic :: Software Development :: Libraries",
26
+ "Typing :: Typed",
27
+ ]
28
+ keywords = ["excel", "pydantic", "yaml", "extraction", "spreadsheet"]
29
+ dependencies = [
30
+ "pydantic>=2.0",
31
+ "openpyxl>=3.1",
32
+ "pyyaml>=6.0",
33
+ ]
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/phlohouse/stencil"
37
+ Repository = "https://github.com/phlohouse/stencil"
38
+ Issues = "https://github.com/phlohouse/stencil/issues"
39
+
40
+ [project.optional-dependencies]
41
+ dev = [
42
+ "pytest",
43
+ "pytest-cov",
44
+ ]
45
+
46
+ [tool.hatch.build.targets.wheel]
47
+ packages = ["src/stencilpy"]
@@ -0,0 +1,122 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterable
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from pydantic import BaseModel
8
+ from pydantic import ValidationError as PydanticValidationError
9
+
10
+ from .computed import get_computed_fields, resolve_computed
11
+ from .errors import StencilError, ValidationError, VersionError
12
+ from .extractor import extract_fields, read_cell
13
+ from .models import build_all_models, get_or_create_model
14
+ from .schema import StencilSchema
15
+
16
+ __all__ = ["Stencil", "StencilError", "VersionError", "ValidationError"]
17
+
18
+
19
+ class Stencil:
20
+ """Main entry point for stencilpy. Load a schema and extract data from Excel files."""
21
+
22
+ def __init__(self, path: str | Path) -> None:
23
+ self._schemas: list[StencilSchema] = []
24
+ path = Path(path)
25
+ if path.is_dir():
26
+ self._load_dir(path)
27
+ else:
28
+ self._schemas.append(StencilSchema.from_file(path))
29
+ self._model_cache: dict[str, dict[str, type[BaseModel]]] = {}
30
+
31
+ @classmethod
32
+ def from_dir(cls, path: str | Path) -> Stencil:
33
+ """Load all .stencil.yaml files from a directory."""
34
+ instance = cls.__new__(cls)
35
+ instance._schemas = []
36
+ instance._model_cache = {}
37
+ instance._load_dir(Path(path))
38
+ return instance
39
+
40
+ def _load_dir(self, path: Path) -> None:
41
+ files = sorted(path.glob("*.stencil.yaml"))
42
+ if not files:
43
+ raise StencilError(f"No .stencil.yaml files found in {path}")
44
+ for f in files:
45
+ self._schemas.append(StencilSchema.from_file(f))
46
+
47
+ def extract(self, path: str | Path) -> BaseModel:
48
+ """Extract data from an Excel file, auto-detecting version via discriminator."""
49
+ path = Path(path)
50
+
51
+ for schema in self._schemas:
52
+ try:
53
+ return self._extract_with_schema(schema, path)
54
+ except VersionError:
55
+ continue
56
+
57
+ raise VersionError(
58
+ f"No schema version matched the discriminator in '{path}'"
59
+ )
60
+
61
+ def extract_batch(
62
+ self, paths: Iterable[Path]
63
+ ) -> list[tuple[Path, BaseModel | StencilError]]:
64
+ """Extract data from multiple Excel files."""
65
+ results = []
66
+ for p in paths:
67
+ try:
68
+ model = self.extract(p)
69
+ results.append((p, model))
70
+ except StencilError as e:
71
+ results.append((p, e))
72
+ return results
73
+
74
+ @property
75
+ def models(self) -> dict[str, type[BaseModel]]:
76
+ """Return dict of version -> model class for all schemas."""
77
+ result: dict[str, type[BaseModel]] = {}
78
+ for schema in self._schemas:
79
+ models = self._get_models(schema)
80
+ result.update(models)
81
+ return result
82
+
83
+ def _get_models(self, schema: StencilSchema) -> dict[str, type[BaseModel]]:
84
+ if schema.name not in self._model_cache:
85
+ self._model_cache[schema.name] = build_all_models(schema)
86
+ return self._model_cache[schema.name]
87
+
88
+ def _extract_with_schema(
89
+ self, schema: StencilSchema, excel_path: Path
90
+ ) -> BaseModel:
91
+ disc_value = read_cell(excel_path, schema.discriminator_cell)
92
+ disc_str = str(disc_value).strip() if disc_value is not None else ""
93
+
94
+ matched_version = None
95
+ for ver_key in schema.versions:
96
+ if disc_str == ver_key:
97
+ matched_version = ver_key
98
+ break
99
+
100
+ if matched_version is None:
101
+ raise VersionError(
102
+ f"Discriminator '{disc_str}' doesn't match any version "
103
+ f"in schema '{schema.name}'"
104
+ )
105
+
106
+ version_def = schema.versions[matched_version]
107
+ model_cls = get_or_create_model(schema, matched_version)
108
+
109
+ # Extract non-computed fields
110
+ raw_values = extract_fields(excel_path, version_def.fields)
111
+
112
+ # Evaluate computed fields
113
+ computed_fields = get_computed_fields(version_def.fields)
114
+ if computed_fields:
115
+ computed_values = resolve_computed(computed_fields, raw_values)
116
+ raw_values.update(computed_values)
117
+
118
+ # Build and validate model
119
+ try:
120
+ return model_cls.model_validate(raw_values)
121
+ except PydanticValidationError as e:
122
+ raise ValidationError(str(e)) from e
@@ -0,0 +1,98 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+
6
+
7
+ _CELL_RE = re.compile(r"^([A-Z]+)(\d+)$")
8
+
9
+
10
+ def _col_to_index(col: str) -> int:
11
+ """Convert a column letter string (A, B, ..., Z, AA, ...) to a 1-based index."""
12
+ result = 0
13
+ for ch in col:
14
+ result = result * 26 + (ord(ch) - ord("A") + 1)
15
+ return result
16
+
17
+
18
+ def _index_to_col(index: int) -> str:
19
+ """Convert a 1-based column index to a column letter string."""
20
+ result = []
21
+ while index > 0:
22
+ index, remainder = divmod(index - 1, 26)
23
+ result.append(chr(ord("A") + remainder))
24
+ return "".join(reversed(result))
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class CellAddress:
29
+ """A single cell reference, optionally sheet-qualified."""
30
+
31
+ sheet: str | None
32
+ col: int # 1-based
33
+ row: int # 1-based
34
+
35
+ @property
36
+ def col_letter(self) -> str:
37
+ return _index_to_col(self.col)
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class RangeAddress:
42
+ """A range reference, optionally sheet-qualified. end_row=None means open-ended."""
43
+
44
+ sheet: str | None
45
+ start_col: int # 1-based
46
+ start_row: int # 1-based
47
+ end_col: int # 1-based
48
+ end_row: int | None # None = open-ended
49
+
50
+
51
+ def parse_cell(ref: str) -> CellAddress:
52
+ """Parse a cell reference like 'A1' or 'Sheet2!B3'."""
53
+ sheet, cell_part = _split_sheet(ref)
54
+ col, row = _parse_cell_part(cell_part)
55
+ return CellAddress(sheet=sheet, col=col, row=row)
56
+
57
+
58
+ def parse_range(ref: str) -> RangeAddress:
59
+ """Parse a range reference like 'A1:D50', 'D5:D', or 'Sheet2!A1:D50'."""
60
+ sheet, range_part = _split_sheet(ref)
61
+
62
+ if ":" not in range_part:
63
+ raise ValueError(f"Invalid range reference (no ':'): {ref}")
64
+
65
+ start_str, end_str = range_part.split(":", 1)
66
+ start_col, start_row = _parse_cell_part(start_str)
67
+
68
+ # Check for open-ended range (end is column-only, e.g. "D")
69
+ if re.match(r"^[A-Z]+$", end_str):
70
+ end_col = _col_to_index(end_str)
71
+ end_row = None
72
+ else:
73
+ end_col, end_row = _parse_cell_part(end_str)
74
+
75
+ return RangeAddress(
76
+ sheet=sheet,
77
+ start_col=start_col,
78
+ start_row=start_row,
79
+ end_col=end_col,
80
+ end_row=end_row,
81
+ )
82
+
83
+
84
+ def _split_sheet(ref: str) -> tuple[str | None, str]:
85
+ """Split 'Sheet2!A1' into ('Sheet2', 'A1') or ('A1',) -> (None, 'A1')."""
86
+ if "!" in ref:
87
+ sheet, rest = ref.split("!", 1)
88
+ return sheet, rest
89
+ return None, ref
90
+
91
+
92
+ def _parse_cell_part(cell: str) -> tuple[int, int]:
93
+ """Parse 'A1' into (col_index, row_index), both 1-based."""
94
+ m = _CELL_RE.match(cell.upper())
95
+ if not m:
96
+ raise ValueError(f"Invalid cell reference: {cell}")
97
+ col_str, row_str = m.groups()
98
+ return _col_to_index(col_str), int(row_str)
@@ -0,0 +1,101 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Any
5
+
6
+ from .schema import FieldDef
7
+
8
+
9
+ _FIELD_REF_RE = re.compile(r"\{(\w+)\}")
10
+
11
+
12
+ def get_computed_fields(fields: dict[str, FieldDef]) -> dict[str, FieldDef]:
13
+ """Return only the computed fields from a field dict."""
14
+ return {name: f for name, f in fields.items() if f.is_computed}
15
+
16
+
17
+ def resolve_computed(
18
+ computed_fields: dict[str, FieldDef],
19
+ extracted_values: dict[str, Any],
20
+ ) -> dict[str, Any]:
21
+ """Evaluate computed fields in dependency order and return their values."""
22
+ order = _topological_sort(computed_fields)
23
+ all_values = dict(extracted_values)
24
+ results: dict[str, Any] = {}
25
+
26
+ for name in order:
27
+ field_def = computed_fields[name]
28
+ value = _evaluate(field_def.computed, all_values)
29
+ all_values[name] = value
30
+ results[name] = value
31
+
32
+ return results
33
+
34
+
35
+ def get_field_references(expression: str) -> list[str]:
36
+ """Extract field references like {field_name} from an expression."""
37
+ return _FIELD_REF_RE.findall(expression)
38
+
39
+
40
+ def _topological_sort(computed_fields: dict[str, FieldDef]) -> list[str]:
41
+ """Sort computed fields by dependency order."""
42
+ deps: dict[str, set[str]] = {}
43
+ for name, field_def in computed_fields.items():
44
+ refs = set(get_field_references(field_def.computed))
45
+ deps[name] = refs & set(computed_fields.keys())
46
+
47
+ visited: set[str] = set()
48
+ order: list[str] = []
49
+ visiting: set[str] = set()
50
+
51
+ def visit(name: str) -> None:
52
+ if name in visited:
53
+ return
54
+ if name in visiting:
55
+ raise ValueError(f"Circular dependency detected involving '{name}'")
56
+ visiting.add(name)
57
+ for dep in deps.get(name, set()):
58
+ visit(dep)
59
+ visiting.discard(name)
60
+ visited.add(name)
61
+ order.append(name)
62
+
63
+ for name in computed_fields:
64
+ visit(name)
65
+
66
+ return order
67
+
68
+
69
+ def _is_interpolation(expression: str) -> bool:
70
+ """Check if expression is pure string interpolation (no operators outside refs).
71
+
72
+ Returns True when the text between {field} references is only whitespace,
73
+ e.g. "{first_name} {last_name}" — but NOT "{weight} / ({height} ** 2)".
74
+ """
75
+ stripped = _FIELD_REF_RE.sub("", expression)
76
+ # Must have some literal text (spaces) AND only whitespace characters
77
+ return len(stripped) > 0 and stripped.isspace()
78
+
79
+
80
+ def _evaluate(expression: str, values: dict[str, Any]) -> Any:
81
+ """Evaluate a computed expression with {field_name} substitutions."""
82
+ if _is_interpolation(expression):
83
+ def str_replacer(match: re.Match) -> str:
84
+ field_name = match.group(1)
85
+ val = values.get(field_name)
86
+ return str(val) if val is not None else ""
87
+ return _FIELD_REF_RE.sub(str_replacer, expression)
88
+
89
+ def replacer(match: re.Match) -> str:
90
+ field_name = match.group(1)
91
+ val = values.get(field_name)
92
+ if val is None:
93
+ return "None"
94
+ return repr(val)
95
+
96
+ code = _FIELD_REF_RE.sub(replacer, expression)
97
+
98
+ try:
99
+ return eval(code) # noqa: S307 — trusted YAML author
100
+ except Exception:
101
+ return code
@@ -0,0 +1,10 @@
1
+ class StencilError(Exception):
2
+ """Base exception for stencilpy."""
3
+
4
+
5
+ class VersionError(StencilError):
6
+ """Discriminator cell value didn't match any known version."""
7
+
8
+
9
+ class ValidationError(StencilError):
10
+ """Pydantic validation failed during extraction."""