spextract 0.5.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ Copyright © 2026 Vincent Lonij
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,83 @@
1
+ Metadata-Version: 2.3
2
+ Name: spextract
3
+ Version: 0.5.16
4
+ Summary: A declarative html scraper for python
5
+ License: MIT
6
+ Author: Vincent Lonij
7
+ Author-email: 29819815+vincentropy@users.noreply.github.com
8
+ Requires-Python: >=3.10
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Requires-Dist: beautifulsoup4 (>=4.14.3,<5.0.0)
16
+ Requires-Dist: click (>=8.3.2,<9.0.0)
17
+ Requires-Dist: lxml (>=6.0.4,<7.0.0)
18
+ Requires-Dist: pydantic (>=2.12.5,<3.0.0)
19
+ Requires-Dist: pyyaml (>=6.0.3,<7.0.0)
20
+ Description-Content-Type: text/markdown
21
+
22
+ # A Declarative HTML Scraper for Python
23
+
24
+ This package provides a simple way to declare what data should be extracted from an HTML document in a configuration file.
25
+
26
+ This enables sharing of scraping logic across projects and teams without the risk of executing untrusted code. It also allows for easier maintenance and updates to scraping logic without needing to modify the underlying codebase.
27
+
28
+ ## CLI
29
+
30
+ The package includes a Click-based CLI with two commands:
31
+
32
+ ```bash
33
+ decs parse spec.yaml <path to html file or directory>
34
+ decs validate spec.yaml expected-results.yaml
35
+ ```
36
+
37
+ `parse` emits YAML in the same expected-results format used by `validate`, so you can capture known-good output and re-run validation later.
38
+
39
+ ## How to use
40
+
41
+ ### Build a configuration file
42
+
43
+ You can write a configuration file with the provided ParserSpec class.
44
+
45
+ ```python
46
+ import py_decs
47
+
48
+ spec = py_decs.ParserSpec(
49
+ name="example_parser",
50
+ description="An example parser for demonstration purposes.",
51
+ fields=[
52
+ py_decs.FieldSpec(
53
+ name="title",
54
+ selector="h1.title::text",
55
+ type=py_decs.FieldType.TEXT,
56
+ ),
57
+ py_decs.FieldSpec(
58
+ name="links",
59
+ selector="a.link::attr(href)",
60
+ type=py_decs.FieldType.LINK,
61
+ multiple=True,
62
+ )
63
+ py_decs.FieldSpec(
64
+ name="author",
65
+ selector="div.author",
66
+ type=py_decs.FieldType.OBJECT,
67
+ fields=[
68
+ py_decs.FieldSpec(
69
+ name="name",
70
+ selector="span.name::text",
71
+ type=py_decs.FieldType.TEXT,
72
+ ),
73
+ py_decs.FieldSpec(
74
+ name="profile_url",
75
+ selector="a.profile::attr(href)",
76
+ type=py_decs.FieldType.LINK,
77
+ ),
78
+ ]
79
+ ),
80
+ ]
81
+ )
82
+ ```
83
+
@@ -0,0 +1,61 @@
1
+ # A Declarative HTML Scraper for Python
2
+
3
+ This package provides a simple way to declare what data should be extracted from an HTML document in a configuration file.
4
+
5
+ This enables sharing of scraping logic across projects and teams without the risk of executing untrusted code. It also allows for easier maintenance and updates to scraping logic without needing to modify the underlying codebase.
6
+
7
+ ## CLI
8
+
9
+ The package includes a Click-based CLI with two commands:
10
+
11
+ ```bash
12
+ decs parse spec.yaml <path to html file or directory>
13
+ decs validate spec.yaml expected-results.yaml
14
+ ```
15
+
16
+ `parse` emits YAML in the same expected-results format used by `validate`, so you can capture known-good output and re-run validation later.
17
+
18
+ ## How to use
19
+
20
+ ### Build a configuration file
21
+
22
+ You can write a configuration file with the provided ParserSpec class.
23
+
24
+ ```python
25
+ import py_decs
26
+
27
+ spec = py_decs.ParserSpec(
28
+ name="example_parser",
29
+ description="An example parser for demonstration purposes.",
30
+ fields=[
31
+ py_decs.FieldSpec(
32
+ name="title",
33
+ selector="h1.title::text",
34
+ type=py_decs.FieldType.TEXT,
35
+ ),
36
+ py_decs.FieldSpec(
37
+ name="links",
38
+ selector="a.link::attr(href)",
39
+ type=py_decs.FieldType.LINK,
40
+ multiple=True,
41
+ )
42
+ py_decs.FieldSpec(
43
+ name="author",
44
+ selector="div.author",
45
+ type=py_decs.FieldType.OBJECT,
46
+ fields=[
47
+ py_decs.FieldSpec(
48
+ name="name",
49
+ selector="span.name::text",
50
+ type=py_decs.FieldType.TEXT,
51
+ ),
52
+ py_decs.FieldSpec(
53
+ name="profile_url",
54
+ selector="a.profile::attr(href)",
55
+ type=py_decs.FieldType.LINK,
56
+ ),
57
+ ]
58
+ ),
59
+ ]
60
+ )
61
+ ```
@@ -0,0 +1,48 @@
1
+ [project]
2
+ name = "spextract"
3
+ version = "0.5.16"
4
+ description = "A declarative html scraper for python"
5
+ authors = [
6
+ { name = "Vincent Lonij", email = "29819815+vincentropy@users.noreply.github.com" },
7
+ ]
8
+ license = "MIT"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "beautifulsoup4 (>=4.14.3,<5.0.0)",
13
+ "pydantic (>=2.12.5,<3.0.0)",
14
+ "pyyaml (>=6.0.3,<7.0.0)",
15
+ "lxml (>=6.0.4,<7.0.0)",
16
+ "click (>=8.3.2,<9.0.0)",
17
+ ]
18
+
19
+ [project.scripts]
20
+ decs = "py_decs.cli:cli"
21
+
22
+ [tool.poetry]
23
+
24
+ [tool.poetry.group.dev.dependencies]
25
+ mypy = "^1.20.0"
26
+ types-pyyaml = "^6.0.12.20260408"
27
+ pylint = "^4.0.5"
28
+ pytest = "^9.0.3"
29
+ types-lxml = "^2026.2.16"
30
+ pylint-pydantic = "^0.4.1"
31
+ twine = "^6.2.0"
32
+
33
+ [build-system]
34
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
35
+ build-backend = "poetry.core.masonry.api"
36
+
37
+ [tool.pylint]
38
+ load-plugins = ["pylint_pydantic"]
39
+ disable = [
40
+ "missing-module-docstring",
41
+ "missing-class-docstring",
42
+ "missing-function-docstring",
43
+ "line-too-long",
44
+ ]
45
+ max-line-length = 120
46
+
47
+ [tool.black]
48
+ line-length = 120
@@ -0,0 +1,27 @@
1
+ from .models import (
2
+ ParseSpec,
3
+ FieldSpec,
4
+ FieldType,
5
+ ProcessorSpec,
6
+ EngineOutput,
7
+ SpecValidationResult,
8
+ ValidationMismatch,
9
+ ExpectedResults,
10
+ FileExpectedItems,
11
+ )
12
+ from .engine import ParseEngine
13
+ from .validation.spec_validate import validate_spec_output
14
+
15
+ __all__ = [
16
+ "FieldSpec",
17
+ "FieldType",
18
+ "ParseEngine",
19
+ "ParseSpec",
20
+ "ProcessorSpec",
21
+ "EngineOutput",
22
+ "SpecValidationResult",
23
+ "ValidationMismatch",
24
+ "ExpectedResults",
25
+ "FileExpectedItems",
26
+ "validate_spec_output",
27
+ ]
@@ -0,0 +1,85 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import click
6
+ import yaml
7
+
8
+ from .engine import ParseEngine
9
+ from .models.output import EngineOutput
10
+ from .models.parser_spec import ParseSpec
11
+ from .models.validation import ExpectedResults, FileExpectedItems
12
+ from .validation.true_validate import validate_files
13
+
14
+
15
+ @click.group()
16
+ def cli() -> None:
17
+ """Declarative scraper utilities."""
18
+
19
+
20
+ @cli.command()
21
+ @click.argument("spec_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
22
+ @click.argument("html_path", type=click.Path(exists=True, dir_okay=True, path_type=Path))
23
+ @click.option("--output", "output_path", type=click.Path(dir_okay=False, path_type=Path))
24
+ def parse(spec_path: Path, html_path: Path, output_path: Path | None) -> None:
25
+ """Apply a spec to one or more HTML files."""
26
+
27
+ spec = ParseSpec.from_yaml_file(spec_path)
28
+ engine = ParseEngine(spec)
29
+ results: dict[str, EngineOutput] = {}
30
+ if html_path.is_file():
31
+ html_files = [html_path]
32
+ else:
33
+ html_files = sorted(html_path.glob("*.html"))
34
+
35
+ for html_file in html_files:
36
+ html = html_file.read_text(encoding="utf-8")
37
+ parsed = engine.parse(html)
38
+ results[html_file.name] = parsed
39
+
40
+ file_results = ExpectedResults(
41
+ data_path=html_path if html_path.is_dir() else html_path.parent,
42
+ files=[FileExpectedItems.from_engine_output(file_name, output) for file_name, output in results.items()],
43
+ )
44
+
45
+ if output_path is not None:
46
+ file_results.to_yaml_file(output_path)
47
+ click.echo(f"Wrote parsed results to {output_path}")
48
+ return
49
+
50
+ rendered_yaml = yaml.safe_dump(file_results.model_dump(), sort_keys=False, allow_unicode=True)
51
+ click.echo(rendered_yaml, nl=False)
52
+
53
+
54
+ @cli.command()
55
+ @click.argument("spec_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
56
+ @click.argument("expected_results_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
57
+ @click.option(
58
+ "--field-path", "-f", type=str, help="Optional dot path to a specific field to validate (e.g. 'items.name')."
59
+ )
60
+ def validate(spec_path: Path, expected_results_path: Path, field_path: str | None) -> None:
61
+ """Validate a spec against YAML expected extraction results."""
62
+
63
+ validation_result = validate_files(
64
+ expected_values_path=expected_results_path,
65
+ spec_file_path=spec_path,
66
+ field_path=field_path,
67
+ )
68
+ if validation_result.passed:
69
+ click.echo(
70
+ f"Validation passed for {validation_result.total_files} file(s) and {validation_result.total_items} extracted field(s)."
71
+ )
72
+ return
73
+
74
+ for file_result in validation_result.file_results:
75
+ click.echo(f"{file_result.file_name}:")
76
+ if file_result.passed:
77
+ click.echo(f" Passed. Values checked: {file_result.item_count}")
78
+ continue
79
+
80
+ for error in file_result.errors:
81
+ click.echo(f" - {error}")
82
+
83
+ raise click.ClickException(
84
+ f"Validation failed for {validation_result.failures} of {validation_result.total_files} file(s)."
85
+ )
@@ -0,0 +1,152 @@
1
+ """Core extraction engine that applies a ParseSpec to HTML content."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import cast
6
+
7
+ from bs4 import BeautifulSoup, Tag
8
+
9
+ from spextract.models.validation import SpecValidationResult
10
+
11
+ from .models import EngineOutput, FieldSpec, ParseSpec, ProcessorSpec, DataValue
12
+ from .processors import apply_processor
13
+ from .uni_selector import select
14
+
15
+
16
+ class ParseEngine:
17
+ """Applies a ParseSpec to HTML content and returns extracted items."""
18
+
19
+ def __init__(self, spec: ParseSpec) -> None:
20
+ self.spec = spec
21
+
22
+ def parse(self, html: str, field_path: str | None = None) -> EngineOutput:
23
+ """Parse HTML string and return EngineOutput with typed fields.
24
+
25
+ Args:
26
+ html: Raw HTML content to parse.
27
+ field_path: Optional dot-separated path (e.g. ``"person.name"``) to
28
+ extract only a single field instead of the full spec. Each
29
+ segment must match a key in the ``fields`` dict at that level of
30
+ nesting. Raises ``KeyError`` if a segment is not found or if a
31
+ non-leaf segment has no child fields.
32
+ """
33
+ root = BeautifulSoup(html, "html.parser")
34
+ if field_path is not None:
35
+ data = self._extract_field_path(root, self.spec.fields, field_path.split("."))
36
+ else:
37
+ data = self._extract_fields(root, self.spec.fields)
38
+ return EngineOutput(spec=self.spec, data=data if data is not None else {})
39
+
40
+ def validate(self, html: str, field_path: str | None = None) -> SpecValidationResult:
41
+ """Validate HTML against spec, returning SpecValidationResult with details."""
42
+ from .validation.spec_validate import validate_spec_output # pylint: disable=import-outside-toplevel
43
+
44
+ output = self.parse(html, field_path=field_path)
45
+ validation_result = validate_spec_output(self.spec, output.data, raise_=False)
46
+ return validation_result
47
+
48
+ def parse_and_validate(self, html: str, field_path: str | None = None) -> EngineOutput:
49
+ """Parse HTML and validate output against spec, returning EngineOutput with validation results.
50
+ Raises ValueError if validation fails."""
51
+
52
+ output = self.parse(html, field_path=field_path)
53
+ validation_result = self.validate(html, field_path=field_path)
54
+ if not validation_result.is_valid:
55
+ raise ValueError(f"Validation failed: {validation_result.mismatches[0]}")
56
+ return output
57
+
58
+ @staticmethod
59
+ def _extract_field_path(
60
+ node: Tag | BeautifulSoup,
61
+ fields: dict[str, FieldSpec],
62
+ path: list[str],
63
+ ) -> dict[str, DataValue]:
64
+ """Navigate ``fields`` and the HTML tree simultaneously following ``path``.
65
+
66
+ Returns a ``{leaf_name: value}`` dict containing only the targeted field.
67
+ Raises ``KeyError`` if any path segment is missing or if a non-leaf
68
+ segment has no child ``fields``.
69
+ """
70
+ name, *rest = path
71
+ if name not in fields:
72
+ raise KeyError(f"Field {name!r} not found. Available: {list(fields.keys())}")
73
+ field_spec = fields[name]
74
+
75
+ if not rest:
76
+ # Leaf — extract just this field
77
+ value = ParseEngine._extract_field(node, field_spec)
78
+ return {name: value} if value is not None else {}
79
+
80
+ # Intermediate — must have child fields and a navigable selector
81
+ if field_spec.fields is None:
82
+ raise KeyError(f"Field {name!r} has no child fields; cannot navigate to {'.'.join(rest)!r}")
83
+ sub_nodes = select(node, field_spec.selector, assert_tags=True)
84
+ if not sub_nodes:
85
+ return {}
86
+
87
+ if field_spec.multiple:
88
+ items = [ParseEngine._extract_field_path(sub, field_spec.fields, rest) for sub in sub_nodes]
89
+ return cast(dict[str, DataValue], {name: cast(DataValue, items)})
90
+
91
+ inner = ParseEngine._extract_field_path(sub_nodes[0], field_spec.fields, rest)
92
+ return cast(dict[str, DataValue], {name: cast(DataValue, inner)})
93
+
94
+ @staticmethod
95
+ def _extract_fields(node: Tag | BeautifulSoup, fields: dict[str, FieldSpec]) -> dict[str, DataValue]:
96
+ result: dict[str, DataValue] = {}
97
+ for name, field_spec in fields.items():
98
+ field_out = ParseEngine._extract_field(node, field_spec)
99
+ if field_out is not None:
100
+ result[name] = field_out
101
+ return result
102
+
103
+ @staticmethod
104
+ def _extract_field(node: Tag | BeautifulSoup, field_spec: FieldSpec) -> DataValue | None:
105
+ if field_spec.fields is not None:
106
+ # if this field has child fields, extraction is slightly different
107
+ # we ignore ::text / ::attr on the parent selector.
108
+ return ParseEngine._extract_nested(node, field_spec)
109
+
110
+ values = select(node, field_spec.selector)
111
+ if not values:
112
+ return [] if field_spec.multiple else None
113
+ if field_spec.multiple:
114
+ out_values = [ParseEngine.apply_processors(v, field_spec.resolved_processors()) for v in values]
115
+ return cast(DataValue, out_values)
116
+
117
+ all_strings = all(isinstance(v, str) for v in values)
118
+ if all_strings:
119
+ values = ["".join(cast(list[str], values))]
120
+ if len(values) > 1:
121
+ print(f"Warning: Multiple elements matched for single field: {field_spec.selector}. Using first match.")
122
+ out_value = ParseEngine.apply_processors(values[0], field_spec.resolved_processors())
123
+ return out_value
124
+
125
+ @staticmethod
126
+ def _extract_nested(node: Tag | BeautifulSoup, field_spec: FieldSpec) -> DataValue | None:
127
+ """Extract a field with child fields, applying child selectors relative to parent elements.
128
+ In this case we ignore ::text / ::attr on the parent selector since it doesn't make sense
129
+ to apply these to a parent element that we're extracting child fields from."""
130
+ assert field_spec.fields is not None
131
+
132
+ sub_nodes = select(node, field_spec.selector, assert_tags=True)
133
+ if not sub_nodes:
134
+ return [] if field_spec.multiple else None
135
+
136
+ if field_spec.multiple:
137
+ # Return a list of FieldOutput objects
138
+ return [ParseEngine._extract_fields(sub, field_spec.fields) for sub in sub_nodes]
139
+
140
+ if len(sub_nodes) > 1:
141
+ print(f"Warning: Multiple elements matched for single field: {field_spec.selector}. Using first match.")
142
+ return ParseEngine._extract_fields(sub_nodes[0], field_spec.fields)
143
+
144
+ @staticmethod
145
+ def apply_processors(value: object, processors: list[ProcessorSpec]) -> str | float | None:
146
+ if value is None:
147
+ return value
148
+ for proc in processors:
149
+ value = apply_processor(proc.name, value, proc.args if proc.args else None)
150
+ if isinstance(value, (str, float)) or value is None:
151
+ return value
152
+ raise ValueError(f"Unsupported value type after processing: {type(value)}")
@@ -0,0 +1,17 @@
1
+ from .parser_spec import FieldSpec, FieldType, ParseSpec, ProcessorName, ProcessorSpec
2
+ from .output import DataValue, EngineOutput
3
+ from .validation import SpecValidationResult, ValidationMismatch, ExpectedResults, FileExpectedItems
4
+
5
+ __all__ = [
6
+ "FieldSpec",
7
+ "FieldType",
8
+ "ParseSpec",
9
+ "ProcessorName",
10
+ "ProcessorSpec",
11
+ "DataValue",
12
+ "EngineOutput",
13
+ "SpecValidationResult",
14
+ "ValidationMismatch",
15
+ "ExpectedResults",
16
+ "FileExpectedItems",
17
+ ]
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Union # pylint: disable=unused-import
4
+ from typing_extensions import TypeAliasType
5
+
6
+ from .yaml import BaseModelWithYamlSupport
7
+
8
+ from .parser_spec import ParseSpec
9
+
10
+ DataValue = TypeAliasType(
11
+ "DataValue",
12
+ "Union[None, float, str, dict[str, DataValue], list[DataValue]]",
13
+ )
14
+
15
+
16
+ class EngineOutput(BaseModelWithYamlSupport):
17
+ """Output from the scraping engine for a single file,
18
+ including the parser spec used and the extracted data."""
19
+
20
+ spec: ParseSpec | None = None
21
+ data: dict[str, DataValue]
@@ -0,0 +1,85 @@
1
+ """
2
+ Pydantic models for the declarative scraper.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import enum
8
+ from typing import Union
9
+
10
+ from pydantic import Field
11
+
12
+ from .yaml import BaseModelWithYamlSupport
13
+ from ..processors import ProcessorName
14
+
15
+
16
+ class ProcessorSpec(BaseModelWithYamlSupport):
17
+ """A single field processor.
18
+
19
+ Can be a simple named processor (e.g. "strip") or a parameterised one, eg a regex.
20
+ """
21
+
22
+ name: ProcessorName
23
+ args: list[Union[str, int]] = Field(default_factory=list)
24
+
25
+
26
+ class FieldType(enum.Enum):
27
+ """Type of field to extract, used to determine how to extract it from HTML."""
28
+
29
+ TEXT = "text"
30
+ LINK = "link"
31
+ NUMBER = "number"
32
+ DATE = "date"
33
+ OBJECT = "object"
34
+
35
+
36
+ class FieldSpec(BaseModelWithYamlSupport):
37
+ """Specification for extracting a single field from HTML."""
38
+
39
+ selector: str = Field(
40
+ description="""
41
+ CSS selector or XPATH selector for the field.
42
+ If the fields attribute is not None, this selector should return a parent element or a list of parent elements.
43
+ The selector attribute of the child fields will be applied relative to each parent element.
44
+ """
45
+ )
46
+ type: FieldType = Field(
47
+ default=FieldType.TEXT,
48
+ description="Type of field to extract, used to validate extraction.",
49
+ )
50
+ required: bool = Field(default=True, description="Whether this field is required. Used for validation.")
51
+
52
+ multiple: bool = Field(
53
+ default=False,
54
+ description="Whether to extract multiple values from this field (i.e. return a list).",
55
+ )
56
+ processors: list[Union[ProcessorName, dict[ProcessorName, list[Union[str, int]]]]] = Field(
57
+ default_factory=list,
58
+ description="List of processors to apply to the extracted value(s). Each processor can be a string (processor name) or a dict mapping processor name to argument list. These are applied in order.",
59
+ )
60
+ fields: dict[str, "FieldSpec"] | None = Field(
61
+ default=None,
62
+ description="Child fields to extract from the element(s) selected by this field. \
63
+ Keys in this dict will be keys in the output data.",
64
+ )
65
+
66
+ def resolved_processors(self) -> list[ProcessorSpec]:
67
+ """Normalise the processor list into ProcessorSpec objects from dict[func-name=>arg list] or str."""
68
+ result: list[ProcessorSpec] = []
69
+ for p in self.processors:
70
+ if isinstance(p, (str, ProcessorName)):
71
+ result.append(ProcessorSpec(name=p))
72
+ elif isinstance(p, dict):
73
+ for name, args in p.items():
74
+ result.append(ProcessorSpec(name=name, args=args))
75
+ else:
76
+ raise ValueError(f"Invalid processor spec: {p}")
77
+ return result
78
+
79
+
80
+ class ParseSpec(BaseModelWithYamlSupport):
81
+ """Top-level declarative parser specification."""
82
+
83
+ version: int = 1
84
+ name: str
85
+ fields: dict[str, FieldSpec]
@@ -0,0 +1,84 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from pydantic import BaseModel, Field, field_serializer
6
+
7
+ from .output import DataValue, EngineOutput
8
+ from .yaml import BaseModelWithYamlSupport
9
+
10
+
11
+ class ValidationMismatch(BaseModel):
12
+ field: str
13
+ expected_type: str
14
+ actual_type: str
15
+ message: str
16
+
17
+
18
+ class SpecValidationResult(BaseModel):
19
+ mismatches: list[ValidationMismatch] = Field(default_factory=list)
20
+
21
+ @property
22
+ def is_valid(self) -> bool:
23
+ return len(self.mismatches) == 0
24
+
25
+
26
+ class FileValidationResult(BaseModel):
27
+ """Validation result for a single HTML file."""
28
+
29
+ file_name: str
30
+ item_count: int
31
+ errors: list[str] = Field(default_factory=list)
32
+
33
+ @property
34
+ def passed(self) -> bool:
35
+ return not self.errors
36
+
37
+
38
+ class TrueValidationResult(BaseModel):
39
+ """Aggregate validation result across all files."""
40
+
41
+ file_results: list[FileValidationResult] = Field(default_factory=list)
42
+
43
+ @property
44
+ def total_files(self) -> int:
45
+ return len(self.file_results)
46
+
47
+ @property
48
+ def total_items(self) -> int:
49
+ return sum(result.item_count for result in self.file_results)
50
+
51
+ @property
52
+ def failures(self) -> int:
53
+ return sum(1 for result in self.file_results if not result.passed)
54
+
55
+ @property
56
+ def passed(self) -> bool:
57
+ return self.failures == 0
58
+
59
+
60
+ class FileExpectedItems(BaseModel):
61
+ """Expected extraction results for a single example file."""
62
+
63
+ file: str
64
+ items: dict[str, DataValue]
65
+
66
+ @classmethod
67
+ def from_engine_output(cls, file_name: str, output: EngineOutput) -> FileExpectedItems:
68
+ return cls(file=file_name, items=output.data)
69
+
70
+
71
+ class ExpectedResults(BaseModelWithYamlSupport):
72
+ """Expected extraction output used to validate parser correctness.
73
+
74
+ Stored as YAML alongside example data files.
75
+ """
76
+
77
+ version: int = 1
78
+ data_path: Path | None = None
79
+ files: list[FileExpectedItems]
80
+
81
+ @field_serializer("data_path")
82
+ def serialize_data_path(self, value: Path | None) -> str | None:
83
+ """Serialize data_path as a string in YAML."""
84
+ return str(value) if value is not None else None