spextract 0.5.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spextract/__init__.py +27 -0
- spextract/cli.py +85 -0
- spextract/engine.py +152 -0
- spextract/models/__init__.py +17 -0
- spextract/models/output.py +21 -0
- spextract/models/parser_spec.py +85 -0
- spextract/models/validation.py +84 -0
- spextract/models/yaml.py +23 -0
- spextract/processors.py +112 -0
- spextract/py.typed +0 -0
- spextract/uni_selector.py +125 -0
- spextract/validation/__init__.py +15 -0
- spextract/validation/spec_validate.py +138 -0
- spextract/validation/true_validate.py +204 -0
- spextract/validation/validators.py +110 -0
- spextract-0.5.16.dist-info/LICENSE.md +7 -0
- spextract-0.5.16.dist-info/METADATA +83 -0
- spextract-0.5.16.dist-info/RECORD +20 -0
- spextract-0.5.16.dist-info/WHEEL +4 -0
- spextract-0.5.16.dist-info/entry_points.txt +3 -0
spextract/__init__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from .models import (
|
|
2
|
+
ParseSpec,
|
|
3
|
+
FieldSpec,
|
|
4
|
+
FieldType,
|
|
5
|
+
ProcessorSpec,
|
|
6
|
+
EngineOutput,
|
|
7
|
+
SpecValidationResult,
|
|
8
|
+
ValidationMismatch,
|
|
9
|
+
ExpectedResults,
|
|
10
|
+
FileExpectedItems,
|
|
11
|
+
)
|
|
12
|
+
from .engine import ParseEngine
|
|
13
|
+
from .validation.spec_validate import validate_spec_output
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"FieldSpec",
|
|
17
|
+
"FieldType",
|
|
18
|
+
"ParseEngine",
|
|
19
|
+
"ParseSpec",
|
|
20
|
+
"ProcessorSpec",
|
|
21
|
+
"EngineOutput",
|
|
22
|
+
"SpecValidationResult",
|
|
23
|
+
"ValidationMismatch",
|
|
24
|
+
"ExpectedResults",
|
|
25
|
+
"FileExpectedItems",
|
|
26
|
+
"validate_spec_output",
|
|
27
|
+
]
|
spextract/cli.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
from .engine import ParseEngine
|
|
9
|
+
from .models.output import EngineOutput
|
|
10
|
+
from .models.parser_spec import ParseSpec
|
|
11
|
+
from .models.validation import ExpectedResults, FileExpectedItems
|
|
12
|
+
from .validation.true_validate import validate_files
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.group()
|
|
16
|
+
def cli() -> None:
|
|
17
|
+
"""Declarative scraper utilities."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@cli.command()
|
|
21
|
+
@click.argument("spec_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
|
22
|
+
@click.argument("html_path", type=click.Path(exists=True, dir_okay=True, path_type=Path))
|
|
23
|
+
@click.option("--output", "output_path", type=click.Path(dir_okay=False, path_type=Path))
|
|
24
|
+
def parse(spec_path: Path, html_path: Path, output_path: Path | None) -> None:
|
|
25
|
+
"""Apply a spec to one or more HTML files."""
|
|
26
|
+
|
|
27
|
+
spec = ParseSpec.from_yaml_file(spec_path)
|
|
28
|
+
engine = ParseEngine(spec)
|
|
29
|
+
results: dict[str, EngineOutput] = {}
|
|
30
|
+
if html_path.is_file():
|
|
31
|
+
html_files = [html_path]
|
|
32
|
+
else:
|
|
33
|
+
html_files = sorted(html_path.glob("*.html"))
|
|
34
|
+
|
|
35
|
+
for html_file in html_files:
|
|
36
|
+
html = html_file.read_text(encoding="utf-8")
|
|
37
|
+
parsed = engine.parse(html)
|
|
38
|
+
results[html_file.name] = parsed
|
|
39
|
+
|
|
40
|
+
file_results = ExpectedResults(
|
|
41
|
+
data_path=html_path if html_path.is_dir() else html_path.parent,
|
|
42
|
+
files=[FileExpectedItems.from_engine_output(file_name, output) for file_name, output in results.items()],
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
if output_path is not None:
|
|
46
|
+
file_results.to_yaml_file(output_path)
|
|
47
|
+
click.echo(f"Wrote parsed results to {output_path}")
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
rendered_yaml = yaml.safe_dump(file_results.model_dump(), sort_keys=False, allow_unicode=True)
|
|
51
|
+
click.echo(rendered_yaml, nl=False)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@cli.command()
|
|
55
|
+
@click.argument("spec_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
|
56
|
+
@click.argument("expected_results_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
|
57
|
+
@click.option(
|
|
58
|
+
"--field-path", "-f", type=str, help="Optional dot path to a specific field to validate (e.g. 'items.name')."
|
|
59
|
+
)
|
|
60
|
+
def validate(spec_path: Path, expected_results_path: Path, field_path: str | None) -> None:
|
|
61
|
+
"""Validate a spec against YAML expected extraction results."""
|
|
62
|
+
|
|
63
|
+
validation_result = validate_files(
|
|
64
|
+
expected_values_path=expected_results_path,
|
|
65
|
+
spec_file_path=spec_path,
|
|
66
|
+
field_path=field_path,
|
|
67
|
+
)
|
|
68
|
+
if validation_result.passed:
|
|
69
|
+
click.echo(
|
|
70
|
+
f"Validation passed for {validation_result.total_files} file(s) and {validation_result.total_items} extracted field(s)."
|
|
71
|
+
)
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
for file_result in validation_result.file_results:
|
|
75
|
+
click.echo(f"{file_result.file_name}:")
|
|
76
|
+
if file_result.passed:
|
|
77
|
+
click.echo(f" Passed. Values checked: {file_result.item_count}")
|
|
78
|
+
continue
|
|
79
|
+
|
|
80
|
+
for error in file_result.errors:
|
|
81
|
+
click.echo(f" - {error}")
|
|
82
|
+
|
|
83
|
+
raise click.ClickException(
|
|
84
|
+
f"Validation failed for {validation_result.failures} of {validation_result.total_files} file(s)."
|
|
85
|
+
)
|
spextract/engine.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Core extraction engine that applies a ParseSpec to HTML content."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import cast
|
|
6
|
+
|
|
7
|
+
from bs4 import BeautifulSoup, Tag
|
|
8
|
+
|
|
9
|
+
from spextract.models.validation import SpecValidationResult
|
|
10
|
+
|
|
11
|
+
from .models import EngineOutput, FieldSpec, ParseSpec, ProcessorSpec, DataValue
|
|
12
|
+
from .processors import apply_processor
|
|
13
|
+
from .uni_selector import select
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ParseEngine:
|
|
17
|
+
"""Applies a ParseSpec to HTML content and returns extracted items."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, spec: ParseSpec) -> None:
|
|
20
|
+
self.spec = spec
|
|
21
|
+
|
|
22
|
+
def parse(self, html: str, field_path: str | None = None) -> EngineOutput:
|
|
23
|
+
"""Parse HTML string and return EngineOutput with typed fields.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
html: Raw HTML content to parse.
|
|
27
|
+
field_path: Optional dot-separated path (e.g. ``"person.name"``) to
|
|
28
|
+
extract only a single field instead of the full spec. Each
|
|
29
|
+
segment must match a key in the ``fields`` dict at that level of
|
|
30
|
+
nesting. Raises ``KeyError`` if a segment is not found or if a
|
|
31
|
+
non-leaf segment has no child fields.
|
|
32
|
+
"""
|
|
33
|
+
root = BeautifulSoup(html, "html.parser")
|
|
34
|
+
if field_path is not None:
|
|
35
|
+
data = self._extract_field_path(root, self.spec.fields, field_path.split("."))
|
|
36
|
+
else:
|
|
37
|
+
data = self._extract_fields(root, self.spec.fields)
|
|
38
|
+
return EngineOutput(spec=self.spec, data=data if data is not None else {})
|
|
39
|
+
|
|
40
|
+
def validate(self, html: str, field_path: str | None = None) -> SpecValidationResult:
|
|
41
|
+
"""Validate HTML against spec, returning SpecValidationResult with details."""
|
|
42
|
+
from .validation.spec_validate import validate_spec_output # pylint: disable=import-outside-toplevel
|
|
43
|
+
|
|
44
|
+
output = self.parse(html, field_path=field_path)
|
|
45
|
+
validation_result = validate_spec_output(self.spec, output.data, raise_=False)
|
|
46
|
+
return validation_result
|
|
47
|
+
|
|
48
|
+
def parse_and_validate(self, html: str, field_path: str | None = None) -> EngineOutput:
|
|
49
|
+
"""Parse HTML and validate output against spec, returning EngineOutput with validation results.
|
|
50
|
+
Raises ValueError if validation fails."""
|
|
51
|
+
|
|
52
|
+
output = self.parse(html, field_path=field_path)
|
|
53
|
+
validation_result = self.validate(html, field_path=field_path)
|
|
54
|
+
if not validation_result.is_valid:
|
|
55
|
+
raise ValueError(f"Validation failed: {validation_result.mismatches[0]}")
|
|
56
|
+
return output
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _extract_field_path(
|
|
60
|
+
node: Tag | BeautifulSoup,
|
|
61
|
+
fields: dict[str, FieldSpec],
|
|
62
|
+
path: list[str],
|
|
63
|
+
) -> dict[str, DataValue]:
|
|
64
|
+
"""Navigate ``fields`` and the HTML tree simultaneously following ``path``.
|
|
65
|
+
|
|
66
|
+
Returns a ``{leaf_name: value}`` dict containing only the targeted field.
|
|
67
|
+
Raises ``KeyError`` if any path segment is missing or if a non-leaf
|
|
68
|
+
segment has no child ``fields``.
|
|
69
|
+
"""
|
|
70
|
+
name, *rest = path
|
|
71
|
+
if name not in fields:
|
|
72
|
+
raise KeyError(f"Field {name!r} not found. Available: {list(fields.keys())}")
|
|
73
|
+
field_spec = fields[name]
|
|
74
|
+
|
|
75
|
+
if not rest:
|
|
76
|
+
# Leaf — extract just this field
|
|
77
|
+
value = ParseEngine._extract_field(node, field_spec)
|
|
78
|
+
return {name: value} if value is not None else {}
|
|
79
|
+
|
|
80
|
+
# Intermediate — must have child fields and a navigable selector
|
|
81
|
+
if field_spec.fields is None:
|
|
82
|
+
raise KeyError(f"Field {name!r} has no child fields; cannot navigate to {'.'.join(rest)!r}")
|
|
83
|
+
sub_nodes = select(node, field_spec.selector, assert_tags=True)
|
|
84
|
+
if not sub_nodes:
|
|
85
|
+
return {}
|
|
86
|
+
|
|
87
|
+
if field_spec.multiple:
|
|
88
|
+
items = [ParseEngine._extract_field_path(sub, field_spec.fields, rest) for sub in sub_nodes]
|
|
89
|
+
return cast(dict[str, DataValue], {name: cast(DataValue, items)})
|
|
90
|
+
|
|
91
|
+
inner = ParseEngine._extract_field_path(sub_nodes[0], field_spec.fields, rest)
|
|
92
|
+
return cast(dict[str, DataValue], {name: cast(DataValue, inner)})
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def _extract_fields(node: Tag | BeautifulSoup, fields: dict[str, FieldSpec]) -> dict[str, DataValue]:
|
|
96
|
+
result: dict[str, DataValue] = {}
|
|
97
|
+
for name, field_spec in fields.items():
|
|
98
|
+
field_out = ParseEngine._extract_field(node, field_spec)
|
|
99
|
+
if field_out is not None:
|
|
100
|
+
result[name] = field_out
|
|
101
|
+
return result
|
|
102
|
+
|
|
103
|
+
@staticmethod
|
|
104
|
+
def _extract_field(node: Tag | BeautifulSoup, field_spec: FieldSpec) -> DataValue | None:
|
|
105
|
+
if field_spec.fields is not None:
|
|
106
|
+
# if this field has child fields, extraction is slightly different
|
|
107
|
+
# we ignore ::text / ::attr on the parent selector.
|
|
108
|
+
return ParseEngine._extract_nested(node, field_spec)
|
|
109
|
+
|
|
110
|
+
values = select(node, field_spec.selector)
|
|
111
|
+
if not values:
|
|
112
|
+
return [] if field_spec.multiple else None
|
|
113
|
+
if field_spec.multiple:
|
|
114
|
+
out_values = [ParseEngine.apply_processors(v, field_spec.resolved_processors()) for v in values]
|
|
115
|
+
return cast(DataValue, out_values)
|
|
116
|
+
|
|
117
|
+
all_strings = all(isinstance(v, str) for v in values)
|
|
118
|
+
if all_strings:
|
|
119
|
+
values = ["".join(cast(list[str], values))]
|
|
120
|
+
if len(values) > 1:
|
|
121
|
+
print(f"Warning: Multiple elements matched for single field: {field_spec.selector}. Using first match.")
|
|
122
|
+
out_value = ParseEngine.apply_processors(values[0], field_spec.resolved_processors())
|
|
123
|
+
return out_value
|
|
124
|
+
|
|
125
|
+
@staticmethod
|
|
126
|
+
def _extract_nested(node: Tag | BeautifulSoup, field_spec: FieldSpec) -> DataValue | None:
|
|
127
|
+
"""Extract a field with child fields, applying child selectors relative to parent elements.
|
|
128
|
+
In this case we ignore ::text / ::attr on the parent selector since it doesn't make sense
|
|
129
|
+
to apply these to a parent element that we're extracting child fields from."""
|
|
130
|
+
assert field_spec.fields is not None
|
|
131
|
+
|
|
132
|
+
sub_nodes = select(node, field_spec.selector, assert_tags=True)
|
|
133
|
+
if not sub_nodes:
|
|
134
|
+
return [] if field_spec.multiple else None
|
|
135
|
+
|
|
136
|
+
if field_spec.multiple:
|
|
137
|
+
# Return a list of FieldOutput objects
|
|
138
|
+
return [ParseEngine._extract_fields(sub, field_spec.fields) for sub in sub_nodes]
|
|
139
|
+
|
|
140
|
+
if len(sub_nodes) > 1:
|
|
141
|
+
print(f"Warning: Multiple elements matched for single field: {field_spec.selector}. Using first match.")
|
|
142
|
+
return ParseEngine._extract_fields(sub_nodes[0], field_spec.fields)
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def apply_processors(value: object, processors: list[ProcessorSpec]) -> str | float | None:
|
|
146
|
+
if value is None:
|
|
147
|
+
return value
|
|
148
|
+
for proc in processors:
|
|
149
|
+
value = apply_processor(proc.name, value, proc.args if proc.args else None)
|
|
150
|
+
if isinstance(value, (str, float)) or value is None:
|
|
151
|
+
return value
|
|
152
|
+
raise ValueError(f"Unsupported value type after processing: {type(value)}")
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .parser_spec import FieldSpec, FieldType, ParseSpec, ProcessorName, ProcessorSpec
|
|
2
|
+
from .output import DataValue, EngineOutput
|
|
3
|
+
from .validation import SpecValidationResult, ValidationMismatch, ExpectedResults, FileExpectedItems
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"FieldSpec",
|
|
7
|
+
"FieldType",
|
|
8
|
+
"ParseSpec",
|
|
9
|
+
"ProcessorName",
|
|
10
|
+
"ProcessorSpec",
|
|
11
|
+
"DataValue",
|
|
12
|
+
"EngineOutput",
|
|
13
|
+
"SpecValidationResult",
|
|
14
|
+
"ValidationMismatch",
|
|
15
|
+
"ExpectedResults",
|
|
16
|
+
"FileExpectedItems",
|
|
17
|
+
]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Union # pylint: disable=unused-import
|
|
4
|
+
from typing_extensions import TypeAliasType
|
|
5
|
+
|
|
6
|
+
from .yaml import BaseModelWithYamlSupport
|
|
7
|
+
|
|
8
|
+
from .parser_spec import ParseSpec
|
|
9
|
+
|
|
10
|
+
DataValue = TypeAliasType(
|
|
11
|
+
"DataValue",
|
|
12
|
+
"Union[None, float, str, dict[str, DataValue], list[DataValue]]",
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EngineOutput(BaseModelWithYamlSupport):
|
|
17
|
+
"""Output from the scraping engine for a single file,
|
|
18
|
+
including the parser spec used and the extracted data."""
|
|
19
|
+
|
|
20
|
+
spec: ParseSpec | None = None
|
|
21
|
+
data: dict[str, DataValue]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pydantic models for the declarative scraper.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import enum
|
|
8
|
+
from typing import Union
|
|
9
|
+
|
|
10
|
+
from pydantic import Field
|
|
11
|
+
|
|
12
|
+
from .yaml import BaseModelWithYamlSupport
|
|
13
|
+
from ..processors import ProcessorName
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ProcessorSpec(BaseModelWithYamlSupport):
|
|
17
|
+
"""A single field processor.
|
|
18
|
+
|
|
19
|
+
Can be a simple named processor (e.g. "strip") or a parameterised one, eg a regex.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
name: ProcessorName
|
|
23
|
+
args: list[Union[str, int]] = Field(default_factory=list)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class FieldType(enum.Enum):
|
|
27
|
+
"""Type of field to extract, used to determine how to extract it from HTML."""
|
|
28
|
+
|
|
29
|
+
TEXT = "text"
|
|
30
|
+
LINK = "link"
|
|
31
|
+
NUMBER = "number"
|
|
32
|
+
DATE = "date"
|
|
33
|
+
OBJECT = "object"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FieldSpec(BaseModelWithYamlSupport):
|
|
37
|
+
"""Specification for extracting a single field from HTML."""
|
|
38
|
+
|
|
39
|
+
selector: str = Field(
|
|
40
|
+
description="""
|
|
41
|
+
CSS selector or XPATH selector for the field.
|
|
42
|
+
If the fields attribute is not None, this selector should return a parent element or a list of parent elements.
|
|
43
|
+
The selector attribute of the child fields will be applied relative to each parent element.
|
|
44
|
+
"""
|
|
45
|
+
)
|
|
46
|
+
type: FieldType = Field(
|
|
47
|
+
default=FieldType.TEXT,
|
|
48
|
+
description="Type of field to extract, used to validate extraction.",
|
|
49
|
+
)
|
|
50
|
+
required: bool = Field(default=True, description="Whether this field is required. Used for validation.")
|
|
51
|
+
|
|
52
|
+
multiple: bool = Field(
|
|
53
|
+
default=False,
|
|
54
|
+
description="Whether to extract multiple values from this field (i.e. return a list).",
|
|
55
|
+
)
|
|
56
|
+
processors: list[Union[ProcessorName, dict[ProcessorName, list[Union[str, int]]]]] = Field(
|
|
57
|
+
default_factory=list,
|
|
58
|
+
description="List of processors to apply to the extracted value(s). Each processor can be a string (processor name) or a dict mapping processor name to argument list. These are applied in order.",
|
|
59
|
+
)
|
|
60
|
+
fields: dict[str, "FieldSpec"] | None = Field(
|
|
61
|
+
default=None,
|
|
62
|
+
description="Child fields to extract from the element(s) selected by this field. \
|
|
63
|
+
Keys in this dict will be keys in the output data.",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def resolved_processors(self) -> list[ProcessorSpec]:
|
|
67
|
+
"""Normalise the processor list into ProcessorSpec objects from dict[func-name=>arg list] or str."""
|
|
68
|
+
result: list[ProcessorSpec] = []
|
|
69
|
+
for p in self.processors:
|
|
70
|
+
if isinstance(p, (str, ProcessorName)):
|
|
71
|
+
result.append(ProcessorSpec(name=p))
|
|
72
|
+
elif isinstance(p, dict):
|
|
73
|
+
for name, args in p.items():
|
|
74
|
+
result.append(ProcessorSpec(name=name, args=args))
|
|
75
|
+
else:
|
|
76
|
+
raise ValueError(f"Invalid processor spec: {p}")
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class ParseSpec(BaseModelWithYamlSupport):
|
|
81
|
+
"""Top-level declarative parser specification."""
|
|
82
|
+
|
|
83
|
+
version: int = 1
|
|
84
|
+
name: str
|
|
85
|
+
fields: dict[str, FieldSpec]
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field, field_serializer
|
|
6
|
+
|
|
7
|
+
from .output import DataValue, EngineOutput
|
|
8
|
+
from .yaml import BaseModelWithYamlSupport
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ValidationMismatch(BaseModel):
|
|
12
|
+
field: str
|
|
13
|
+
expected_type: str
|
|
14
|
+
actual_type: str
|
|
15
|
+
message: str
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SpecValidationResult(BaseModel):
|
|
19
|
+
mismatches: list[ValidationMismatch] = Field(default_factory=list)
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def is_valid(self) -> bool:
|
|
23
|
+
return len(self.mismatches) == 0
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class FileValidationResult(BaseModel):
|
|
27
|
+
"""Validation result for a single HTML file."""
|
|
28
|
+
|
|
29
|
+
file_name: str
|
|
30
|
+
item_count: int
|
|
31
|
+
errors: list[str] = Field(default_factory=list)
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def passed(self) -> bool:
|
|
35
|
+
return not self.errors
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class TrueValidationResult(BaseModel):
|
|
39
|
+
"""Aggregate validation result across all files."""
|
|
40
|
+
|
|
41
|
+
file_results: list[FileValidationResult] = Field(default_factory=list)
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def total_files(self) -> int:
|
|
45
|
+
return len(self.file_results)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def total_items(self) -> int:
|
|
49
|
+
return sum(result.item_count for result in self.file_results)
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def failures(self) -> int:
|
|
53
|
+
return sum(1 for result in self.file_results if not result.passed)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def passed(self) -> bool:
|
|
57
|
+
return self.failures == 0
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class FileExpectedItems(BaseModel):
|
|
61
|
+
"""Expected extraction results for a single example file."""
|
|
62
|
+
|
|
63
|
+
file: str
|
|
64
|
+
items: dict[str, DataValue]
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def from_engine_output(cls, file_name: str, output: EngineOutput) -> FileExpectedItems:
|
|
68
|
+
return cls(file=file_name, items=output.data)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class ExpectedResults(BaseModelWithYamlSupport):
|
|
72
|
+
"""Expected extraction output used to validate parser correctness.
|
|
73
|
+
|
|
74
|
+
Stored as YAML alongside example data files.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
version: int = 1
|
|
78
|
+
data_path: Path | None = None
|
|
79
|
+
files: list[FileExpectedItems]
|
|
80
|
+
|
|
81
|
+
@field_serializer("data_path")
|
|
82
|
+
def serialize_data_path(self, value: Path | None) -> str | None:
|
|
83
|
+
"""Serialize data_path as a string in YAML."""
|
|
84
|
+
return str(value) if value is not None else None
|
spextract/models/yaml.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import TypeVar
|
|
3
|
+
|
|
4
|
+
import yaml
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
T = TypeVar("T", bound="BaseModelWithYamlSupport")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseModelWithYamlSupport(BaseModel):
|
|
11
|
+
"""BaseModel subclass with support for loading from YAML files."""
|
|
12
|
+
|
|
13
|
+
@classmethod
|
|
14
|
+
def from_yaml_file(cls: type[T], file_path: Path) -> T:
|
|
15
|
+
"""Load a model instance from a YAML file."""
|
|
16
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
17
|
+
data = yaml.safe_load(f)
|
|
18
|
+
return cls.model_validate(data)
|
|
19
|
+
|
|
20
|
+
def to_yaml_file(self, file_path: Path) -> None:
|
|
21
|
+
"""Save a model instance to a YAML file."""
|
|
22
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
23
|
+
yaml.safe_dump(self.model_dump(), f)
|
spextract/processors.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Built-in field processors for transforming extracted values."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Callable, Literal, Union, overload
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ProcessorName(Enum):
|
|
11
|
+
STRIP = "strip"
|
|
12
|
+
TO_INT = "to_int"
|
|
13
|
+
TO_FLOAT = "to_float"
|
|
14
|
+
LOWERCASE = "lowercase"
|
|
15
|
+
UPPERCASE = "uppercase"
|
|
16
|
+
JOIN = "join"
|
|
17
|
+
REGEX = "regex"
|
|
18
|
+
SPLIT = "split"
|
|
19
|
+
INDEX = "index"
|
|
20
|
+
REPLACE = "replace"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def regex_extract(value: str, pattern: str) -> str:
|
|
24
|
+
"""Extract the first regex match from value. Returns empty string if no match.
|
|
25
|
+
|
|
26
|
+
If the pattern contains capturing groups, returns the first captured group.
|
|
27
|
+
Otherwise, returns the full match.
|
|
28
|
+
"""
|
|
29
|
+
match = re.search(pattern, value)
|
|
30
|
+
if not match:
|
|
31
|
+
return ""
|
|
32
|
+
return match.group(1) if match.lastindex else match.group(0)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def split_string(value: str, separator: str = " ") -> list[str]:
|
|
36
|
+
"""Split a string by the given separator."""
|
|
37
|
+
return value.split(separator)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def select_index(value: list[object], idx: str) -> object:
|
|
41
|
+
"""Select an element from a list by index."""
|
|
42
|
+
int_idx = int(idx)
|
|
43
|
+
return value[int_idx] if -len(value) <= int_idx < len(value) else None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def join_strings(value: list[str], separator: str = " ") -> str:
|
|
47
|
+
"""Join a list of strings with the given separator."""
|
|
48
|
+
if not isinstance(value, list):
|
|
49
|
+
raise ValueError(
|
|
50
|
+
f"Expected a list of strings for join processor, got {type(value).__name__}, with value: {value!r}"
|
|
51
|
+
)
|
|
52
|
+
if not all(isinstance(v, str) for v in value):
|
|
53
|
+
raise ValueError(f"Expected all elements to be strings for join processor, got: {value!r}")
|
|
54
|
+
return separator.join(value)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
PROCESSOR_REGISTRY: dict[ProcessorName, Callable[..., object]] = {
|
|
58
|
+
ProcessorName.STRIP: lambda value: value.strip() if isinstance(value, str) else "",
|
|
59
|
+
ProcessorName.TO_INT: int,
|
|
60
|
+
ProcessorName.TO_FLOAT: float,
|
|
61
|
+
ProcessorName.LOWERCASE: lambda value: value.lower(),
|
|
62
|
+
ProcessorName.UPPERCASE: lambda value: value.upper(),
|
|
63
|
+
ProcessorName.JOIN: join_strings,
|
|
64
|
+
ProcessorName.REGEX: regex_extract,
|
|
65
|
+
ProcessorName.SPLIT: split_string,
|
|
66
|
+
ProcessorName.INDEX: select_index,
|
|
67
|
+
ProcessorName.REPLACE: lambda value, old, new: value.replace(old, new) if isinstance(value, str) else value,
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@overload
|
|
72
|
+
def apply_processor(name: Literal[ProcessorName.STRIP], value: str) -> str: ...
|
|
73
|
+
@overload
|
|
74
|
+
def apply_processor(name: Literal[ProcessorName.TO_INT], value: str) -> int: ...
|
|
75
|
+
@overload
|
|
76
|
+
def apply_processor(name: Literal[ProcessorName.TO_FLOAT], value: str) -> float: ...
|
|
77
|
+
@overload
|
|
78
|
+
def apply_processor(name: Literal[ProcessorName.LOWERCASE], value: str) -> str: ...
|
|
79
|
+
@overload
|
|
80
|
+
def apply_processor(name: Literal[ProcessorName.UPPERCASE], value: str) -> str: ...
|
|
81
|
+
@overload
|
|
82
|
+
def apply_processor(name: Literal[ProcessorName.JOIN], value: list[str], args: list[str] | None = None) -> str: ...
|
|
83
|
+
@overload
|
|
84
|
+
def apply_processor(name: Literal[ProcessorName.REGEX], value: str, args: list[str]) -> str: ...
|
|
85
|
+
@overload
|
|
86
|
+
def apply_processor(name: Literal[ProcessorName.SPLIT], value: str, args: list[str] | None = None) -> list[str]: ...
|
|
87
|
+
@overload
|
|
88
|
+
def apply_processor(name: Literal[ProcessorName.REPLACE], value: str, args: list[str]) -> str: ...
|
|
89
|
+
@overload
|
|
90
|
+
def apply_processor(
|
|
91
|
+
name: Literal[ProcessorName.INDEX],
|
|
92
|
+
value: list[object],
|
|
93
|
+
args: list[Union[str, int]],
|
|
94
|
+
) -> object: ...
|
|
95
|
+
@overload
|
|
96
|
+
def apply_processor(name: ProcessorName, value: object, args: list[Union[str, int]] | None = None) -> object: ...
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def apply_processor(
|
|
100
|
+
name: ProcessorName, value: object, args: list[Union[str, int]] | list[str] | None = None
|
|
101
|
+
) -> object:
|
|
102
|
+
"""Apply a named processor to a value.
|
|
103
|
+
|
|
104
|
+
Raises KeyError if the processor name is not registered.
|
|
105
|
+
"""
|
|
106
|
+
processor_name = ProcessorName(name)
|
|
107
|
+
if processor_name not in PROCESSOR_REGISTRY:
|
|
108
|
+
raise KeyError(f"Unknown processor: {name!r}. Available: {list(PROCESSOR_REGISTRY.keys())}")
|
|
109
|
+
func = PROCESSOR_REGISTRY[processor_name]
|
|
110
|
+
if args:
|
|
111
|
+
return func(value, *args)
|
|
112
|
+
return func(value)
|
spextract/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Literal, cast, overload
|
|
3
|
+
|
|
4
|
+
import lxml.etree
|
|
5
|
+
import lxml.html
|
|
6
|
+
from bs4 import BeautifulSoup, NavigableString, Tag
|
|
7
|
+
from soupsieve import SelectorSyntaxError
|
|
8
|
+
|
|
9
|
+
_PSEUDO_RE = re.compile(r"::(text|attr\(([^)]+)\))\s*$")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _xpath_results_to_tags(results: list[object] | str) -> list[Tag] | list[str]:
|
|
13
|
+
"""Convert lxml XPath results to BeautifulSoup Tags."""
|
|
14
|
+
if isinstance(results, str):
|
|
15
|
+
return [results]
|
|
16
|
+
tags: list[Tag] | list[str] = []
|
|
17
|
+
all_elements = all(isinstance(r, lxml.etree._Element) for r in results) # pylint: disable=protected-access
|
|
18
|
+
all_strings = all(isinstance(r, str) for r in results)
|
|
19
|
+
if all_strings:
|
|
20
|
+
return cast(list[str], results)
|
|
21
|
+
if all_elements:
|
|
22
|
+
for r in results:
|
|
23
|
+
html = lxml.html.tostring(r, encoding="unicode") # type: ignore
|
|
24
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
25
|
+
if soup.contents:
|
|
26
|
+
tags.append(soup.contents[0]) # type: ignore
|
|
27
|
+
return tags
|
|
28
|
+
|
|
29
|
+
raise ValueError(f"Expected all XPath results to be either strings or elements, but got: {results}")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _parse_selector(css: str) -> tuple[str, str | None]:
|
|
33
|
+
"""Split selector into (base, mode) stripping ::text / ::attr(...)."""
|
|
34
|
+
m = _PSEUDO_RE.search(css)
|
|
35
|
+
if not m:
|
|
36
|
+
return css, None
|
|
37
|
+
base = css[: m.start()]
|
|
38
|
+
if m.group(1) == "text":
|
|
39
|
+
return base, "text"
|
|
40
|
+
return base, f"attr:{m.group(2)}"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _select_css(node: Tag | BeautifulSoup, css: str) -> list[str] | list[Tag]:
|
|
44
|
+
"""Run a CSS selector and return matched strings."""
|
|
45
|
+
|
|
46
|
+
base, mode = _parse_selector(css)
|
|
47
|
+
tags = node.select(base) if base.strip() else []
|
|
48
|
+
|
|
49
|
+
if mode == "text":
|
|
50
|
+
results: list[str] = []
|
|
51
|
+
for tag in tags:
|
|
52
|
+
for child in tag.children:
|
|
53
|
+
if isinstance(child, NavigableString) and not isinstance(child, Tag):
|
|
54
|
+
results.append(str(child))
|
|
55
|
+
return results
|
|
56
|
+
|
|
57
|
+
if mode is not None and mode.startswith("attr:"):
|
|
58
|
+
attr_name = mode[5:]
|
|
59
|
+
results = []
|
|
60
|
+
for tag in tags:
|
|
61
|
+
val = tag.get(attr_name)
|
|
62
|
+
if val is not None:
|
|
63
|
+
results.append(" ".join(val) if isinstance(val, list) else str(val))
|
|
64
|
+
return results
|
|
65
|
+
|
|
66
|
+
return tags
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@overload
|
|
70
|
+
def select(
|
|
71
|
+
node: Tag | BeautifulSoup,
|
|
72
|
+
selector: str,
|
|
73
|
+
) -> list[Tag] | list[str]: ...
|
|
74
|
+
@overload
|
|
75
|
+
def select(
|
|
76
|
+
node: Tag | BeautifulSoup,
|
|
77
|
+
selector: str,
|
|
78
|
+
assert_tags: Literal[True],
|
|
79
|
+
assert_strings: Literal[False] = False,
|
|
80
|
+
) -> list[Tag]: ...
|
|
81
|
+
@overload
|
|
82
|
+
def select(
|
|
83
|
+
node: Tag | BeautifulSoup,
|
|
84
|
+
selector: str,
|
|
85
|
+
assert_tags: Literal[False] = False,
|
|
86
|
+
assert_strings: Literal[True] = True,
|
|
87
|
+
) -> list[str]: ...
|
|
88
|
+
@overload
|
|
89
|
+
def select(
|
|
90
|
+
node: Tag | BeautifulSoup,
|
|
91
|
+
selector: str,
|
|
92
|
+
*,
|
|
93
|
+
as_strings: Literal[True] = True,
|
|
94
|
+
) -> list[str]: ...
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def select(
|
|
98
|
+
node: Tag | BeautifulSoup,
|
|
99
|
+
selector: str,
|
|
100
|
+
assert_tags: bool = False,
|
|
101
|
+
assert_strings: bool = False,
|
|
102
|
+
as_strings: bool = False,
|
|
103
|
+
) -> list[Tag] | list[str]:
|
|
104
|
+
"""Select elements using CSS or XPath selector."""
|
|
105
|
+
results: list[Tag] | list[str] = []
|
|
106
|
+
try:
|
|
107
|
+
# CSS selector
|
|
108
|
+
tags = _select_css(node, selector)
|
|
109
|
+
results = tags
|
|
110
|
+
except (SelectorSyntaxError, NotImplementedError):
|
|
111
|
+
# XPath selector
|
|
112
|
+
root = lxml.html.fromstring(str(node))
|
|
113
|
+
xpath_results = cast(list[object] | str, root.xpath(selector))
|
|
114
|
+
str_or_tag = _xpath_results_to_tags(xpath_results)
|
|
115
|
+
results = str_or_tag
|
|
116
|
+
|
|
117
|
+
all_tags = all(isinstance(r, Tag) for r in results)
|
|
118
|
+
all_strings = all(isinstance(r, str) for r in results)
|
|
119
|
+
if assert_tags and not all_tags:
|
|
120
|
+
raise ValueError(f"Expected all results to be Tags, but got: {results}")
|
|
121
|
+
if assert_strings and not all_strings:
|
|
122
|
+
raise ValueError(f"Expected all results to be strings, but got: {results}")
|
|
123
|
+
if as_strings:
|
|
124
|
+
return [str(r) for r in results]
|
|
125
|
+
return results
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from .true_validate import (
|
|
2
|
+
validate_spec_against_expected,
|
|
3
|
+
validate_files,
|
|
4
|
+
validate_items_against_expected,
|
|
5
|
+
TrueValidationResult,
|
|
6
|
+
FileValidationResult,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"validate_spec_against_expected",
|
|
11
|
+
"validate_files",
|
|
12
|
+
"validate_items_against_expected",
|
|
13
|
+
"TrueValidationResult",
|
|
14
|
+
"FileValidationResult",
|
|
15
|
+
]
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Validation logic for checking if engine output matches the declared spec."""
|
|
3
|
+
|
|
4
|
+
from typing import Any, List
|
|
5
|
+
|
|
6
|
+
from ..models.output import DataValue
|
|
7
|
+
from ..models.parser_spec import FieldSpec, ParseSpec
|
|
8
|
+
from ..models.validation import SpecValidationResult, ValidationMismatch
|
|
9
|
+
from .validators import validate_value, type_name
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _expectation_for_field(field_spec: FieldSpec) -> str:
|
|
13
|
+
if field_spec.fields is not None:
|
|
14
|
+
if field_spec.multiple:
|
|
15
|
+
return "array<object>"
|
|
16
|
+
return "object"
|
|
17
|
+
base = field_spec.type.value
|
|
18
|
+
if field_spec.multiple:
|
|
19
|
+
return f"array<{base}>"
|
|
20
|
+
return base
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _validate_field_recursive(name_path: str, field_spec: FieldSpec, value: Any) -> List[ValidationMismatch]:
|
|
24
|
+
if field_spec.required and value is None:
|
|
25
|
+
return [
|
|
26
|
+
ValidationMismatch(
|
|
27
|
+
field=name_path,
|
|
28
|
+
expected_type=_expectation_for_field(field_spec),
|
|
29
|
+
actual_type="missing",
|
|
30
|
+
message=f"Field '{name_path}' missing in output.",
|
|
31
|
+
)
|
|
32
|
+
]
|
|
33
|
+
if value is None:
|
|
34
|
+
# not required and missing is fine, no mismatches
|
|
35
|
+
return []
|
|
36
|
+
# Delegate to nested or non-nested validators
|
|
37
|
+
if field_spec.fields is not None:
|
|
38
|
+
return _validate_nested(name_path, field_spec, value)
|
|
39
|
+
return _validate_non_nested(name_path, field_spec, value)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _validate_nested(name_path: str, field_spec: FieldSpec, value: object) -> List[ValidationMismatch]:
|
|
43
|
+
mismatches: List[ValidationMismatch] = []
|
|
44
|
+
expected = _expectation_for_field(field_spec)
|
|
45
|
+
actual = type_name(value)
|
|
46
|
+
|
|
47
|
+
if field_spec.multiple:
|
|
48
|
+
if not isinstance(value, list):
|
|
49
|
+
mismatches.append(
|
|
50
|
+
ValidationMismatch(
|
|
51
|
+
field=name_path,
|
|
52
|
+
expected_type=expected,
|
|
53
|
+
actual_type=actual,
|
|
54
|
+
message=f"Field '{name_path}' expected list of objects, got {actual}",
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
return mismatches
|
|
58
|
+
for idx, item in enumerate(value):
|
|
59
|
+
if not isinstance(item, dict):
|
|
60
|
+
mismatches.append(
|
|
61
|
+
ValidationMismatch(
|
|
62
|
+
field=f"{name_path}[{idx}]",
|
|
63
|
+
expected_type="object",
|
|
64
|
+
actual_type=type_name(item),
|
|
65
|
+
message=f"Expected object at '{name_path}[{idx}]', got {type_name(item)}",
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
continue
|
|
69
|
+
assert field_spec.fields is not None # for mypy - we know this is not None since we're in the nested case
|
|
70
|
+
for child_name, child_spec in field_spec.fields.items():
|
|
71
|
+
child_value = item.get(child_name)
|
|
72
|
+
mismatches.extend(_validate_field_recursive(f"{name_path}.{child_name}", child_spec, child_value))
|
|
73
|
+
return mismatches
|
|
74
|
+
|
|
75
|
+
# single nested object expected
|
|
76
|
+
if not isinstance(value, dict):
|
|
77
|
+
mismatches.append(
|
|
78
|
+
ValidationMismatch(
|
|
79
|
+
field=name_path,
|
|
80
|
+
expected_type=expected,
|
|
81
|
+
actual_type=actual,
|
|
82
|
+
message=f"Field '{name_path}' expected object, got {actual}",
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
return mismatches
|
|
86
|
+
|
|
87
|
+
assert field_spec.fields is not None # for mypy - we know this is not None since we're in the nested case
|
|
88
|
+
for child_name, child_spec in field_spec.fields.items():
|
|
89
|
+
child_value = value.get(child_name)
|
|
90
|
+
mismatches.extend(_validate_field_recursive(f"{name_path}.{child_name}", child_spec, child_value))
|
|
91
|
+
return mismatches
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _validate_non_nested(name_path: str, field_spec: FieldSpec, value: object) -> List[ValidationMismatch]:
|
|
95
|
+
mismatches: List[ValidationMismatch] = []
|
|
96
|
+
expected = _expectation_for_field(field_spec)
|
|
97
|
+
actual = type_name(value)
|
|
98
|
+
base_type = field_spec.type
|
|
99
|
+
|
|
100
|
+
if field_spec.multiple:
|
|
101
|
+
if not isinstance(value, list):
|
|
102
|
+
mismatches.append(
|
|
103
|
+
ValidationMismatch(
|
|
104
|
+
field=name_path,
|
|
105
|
+
expected_type=expected,
|
|
106
|
+
actual_type=actual,
|
|
107
|
+
message=f"Field '{name_path}' expected list of {base_type.value}, got {actual}",
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
return mismatches
|
|
111
|
+
for idx, item in enumerate(value):
|
|
112
|
+
val_result = validate_value(f"{name_path}[{idx}]", base_type, item)
|
|
113
|
+
if val_result is not None:
|
|
114
|
+
mismatches.append(val_result)
|
|
115
|
+
return mismatches
|
|
116
|
+
|
|
117
|
+
# single value expected
|
|
118
|
+
val_result = validate_value(name_path, base_type, value)
|
|
119
|
+
if val_result is not None:
|
|
120
|
+
mismatches.append(val_result)
|
|
121
|
+
return mismatches
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def validate_spec_output(spec: ParseSpec, data: dict[str, DataValue], raise_: bool = False) -> SpecValidationResult:
|
|
125
|
+
"""Validate that the engine output `data` matches the `spec`.
|
|
126
|
+
|
|
127
|
+
Returns a `SpecValidationResult` containing any mismatches found.
|
|
128
|
+
"""
|
|
129
|
+
mismatches: List[ValidationMismatch] = []
|
|
130
|
+
|
|
131
|
+
for field_name, field_spec in spec.fields.items():
|
|
132
|
+
value = data[field_name] if field_name in data else None
|
|
133
|
+
mismatches.extend(_validate_field_recursive(field_name, field_spec, value))
|
|
134
|
+
|
|
135
|
+
result = SpecValidationResult(mismatches=mismatches)
|
|
136
|
+
if raise_ and not result.is_valid:
|
|
137
|
+
raise ValueError(f"Validation failed for output: {result}")
|
|
138
|
+
return result
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import cast
|
|
4
|
+
|
|
5
|
+
from ..engine import ParseEngine
|
|
6
|
+
from ..models.output import DataValue
|
|
7
|
+
from ..models.parser_spec import ParseSpec
|
|
8
|
+
from ..models.validation import ExpectedResults, FileValidationResult, TrueValidationResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _path_matches_target(path: str, target: str | None) -> bool:
|
|
12
|
+
"""Return True if *path* matches, is a descendant of, or is an ancestor of *target*.
|
|
13
|
+
|
|
14
|
+
`items.name` matches `items[0].name`, `items[1].name`, etc.
|
|
15
|
+
`items[0].name` matches only `items[0].name`.
|
|
16
|
+
`items.meta` matches `items[0].meta.color` (target is ancestor of path).
|
|
17
|
+
`items` matches `items.meta` (path is ancestor of target — structural errors).
|
|
18
|
+
"""
|
|
19
|
+
if not target:
|
|
20
|
+
return True
|
|
21
|
+
target_parts = target.split(".")
|
|
22
|
+
path_parts = path.split(".")
|
|
23
|
+
# Compare the overlapping prefix
|
|
24
|
+
for target_part, path_part in zip(target_parts, path_parts):
|
|
25
|
+
if "[" in target_part:
|
|
26
|
+
if path_part != target_part:
|
|
27
|
+
return False
|
|
28
|
+
else:
|
|
29
|
+
bracket = path_part.find("[")
|
|
30
|
+
path_base = path_part[:bracket] if bracket != -1 else path_part
|
|
31
|
+
if path_base != target_part:
|
|
32
|
+
return False
|
|
33
|
+
return True
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _compare_values(
|
|
37
|
+
actual: object,
|
|
38
|
+
expected: DataValue,
|
|
39
|
+
path: str,
|
|
40
|
+
errors: list[str],
|
|
41
|
+
target_field_path: str | None = None,
|
|
42
|
+
) -> int:
|
|
43
|
+
"""Recursively compare an actual parsed value against an expected value."""
|
|
44
|
+
num_items_checked = 0
|
|
45
|
+
if expected is None:
|
|
46
|
+
if not _path_matches_target(path, target_field_path):
|
|
47
|
+
return 0
|
|
48
|
+
if not actual:
|
|
49
|
+
# Treat None, empty string, empty list, and empty dict as equivalent for convenience
|
|
50
|
+
return 1
|
|
51
|
+
if actual:
|
|
52
|
+
errors.append(f"{path}: expected None/empty, got {actual!r}")
|
|
53
|
+
return 1
|
|
54
|
+
elif isinstance(expected, str):
|
|
55
|
+
if not _path_matches_target(path, target_field_path):
|
|
56
|
+
return 0
|
|
57
|
+
if expected == "" and (actual is None or actual == ""):
|
|
58
|
+
return 1 # Treat empty string and None as equivalent for convenience
|
|
59
|
+
if actual != expected:
|
|
60
|
+
errors.append(f"{path}: expected {expected!r}, got {actual!r}")
|
|
61
|
+
return 1
|
|
62
|
+
elif isinstance(expected, dict):
|
|
63
|
+
if not isinstance(actual, dict):
|
|
64
|
+
if _path_matches_target(path, target_field_path):
|
|
65
|
+
errors.append(f"{path}: expected dict for target field, got {type(actual).__name__}: {actual!r}")
|
|
66
|
+
return 1
|
|
67
|
+
return 0
|
|
68
|
+
for key, exp_val in expected.items():
|
|
69
|
+
actual_val = actual.get(key)
|
|
70
|
+
num_items_checked += _compare_values(actual_val, exp_val, f"{path}.{key}", errors, target_field_path)
|
|
71
|
+
elif isinstance(expected, list):
|
|
72
|
+
exp_list = cast(list[DataValue], expected)
|
|
73
|
+
if not isinstance(actual, list):
|
|
74
|
+
if _path_matches_target(path, target_field_path):
|
|
75
|
+
errors.append(f"{path}: expected list, got {type(actual).__name__}: {actual!r}")
|
|
76
|
+
return 1
|
|
77
|
+
return 0
|
|
78
|
+
if len(actual) != len(expected):
|
|
79
|
+
if _path_matches_target(path, target_field_path):
|
|
80
|
+
errors.append(f"{path}: expected {len(expected)} items, got {len(actual)}")
|
|
81
|
+
return 1
|
|
82
|
+
for i, (act_item, exp_item) in enumerate(zip(actual, exp_list)):
|
|
83
|
+
num_items_checked += _compare_values(act_item, exp_item, f"{path}[{i}]", errors, target_field_path)
|
|
84
|
+
elif actual != expected and (not target_field_path or _path_matches_target(path, target_field_path)):
|
|
85
|
+
errors.append(f"{path}: expected {expected!r}, got {actual!r}")
|
|
86
|
+
return 1
|
|
87
|
+
return num_items_checked
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def validate_spec_against_expected(
|
|
91
|
+
spec: ParseSpec,
|
|
92
|
+
html: str,
|
|
93
|
+
expected: dict[str, DataValue],
|
|
94
|
+
field_path: str | None = None,
|
|
95
|
+
) -> FileValidationResult:
|
|
96
|
+
# if field_path is provided, check that it exists in the spec before parsing
|
|
97
|
+
# if field_path starts with "fields.", remove that prefix for easier matching
|
|
98
|
+
if field_path and field_path.startswith("fields."):
|
|
99
|
+
field_path = field_path[len("fields.") :]
|
|
100
|
+
if field_path is not None:
|
|
101
|
+
# Support dot notation for nested fields; strip any [N] index suffix for spec lookup
|
|
102
|
+
field_parts = field_path.split(".")
|
|
103
|
+
current = spec.fields
|
|
104
|
+
for part in field_parts:
|
|
105
|
+
part_name = re.sub(r"\[\d+\]$", "", part)
|
|
106
|
+
if isinstance(current, dict) and part_name in current:
|
|
107
|
+
nested_fields = current[part_name].fields
|
|
108
|
+
current = nested_fields if nested_fields is not None else {}
|
|
109
|
+
else:
|
|
110
|
+
raise ValueError(f"Field path '{field_path}' does not exist in the spec.")
|
|
111
|
+
|
|
112
|
+
engine = ParseEngine(spec)
|
|
113
|
+
items = engine.parse(html).data
|
|
114
|
+
return validate_items_against_expected(items, expected, field_path)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def validate_items_against_expected(
|
|
118
|
+
items: dict[str, DataValue],
|
|
119
|
+
expected: dict[str, DataValue],
|
|
120
|
+
field_path: str | None = None,
|
|
121
|
+
) -> FileValidationResult:
|
|
122
|
+
"""Validate a parser spec against an HTML string.
|
|
123
|
+
|
|
124
|
+
Parses the HTML using the spec and optionally compares against expected results.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
errors: list[str] = []
|
|
128
|
+
|
|
129
|
+
actual = items if items else {}
|
|
130
|
+
num_items_checked = 0
|
|
131
|
+
for key, exp_val in expected.items():
|
|
132
|
+
actual_val = actual.get(key)
|
|
133
|
+
num_items_checked += _compare_values(actual_val, exp_val, key, errors, field_path)
|
|
134
|
+
|
|
135
|
+
return FileValidationResult(file_name="", item_count=num_items_checked, errors=errors)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def validate_files(
|
|
139
|
+
expected_values_path: Path,
|
|
140
|
+
spec_file_path: Path,
|
|
141
|
+
data_dir: Path | None = None,
|
|
142
|
+
field_path: str | None = None,
|
|
143
|
+
skip_unexpected_files: bool = False,
|
|
144
|
+
) -> TrueValidationResult:
|
|
145
|
+
"""Validate an item directory containing parser_spec.yaml and expected.yaml.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
expected_values_path: Path to the expected values YAML file.
|
|
149
|
+
spec_file_path: Path to the parser spec YAML file.
|
|
150
|
+
data_dir: Override for the data directory. Defaults to item_dir/../data.
|
|
151
|
+
field_path: Optional dot path to a specific field to validate.
|
|
152
|
+
"""
|
|
153
|
+
expected_values = ExpectedResults.from_yaml_file(expected_values_path)
|
|
154
|
+
spec = ParseSpec.from_yaml_file(spec_file_path)
|
|
155
|
+
|
|
156
|
+
if data_dir is None:
|
|
157
|
+
data_dir = expected_values.data_path
|
|
158
|
+
|
|
159
|
+
if data_dir is None:
|
|
160
|
+
raise ValueError(
|
|
161
|
+
"Data path not specified. Either include 'data_path' in the expected values YAML or provide --data-dir."
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
if not data_dir.is_absolute():
|
|
165
|
+
data_dir = expected_values_path.parent / data_dir
|
|
166
|
+
|
|
167
|
+
expected_by_file: dict[str, dict[str, DataValue]] = {fe.file: fe.items for fe in expected_values.files}
|
|
168
|
+
|
|
169
|
+
html_files = sorted(data_dir.glob("*.html"))
|
|
170
|
+
result = TrueValidationResult()
|
|
171
|
+
|
|
172
|
+
if not html_files:
|
|
173
|
+
result.file_results.append(
|
|
174
|
+
FileValidationResult(
|
|
175
|
+
file_name=str(data_dir),
|
|
176
|
+
item_count=0,
|
|
177
|
+
errors=[f"No HTML files found in {data_dir}"],
|
|
178
|
+
)
|
|
179
|
+
)
|
|
180
|
+
return result
|
|
181
|
+
|
|
182
|
+
for html_file in html_files:
|
|
183
|
+
html = html_file.read_text(encoding="utf-8")
|
|
184
|
+
file_expected = expected_by_file.get(html_file.name)
|
|
185
|
+
if not file_expected:
|
|
186
|
+
if not skip_unexpected_files:
|
|
187
|
+
result.file_results.append(
|
|
188
|
+
FileValidationResult(
|
|
189
|
+
file_name=html_file.name,
|
|
190
|
+
item_count=0,
|
|
191
|
+
errors=[f"No expected results defined for {html_file.name}"],
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
continue
|
|
195
|
+
file_result = validate_spec_against_expected(spec, html, file_expected, field_path=field_path)
|
|
196
|
+
result.file_results.append(
|
|
197
|
+
FileValidationResult(
|
|
198
|
+
file_name=html_file.name,
|
|
199
|
+
item_count=file_result.item_count,
|
|
200
|
+
errors=file_result.errors,
|
|
201
|
+
)
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
return result
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Validators for primitive (non-nested) field values.
|
|
2
|
+
Factored out from validation logic so tests and code can reuse semantic checks.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
|
|
9
|
+
from ..models.parser_spec import FieldType
|
|
10
|
+
from ..models.validation import ValidationMismatch
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def type_name(value: Any) -> str:
|
|
14
|
+
if value is None:
|
|
15
|
+
return "missing"
|
|
16
|
+
if isinstance(value, dict):
|
|
17
|
+
return "object"
|
|
18
|
+
if isinstance(value, list):
|
|
19
|
+
return "array"
|
|
20
|
+
if isinstance(value, (int, float)):
|
|
21
|
+
return "number"
|
|
22
|
+
if isinstance(value, str):
|
|
23
|
+
return "string"
|
|
24
|
+
return type(value).__name__
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def validate_text(name_path: str, value: Any) -> Optional[ValidationMismatch]:
|
|
28
|
+
if not isinstance(value, str):
|
|
29
|
+
return ValidationMismatch(
|
|
30
|
+
field=name_path,
|
|
31
|
+
expected_type="text",
|
|
32
|
+
actual_type=type_name(value),
|
|
33
|
+
message=f"Field '{name_path}' expected text, got {type_name(value)}",
|
|
34
|
+
)
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def validate_link(name_path: str, value: Any) -> Optional[ValidationMismatch]:
|
|
39
|
+
if not isinstance(value, str):
|
|
40
|
+
return ValidationMismatch(
|
|
41
|
+
field=name_path,
|
|
42
|
+
expected_type="link",
|
|
43
|
+
actual_type=type_name(value),
|
|
44
|
+
message=f"Field '{name_path}' expected link string, got {type_name(value)}",
|
|
45
|
+
)
|
|
46
|
+
if not (value.startswith("http://") or value.startswith("https://") or value.startswith("/")):
|
|
47
|
+
return ValidationMismatch(
|
|
48
|
+
field=name_path,
|
|
49
|
+
expected_type="link",
|
|
50
|
+
actual_type="string",
|
|
51
|
+
message=f"Field '{name_path}' does not look like a link: '{value}'",
|
|
52
|
+
)
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def validate_number(name_path: str, value: Any) -> Optional[ValidationMismatch]:
|
|
57
|
+
if not isinstance(value, (int, float)):
|
|
58
|
+
return ValidationMismatch(
|
|
59
|
+
field=name_path,
|
|
60
|
+
expected_type="number",
|
|
61
|
+
actual_type=type_name(value),
|
|
62
|
+
message=f"Field '{name_path}' expected number, got {type_name(value)}",
|
|
63
|
+
)
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def validate_date(name_path: str, value: Any) -> Optional[ValidationMismatch]:
|
|
68
|
+
if not isinstance(value, str):
|
|
69
|
+
return ValidationMismatch(
|
|
70
|
+
field=name_path,
|
|
71
|
+
expected_type="date",
|
|
72
|
+
actual_type=type_name(value),
|
|
73
|
+
message=f"Field '{name_path}' expected date string, got {type_name(value)}",
|
|
74
|
+
)
|
|
75
|
+
try:
|
|
76
|
+
datetime.fromisoformat(value)
|
|
77
|
+
except Exception: # pylint: disable=broad-except
|
|
78
|
+
return ValidationMismatch(
|
|
79
|
+
field=name_path,
|
|
80
|
+
expected_type="date",
|
|
81
|
+
actual_type="string",
|
|
82
|
+
message=f"Field '{name_path}' is not a valid ISO date: '{value}'",
|
|
83
|
+
)
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def validate_value(name_path: str, base_type: FieldType, value: Any) -> Optional[ValidationMismatch]:
|
|
88
|
+
"""Dispatch to the right validator for `base_type`.
|
|
89
|
+
|
|
90
|
+
Returns a `ValidationMismatch` if invalid, or None if valid.
|
|
91
|
+
"""
|
|
92
|
+
if value is None:
|
|
93
|
+
return ValidationMismatch(
|
|
94
|
+
field=name_path,
|
|
95
|
+
expected_type=base_type.value,
|
|
96
|
+
actual_type="missing",
|
|
97
|
+
message=f"Field '{name_path}' is missing",
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if base_type == FieldType.TEXT:
|
|
101
|
+
return validate_text(name_path, value)
|
|
102
|
+
if base_type == FieldType.LINK:
|
|
103
|
+
return validate_link(name_path, value)
|
|
104
|
+
if base_type == FieldType.NUMBER:
|
|
105
|
+
return validate_number(name_path, value)
|
|
106
|
+
if base_type == FieldType.DATE:
|
|
107
|
+
return validate_date(name_path, value)
|
|
108
|
+
|
|
109
|
+
# Fallback: treat as text
|
|
110
|
+
return validate_text(name_path, value)
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright © 2026 Vincent Lonij
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: spextract
|
|
3
|
+
Version: 0.5.16
|
|
4
|
+
Summary: A declarative html scraper for python
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Vincent Lonij
|
|
7
|
+
Author-email: 29819815+vincentropy@users.noreply.github.com
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Dist: beautifulsoup4 (>=4.14.3,<5.0.0)
|
|
16
|
+
Requires-Dist: click (>=8.3.2,<9.0.0)
|
|
17
|
+
Requires-Dist: lxml (>=6.0.4,<7.0.0)
|
|
18
|
+
Requires-Dist: pydantic (>=2.12.5,<3.0.0)
|
|
19
|
+
Requires-Dist: pyyaml (>=6.0.3,<7.0.0)
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# A Declarative HTML Scraper for Python
|
|
23
|
+
|
|
24
|
+
This package provides a simple way to declare what data should be extracted from an HTML document in a configuration file.
|
|
25
|
+
|
|
26
|
+
This enables sharing of scraping logic across projects and teams without the risk of executing untrusted code. It also allows for easier maintenance and updates to scraping logic without needing to modify the underlying codebase.
|
|
27
|
+
|
|
28
|
+
## CLI
|
|
29
|
+
|
|
30
|
+
The package includes a Click-based CLI with two commands:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
decs parse spec.yaml <path to html file or directory>
|
|
34
|
+
decs validate spec.yaml expected-results.yaml
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
`parse` emits YAML in the same expected-results format used by `validate`, so you can capture known-good output and re-run validation later.
|
|
38
|
+
|
|
39
|
+
## How to use
|
|
40
|
+
|
|
41
|
+
### Build a configuration file
|
|
42
|
+
|
|
43
|
+
You can write a configuration file with the provided ParserSpec class.
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import py_decs
|
|
47
|
+
|
|
48
|
+
spec = py_decs.ParserSpec(
|
|
49
|
+
name="example_parser",
|
|
50
|
+
description="An example parser for demonstration purposes.",
|
|
51
|
+
fields=[
|
|
52
|
+
py_decs.FieldSpec(
|
|
53
|
+
name="title",
|
|
54
|
+
selector="h1.title::text",
|
|
55
|
+
type=py_decs.FieldType.TEXT,
|
|
56
|
+
),
|
|
57
|
+
py_decs.FieldSpec(
|
|
58
|
+
name="links",
|
|
59
|
+
selector="a.link::attr(href)",
|
|
60
|
+
type=py_decs.FieldType.LINK,
|
|
61
|
+
multiple=True,
|
|
62
|
+
)
|
|
63
|
+
py_decs.FieldSpec(
|
|
64
|
+
name="author",
|
|
65
|
+
selector="div.author",
|
|
66
|
+
type=py_decs.FieldType.OBJECT,
|
|
67
|
+
fields=[
|
|
68
|
+
py_decs.FieldSpec(
|
|
69
|
+
name="name",
|
|
70
|
+
selector="span.name::text",
|
|
71
|
+
type=py_decs.FieldType.TEXT,
|
|
72
|
+
),
|
|
73
|
+
py_decs.FieldSpec(
|
|
74
|
+
name="profile_url",
|
|
75
|
+
selector="a.profile::attr(href)",
|
|
76
|
+
type=py_decs.FieldType.LINK,
|
|
77
|
+
),
|
|
78
|
+
]
|
|
79
|
+
),
|
|
80
|
+
]
|
|
81
|
+
)
|
|
82
|
+
```
|
|
83
|
+
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
spextract/__init__.py,sha256=4yowBnR5WkwlRe0_Ad27TkKTPmVa4mGouVudMctyON8,547
|
|
2
|
+
spextract/cli.py,sha256=bOSDgqgnT3N1bQwKNLg8LtcsSSzWGwHNfqyH6_gjhXI,3088
|
|
3
|
+
spextract/engine.py,sha256=yIH26wYiQPsVzhc39AZxJX83MBl4FetA6olXLV2L3f0,6979
|
|
4
|
+
spextract/models/__init__.py,sha256=QeHguokPrbZ75TVEBfCueq1ASo2_0rN8JTsFKyjDHqI,479
|
|
5
|
+
spextract/models/output.py,sha256=T-zygB2x5eUhXWvkR_eW5hfEDfWAA43kDPl-mbMNDdk,569
|
|
6
|
+
spextract/models/parser_spec.py,sha256=pZbFnEnBtQdw34OrY0TvPreRIJoCI3Q5ZGvyX71rVog,2936
|
|
7
|
+
spextract/models/validation.py,sha256=1_K5y94CKRqSMwZBysTDbV4njfI3x57j_9g23Si3S8k,2163
|
|
8
|
+
spextract/models/yaml.py,sha256=xjSi3dDehF00X97nZDx1TIHJMTvGg9iVyMD8c-hi320,744
|
|
9
|
+
spextract/processors.py,sha256=y41nSSWy8ch3gzlDifcGlGXLp-jHzLihlT-_rLGdyIU,4067
|
|
10
|
+
spextract/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
spextract/uni_selector.py,sha256=R4q2gVxoJAlyJ7FbgqPAiMQSGOSsqbCjfPjAlUqux1w,3956
|
|
12
|
+
spextract/validation/__init__.py,sha256=k13bHZzZAQ0zjXrp-gz98zOyBcfREPAH2_O5thTbIu8,346
|
|
13
|
+
spextract/validation/spec_validate.py,sha256=GWyDjqx0ElDlqJkNrqQm6ys8yXaS6E1SFwTuZPEZRdk,5388
|
|
14
|
+
spextract/validation/true_validate.py,sha256=H8c_lBZ9ClxCnqnaOZjM6nYoClFa6H9VB8h4fmVFDkE,7991
|
|
15
|
+
spextract/validation/validators.py,sha256=FXeejkc-IQnsQxrsldz2f6nWHp6DFjFVBPbd0jF5HHE,3647
|
|
16
|
+
spextract-0.5.16.dist-info/LICENSE.md,sha256=fboAcycaR0hzaSgMDYqsxxyyF1Yq3Vt7R28krR1lgXc,1064
|
|
17
|
+
spextract-0.5.16.dist-info/METADATA,sha256=wuSrhbRRpMYGp0NaskcreQYqfaf8fw_uNJra6cBJpn0,2653
|
|
18
|
+
spextract-0.5.16.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
19
|
+
spextract-0.5.16.dist-info/entry_points.txt,sha256=uzZaXL_t9zdWp3Y-P8wkZhSJmjfE7jrw8mJJmQN3-z4,40
|
|
20
|
+
spextract-0.5.16.dist-info/RECORD,,
|