spextract 0.5.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spextract/__init__.py ADDED
@@ -0,0 +1,27 @@
1
+ from .models import (
2
+ ParseSpec,
3
+ FieldSpec,
4
+ FieldType,
5
+ ProcessorSpec,
6
+ EngineOutput,
7
+ SpecValidationResult,
8
+ ValidationMismatch,
9
+ ExpectedResults,
10
+ FileExpectedItems,
11
+ )
12
+ from .engine import ParseEngine
13
+ from .validation.spec_validate import validate_spec_output
14
+
15
+ __all__ = [
16
+ "FieldSpec",
17
+ "FieldType",
18
+ "ParseEngine",
19
+ "ParseSpec",
20
+ "ProcessorSpec",
21
+ "EngineOutput",
22
+ "SpecValidationResult",
23
+ "ValidationMismatch",
24
+ "ExpectedResults",
25
+ "FileExpectedItems",
26
+ "validate_spec_output",
27
+ ]
spextract/cli.py ADDED
@@ -0,0 +1,85 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import click
6
+ import yaml
7
+
8
+ from .engine import ParseEngine
9
+ from .models.output import EngineOutput
10
+ from .models.parser_spec import ParseSpec
11
+ from .models.validation import ExpectedResults, FileExpectedItems
12
+ from .validation.true_validate import validate_files
13
+
14
+
15
+ @click.group()
16
+ def cli() -> None:
17
+ """Declarative scraper utilities."""
18
+
19
+
20
+ @cli.command()
21
+ @click.argument("spec_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
22
+ @click.argument("html_path", type=click.Path(exists=True, dir_okay=True, path_type=Path))
23
+ @click.option("--output", "output_path", type=click.Path(dir_okay=False, path_type=Path))
24
+ def parse(spec_path: Path, html_path: Path, output_path: Path | None) -> None:
25
+ """Apply a spec to one or more HTML files."""
26
+
27
+ spec = ParseSpec.from_yaml_file(spec_path)
28
+ engine = ParseEngine(spec)
29
+ results: dict[str, EngineOutput] = {}
30
+ if html_path.is_file():
31
+ html_files = [html_path]
32
+ else:
33
+ html_files = sorted(html_path.glob("*.html"))
34
+
35
+ for html_file in html_files:
36
+ html = html_file.read_text(encoding="utf-8")
37
+ parsed = engine.parse(html)
38
+ results[html_file.name] = parsed
39
+
40
+ file_results = ExpectedResults(
41
+ data_path=html_path if html_path.is_dir() else html_path.parent,
42
+ files=[FileExpectedItems.from_engine_output(file_name, output) for file_name, output in results.items()],
43
+ )
44
+
45
+ if output_path is not None:
46
+ file_results.to_yaml_file(output_path)
47
+ click.echo(f"Wrote parsed results to {output_path}")
48
+ return
49
+
50
+ rendered_yaml = yaml.safe_dump(file_results.model_dump(), sort_keys=False, allow_unicode=True)
51
+ click.echo(rendered_yaml, nl=False)
52
+
53
+
54
+ @cli.command()
55
+ @click.argument("spec_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
56
+ @click.argument("expected_results_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
57
+ @click.option(
58
+ "--field-path", "-f", type=str, help="Optional dot path to a specific field to validate (e.g. 'items.name')."
59
+ )
60
+ def validate(spec_path: Path, expected_results_path: Path, field_path: str | None) -> None:
61
+ """Validate a spec against YAML expected extraction results."""
62
+
63
+ validation_result = validate_files(
64
+ expected_values_path=expected_results_path,
65
+ spec_file_path=spec_path,
66
+ field_path=field_path,
67
+ )
68
+ if validation_result.passed:
69
+ click.echo(
70
+ f"Validation passed for {validation_result.total_files} file(s) and {validation_result.total_items} extracted field(s)."
71
+ )
72
+ return
73
+
74
+ for file_result in validation_result.file_results:
75
+ click.echo(f"{file_result.file_name}:")
76
+ if file_result.passed:
77
+ click.echo(f" Passed. Values checked: {file_result.item_count}")
78
+ continue
79
+
80
+ for error in file_result.errors:
81
+ click.echo(f" - {error}")
82
+
83
+ raise click.ClickException(
84
+ f"Validation failed for {validation_result.failures} of {validation_result.total_files} file(s)."
85
+ )
spextract/engine.py ADDED
@@ -0,0 +1,152 @@
1
+ """Core extraction engine that applies a ParseSpec to HTML content."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import cast
6
+
7
+ from bs4 import BeautifulSoup, Tag
8
+
9
+ from spextract.models.validation import SpecValidationResult
10
+
11
+ from .models import EngineOutput, FieldSpec, ParseSpec, ProcessorSpec, DataValue
12
+ from .processors import apply_processor
13
+ from .uni_selector import select
14
+
15
+
16
+ class ParseEngine:
17
+ """Applies a ParseSpec to HTML content and returns extracted items."""
18
+
19
+ def __init__(self, spec: ParseSpec) -> None:
20
+ self.spec = spec
21
+
22
+ def parse(self, html: str, field_path: str | None = None) -> EngineOutput:
23
+ """Parse HTML string and return EngineOutput with typed fields.
24
+
25
+ Args:
26
+ html: Raw HTML content to parse.
27
+ field_path: Optional dot-separated path (e.g. ``"person.name"``) to
28
+ extract only a single field instead of the full spec. Each
29
+ segment must match a key in the ``fields`` dict at that level of
30
+ nesting. Raises ``KeyError`` if a segment is not found or if a
31
+ non-leaf segment has no child fields.
32
+ """
33
+ root = BeautifulSoup(html, "html.parser")
34
+ if field_path is not None:
35
+ data = self._extract_field_path(root, self.spec.fields, field_path.split("."))
36
+ else:
37
+ data = self._extract_fields(root, self.spec.fields)
38
+ return EngineOutput(spec=self.spec, data=data if data is not None else {})
39
+
40
+ def validate(self, html: str, field_path: str | None = None) -> SpecValidationResult:
41
+ """Validate HTML against spec, returning SpecValidationResult with details."""
42
+ from .validation.spec_validate import validate_spec_output # pylint: disable=import-outside-toplevel
43
+
44
+ output = self.parse(html, field_path=field_path)
45
+ validation_result = validate_spec_output(self.spec, output.data, raise_=False)
46
+ return validation_result
47
+
48
+ def parse_and_validate(self, html: str, field_path: str | None = None) -> EngineOutput:
49
+ """Parse HTML and validate output against spec, returning EngineOutput with validation results.
50
+ Raises ValueError if validation fails."""
51
+
52
+ output = self.parse(html, field_path=field_path)
53
+ validation_result = self.validate(html, field_path=field_path)
54
+ if not validation_result.is_valid:
55
+ raise ValueError(f"Validation failed: {validation_result.mismatches[0]}")
56
+ return output
57
+
58
+ @staticmethod
59
+ def _extract_field_path(
60
+ node: Tag | BeautifulSoup,
61
+ fields: dict[str, FieldSpec],
62
+ path: list[str],
63
+ ) -> dict[str, DataValue]:
64
+ """Navigate ``fields`` and the HTML tree simultaneously following ``path``.
65
+
66
+ Returns a ``{leaf_name: value}`` dict containing only the targeted field.
67
+ Raises ``KeyError`` if any path segment is missing or if a non-leaf
68
+ segment has no child ``fields``.
69
+ """
70
+ name, *rest = path
71
+ if name not in fields:
72
+ raise KeyError(f"Field {name!r} not found. Available: {list(fields.keys())}")
73
+ field_spec = fields[name]
74
+
75
+ if not rest:
76
+ # Leaf — extract just this field
77
+ value = ParseEngine._extract_field(node, field_spec)
78
+ return {name: value} if value is not None else {}
79
+
80
+ # Intermediate — must have child fields and a navigable selector
81
+ if field_spec.fields is None:
82
+ raise KeyError(f"Field {name!r} has no child fields; cannot navigate to {'.'.join(rest)!r}")
83
+ sub_nodes = select(node, field_spec.selector, assert_tags=True)
84
+ if not sub_nodes:
85
+ return {}
86
+
87
+ if field_spec.multiple:
88
+ items = [ParseEngine._extract_field_path(sub, field_spec.fields, rest) for sub in sub_nodes]
89
+ return cast(dict[str, DataValue], {name: cast(DataValue, items)})
90
+
91
+ inner = ParseEngine._extract_field_path(sub_nodes[0], field_spec.fields, rest)
92
+ return cast(dict[str, DataValue], {name: cast(DataValue, inner)})
93
+
94
+ @staticmethod
95
+ def _extract_fields(node: Tag | BeautifulSoup, fields: dict[str, FieldSpec]) -> dict[str, DataValue]:
96
+ result: dict[str, DataValue] = {}
97
+ for name, field_spec in fields.items():
98
+ field_out = ParseEngine._extract_field(node, field_spec)
99
+ if field_out is not None:
100
+ result[name] = field_out
101
+ return result
102
+
103
+ @staticmethod
104
+ def _extract_field(node: Tag | BeautifulSoup, field_spec: FieldSpec) -> DataValue | None:
105
+ if field_spec.fields is not None:
106
+ # if this field has child fields, extraction is slightly different
107
+ # we ignore ::text / ::attr on the parent selector.
108
+ return ParseEngine._extract_nested(node, field_spec)
109
+
110
+ values = select(node, field_spec.selector)
111
+ if not values:
112
+ return [] if field_spec.multiple else None
113
+ if field_spec.multiple:
114
+ out_values = [ParseEngine.apply_processors(v, field_spec.resolved_processors()) for v in values]
115
+ return cast(DataValue, out_values)
116
+
117
+ all_strings = all(isinstance(v, str) for v in values)
118
+ if all_strings:
119
+ values = ["".join(cast(list[str], values))]
120
+ if len(values) > 1:
121
+ print(f"Warning: Multiple elements matched for single field: {field_spec.selector}. Using first match.")
122
+ out_value = ParseEngine.apply_processors(values[0], field_spec.resolved_processors())
123
+ return out_value
124
+
125
+ @staticmethod
126
+ def _extract_nested(node: Tag | BeautifulSoup, field_spec: FieldSpec) -> DataValue | None:
127
+ """Extract a field with child fields, applying child selectors relative to parent elements.
128
+ In this case we ignore ::text / ::attr on the parent selector since it doesn't make sense
129
+ to apply these to a parent element that we're extracting child fields from."""
130
+ assert field_spec.fields is not None
131
+
132
+ sub_nodes = select(node, field_spec.selector, assert_tags=True)
133
+ if not sub_nodes:
134
+ return [] if field_spec.multiple else None
135
+
136
+ if field_spec.multiple:
137
+ # Return a list of FieldOutput objects
138
+ return [ParseEngine._extract_fields(sub, field_spec.fields) for sub in sub_nodes]
139
+
140
+ if len(sub_nodes) > 1:
141
+ print(f"Warning: Multiple elements matched for single field: {field_spec.selector}. Using first match.")
142
+ return ParseEngine._extract_fields(sub_nodes[0], field_spec.fields)
143
+
144
+ @staticmethod
145
+ def apply_processors(value: object, processors: list[ProcessorSpec]) -> str | float | None:
146
+ if value is None:
147
+ return value
148
+ for proc in processors:
149
+ value = apply_processor(proc.name, value, proc.args if proc.args else None)
150
+ if isinstance(value, (str, float)) or value is None:
151
+ return value
152
+ raise ValueError(f"Unsupported value type after processing: {type(value)}")
@@ -0,0 +1,17 @@
1
+ from .parser_spec import FieldSpec, FieldType, ParseSpec, ProcessorName, ProcessorSpec
2
+ from .output import DataValue, EngineOutput
3
+ from .validation import SpecValidationResult, ValidationMismatch, ExpectedResults, FileExpectedItems
4
+
5
+ __all__ = [
6
+ "FieldSpec",
7
+ "FieldType",
8
+ "ParseSpec",
9
+ "ProcessorName",
10
+ "ProcessorSpec",
11
+ "DataValue",
12
+ "EngineOutput",
13
+ "SpecValidationResult",
14
+ "ValidationMismatch",
15
+ "ExpectedResults",
16
+ "FileExpectedItems",
17
+ ]
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Union # pylint: disable=unused-import
4
+ from typing_extensions import TypeAliasType
5
+
6
+ from .yaml import BaseModelWithYamlSupport
7
+
8
+ from .parser_spec import ParseSpec
9
+
10
+ DataValue = TypeAliasType(
11
+ "DataValue",
12
+ "Union[None, float, str, dict[str, DataValue], list[DataValue]]",
13
+ )
14
+
15
+
16
+ class EngineOutput(BaseModelWithYamlSupport):
17
+ """Output from the scraping engine for a single file,
18
+ including the parser spec used and the extracted data."""
19
+
20
+ spec: ParseSpec | None = None
21
+ data: dict[str, DataValue]
@@ -0,0 +1,85 @@
1
+ """
2
+ Pydantic models for the declarative scraper.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import enum
8
+ from typing import Union
9
+
10
+ from pydantic import Field
11
+
12
+ from .yaml import BaseModelWithYamlSupport
13
+ from ..processors import ProcessorName
14
+
15
+
16
+ class ProcessorSpec(BaseModelWithYamlSupport):
17
+ """A single field processor.
18
+
19
+ Can be a simple named processor (e.g. "strip") or a parameterised one, eg a regex.
20
+ """
21
+
22
+ name: ProcessorName
23
+ args: list[Union[str, int]] = Field(default_factory=list)
24
+
25
+
26
+ class FieldType(enum.Enum):
27
+ """Type of field to extract, used to determine how to extract it from HTML."""
28
+
29
+ TEXT = "text"
30
+ LINK = "link"
31
+ NUMBER = "number"
32
+ DATE = "date"
33
+ OBJECT = "object"
34
+
35
+
36
+ class FieldSpec(BaseModelWithYamlSupport):
37
+ """Specification for extracting a single field from HTML."""
38
+
39
+ selector: str = Field(
40
+ description="""
41
+ CSS selector or XPATH selector for the field.
42
+ If the fields attribute is not None, this selector should return a parent element or a list of parent elements.
43
+ The selector attribute of the child fields will be applied relative to each parent element.
44
+ """
45
+ )
46
+ type: FieldType = Field(
47
+ default=FieldType.TEXT,
48
+ description="Type of field to extract, used to validate extraction.",
49
+ )
50
+ required: bool = Field(default=True, description="Whether this field is required. Used for validation.")
51
+
52
+ multiple: bool = Field(
53
+ default=False,
54
+ description="Whether to extract multiple values from this field (i.e. return a list).",
55
+ )
56
+ processors: list[Union[ProcessorName, dict[ProcessorName, list[Union[str, int]]]]] = Field(
57
+ default_factory=list,
58
+ description="List of processors to apply to the extracted value(s). Each processor can be a string (processor name) or a dict mapping processor name to argument list. These are applied in order.",
59
+ )
60
+ fields: dict[str, "FieldSpec"] | None = Field(
61
+ default=None,
62
+ description="Child fields to extract from the element(s) selected by this field. \
63
+ Keys in this dict will be keys in the output data.",
64
+ )
65
+
66
+ def resolved_processors(self) -> list[ProcessorSpec]:
67
+ """Normalise the processor list into ProcessorSpec objects from dict[func-name=>arg list] or str."""
68
+ result: list[ProcessorSpec] = []
69
+ for p in self.processors:
70
+ if isinstance(p, (str, ProcessorName)):
71
+ result.append(ProcessorSpec(name=p))
72
+ elif isinstance(p, dict):
73
+ for name, args in p.items():
74
+ result.append(ProcessorSpec(name=name, args=args))
75
+ else:
76
+ raise ValueError(f"Invalid processor spec: {p}")
77
+ return result
78
+
79
+
80
+ class ParseSpec(BaseModelWithYamlSupport):
81
+ """Top-level declarative parser specification."""
82
+
83
+ version: int = 1
84
+ name: str
85
+ fields: dict[str, FieldSpec]
@@ -0,0 +1,84 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from pydantic import BaseModel, Field, field_serializer
6
+
7
+ from .output import DataValue, EngineOutput
8
+ from .yaml import BaseModelWithYamlSupport
9
+
10
+
11
+ class ValidationMismatch(BaseModel):
12
+ field: str
13
+ expected_type: str
14
+ actual_type: str
15
+ message: str
16
+
17
+
18
+ class SpecValidationResult(BaseModel):
19
+ mismatches: list[ValidationMismatch] = Field(default_factory=list)
20
+
21
+ @property
22
+ def is_valid(self) -> bool:
23
+ return len(self.mismatches) == 0
24
+
25
+
26
+ class FileValidationResult(BaseModel):
27
+ """Validation result for a single HTML file."""
28
+
29
+ file_name: str
30
+ item_count: int
31
+ errors: list[str] = Field(default_factory=list)
32
+
33
+ @property
34
+ def passed(self) -> bool:
35
+ return not self.errors
36
+
37
+
38
+ class TrueValidationResult(BaseModel):
39
+ """Aggregate validation result across all files."""
40
+
41
+ file_results: list[FileValidationResult] = Field(default_factory=list)
42
+
43
+ @property
44
+ def total_files(self) -> int:
45
+ return len(self.file_results)
46
+
47
+ @property
48
+ def total_items(self) -> int:
49
+ return sum(result.item_count for result in self.file_results)
50
+
51
+ @property
52
+ def failures(self) -> int:
53
+ return sum(1 for result in self.file_results if not result.passed)
54
+
55
+ @property
56
+ def passed(self) -> bool:
57
+ return self.failures == 0
58
+
59
+
60
+ class FileExpectedItems(BaseModel):
61
+ """Expected extraction results for a single example file."""
62
+
63
+ file: str
64
+ items: dict[str, DataValue]
65
+
66
+ @classmethod
67
+ def from_engine_output(cls, file_name: str, output: EngineOutput) -> FileExpectedItems:
68
+ return cls(file=file_name, items=output.data)
69
+
70
+
71
+ class ExpectedResults(BaseModelWithYamlSupport):
72
+ """Expected extraction output used to validate parser correctness.
73
+
74
+ Stored as YAML alongside example data files.
75
+ """
76
+
77
+ version: int = 1
78
+ data_path: Path | None = None
79
+ files: list[FileExpectedItems]
80
+
81
+ @field_serializer("data_path")
82
+ def serialize_data_path(self, value: Path | None) -> str | None:
83
+ """Serialize data_path as a string in YAML."""
84
+ return str(value) if value is not None else None
@@ -0,0 +1,23 @@
1
+ from pathlib import Path
2
+ from typing import TypeVar
3
+
4
+ import yaml
5
+ from pydantic import BaseModel
6
+
7
+ T = TypeVar("T", bound="BaseModelWithYamlSupport")
8
+
9
+
10
+ class BaseModelWithYamlSupport(BaseModel):
11
+ """BaseModel subclass with support for loading from YAML files."""
12
+
13
+ @classmethod
14
+ def from_yaml_file(cls: type[T], file_path: Path) -> T:
15
+ """Load a model instance from a YAML file."""
16
+ with open(file_path, "r", encoding="utf-8") as f:
17
+ data = yaml.safe_load(f)
18
+ return cls.model_validate(data)
19
+
20
+ def to_yaml_file(self, file_path: Path) -> None:
21
+ """Save a model instance to a YAML file."""
22
+ with open(file_path, "w", encoding="utf-8") as f:
23
+ yaml.safe_dump(self.model_dump(), f)
@@ -0,0 +1,112 @@
1
+ """Built-in field processors for transforming extracted values."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from enum import Enum
7
+ from typing import Callable, Literal, Union, overload
8
+
9
+
10
+ class ProcessorName(Enum):
11
+ STRIP = "strip"
12
+ TO_INT = "to_int"
13
+ TO_FLOAT = "to_float"
14
+ LOWERCASE = "lowercase"
15
+ UPPERCASE = "uppercase"
16
+ JOIN = "join"
17
+ REGEX = "regex"
18
+ SPLIT = "split"
19
+ INDEX = "index"
20
+ REPLACE = "replace"
21
+
22
+
23
+ def regex_extract(value: str, pattern: str) -> str:
24
+ """Extract the first regex match from value. Returns empty string if no match.
25
+
26
+ If the pattern contains capturing groups, returns the first captured group.
27
+ Otherwise, returns the full match.
28
+ """
29
+ match = re.search(pattern, value)
30
+ if not match:
31
+ return ""
32
+ return match.group(1) if match.lastindex else match.group(0)
33
+
34
+
35
+ def split_string(value: str, separator: str = " ") -> list[str]:
36
+ """Split a string by the given separator."""
37
+ return value.split(separator)
38
+
39
+
40
+ def select_index(value: list[object], idx: str) -> object:
41
+ """Select an element from a list by index."""
42
+ int_idx = int(idx)
43
+ return value[int_idx] if -len(value) <= int_idx < len(value) else None
44
+
45
+
46
+ def join_strings(value: list[str], separator: str = " ") -> str:
47
+ """Join a list of strings with the given separator."""
48
+ if not isinstance(value, list):
49
+ raise ValueError(
50
+ f"Expected a list of strings for join processor, got {type(value).__name__}, with value: {value!r}"
51
+ )
52
+ if not all(isinstance(v, str) for v in value):
53
+ raise ValueError(f"Expected all elements to be strings for join processor, got: {value!r}")
54
+ return separator.join(value)
55
+
56
+
57
+ PROCESSOR_REGISTRY: dict[ProcessorName, Callable[..., object]] = {
58
+ ProcessorName.STRIP: lambda value: value.strip() if isinstance(value, str) else "",
59
+ ProcessorName.TO_INT: int,
60
+ ProcessorName.TO_FLOAT: float,
61
+ ProcessorName.LOWERCASE: lambda value: value.lower(),
62
+ ProcessorName.UPPERCASE: lambda value: value.upper(),
63
+ ProcessorName.JOIN: join_strings,
64
+ ProcessorName.REGEX: regex_extract,
65
+ ProcessorName.SPLIT: split_string,
66
+ ProcessorName.INDEX: select_index,
67
+ ProcessorName.REPLACE: lambda value, old, new: value.replace(old, new) if isinstance(value, str) else value,
68
+ }
69
+
70
+
71
+ @overload
72
+ def apply_processor(name: Literal[ProcessorName.STRIP], value: str) -> str: ...
73
+ @overload
74
+ def apply_processor(name: Literal[ProcessorName.TO_INT], value: str) -> int: ...
75
+ @overload
76
+ def apply_processor(name: Literal[ProcessorName.TO_FLOAT], value: str) -> float: ...
77
+ @overload
78
+ def apply_processor(name: Literal[ProcessorName.LOWERCASE], value: str) -> str: ...
79
+ @overload
80
+ def apply_processor(name: Literal[ProcessorName.UPPERCASE], value: str) -> str: ...
81
+ @overload
82
+ def apply_processor(name: Literal[ProcessorName.JOIN], value: list[str], args: list[str] | None = None) -> str: ...
83
+ @overload
84
+ def apply_processor(name: Literal[ProcessorName.REGEX], value: str, args: list[str]) -> str: ...
85
+ @overload
86
+ def apply_processor(name: Literal[ProcessorName.SPLIT], value: str, args: list[str] | None = None) -> list[str]: ...
87
+ @overload
88
+ def apply_processor(name: Literal[ProcessorName.REPLACE], value: str, args: list[str]) -> str: ...
89
+ @overload
90
+ def apply_processor(
91
+ name: Literal[ProcessorName.INDEX],
92
+ value: list[object],
93
+ args: list[Union[str, int]],
94
+ ) -> object: ...
95
+ @overload
96
+ def apply_processor(name: ProcessorName, value: object, args: list[Union[str, int]] | None = None) -> object: ...
97
+
98
+
99
+ def apply_processor(
100
+ name: ProcessorName, value: object, args: list[Union[str, int]] | list[str] | None = None
101
+ ) -> object:
102
+ """Apply a named processor to a value.
103
+
104
+ Raises KeyError if the processor name is not registered.
105
+ """
106
+ processor_name = ProcessorName(name)
107
+ if processor_name not in PROCESSOR_REGISTRY:
108
+ raise KeyError(f"Unknown processor: {name!r}. Available: {list(PROCESSOR_REGISTRY.keys())}")
109
+ func = PROCESSOR_REGISTRY[processor_name]
110
+ if args:
111
+ return func(value, *args)
112
+ return func(value)
spextract/py.typed ADDED
File without changes
@@ -0,0 +1,125 @@
1
+ import re
2
+ from typing import Literal, cast, overload
3
+
4
+ import lxml.etree
5
+ import lxml.html
6
+ from bs4 import BeautifulSoup, NavigableString, Tag
7
+ from soupsieve import SelectorSyntaxError
8
+
9
+ _PSEUDO_RE = re.compile(r"::(text|attr\(([^)]+)\))\s*$")
10
+
11
+
12
+ def _xpath_results_to_tags(results: list[object] | str) -> list[Tag] | list[str]:
13
+ """Convert lxml XPath results to BeautifulSoup Tags."""
14
+ if isinstance(results, str):
15
+ return [results]
16
+ tags: list[Tag] | list[str] = []
17
+ all_elements = all(isinstance(r, lxml.etree._Element) for r in results) # pylint: disable=protected-access
18
+ all_strings = all(isinstance(r, str) for r in results)
19
+ if all_strings:
20
+ return cast(list[str], results)
21
+ if all_elements:
22
+ for r in results:
23
+ html = lxml.html.tostring(r, encoding="unicode") # type: ignore
24
+ soup = BeautifulSoup(html, "html.parser")
25
+ if soup.contents:
26
+ tags.append(soup.contents[0]) # type: ignore
27
+ return tags
28
+
29
+ raise ValueError(f"Expected all XPath results to be either strings or elements, but got: {results}")
30
+
31
+
32
+ def _parse_selector(css: str) -> tuple[str, str | None]:
33
+ """Split selector into (base, mode) stripping ::text / ::attr(...)."""
34
+ m = _PSEUDO_RE.search(css)
35
+ if not m:
36
+ return css, None
37
+ base = css[: m.start()]
38
+ if m.group(1) == "text":
39
+ return base, "text"
40
+ return base, f"attr:{m.group(2)}"
41
+
42
+
43
+ def _select_css(node: Tag | BeautifulSoup, css: str) -> list[str] | list[Tag]:
44
+ """Run a CSS selector and return matched strings."""
45
+
46
+ base, mode = _parse_selector(css)
47
+ tags = node.select(base) if base.strip() else []
48
+
49
+ if mode == "text":
50
+ results: list[str] = []
51
+ for tag in tags:
52
+ for child in tag.children:
53
+ if isinstance(child, NavigableString) and not isinstance(child, Tag):
54
+ results.append(str(child))
55
+ return results
56
+
57
+ if mode is not None and mode.startswith("attr:"):
58
+ attr_name = mode[5:]
59
+ results = []
60
+ for tag in tags:
61
+ val = tag.get(attr_name)
62
+ if val is not None:
63
+ results.append(" ".join(val) if isinstance(val, list) else str(val))
64
+ return results
65
+
66
+ return tags
67
+
68
+
69
+ @overload
70
+ def select(
71
+ node: Tag | BeautifulSoup,
72
+ selector: str,
73
+ ) -> list[Tag] | list[str]: ...
74
+ @overload
75
+ def select(
76
+ node: Tag | BeautifulSoup,
77
+ selector: str,
78
+ assert_tags: Literal[True],
79
+ assert_strings: Literal[False] = False,
80
+ ) -> list[Tag]: ...
81
+ @overload
82
+ def select(
83
+ node: Tag | BeautifulSoup,
84
+ selector: str,
85
+ assert_tags: Literal[False] = False,
86
+ assert_strings: Literal[True] = True,
87
+ ) -> list[str]: ...
88
+ @overload
89
+ def select(
90
+ node: Tag | BeautifulSoup,
91
+ selector: str,
92
+ *,
93
+ as_strings: Literal[True] = True,
94
+ ) -> list[str]: ...
95
+
96
+
97
+ def select(
98
+ node: Tag | BeautifulSoup,
99
+ selector: str,
100
+ assert_tags: bool = False,
101
+ assert_strings: bool = False,
102
+ as_strings: bool = False,
103
+ ) -> list[Tag] | list[str]:
104
+ """Select elements using CSS or XPath selector."""
105
+ results: list[Tag] | list[str] = []
106
+ try:
107
+ # CSS selector
108
+ tags = _select_css(node, selector)
109
+ results = tags
110
+ except (SelectorSyntaxError, NotImplementedError):
111
+ # XPath selector
112
+ root = lxml.html.fromstring(str(node))
113
+ xpath_results = cast(list[object] | str, root.xpath(selector))
114
+ str_or_tag = _xpath_results_to_tags(xpath_results)
115
+ results = str_or_tag
116
+
117
+ all_tags = all(isinstance(r, Tag) for r in results)
118
+ all_strings = all(isinstance(r, str) for r in results)
119
+ if assert_tags and not all_tags:
120
+ raise ValueError(f"Expected all results to be Tags, but got: {results}")
121
+ if assert_strings and not all_strings:
122
+ raise ValueError(f"Expected all results to be strings, but got: {results}")
123
+ if as_strings:
124
+ return [str(r) for r in results]
125
+ return results
@@ -0,0 +1,15 @@
1
+ from .true_validate import (
2
+ validate_spec_against_expected,
3
+ validate_files,
4
+ validate_items_against_expected,
5
+ TrueValidationResult,
6
+ FileValidationResult,
7
+ )
8
+
9
+ __all__ = [
10
+ "validate_spec_against_expected",
11
+ "validate_files",
12
+ "validate_items_against_expected",
13
+ "TrueValidationResult",
14
+ "FileValidationResult",
15
+ ]
@@ -0,0 +1,138 @@
1
+ """
2
+ Validation logic for checking if engine output matches the declared spec."""
3
+
4
+ from typing import Any, List
5
+
6
+ from ..models.output import DataValue
7
+ from ..models.parser_spec import FieldSpec, ParseSpec
8
+ from ..models.validation import SpecValidationResult, ValidationMismatch
9
+ from .validators import validate_value, type_name
10
+
11
+
12
+ def _expectation_for_field(field_spec: FieldSpec) -> str:
13
+ if field_spec.fields is not None:
14
+ if field_spec.multiple:
15
+ return "array<object>"
16
+ return "object"
17
+ base = field_spec.type.value
18
+ if field_spec.multiple:
19
+ return f"array<{base}>"
20
+ return base
21
+
22
+
23
+ def _validate_field_recursive(name_path: str, field_spec: FieldSpec, value: Any) -> List[ValidationMismatch]:
24
+ if field_spec.required and value is None:
25
+ return [
26
+ ValidationMismatch(
27
+ field=name_path,
28
+ expected_type=_expectation_for_field(field_spec),
29
+ actual_type="missing",
30
+ message=f"Field '{name_path}' missing in output.",
31
+ )
32
+ ]
33
+ if value is None:
34
+ # not required and missing is fine, no mismatches
35
+ return []
36
+ # Delegate to nested or non-nested validators
37
+ if field_spec.fields is not None:
38
+ return _validate_nested(name_path, field_spec, value)
39
+ return _validate_non_nested(name_path, field_spec, value)
40
+
41
+
42
+ def _validate_nested(name_path: str, field_spec: FieldSpec, value: object) -> List[ValidationMismatch]:
43
+ mismatches: List[ValidationMismatch] = []
44
+ expected = _expectation_for_field(field_spec)
45
+ actual = type_name(value)
46
+
47
+ if field_spec.multiple:
48
+ if not isinstance(value, list):
49
+ mismatches.append(
50
+ ValidationMismatch(
51
+ field=name_path,
52
+ expected_type=expected,
53
+ actual_type=actual,
54
+ message=f"Field '{name_path}' expected list of objects, got {actual}",
55
+ )
56
+ )
57
+ return mismatches
58
+ for idx, item in enumerate(value):
59
+ if not isinstance(item, dict):
60
+ mismatches.append(
61
+ ValidationMismatch(
62
+ field=f"{name_path}[{idx}]",
63
+ expected_type="object",
64
+ actual_type=type_name(item),
65
+ message=f"Expected object at '{name_path}[{idx}]', got {type_name(item)}",
66
+ )
67
+ )
68
+ continue
69
+ assert field_spec.fields is not None # for mypy - we know this is not None since we're in the nested case
70
+ for child_name, child_spec in field_spec.fields.items():
71
+ child_value = item.get(child_name)
72
+ mismatches.extend(_validate_field_recursive(f"{name_path}.{child_name}", child_spec, child_value))
73
+ return mismatches
74
+
75
+ # single nested object expected
76
+ if not isinstance(value, dict):
77
+ mismatches.append(
78
+ ValidationMismatch(
79
+ field=name_path,
80
+ expected_type=expected,
81
+ actual_type=actual,
82
+ message=f"Field '{name_path}' expected object, got {actual}",
83
+ )
84
+ )
85
+ return mismatches
86
+
87
+ assert field_spec.fields is not None # for mypy - we know this is not None since we're in the nested case
88
+ for child_name, child_spec in field_spec.fields.items():
89
+ child_value = value.get(child_name)
90
+ mismatches.extend(_validate_field_recursive(f"{name_path}.{child_name}", child_spec, child_value))
91
+ return mismatches
92
+
93
+
94
+ def _validate_non_nested(name_path: str, field_spec: FieldSpec, value: object) -> List[ValidationMismatch]:
95
+ mismatches: List[ValidationMismatch] = []
96
+ expected = _expectation_for_field(field_spec)
97
+ actual = type_name(value)
98
+ base_type = field_spec.type
99
+
100
+ if field_spec.multiple:
101
+ if not isinstance(value, list):
102
+ mismatches.append(
103
+ ValidationMismatch(
104
+ field=name_path,
105
+ expected_type=expected,
106
+ actual_type=actual,
107
+ message=f"Field '{name_path}' expected list of {base_type.value}, got {actual}",
108
+ )
109
+ )
110
+ return mismatches
111
+ for idx, item in enumerate(value):
112
+ val_result = validate_value(f"{name_path}[{idx}]", base_type, item)
113
+ if val_result is not None:
114
+ mismatches.append(val_result)
115
+ return mismatches
116
+
117
+ # single value expected
118
+ val_result = validate_value(name_path, base_type, value)
119
+ if val_result is not None:
120
+ mismatches.append(val_result)
121
+ return mismatches
122
+
123
+
124
+ def validate_spec_output(spec: ParseSpec, data: dict[str, DataValue], raise_: bool = False) -> SpecValidationResult:
125
+ """Validate that the engine output `data` matches the `spec`.
126
+
127
+ Returns a `SpecValidationResult` containing any mismatches found.
128
+ """
129
+ mismatches: List[ValidationMismatch] = []
130
+
131
+ for field_name, field_spec in spec.fields.items():
132
+ value = data[field_name] if field_name in data else None
133
+ mismatches.extend(_validate_field_recursive(field_name, field_spec, value))
134
+
135
+ result = SpecValidationResult(mismatches=mismatches)
136
+ if raise_ and not result.is_valid:
137
+ raise ValueError(f"Validation failed for output: {result}")
138
+ return result
@@ -0,0 +1,204 @@
1
+ import re
2
+ from pathlib import Path
3
+ from typing import cast
4
+
5
+ from ..engine import ParseEngine
6
+ from ..models.output import DataValue
7
+ from ..models.parser_spec import ParseSpec
8
+ from ..models.validation import ExpectedResults, FileValidationResult, TrueValidationResult
9
+
10
+
11
+ def _path_matches_target(path: str, target: str | None) -> bool:
12
+ """Return True if *path* matches, is a descendant of, or is an ancestor of *target*.
13
+
14
+ `items.name` matches `items[0].name`, `items[1].name`, etc.
15
+ `items[0].name` matches only `items[0].name`.
16
+ `items.meta` matches `items[0].meta.color` (target is ancestor of path).
17
+ `items` matches `items.meta` (path is ancestor of target — structural errors).
18
+ """
19
+ if not target:
20
+ return True
21
+ target_parts = target.split(".")
22
+ path_parts = path.split(".")
23
+ # Compare the overlapping prefix
24
+ for target_part, path_part in zip(target_parts, path_parts):
25
+ if "[" in target_part:
26
+ if path_part != target_part:
27
+ return False
28
+ else:
29
+ bracket = path_part.find("[")
30
+ path_base = path_part[:bracket] if bracket != -1 else path_part
31
+ if path_base != target_part:
32
+ return False
33
+ return True
34
+
35
+
36
+ def _compare_values(
37
+ actual: object,
38
+ expected: DataValue,
39
+ path: str,
40
+ errors: list[str],
41
+ target_field_path: str | None = None,
42
+ ) -> int:
43
+ """Recursively compare an actual parsed value against an expected value."""
44
+ num_items_checked = 0
45
+ if expected is None:
46
+ if not _path_matches_target(path, target_field_path):
47
+ return 0
48
+ if not actual:
49
+ # Treat None, empty string, empty list, and empty dict as equivalent for convenience
50
+ return 1
51
+ if actual:
52
+ errors.append(f"{path}: expected None/empty, got {actual!r}")
53
+ return 1
54
+ elif isinstance(expected, str):
55
+ if not _path_matches_target(path, target_field_path):
56
+ return 0
57
+ if expected == "" and (actual is None or actual == ""):
58
+ return 1 # Treat empty string and None as equivalent for convenience
59
+ if actual != expected:
60
+ errors.append(f"{path}: expected {expected!r}, got {actual!r}")
61
+ return 1
62
+ elif isinstance(expected, dict):
63
+ if not isinstance(actual, dict):
64
+ if _path_matches_target(path, target_field_path):
65
+ errors.append(f"{path}: expected dict for target field, got {type(actual).__name__}: {actual!r}")
66
+ return 1
67
+ return 0
68
+ for key, exp_val in expected.items():
69
+ actual_val = actual.get(key)
70
+ num_items_checked += _compare_values(actual_val, exp_val, f"{path}.{key}", errors, target_field_path)
71
+ elif isinstance(expected, list):
72
+ exp_list = cast(list[DataValue], expected)
73
+ if not isinstance(actual, list):
74
+ if _path_matches_target(path, target_field_path):
75
+ errors.append(f"{path}: expected list, got {type(actual).__name__}: {actual!r}")
76
+ return 1
77
+ return 0
78
+ if len(actual) != len(expected):
79
+ if _path_matches_target(path, target_field_path):
80
+ errors.append(f"{path}: expected {len(expected)} items, got {len(actual)}")
81
+ return 1
82
+ for i, (act_item, exp_item) in enumerate(zip(actual, exp_list)):
83
+ num_items_checked += _compare_values(act_item, exp_item, f"{path}[{i}]", errors, target_field_path)
84
+ elif actual != expected and (not target_field_path or _path_matches_target(path, target_field_path)):
85
+ errors.append(f"{path}: expected {expected!r}, got {actual!r}")
86
+ return 1
87
+ return num_items_checked
88
+
89
+
90
+ def validate_spec_against_expected(
91
+ spec: ParseSpec,
92
+ html: str,
93
+ expected: dict[str, DataValue],
94
+ field_path: str | None = None,
95
+ ) -> FileValidationResult:
96
+ # if field_path is provided, check that it exists in the spec before parsing
97
+ # if field_path starts with "fields.", remove that prefix for easier matching
98
+ if field_path and field_path.startswith("fields."):
99
+ field_path = field_path[len("fields.") :]
100
+ if field_path is not None:
101
+ # Support dot notation for nested fields; strip any [N] index suffix for spec lookup
102
+ field_parts = field_path.split(".")
103
+ current = spec.fields
104
+ for part in field_parts:
105
+ part_name = re.sub(r"\[\d+\]$", "", part)
106
+ if isinstance(current, dict) and part_name in current:
107
+ nested_fields = current[part_name].fields
108
+ current = nested_fields if nested_fields is not None else {}
109
+ else:
110
+ raise ValueError(f"Field path '{field_path}' does not exist in the spec.")
111
+
112
+ engine = ParseEngine(spec)
113
+ items = engine.parse(html).data
114
+ return validate_items_against_expected(items, expected, field_path)
115
+
116
+
117
+ def validate_items_against_expected(
118
+ items: dict[str, DataValue],
119
+ expected: dict[str, DataValue],
120
+ field_path: str | None = None,
121
+ ) -> FileValidationResult:
122
+ """Validate a parser spec against an HTML string.
123
+
124
+ Parses the HTML using the spec and optionally compares against expected results.
125
+ """
126
+
127
+ errors: list[str] = []
128
+
129
+ actual = items if items else {}
130
+ num_items_checked = 0
131
+ for key, exp_val in expected.items():
132
+ actual_val = actual.get(key)
133
+ num_items_checked += _compare_values(actual_val, exp_val, key, errors, field_path)
134
+
135
+ return FileValidationResult(file_name="", item_count=num_items_checked, errors=errors)
136
+
137
+
138
+ def validate_files(
139
+ expected_values_path: Path,
140
+ spec_file_path: Path,
141
+ data_dir: Path | None = None,
142
+ field_path: str | None = None,
143
+ skip_unexpected_files: bool = False,
144
+ ) -> TrueValidationResult:
145
+ """Validate an item directory containing parser_spec.yaml and expected.yaml.
146
+
147
+ Args:
148
+ expected_values_path: Path to the expected values YAML file.
149
+ spec_file_path: Path to the parser spec YAML file.
150
+ data_dir: Override for the data directory. Defaults to item_dir/../data.
151
+ field_path: Optional dot path to a specific field to validate.
152
+ """
153
+ expected_values = ExpectedResults.from_yaml_file(expected_values_path)
154
+ spec = ParseSpec.from_yaml_file(spec_file_path)
155
+
156
+ if data_dir is None:
157
+ data_dir = expected_values.data_path
158
+
159
+ if data_dir is None:
160
+ raise ValueError(
161
+ "Data path not specified. Either include 'data_path' in the expected values YAML or provide --data-dir."
162
+ )
163
+
164
+ if not data_dir.is_absolute():
165
+ data_dir = expected_values_path.parent / data_dir
166
+
167
+ expected_by_file: dict[str, dict[str, DataValue]] = {fe.file: fe.items for fe in expected_values.files}
168
+
169
+ html_files = sorted(data_dir.glob("*.html"))
170
+ result = TrueValidationResult()
171
+
172
+ if not html_files:
173
+ result.file_results.append(
174
+ FileValidationResult(
175
+ file_name=str(data_dir),
176
+ item_count=0,
177
+ errors=[f"No HTML files found in {data_dir}"],
178
+ )
179
+ )
180
+ return result
181
+
182
+ for html_file in html_files:
183
+ html = html_file.read_text(encoding="utf-8")
184
+ file_expected = expected_by_file.get(html_file.name)
185
+ if not file_expected:
186
+ if not skip_unexpected_files:
187
+ result.file_results.append(
188
+ FileValidationResult(
189
+ file_name=html_file.name,
190
+ item_count=0,
191
+ errors=[f"No expected results defined for {html_file.name}"],
192
+ )
193
+ )
194
+ continue
195
+ file_result = validate_spec_against_expected(spec, html, file_expected, field_path=field_path)
196
+ result.file_results.append(
197
+ FileValidationResult(
198
+ file_name=html_file.name,
199
+ item_count=file_result.item_count,
200
+ errors=file_result.errors,
201
+ )
202
+ )
203
+
204
+ return result
@@ -0,0 +1,110 @@
1
+ """Validators for primitive (non-nested) field values.
2
+ Factored out from validation logic so tests and code can reuse semantic checks.
3
+ """
4
+
5
+
6
+ from typing import Any, Optional
7
+ from datetime import datetime
8
+
9
+ from ..models.parser_spec import FieldType
10
+ from ..models.validation import ValidationMismatch
11
+
12
+
13
+ def type_name(value: Any) -> str:
14
+ if value is None:
15
+ return "missing"
16
+ if isinstance(value, dict):
17
+ return "object"
18
+ if isinstance(value, list):
19
+ return "array"
20
+ if isinstance(value, (int, float)):
21
+ return "number"
22
+ if isinstance(value, str):
23
+ return "string"
24
+ return type(value).__name__
25
+
26
+
27
+ def validate_text(name_path: str, value: Any) -> Optional[ValidationMismatch]:
28
+ if not isinstance(value, str):
29
+ return ValidationMismatch(
30
+ field=name_path,
31
+ expected_type="text",
32
+ actual_type=type_name(value),
33
+ message=f"Field '{name_path}' expected text, got {type_name(value)}",
34
+ )
35
+ return None
36
+
37
+
38
+ def validate_link(name_path: str, value: Any) -> Optional[ValidationMismatch]:
39
+ if not isinstance(value, str):
40
+ return ValidationMismatch(
41
+ field=name_path,
42
+ expected_type="link",
43
+ actual_type=type_name(value),
44
+ message=f"Field '{name_path}' expected link string, got {type_name(value)}",
45
+ )
46
+ if not (value.startswith("http://") or value.startswith("https://") or value.startswith("/")):
47
+ return ValidationMismatch(
48
+ field=name_path,
49
+ expected_type="link",
50
+ actual_type="string",
51
+ message=f"Field '{name_path}' does not look like a link: '{value}'",
52
+ )
53
+ return None
54
+
55
+
56
+ def validate_number(name_path: str, value: Any) -> Optional[ValidationMismatch]:
57
+ if not isinstance(value, (int, float)):
58
+ return ValidationMismatch(
59
+ field=name_path,
60
+ expected_type="number",
61
+ actual_type=type_name(value),
62
+ message=f"Field '{name_path}' expected number, got {type_name(value)}",
63
+ )
64
+ return None
65
+
66
+
67
+ def validate_date(name_path: str, value: Any) -> Optional[ValidationMismatch]:
68
+ if not isinstance(value, str):
69
+ return ValidationMismatch(
70
+ field=name_path,
71
+ expected_type="date",
72
+ actual_type=type_name(value),
73
+ message=f"Field '{name_path}' expected date string, got {type_name(value)}",
74
+ )
75
+ try:
76
+ datetime.fromisoformat(value)
77
+ except Exception: # pylint: disable=broad-except
78
+ return ValidationMismatch(
79
+ field=name_path,
80
+ expected_type="date",
81
+ actual_type="string",
82
+ message=f"Field '{name_path}' is not a valid ISO date: '{value}'",
83
+ )
84
+ return None
85
+
86
+
87
+ def validate_value(name_path: str, base_type: FieldType, value: Any) -> Optional[ValidationMismatch]:
88
+ """Dispatch to the right validator for `base_type`.
89
+
90
+ Returns a `ValidationMismatch` if invalid, or None if valid.
91
+ """
92
+ if value is None:
93
+ return ValidationMismatch(
94
+ field=name_path,
95
+ expected_type=base_type.value,
96
+ actual_type="missing",
97
+ message=f"Field '{name_path}' is missing",
98
+ )
99
+
100
+ if base_type == FieldType.TEXT:
101
+ return validate_text(name_path, value)
102
+ if base_type == FieldType.LINK:
103
+ return validate_link(name_path, value)
104
+ if base_type == FieldType.NUMBER:
105
+ return validate_number(name_path, value)
106
+ if base_type == FieldType.DATE:
107
+ return validate_date(name_path, value)
108
+
109
+ # Fallback: treat as text
110
+ return validate_text(name_path, value)
@@ -0,0 +1,7 @@
1
+ Copyright © 2026 Vincent Lonij
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,83 @@
1
+ Metadata-Version: 2.3
2
+ Name: spextract
3
+ Version: 0.5.16
4
+ Summary: A declarative html scraper for python
5
+ License: MIT
6
+ Author: Vincent Lonij
7
+ Author-email: 29819815+vincentropy@users.noreply.github.com
8
+ Requires-Python: >=3.10
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Requires-Dist: beautifulsoup4 (>=4.14.3,<5.0.0)
16
+ Requires-Dist: click (>=8.3.2,<9.0.0)
17
+ Requires-Dist: lxml (>=6.0.4,<7.0.0)
18
+ Requires-Dist: pydantic (>=2.12.5,<3.0.0)
19
+ Requires-Dist: pyyaml (>=6.0.3,<7.0.0)
20
+ Description-Content-Type: text/markdown
21
+
22
+ # A Declarative HTML Scraper for Python
23
+
24
+ This package provides a simple way to declare what data should be extracted from an HTML document in a configuration file.
25
+
26
+ This enables sharing of scraping logic across projects and teams without the risk of executing untrusted code. It also allows for easier maintenance and updates to scraping logic without needing to modify the underlying codebase.
27
+
28
+ ## CLI
29
+
30
+ The package includes a Click-based CLI with two commands:
31
+
32
+ ```bash
33
+ decs parse spec.yaml <path to html file or directory>
34
+ decs validate spec.yaml expected-results.yaml
35
+ ```
36
+
37
+ `parse` emits YAML in the same expected-results format used by `validate`, so you can capture known-good output and re-run validation later.
38
+
39
+ ## How to use
40
+
41
+ ### Build a configuration file
42
+
43
+ You can write a configuration file with the provided ParserSpec class.
44
+
45
+ ```python
46
+ import py_decs
47
+
48
+ spec = py_decs.ParserSpec(
49
+ name="example_parser",
50
+ description="An example parser for demonstration purposes.",
51
+ fields=[
52
+ py_decs.FieldSpec(
53
+ name="title",
54
+ selector="h1.title::text",
55
+ type=py_decs.FieldType.TEXT,
56
+ ),
57
+ py_decs.FieldSpec(
58
+ name="links",
59
+ selector="a.link::attr(href)",
60
+ type=py_decs.FieldType.LINK,
61
+ multiple=True,
62
+ )
63
+ py_decs.FieldSpec(
64
+ name="author",
65
+ selector="div.author",
66
+ type=py_decs.FieldType.OBJECT,
67
+ fields=[
68
+ py_decs.FieldSpec(
69
+ name="name",
70
+ selector="span.name::text",
71
+ type=py_decs.FieldType.TEXT,
72
+ ),
73
+ py_decs.FieldSpec(
74
+ name="profile_url",
75
+ selector="a.profile::attr(href)",
76
+ type=py_decs.FieldType.LINK,
77
+ ),
78
+ ]
79
+ ),
80
+ ]
81
+ )
82
+ ```
83
+
@@ -0,0 +1,20 @@
1
+ spextract/__init__.py,sha256=4yowBnR5WkwlRe0_Ad27TkKTPmVa4mGouVudMctyON8,547
2
+ spextract/cli.py,sha256=bOSDgqgnT3N1bQwKNLg8LtcsSSzWGwHNfqyH6_gjhXI,3088
3
+ spextract/engine.py,sha256=yIH26wYiQPsVzhc39AZxJX83MBl4FetA6olXLV2L3f0,6979
4
+ spextract/models/__init__.py,sha256=QeHguokPrbZ75TVEBfCueq1ASo2_0rN8JTsFKyjDHqI,479
5
+ spextract/models/output.py,sha256=T-zygB2x5eUhXWvkR_eW5hfEDfWAA43kDPl-mbMNDdk,569
6
+ spextract/models/parser_spec.py,sha256=pZbFnEnBtQdw34OrY0TvPreRIJoCI3Q5ZGvyX71rVog,2936
7
+ spextract/models/validation.py,sha256=1_K5y94CKRqSMwZBysTDbV4njfI3x57j_9g23Si3S8k,2163
8
+ spextract/models/yaml.py,sha256=xjSi3dDehF00X97nZDx1TIHJMTvGg9iVyMD8c-hi320,744
9
+ spextract/processors.py,sha256=y41nSSWy8ch3gzlDifcGlGXLp-jHzLihlT-_rLGdyIU,4067
10
+ spextract/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ spextract/uni_selector.py,sha256=R4q2gVxoJAlyJ7FbgqPAiMQSGOSsqbCjfPjAlUqux1w,3956
12
+ spextract/validation/__init__.py,sha256=k13bHZzZAQ0zjXrp-gz98zOyBcfREPAH2_O5thTbIu8,346
13
+ spextract/validation/spec_validate.py,sha256=GWyDjqx0ElDlqJkNrqQm6ys8yXaS6E1SFwTuZPEZRdk,5388
14
+ spextract/validation/true_validate.py,sha256=H8c_lBZ9ClxCnqnaOZjM6nYoClFa6H9VB8h4fmVFDkE,7991
15
+ spextract/validation/validators.py,sha256=FXeejkc-IQnsQxrsldz2f6nWHp6DFjFVBPbd0jF5HHE,3647
16
+ spextract-0.5.16.dist-info/LICENSE.md,sha256=fboAcycaR0hzaSgMDYqsxxyyF1Yq3Vt7R28krR1lgXc,1064
17
+ spextract-0.5.16.dist-info/METADATA,sha256=wuSrhbRRpMYGp0NaskcreQYqfaf8fw_uNJra6cBJpn0,2653
18
+ spextract-0.5.16.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
19
+ spextract-0.5.16.dist-info/entry_points.txt,sha256=uzZaXL_t9zdWp3Y-P8wkZhSJmjfE7jrw8mJJmQN3-z4,40
20
+ spextract-0.5.16.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.1.3
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ decs=py_decs.cli:cli
3
+