spextract 0.5.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spextract-0.5.16/LICENSE.md +7 -0
- spextract-0.5.16/PKG-INFO +83 -0
- spextract-0.5.16/README.md +61 -0
- spextract-0.5.16/pyproject.toml +48 -0
- spextract-0.5.16/spextract/__init__.py +27 -0
- spextract-0.5.16/spextract/cli.py +85 -0
- spextract-0.5.16/spextract/engine.py +152 -0
- spextract-0.5.16/spextract/models/__init__.py +17 -0
- spextract-0.5.16/spextract/models/output.py +21 -0
- spextract-0.5.16/spextract/models/parser_spec.py +85 -0
- spextract-0.5.16/spextract/models/validation.py +84 -0
- spextract-0.5.16/spextract/models/yaml.py +23 -0
- spextract-0.5.16/spextract/processors.py +112 -0
- spextract-0.5.16/spextract/py.typed +0 -0
- spextract-0.5.16/spextract/uni_selector.py +125 -0
- spextract-0.5.16/spextract/validation/__init__.py +15 -0
- spextract-0.5.16/spextract/validation/spec_validate.py +138 -0
- spextract-0.5.16/spextract/validation/true_validate.py +204 -0
- spextract-0.5.16/spextract/validation/validators.py +110 -0
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright © 2026 Vincent Lonij
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: spextract
|
|
3
|
+
Version: 0.5.16
|
|
4
|
+
Summary: A declarative html scraper for python
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Vincent Lonij
|
|
7
|
+
Author-email: 29819815+vincentropy@users.noreply.github.com
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Dist: beautifulsoup4 (>=4.14.3,<5.0.0)
|
|
16
|
+
Requires-Dist: click (>=8.3.2,<9.0.0)
|
|
17
|
+
Requires-Dist: lxml (>=6.0.4,<7.0.0)
|
|
18
|
+
Requires-Dist: pydantic (>=2.12.5,<3.0.0)
|
|
19
|
+
Requires-Dist: pyyaml (>=6.0.3,<7.0.0)
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# A Declarative HTML Scraper for Python
|
|
23
|
+
|
|
24
|
+
This package provides a simple way to declare what data should be extracted from an HTML document in a configuration file.
|
|
25
|
+
|
|
26
|
+
This enables sharing of scraping logic across projects and teams without the risk of executing untrusted code. It also allows for easier maintenance and updates to scraping logic without needing to modify the underlying codebase.
|
|
27
|
+
|
|
28
|
+
## CLI
|
|
29
|
+
|
|
30
|
+
The package includes a Click-based CLI with two commands:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
decs parse spec.yaml <path to html file or directory>
|
|
34
|
+
decs validate spec.yaml expected-results.yaml
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
`parse` emits YAML in the same expected-results format used by `validate`, so you can capture known-good output and re-run validation later.
|
|
38
|
+
|
|
39
|
+
## How to use
|
|
40
|
+
|
|
41
|
+
### Build a configuration file
|
|
42
|
+
|
|
43
|
+
You can write a configuration file with the provided ParserSpec class.
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import py_decs
|
|
47
|
+
|
|
48
|
+
spec = py_decs.ParserSpec(
|
|
49
|
+
name="example_parser",
|
|
50
|
+
description="An example parser for demonstration purposes.",
|
|
51
|
+
fields=[
|
|
52
|
+
py_decs.FieldSpec(
|
|
53
|
+
name="title",
|
|
54
|
+
selector="h1.title::text",
|
|
55
|
+
type=py_decs.FieldType.TEXT,
|
|
56
|
+
),
|
|
57
|
+
py_decs.FieldSpec(
|
|
58
|
+
name="links",
|
|
59
|
+
selector="a.link::attr(href)",
|
|
60
|
+
type=py_decs.FieldType.LINK,
|
|
61
|
+
multiple=True,
|
|
62
|
+
)
|
|
63
|
+
py_decs.FieldSpec(
|
|
64
|
+
name="author",
|
|
65
|
+
selector="div.author",
|
|
66
|
+
type=py_decs.FieldType.OBJECT,
|
|
67
|
+
fields=[
|
|
68
|
+
py_decs.FieldSpec(
|
|
69
|
+
name="name",
|
|
70
|
+
selector="span.name::text",
|
|
71
|
+
type=py_decs.FieldType.TEXT,
|
|
72
|
+
),
|
|
73
|
+
py_decs.FieldSpec(
|
|
74
|
+
name="profile_url",
|
|
75
|
+
selector="a.profile::attr(href)",
|
|
76
|
+
type=py_decs.FieldType.LINK,
|
|
77
|
+
),
|
|
78
|
+
]
|
|
79
|
+
),
|
|
80
|
+
]
|
|
81
|
+
)
|
|
82
|
+
```
|
|
83
|
+
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# A Declarative HTML Scraper for Python
|
|
2
|
+
|
|
3
|
+
This package provides a simple way to declare what data should be extracted from an HTML document in a configuration file.
|
|
4
|
+
|
|
5
|
+
This enables sharing of scraping logic across projects and teams without the risk of executing untrusted code. It also allows for easier maintenance and updates to scraping logic without needing to modify the underlying codebase.
|
|
6
|
+
|
|
7
|
+
## CLI
|
|
8
|
+
|
|
9
|
+
The package includes a Click-based CLI with two commands:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
decs parse spec.yaml <path to html file or directory>
|
|
13
|
+
decs validate spec.yaml expected-results.yaml
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
`parse` emits YAML in the same expected-results format used by `validate`, so you can capture known-good output and re-run validation later.
|
|
17
|
+
|
|
18
|
+
## How to use
|
|
19
|
+
|
|
20
|
+
### Build a configuration file
|
|
21
|
+
|
|
22
|
+
You can write a configuration file with the provided ParserSpec class.
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
import py_decs
|
|
26
|
+
|
|
27
|
+
spec = py_decs.ParserSpec(
|
|
28
|
+
name="example_parser",
|
|
29
|
+
description="An example parser for demonstration purposes.",
|
|
30
|
+
fields=[
|
|
31
|
+
py_decs.FieldSpec(
|
|
32
|
+
name="title",
|
|
33
|
+
selector="h1.title::text",
|
|
34
|
+
type=py_decs.FieldType.TEXT,
|
|
35
|
+
),
|
|
36
|
+
py_decs.FieldSpec(
|
|
37
|
+
name="links",
|
|
38
|
+
selector="a.link::attr(href)",
|
|
39
|
+
type=py_decs.FieldType.LINK,
|
|
40
|
+
multiple=True,
|
|
41
|
+
)
|
|
42
|
+
py_decs.FieldSpec(
|
|
43
|
+
name="author",
|
|
44
|
+
selector="div.author",
|
|
45
|
+
type=py_decs.FieldType.OBJECT,
|
|
46
|
+
fields=[
|
|
47
|
+
py_decs.FieldSpec(
|
|
48
|
+
name="name",
|
|
49
|
+
selector="span.name::text",
|
|
50
|
+
type=py_decs.FieldType.TEXT,
|
|
51
|
+
),
|
|
52
|
+
py_decs.FieldSpec(
|
|
53
|
+
name="profile_url",
|
|
54
|
+
selector="a.profile::attr(href)",
|
|
55
|
+
type=py_decs.FieldType.LINK,
|
|
56
|
+
),
|
|
57
|
+
]
|
|
58
|
+
),
|
|
59
|
+
]
|
|
60
|
+
)
|
|
61
|
+
```
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "spextract"
|
|
3
|
+
version = "0.5.16"
|
|
4
|
+
description = "A declarative html scraper for python"
|
|
5
|
+
authors = [
|
|
6
|
+
{ name = "Vincent Lonij", email = "29819815+vincentropy@users.noreply.github.com" },
|
|
7
|
+
]
|
|
8
|
+
license = "MIT"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"beautifulsoup4 (>=4.14.3,<5.0.0)",
|
|
13
|
+
"pydantic (>=2.12.5,<3.0.0)",
|
|
14
|
+
"pyyaml (>=6.0.3,<7.0.0)",
|
|
15
|
+
"lxml (>=6.0.4,<7.0.0)",
|
|
16
|
+
"click (>=8.3.2,<9.0.0)",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.scripts]
|
|
20
|
+
decs = "py_decs.cli:cli"
|
|
21
|
+
|
|
22
|
+
[tool.poetry]
|
|
23
|
+
|
|
24
|
+
[tool.poetry.group.dev.dependencies]
|
|
25
|
+
mypy = "^1.20.0"
|
|
26
|
+
types-pyyaml = "^6.0.12.20260408"
|
|
27
|
+
pylint = "^4.0.5"
|
|
28
|
+
pytest = "^9.0.3"
|
|
29
|
+
types-lxml = "^2026.2.16"
|
|
30
|
+
pylint-pydantic = "^0.4.1"
|
|
31
|
+
twine = "^6.2.0"
|
|
32
|
+
|
|
33
|
+
[build-system]
|
|
34
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
35
|
+
build-backend = "poetry.core.masonry.api"
|
|
36
|
+
|
|
37
|
+
[tool.pylint]
|
|
38
|
+
load-plugins = ["pylint_pydantic"]
|
|
39
|
+
disable = [
|
|
40
|
+
"missing-module-docstring",
|
|
41
|
+
"missing-class-docstring",
|
|
42
|
+
"missing-function-docstring",
|
|
43
|
+
"line-too-long",
|
|
44
|
+
]
|
|
45
|
+
max-line-length = 120
|
|
46
|
+
|
|
47
|
+
[tool.black]
|
|
48
|
+
line-length = 120
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from .models import (
|
|
2
|
+
ParseSpec,
|
|
3
|
+
FieldSpec,
|
|
4
|
+
FieldType,
|
|
5
|
+
ProcessorSpec,
|
|
6
|
+
EngineOutput,
|
|
7
|
+
SpecValidationResult,
|
|
8
|
+
ValidationMismatch,
|
|
9
|
+
ExpectedResults,
|
|
10
|
+
FileExpectedItems,
|
|
11
|
+
)
|
|
12
|
+
from .engine import ParseEngine
|
|
13
|
+
from .validation.spec_validate import validate_spec_output
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"FieldSpec",
|
|
17
|
+
"FieldType",
|
|
18
|
+
"ParseEngine",
|
|
19
|
+
"ParseSpec",
|
|
20
|
+
"ProcessorSpec",
|
|
21
|
+
"EngineOutput",
|
|
22
|
+
"SpecValidationResult",
|
|
23
|
+
"ValidationMismatch",
|
|
24
|
+
"ExpectedResults",
|
|
25
|
+
"FileExpectedItems",
|
|
26
|
+
"validate_spec_output",
|
|
27
|
+
]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
from .engine import ParseEngine
|
|
9
|
+
from .models.output import EngineOutput
|
|
10
|
+
from .models.parser_spec import ParseSpec
|
|
11
|
+
from .models.validation import ExpectedResults, FileExpectedItems
|
|
12
|
+
from .validation.true_validate import validate_files
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.group()
|
|
16
|
+
def cli() -> None:
|
|
17
|
+
"""Declarative scraper utilities."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@cli.command()
|
|
21
|
+
@click.argument("spec_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
|
22
|
+
@click.argument("html_path", type=click.Path(exists=True, dir_okay=True, path_type=Path))
|
|
23
|
+
@click.option("--output", "output_path", type=click.Path(dir_okay=False, path_type=Path))
|
|
24
|
+
def parse(spec_path: Path, html_path: Path, output_path: Path | None) -> None:
|
|
25
|
+
"""Apply a spec to one or more HTML files."""
|
|
26
|
+
|
|
27
|
+
spec = ParseSpec.from_yaml_file(spec_path)
|
|
28
|
+
engine = ParseEngine(spec)
|
|
29
|
+
results: dict[str, EngineOutput] = {}
|
|
30
|
+
if html_path.is_file():
|
|
31
|
+
html_files = [html_path]
|
|
32
|
+
else:
|
|
33
|
+
html_files = sorted(html_path.glob("*.html"))
|
|
34
|
+
|
|
35
|
+
for html_file in html_files:
|
|
36
|
+
html = html_file.read_text(encoding="utf-8")
|
|
37
|
+
parsed = engine.parse(html)
|
|
38
|
+
results[html_file.name] = parsed
|
|
39
|
+
|
|
40
|
+
file_results = ExpectedResults(
|
|
41
|
+
data_path=html_path if html_path.is_dir() else html_path.parent,
|
|
42
|
+
files=[FileExpectedItems.from_engine_output(file_name, output) for file_name, output in results.items()],
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
if output_path is not None:
|
|
46
|
+
file_results.to_yaml_file(output_path)
|
|
47
|
+
click.echo(f"Wrote parsed results to {output_path}")
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
rendered_yaml = yaml.safe_dump(file_results.model_dump(), sort_keys=False, allow_unicode=True)
|
|
51
|
+
click.echo(rendered_yaml, nl=False)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@cli.command()
|
|
55
|
+
@click.argument("spec_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
|
56
|
+
@click.argument("expected_results_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
|
57
|
+
@click.option(
|
|
58
|
+
"--field-path", "-f", type=str, help="Optional dot path to a specific field to validate (e.g. 'items.name')."
|
|
59
|
+
)
|
|
60
|
+
def validate(spec_path: Path, expected_results_path: Path, field_path: str | None) -> None:
|
|
61
|
+
"""Validate a spec against YAML expected extraction results."""
|
|
62
|
+
|
|
63
|
+
validation_result = validate_files(
|
|
64
|
+
expected_values_path=expected_results_path,
|
|
65
|
+
spec_file_path=spec_path,
|
|
66
|
+
field_path=field_path,
|
|
67
|
+
)
|
|
68
|
+
if validation_result.passed:
|
|
69
|
+
click.echo(
|
|
70
|
+
f"Validation passed for {validation_result.total_files} file(s) and {validation_result.total_items} extracted field(s)."
|
|
71
|
+
)
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
for file_result in validation_result.file_results:
|
|
75
|
+
click.echo(f"{file_result.file_name}:")
|
|
76
|
+
if file_result.passed:
|
|
77
|
+
click.echo(f" Passed. Values checked: {file_result.item_count}")
|
|
78
|
+
continue
|
|
79
|
+
|
|
80
|
+
for error in file_result.errors:
|
|
81
|
+
click.echo(f" - {error}")
|
|
82
|
+
|
|
83
|
+
raise click.ClickException(
|
|
84
|
+
f"Validation failed for {validation_result.failures} of {validation_result.total_files} file(s)."
|
|
85
|
+
)
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Core extraction engine that applies a ParseSpec to HTML content."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import cast
|
|
6
|
+
|
|
7
|
+
from bs4 import BeautifulSoup, Tag
|
|
8
|
+
|
|
9
|
+
from spextract.models.validation import SpecValidationResult
|
|
10
|
+
|
|
11
|
+
from .models import EngineOutput, FieldSpec, ParseSpec, ProcessorSpec, DataValue
|
|
12
|
+
from .processors import apply_processor
|
|
13
|
+
from .uni_selector import select
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ParseEngine:
|
|
17
|
+
"""Applies a ParseSpec to HTML content and returns extracted items."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, spec: ParseSpec) -> None:
|
|
20
|
+
self.spec = spec
|
|
21
|
+
|
|
22
|
+
def parse(self, html: str, field_path: str | None = None) -> EngineOutput:
|
|
23
|
+
"""Parse HTML string and return EngineOutput with typed fields.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
html: Raw HTML content to parse.
|
|
27
|
+
field_path: Optional dot-separated path (e.g. ``"person.name"``) to
|
|
28
|
+
extract only a single field instead of the full spec. Each
|
|
29
|
+
segment must match a key in the ``fields`` dict at that level of
|
|
30
|
+
nesting. Raises ``KeyError`` if a segment is not found or if a
|
|
31
|
+
non-leaf segment has no child fields.
|
|
32
|
+
"""
|
|
33
|
+
root = BeautifulSoup(html, "html.parser")
|
|
34
|
+
if field_path is not None:
|
|
35
|
+
data = self._extract_field_path(root, self.spec.fields, field_path.split("."))
|
|
36
|
+
else:
|
|
37
|
+
data = self._extract_fields(root, self.spec.fields)
|
|
38
|
+
return EngineOutput(spec=self.spec, data=data if data is not None else {})
|
|
39
|
+
|
|
40
|
+
def validate(self, html: str, field_path: str | None = None) -> SpecValidationResult:
|
|
41
|
+
"""Validate HTML against spec, returning SpecValidationResult with details."""
|
|
42
|
+
from .validation.spec_validate import validate_spec_output # pylint: disable=import-outside-toplevel
|
|
43
|
+
|
|
44
|
+
output = self.parse(html, field_path=field_path)
|
|
45
|
+
validation_result = validate_spec_output(self.spec, output.data, raise_=False)
|
|
46
|
+
return validation_result
|
|
47
|
+
|
|
48
|
+
def parse_and_validate(self, html: str, field_path: str | None = None) -> EngineOutput:
|
|
49
|
+
"""Parse HTML and validate output against spec, returning EngineOutput with validation results.
|
|
50
|
+
Raises ValueError if validation fails."""
|
|
51
|
+
|
|
52
|
+
output = self.parse(html, field_path=field_path)
|
|
53
|
+
validation_result = self.validate(html, field_path=field_path)
|
|
54
|
+
if not validation_result.is_valid:
|
|
55
|
+
raise ValueError(f"Validation failed: {validation_result.mismatches[0]}")
|
|
56
|
+
return output
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _extract_field_path(
|
|
60
|
+
node: Tag | BeautifulSoup,
|
|
61
|
+
fields: dict[str, FieldSpec],
|
|
62
|
+
path: list[str],
|
|
63
|
+
) -> dict[str, DataValue]:
|
|
64
|
+
"""Navigate ``fields`` and the HTML tree simultaneously following ``path``.
|
|
65
|
+
|
|
66
|
+
Returns a ``{leaf_name: value}`` dict containing only the targeted field.
|
|
67
|
+
Raises ``KeyError`` if any path segment is missing or if a non-leaf
|
|
68
|
+
segment has no child ``fields``.
|
|
69
|
+
"""
|
|
70
|
+
name, *rest = path
|
|
71
|
+
if name not in fields:
|
|
72
|
+
raise KeyError(f"Field {name!r} not found. Available: {list(fields.keys())}")
|
|
73
|
+
field_spec = fields[name]
|
|
74
|
+
|
|
75
|
+
if not rest:
|
|
76
|
+
# Leaf — extract just this field
|
|
77
|
+
value = ParseEngine._extract_field(node, field_spec)
|
|
78
|
+
return {name: value} if value is not None else {}
|
|
79
|
+
|
|
80
|
+
# Intermediate — must have child fields and a navigable selector
|
|
81
|
+
if field_spec.fields is None:
|
|
82
|
+
raise KeyError(f"Field {name!r} has no child fields; cannot navigate to {'.'.join(rest)!r}")
|
|
83
|
+
sub_nodes = select(node, field_spec.selector, assert_tags=True)
|
|
84
|
+
if not sub_nodes:
|
|
85
|
+
return {}
|
|
86
|
+
|
|
87
|
+
if field_spec.multiple:
|
|
88
|
+
items = [ParseEngine._extract_field_path(sub, field_spec.fields, rest) for sub in sub_nodes]
|
|
89
|
+
return cast(dict[str, DataValue], {name: cast(DataValue, items)})
|
|
90
|
+
|
|
91
|
+
inner = ParseEngine._extract_field_path(sub_nodes[0], field_spec.fields, rest)
|
|
92
|
+
return cast(dict[str, DataValue], {name: cast(DataValue, inner)})
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def _extract_fields(node: Tag | BeautifulSoup, fields: dict[str, FieldSpec]) -> dict[str, DataValue]:
|
|
96
|
+
result: dict[str, DataValue] = {}
|
|
97
|
+
for name, field_spec in fields.items():
|
|
98
|
+
field_out = ParseEngine._extract_field(node, field_spec)
|
|
99
|
+
if field_out is not None:
|
|
100
|
+
result[name] = field_out
|
|
101
|
+
return result
|
|
102
|
+
|
|
103
|
+
@staticmethod
|
|
104
|
+
def _extract_field(node: Tag | BeautifulSoup, field_spec: FieldSpec) -> DataValue | None:
|
|
105
|
+
if field_spec.fields is not None:
|
|
106
|
+
# if this field has child fields, extraction is slightly different
|
|
107
|
+
# we ignore ::text / ::attr on the parent selector.
|
|
108
|
+
return ParseEngine._extract_nested(node, field_spec)
|
|
109
|
+
|
|
110
|
+
values = select(node, field_spec.selector)
|
|
111
|
+
if not values:
|
|
112
|
+
return [] if field_spec.multiple else None
|
|
113
|
+
if field_spec.multiple:
|
|
114
|
+
out_values = [ParseEngine.apply_processors(v, field_spec.resolved_processors()) for v in values]
|
|
115
|
+
return cast(DataValue, out_values)
|
|
116
|
+
|
|
117
|
+
all_strings = all(isinstance(v, str) for v in values)
|
|
118
|
+
if all_strings:
|
|
119
|
+
values = ["".join(cast(list[str], values))]
|
|
120
|
+
if len(values) > 1:
|
|
121
|
+
print(f"Warning: Multiple elements matched for single field: {field_spec.selector}. Using first match.")
|
|
122
|
+
out_value = ParseEngine.apply_processors(values[0], field_spec.resolved_processors())
|
|
123
|
+
return out_value
|
|
124
|
+
|
|
125
|
+
@staticmethod
|
|
126
|
+
def _extract_nested(node: Tag | BeautifulSoup, field_spec: FieldSpec) -> DataValue | None:
|
|
127
|
+
"""Extract a field with child fields, applying child selectors relative to parent elements.
|
|
128
|
+
In this case we ignore ::text / ::attr on the parent selector since it doesn't make sense
|
|
129
|
+
to apply these to a parent element that we're extracting child fields from."""
|
|
130
|
+
assert field_spec.fields is not None
|
|
131
|
+
|
|
132
|
+
sub_nodes = select(node, field_spec.selector, assert_tags=True)
|
|
133
|
+
if not sub_nodes:
|
|
134
|
+
return [] if field_spec.multiple else None
|
|
135
|
+
|
|
136
|
+
if field_spec.multiple:
|
|
137
|
+
# Return a list of FieldOutput objects
|
|
138
|
+
return [ParseEngine._extract_fields(sub, field_spec.fields) for sub in sub_nodes]
|
|
139
|
+
|
|
140
|
+
if len(sub_nodes) > 1:
|
|
141
|
+
print(f"Warning: Multiple elements matched for single field: {field_spec.selector}. Using first match.")
|
|
142
|
+
return ParseEngine._extract_fields(sub_nodes[0], field_spec.fields)
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def apply_processors(value: object, processors: list[ProcessorSpec]) -> str | float | None:
|
|
146
|
+
if value is None:
|
|
147
|
+
return value
|
|
148
|
+
for proc in processors:
|
|
149
|
+
value = apply_processor(proc.name, value, proc.args if proc.args else None)
|
|
150
|
+
if isinstance(value, (str, float)) or value is None:
|
|
151
|
+
return value
|
|
152
|
+
raise ValueError(f"Unsupported value type after processing: {type(value)}")
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .parser_spec import FieldSpec, FieldType, ParseSpec, ProcessorName, ProcessorSpec
|
|
2
|
+
from .output import DataValue, EngineOutput
|
|
3
|
+
from .validation import SpecValidationResult, ValidationMismatch, ExpectedResults, FileExpectedItems
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"FieldSpec",
|
|
7
|
+
"FieldType",
|
|
8
|
+
"ParseSpec",
|
|
9
|
+
"ProcessorName",
|
|
10
|
+
"ProcessorSpec",
|
|
11
|
+
"DataValue",
|
|
12
|
+
"EngineOutput",
|
|
13
|
+
"SpecValidationResult",
|
|
14
|
+
"ValidationMismatch",
|
|
15
|
+
"ExpectedResults",
|
|
16
|
+
"FileExpectedItems",
|
|
17
|
+
]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Union # pylint: disable=unused-import
|
|
4
|
+
from typing_extensions import TypeAliasType
|
|
5
|
+
|
|
6
|
+
from .yaml import BaseModelWithYamlSupport
|
|
7
|
+
|
|
8
|
+
from .parser_spec import ParseSpec
|
|
9
|
+
|
|
10
|
+
DataValue = TypeAliasType(
|
|
11
|
+
"DataValue",
|
|
12
|
+
"Union[None, float, str, dict[str, DataValue], list[DataValue]]",
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EngineOutput(BaseModelWithYamlSupport):
|
|
17
|
+
"""Output from the scraping engine for a single file,
|
|
18
|
+
including the parser spec used and the extracted data."""
|
|
19
|
+
|
|
20
|
+
spec: ParseSpec | None = None
|
|
21
|
+
data: dict[str, DataValue]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pydantic models for the declarative scraper.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import enum
|
|
8
|
+
from typing import Union
|
|
9
|
+
|
|
10
|
+
from pydantic import Field
|
|
11
|
+
|
|
12
|
+
from .yaml import BaseModelWithYamlSupport
|
|
13
|
+
from ..processors import ProcessorName
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ProcessorSpec(BaseModelWithYamlSupport):
|
|
17
|
+
"""A single field processor.
|
|
18
|
+
|
|
19
|
+
Can be a simple named processor (e.g. "strip") or a parameterised one, eg a regex.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
name: ProcessorName
|
|
23
|
+
args: list[Union[str, int]] = Field(default_factory=list)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class FieldType(enum.Enum):
|
|
27
|
+
"""Type of field to extract, used to determine how to extract it from HTML."""
|
|
28
|
+
|
|
29
|
+
TEXT = "text"
|
|
30
|
+
LINK = "link"
|
|
31
|
+
NUMBER = "number"
|
|
32
|
+
DATE = "date"
|
|
33
|
+
OBJECT = "object"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FieldSpec(BaseModelWithYamlSupport):
|
|
37
|
+
"""Specification for extracting a single field from HTML."""
|
|
38
|
+
|
|
39
|
+
selector: str = Field(
|
|
40
|
+
description="""
|
|
41
|
+
CSS selector or XPATH selector for the field.
|
|
42
|
+
If the fields attribute is not None, this selector should return a parent element or a list of parent elements.
|
|
43
|
+
The selector attribute of the child fields will be applied relative to each parent element.
|
|
44
|
+
"""
|
|
45
|
+
)
|
|
46
|
+
type: FieldType = Field(
|
|
47
|
+
default=FieldType.TEXT,
|
|
48
|
+
description="Type of field to extract, used to validate extraction.",
|
|
49
|
+
)
|
|
50
|
+
required: bool = Field(default=True, description="Whether this field is required. Used for validation.")
|
|
51
|
+
|
|
52
|
+
multiple: bool = Field(
|
|
53
|
+
default=False,
|
|
54
|
+
description="Whether to extract multiple values from this field (i.e. return a list).",
|
|
55
|
+
)
|
|
56
|
+
processors: list[Union[ProcessorName, dict[ProcessorName, list[Union[str, int]]]]] = Field(
|
|
57
|
+
default_factory=list,
|
|
58
|
+
description="List of processors to apply to the extracted value(s). Each processor can be a string (processor name) or a dict mapping processor name to argument list. These are applied in order.",
|
|
59
|
+
)
|
|
60
|
+
fields: dict[str, "FieldSpec"] | None = Field(
|
|
61
|
+
default=None,
|
|
62
|
+
description="Child fields to extract from the element(s) selected by this field. \
|
|
63
|
+
Keys in this dict will be keys in the output data.",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def resolved_processors(self) -> list[ProcessorSpec]:
|
|
67
|
+
"""Normalise the processor list into ProcessorSpec objects from dict[func-name=>arg list] or str."""
|
|
68
|
+
result: list[ProcessorSpec] = []
|
|
69
|
+
for p in self.processors:
|
|
70
|
+
if isinstance(p, (str, ProcessorName)):
|
|
71
|
+
result.append(ProcessorSpec(name=p))
|
|
72
|
+
elif isinstance(p, dict):
|
|
73
|
+
for name, args in p.items():
|
|
74
|
+
result.append(ProcessorSpec(name=name, args=args))
|
|
75
|
+
else:
|
|
76
|
+
raise ValueError(f"Invalid processor spec: {p}")
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class ParseSpec(BaseModelWithYamlSupport):
|
|
81
|
+
"""Top-level declarative parser specification."""
|
|
82
|
+
|
|
83
|
+
version: int = 1
|
|
84
|
+
name: str
|
|
85
|
+
fields: dict[str, FieldSpec]
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field, field_serializer
|
|
6
|
+
|
|
7
|
+
from .output import DataValue, EngineOutput
|
|
8
|
+
from .yaml import BaseModelWithYamlSupport
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ValidationMismatch(BaseModel):
|
|
12
|
+
field: str
|
|
13
|
+
expected_type: str
|
|
14
|
+
actual_type: str
|
|
15
|
+
message: str
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SpecValidationResult(BaseModel):
|
|
19
|
+
mismatches: list[ValidationMismatch] = Field(default_factory=list)
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def is_valid(self) -> bool:
|
|
23
|
+
return len(self.mismatches) == 0
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class FileValidationResult(BaseModel):
|
|
27
|
+
"""Validation result for a single HTML file."""
|
|
28
|
+
|
|
29
|
+
file_name: str
|
|
30
|
+
item_count: int
|
|
31
|
+
errors: list[str] = Field(default_factory=list)
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def passed(self) -> bool:
|
|
35
|
+
return not self.errors
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class TrueValidationResult(BaseModel):
|
|
39
|
+
"""Aggregate validation result across all files."""
|
|
40
|
+
|
|
41
|
+
file_results: list[FileValidationResult] = Field(default_factory=list)
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def total_files(self) -> int:
|
|
45
|
+
return len(self.file_results)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def total_items(self) -> int:
|
|
49
|
+
return sum(result.item_count for result in self.file_results)
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def failures(self) -> int:
|
|
53
|
+
return sum(1 for result in self.file_results if not result.passed)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def passed(self) -> bool:
|
|
57
|
+
return self.failures == 0
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class FileExpectedItems(BaseModel):
|
|
61
|
+
"""Expected extraction results for a single example file."""
|
|
62
|
+
|
|
63
|
+
file: str
|
|
64
|
+
items: dict[str, DataValue]
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def from_engine_output(cls, file_name: str, output: EngineOutput) -> FileExpectedItems:
|
|
68
|
+
return cls(file=file_name, items=output.data)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class ExpectedResults(BaseModelWithYamlSupport):
|
|
72
|
+
"""Expected extraction output used to validate parser correctness.
|
|
73
|
+
|
|
74
|
+
Stored as YAML alongside example data files.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
version: int = 1
|
|
78
|
+
data_path: Path | None = None
|
|
79
|
+
files: list[FileExpectedItems]
|
|
80
|
+
|
|
81
|
+
@field_serializer("data_path")
|
|
82
|
+
def serialize_data_path(self, value: Path | None) -> str | None:
|
|
83
|
+
"""Serialize data_path as a string in YAML."""
|
|
84
|
+
return str(value) if value is not None else None
|