ahorn-loader 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ahorn-loader might be problematic. Click here for more details.
- ahorn_loader-0.1.0/PKG-INFO +53 -0
- ahorn_loader-0.1.0/README.md +42 -0
- ahorn_loader-0.1.0/pyproject.toml +98 -0
- ahorn_loader-0.1.0/src/ahorn_loader/__init__.py +3 -0
- ahorn_loader-0.1.0/src/ahorn_loader/api.py +111 -0
- ahorn_loader-0.1.0/src/ahorn_loader/cli.py +83 -0
- ahorn_loader-0.1.0/src/ahorn_loader/py.typed +0 -0
- ahorn_loader-0.1.0/src/ahorn_loader/validator/__init__.py +3 -0
- ahorn_loader-0.1.0/src/ahorn_loader/validator/rules.py +119 -0
- ahorn_loader-0.1.0/src/ahorn_loader/validator/validator.py +54 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: ahorn-loader
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Author: Florian Frantzen
|
|
6
|
+
Author-email: Florian Frantzen <florian.frantzen@cs.rwth-aachen.de>
|
|
7
|
+
Requires-Dist: requests>=2.32.4
|
|
8
|
+
Requires-Dist: typer>=0.16.0
|
|
9
|
+
Requires-Python: >=3.12
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# `ahorn-loader`
|
|
13
|
+
|
|
14
|
+
Library and command-line application to interact with datasets in [AHORN](https://ahorn.rwth-aachen.de/).
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
|
|
18
|
+
`ahorn-loader` is both a command-line application and a Python package to interact with the AHORN repository for higher-order datasets.
|
|
19
|
+
|
|
20
|
+
### Command-Line Usage
|
|
21
|
+
|
|
22
|
+
To install and use `ahorn-loader` from the command line, you can run the following command:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
uvx ahorn-loader [command] [args]
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Commands include:
|
|
29
|
+
- `ls`: List available datasets in AHORN.
|
|
30
|
+
- `download`: Download a dataset from AHORN.
|
|
31
|
+
- `validate`: Validate a specific dataset file (e.g., before adding it to AHORN).
|
|
32
|
+
|
|
33
|
+
To get a full help of available commands and options, run `ahorn-loader --help`.
|
|
34
|
+
|
|
35
|
+
### Python Package Usage
|
|
36
|
+
|
|
37
|
+
To use `ahorn-loader` as a Python package, you can install it via `pip` (or some other package manager of your choice):
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install ahorn-loader
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Then, you can use it in your Python scripts:
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import ahorn_loader
|
|
47
|
+
|
|
48
|
+
# download a dataset
|
|
49
|
+
ahorn_loader.download('dataset_name', 'target_path')
|
|
50
|
+
|
|
51
|
+
# validate a specific dataset (e.g., before adding it to AHORN)
|
|
52
|
+
ahorn_loader.validate('path_to_dataset_file')
|
|
53
|
+
```
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# `ahorn-loader`
|
|
2
|
+
|
|
3
|
+
Library and command-line application to interact with datasets in [AHORN](https://ahorn.rwth-aachen.de/).
|
|
4
|
+
|
|
5
|
+
## Usage
|
|
6
|
+
|
|
7
|
+
`ahorn-loader` is both a command-line application and a Python package to interact with the AHORN repository for higher-order datasets.
|
|
8
|
+
|
|
9
|
+
### Command-Line Usage
|
|
10
|
+
|
|
11
|
+
To install and use `ahorn-loader` from the command line, you can run the following command:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
uvx ahorn-loader [command] [args]
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Commands include:
|
|
18
|
+
- `ls`: List available datasets in AHORN.
|
|
19
|
+
- `download`: Download a dataset from AHORN.
|
|
20
|
+
- `validate`: Validate a specific dataset file (e.g., before adding it to AHORN).
|
|
21
|
+
|
|
22
|
+
To get a full help of available commands and options, run `ahorn-loader --help`.
|
|
23
|
+
|
|
24
|
+
### Python Package Usage
|
|
25
|
+
|
|
26
|
+
To use `ahorn-loader` as a Python package, you can install it via `pip` (or some other package manager of your choice):
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install ahorn-loader
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Then, you can use it in your Python scripts:
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
import ahorn_loader
|
|
36
|
+
|
|
37
|
+
# download a dataset
|
|
38
|
+
ahorn_loader.download('dataset_name', 'target_path')
|
|
39
|
+
|
|
40
|
+
# validate a specific dataset (e.g., before adding it to AHORN)
|
|
41
|
+
ahorn_loader.validate('path_to_dataset_file')
|
|
42
|
+
```
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "ahorn-loader"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Add your description here"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Florian Frantzen", email = "florian.frantzen@cs.rwth-aachen.de" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.12"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"requests>=2.32.4",
|
|
12
|
+
"typer>=0.16.0",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.scripts]
|
|
16
|
+
ahorn-loader = "ahorn_loader.cli:app"
|
|
17
|
+
|
|
18
|
+
[build-system]
|
|
19
|
+
requires = ["uv_build>=0.8.4,<0.9.0"]
|
|
20
|
+
build-backend = "uv_build"
|
|
21
|
+
|
|
22
|
+
[dependency-groups]
|
|
23
|
+
dev = [
|
|
24
|
+
"mypy>=1.17.1",
|
|
25
|
+
"pytest>=8.4.1",
|
|
26
|
+
"ruff>=0.12.7",
|
|
27
|
+
"types-requests>=2.32.4",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[tool.mypy]
|
|
31
|
+
warn_redundant_casts = true
|
|
32
|
+
warn_unreachable = true
|
|
33
|
+
warn_unused_ignores = true
|
|
34
|
+
enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"]
|
|
35
|
+
|
|
36
|
+
[tool.pytest.ini_options]
|
|
37
|
+
minversion = "7.0"
|
|
38
|
+
addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"]
|
|
39
|
+
xfail_strict = true
|
|
40
|
+
filterwarnings = ["error"]
|
|
41
|
+
log_cli_level = "info"
|
|
42
|
+
testpaths = ["tests"]
|
|
43
|
+
|
|
44
|
+
[tool.ruff.format]
|
|
45
|
+
docstring-code-format = true
|
|
46
|
+
|
|
47
|
+
[tool.ruff.lint]
|
|
48
|
+
select = [
|
|
49
|
+
"F", # pyflakes errors
|
|
50
|
+
"E", # code style
|
|
51
|
+
"W", # warnings
|
|
52
|
+
"I", # import order
|
|
53
|
+
"D", # pydocstyle rules
|
|
54
|
+
"UP", # pyupgrade rules
|
|
55
|
+
"YTT", # flake8-2020 rules
|
|
56
|
+
"S", # bandit rules
|
|
57
|
+
"BLE", # blind except
|
|
58
|
+
"B", # bugbear rules
|
|
59
|
+
"A", # builtin shadowing
|
|
60
|
+
"COM", # comma rules
|
|
61
|
+
"C4", # comprehensions
|
|
62
|
+
"DTZ", # datetime rules
|
|
63
|
+
"T10", # debugger calls
|
|
64
|
+
"FA", # future annotations
|
|
65
|
+
"ISC", # implicit str concatenation
|
|
66
|
+
"ICN", # import conventions
|
|
67
|
+
"LOG", # logging rules
|
|
68
|
+
"G", # logging format rules
|
|
69
|
+
"PIE", # pie rules
|
|
70
|
+
"Q", # quote rules
|
|
71
|
+
"RSE", # raise rules
|
|
72
|
+
"RET", # return rules
|
|
73
|
+
"SLOT", # slot rules
|
|
74
|
+
"SIM", # code simplifications
|
|
75
|
+
"TID", # tidy imports
|
|
76
|
+
"TC", # type checking rules
|
|
77
|
+
"PTH", # use pathlib
|
|
78
|
+
"PD", # pandas rules
|
|
79
|
+
"PLC", # pylint conventions
|
|
80
|
+
"PLE", # pylint errors
|
|
81
|
+
"FLY", # flynt
|
|
82
|
+
"NPY", # numpy rules
|
|
83
|
+
"PERF", # performance rules
|
|
84
|
+
"FURB", # refurb
|
|
85
|
+
"RUF", # miscellaneous rules
|
|
86
|
+
]
|
|
87
|
+
ignore = [
|
|
88
|
+
"E501", # line too long
|
|
89
|
+
"COM812", # trailing commas; conflict with `ruff format`
|
|
90
|
+
"ISC001", # implicitly single-line str concat; conflict with `ruff format`
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
[tool.ruff.lint.per-file-ignores]
|
|
94
|
+
"__init__.py" = ["F403"]
|
|
95
|
+
"tests/**.py" = ["S101"]
|
|
96
|
+
|
|
97
|
+
[tool.ruff.lint.pydocstyle]
|
|
98
|
+
convention = "numpy"
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Module to interact with the Ahorn dataset API."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import UTC, datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
__all__ = ["download_dataset", "load_dataset_data", "load_datasets_data"]
|
|
11
|
+
|
|
12
|
+
DATASET_API_URL = "https://ahorn.rwth-aachen.de/api/datasets.json"
|
|
13
|
+
CACHE_PATH = Path(__file__).parent.parent.parent / "cache" / "datasets.json"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load_datasets_data(*, cache_lifetime: int | None = None) -> dict[str, Any]:
|
|
17
|
+
"""Load dataset data from the Ahorn API.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
cache_lifetime : int, optional
|
|
22
|
+
How long to reuse cached data in seconds. If not provided, the cache will not
|
|
23
|
+
be used.
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
dict[str, Any]
|
|
28
|
+
Dictionary containing dataset information, where the keys are dataset slugs
|
|
29
|
+
and the values are dictionaries with dataset details such as title, tags, and
|
|
30
|
+
attachments.
|
|
31
|
+
"""
|
|
32
|
+
if CACHE_PATH.exists() and cache_lifetime is not None:
|
|
33
|
+
with CACHE_PATH.open("r", encoding="utf-8") as cache_file:
|
|
34
|
+
cache = json.load(cache_file)
|
|
35
|
+
if (
|
|
36
|
+
cache.get("time")
|
|
37
|
+
and (
|
|
38
|
+
datetime.now(tz=UTC) - datetime.fromisoformat(cache["time"])
|
|
39
|
+
).total_seconds()
|
|
40
|
+
< cache_lifetime
|
|
41
|
+
):
|
|
42
|
+
return cache["datasets"]
|
|
43
|
+
|
|
44
|
+
response = requests.get(DATASET_API_URL, timeout=10)
|
|
45
|
+
response.raise_for_status()
|
|
46
|
+
|
|
47
|
+
CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
with CACHE_PATH.open("w", encoding="utf-8") as cache_file:
|
|
49
|
+
cache_file.write(response.text)
|
|
50
|
+
|
|
51
|
+
return response.json()["datasets"]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def load_dataset_data(
|
|
55
|
+
slug: str, *, cache_lifetime: int | None = None
|
|
56
|
+
) -> dict[str, Any]:
|
|
57
|
+
"""Load data for a specific dataset by its slug.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
slug : str
|
|
62
|
+
The slug of the dataset to load.
|
|
63
|
+
cache_lifetime : int, optional
|
|
64
|
+
How long to reuse cached data in seconds. If not provided, the cache will not
|
|
65
|
+
be used.
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
dict[str, Any]
|
|
70
|
+
Dictionary containing the dataset details.
|
|
71
|
+
"""
|
|
72
|
+
datasets = load_datasets_data(cache_lifetime=cache_lifetime)
|
|
73
|
+
if "error" in datasets:
|
|
74
|
+
return {"error": datasets["error"]}
|
|
75
|
+
|
|
76
|
+
return datasets.get(slug, {"error": f"Dataset '{slug}' not found."})
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def download_dataset(
|
|
80
|
+
slug: str, folder: Path | str, *, cache_lifetime: int | None = None
|
|
81
|
+
) -> None:
|
|
82
|
+
"""Download a dataset by its slug to the specified folder.
|
|
83
|
+
|
|
84
|
+
Parameters
|
|
85
|
+
----------
|
|
86
|
+
slug : str
|
|
87
|
+
The slug of the dataset to download.
|
|
88
|
+
folder : Path | str
|
|
89
|
+
The folder where the dataset should be saved.
|
|
90
|
+
cache_lifetime : int, optional
|
|
91
|
+
How long to reuse cached data in seconds. If not provided, the cache will not
|
|
92
|
+
be used.
|
|
93
|
+
"""
|
|
94
|
+
if isinstance(folder, str):
|
|
95
|
+
folder = Path(folder)
|
|
96
|
+
|
|
97
|
+
data = load_dataset_data(slug, cache_lifetime=cache_lifetime)
|
|
98
|
+
if "error" in data:
|
|
99
|
+
raise ValueError(f"Error loading dataset '{slug}': {data['error']}")
|
|
100
|
+
if "attachments" not in data or "dataset" not in data["attachments"]:
|
|
101
|
+
raise KeyError(f"Dataset '{slug}' does not contain required 'attachments/dataset' keys.")
|
|
102
|
+
dataset_attachment = data["attachments"]["dataset"]
|
|
103
|
+
|
|
104
|
+
response = requests.get(dataset_attachment["url"], timeout=10, stream=True)
|
|
105
|
+
response.raise_for_status()
|
|
106
|
+
folder.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
filepath = folder / dataset_attachment["name"]
|
|
108
|
+
with filepath.open("wb") as f:
|
|
109
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
110
|
+
if chunk:
|
|
111
|
+
f.write(chunk)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Entry point for the ``ahorn-loader`` command-line application."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Annotated
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich import print as rich_print
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
from .api import download_dataset, load_datasets_data
|
|
11
|
+
from .validator import Validator
|
|
12
|
+
|
|
13
|
+
app = typer.Typer()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@app.command()
|
|
17
|
+
def ls() -> None:
|
|
18
|
+
"""List available datasets in AHORN."""
|
|
19
|
+
try:
|
|
20
|
+
datasets = load_datasets_data(cache_lifetime=3600)
|
|
21
|
+
if "error" in datasets:
|
|
22
|
+
typer.echo(f"Error: {datasets['error']}")
|
|
23
|
+
raise typer.Exit(code=1)
|
|
24
|
+
except Exception as e:
|
|
25
|
+
print(f"Failed to load datasets: {e}")
|
|
26
|
+
raise typer.Exit(code=1) from e
|
|
27
|
+
|
|
28
|
+
table = Table(title="Available Datasets")
|
|
29
|
+
table.add_column("Slug", style="cyan")
|
|
30
|
+
table.add_column("Title", style="magenta")
|
|
31
|
+
table.add_column("Tags", style="green")
|
|
32
|
+
|
|
33
|
+
for slug, details in datasets.items():
|
|
34
|
+
table.add_row(slug, details["title"], ", ".join(details["tags"]))
|
|
35
|
+
rich_print(table)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@app.command()
|
|
39
|
+
def download(
|
|
40
|
+
name: Annotated[str, typer.Argument(help="The name of the dataset to download.")],
|
|
41
|
+
folder: Annotated[
|
|
42
|
+
Path, typer.Argument(help="Folder where the dataset should be saved.")
|
|
43
|
+
] = Path(),
|
|
44
|
+
) -> None:
|
|
45
|
+
"""Download the specified dataset from AHORN.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
name : str
|
|
50
|
+
The name of the dataset to download.
|
|
51
|
+
folder : Path
|
|
52
|
+
The folder where the dataset should be saved. Defaults to the current directory.
|
|
53
|
+
"""
|
|
54
|
+
download_dataset(name, folder, cache_lifetime=3600)
|
|
55
|
+
typer.echo(f"Downloaded dataset to {folder}")
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
download_dataset(name, folder, cache_lifetime=3600)
|
|
59
|
+
typer.echo(f"Downloaded dataset to {folder}")
|
|
60
|
+
except Exception as e:
|
|
61
|
+
typer.echo(f"Failed to download dataset: {e}")
|
|
62
|
+
raise typer.Exit(code=1) from e
|
|
63
|
+
@app.command()
|
|
64
|
+
def validate(
|
|
65
|
+
path: Annotated[
|
|
66
|
+
Path, typer.Argument(help="The path to the dataset file to validate.")
|
|
67
|
+
],
|
|
68
|
+
) -> None:
|
|
69
|
+
"""Validate whether a given file is a valid AHORN dataset.
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
path : Path
|
|
74
|
+
The path to the dataset file to validate.
|
|
75
|
+
"""
|
|
76
|
+
validator = Validator()
|
|
77
|
+
if not validator.validate(path):
|
|
78
|
+
typer.echo("Validation failed.")
|
|
79
|
+
raise typer.Exit(code=1)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
app()
|
|
File without changes
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Module with validation rules for a AHORN dataset."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"DatasetRule",
|
|
10
|
+
"FileNameRule",
|
|
11
|
+
"NetworkLevelMetadataRule",
|
|
12
|
+
"PreFlightRule",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PreFlightRule(ABC):
|
|
17
|
+
"""Base class for validation rules that run before the dataset file is loaded."""
|
|
18
|
+
|
|
19
|
+
def __init__(self) -> None:
|
|
20
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def validate(self, file_path: Path) -> bool:
|
|
24
|
+
"""
|
|
25
|
+
Validate the dataset before loading it.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
file_path : pathlib.Path
|
|
30
|
+
The path of the file to validate.
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
bool
|
|
35
|
+
True if the dataset is valid, False otherwise.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DatasetRule(ABC):
|
|
40
|
+
"""Base class for validation rules that validates the dataset content."""
|
|
41
|
+
|
|
42
|
+
def __init__(self) -> None:
|
|
43
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def validate(self, content: list[str]) -> bool:
|
|
47
|
+
"""
|
|
48
|
+
Validate the dataset content.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
content : list[str]
|
|
53
|
+
The content of the dataset file to validate.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
bool
|
|
58
|
+
True if the dataset content is valid, False otherwise.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class FileNameRule(PreFlightRule):
|
|
63
|
+
"""Rule to validate file names."""
|
|
64
|
+
|
|
65
|
+
def validate(self, file_path: Path) -> bool:
|
|
66
|
+
"""
|
|
67
|
+
Validate the file name against a specific pattern.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
file_path : pathlib.Path
|
|
72
|
+
The path of the file to validate.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
bool
|
|
77
|
+
True if the file name is valid, False otherwise.
|
|
78
|
+
"""
|
|
79
|
+
if not (file_path.suffix == ".txt" or file_path.name.endswith(".txt.gz")):
|
|
80
|
+
self.logger.error("Dataset must be a .txt or .txt.gz file.")
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
# TODO: Check that the file can be read as plain text or as gzipped text.
|
|
84
|
+
|
|
85
|
+
self.logger.debug("File name %s is valid.", file_path.name)
|
|
86
|
+
return True
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class NetworkLevelMetadataRule(DatasetRule):
|
|
90
|
+
"""Rule to validate network-level metadata."""
|
|
91
|
+
|
|
92
|
+
def validate(self, content: list[str]) -> bool:
|
|
93
|
+
"""
|
|
94
|
+
Validate the network-level metadata.
|
|
95
|
+
|
|
96
|
+
Parameters
|
|
97
|
+
----------
|
|
98
|
+
content : list[str]
|
|
99
|
+
The content of the dataset file to validate.
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
bool
|
|
104
|
+
True if the metadata is valid, False otherwise.
|
|
105
|
+
"""
|
|
106
|
+
try:
|
|
107
|
+
metadata = json.loads(content[0])
|
|
108
|
+
except json.JSONDecodeError:
|
|
109
|
+
self.logger.error("First line of the dataset must be valid JSON metadata.")
|
|
110
|
+
return False
|
|
111
|
+
self.logger.debug(
|
|
112
|
+
"Parsed network-level metadata successfully.", extra={"metadata": metadata}
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if "_format_version" not in metadata:
|
|
116
|
+
self.logger.error("Network-level metadata must contain '_format_version'.")
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
return True
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Module containing the validator for AHORN datasets."""
|
|
2
|
+
|
|
3
|
+
import gzip
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .rules import DatasetRule, FileNameRule, NetworkLevelMetadataRule, PreFlightRule
|
|
7
|
+
|
|
8
|
+
__all__ = ["Validator"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Validator:
|
|
12
|
+
"""Validator class to manage validation rules."""
|
|
13
|
+
|
|
14
|
+
pre_flight_rules: list[PreFlightRule]
|
|
15
|
+
dataset_rules: list[DatasetRule]
|
|
16
|
+
|
|
17
|
+
def __init__(self) -> None:
|
|
18
|
+
self.pre_flight_rules = [
|
|
19
|
+
FileNameRule(),
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
self.dataset_rules = [
|
|
23
|
+
NetworkLevelMetadataRule(),
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
def validate(self, dataset_path: Path | str) -> bool:
|
|
27
|
+
"""Run all validation rules.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
dataset_path : Path | str
|
|
32
|
+
The path to the dataset file to validate.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
bool
|
|
37
|
+
True if all validation rules pass, False otherwise.
|
|
38
|
+
"""
|
|
39
|
+
if isinstance(dataset_path, str):
|
|
40
|
+
dataset_path = Path(dataset_path)
|
|
41
|
+
|
|
42
|
+
if not all(
|
|
43
|
+
rule.validate(file_path=dataset_path) for rule in self.pre_flight_rules
|
|
44
|
+
):
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
if dataset_path.suffix == ".gz":
|
|
48
|
+
with gzip.open(dataset_path, "rt") as f:
|
|
49
|
+
content = f.readlines()
|
|
50
|
+
else:
|
|
51
|
+
with dataset_path.open() as f:
|
|
52
|
+
content = f.readlines()
|
|
53
|
+
|
|
54
|
+
return all(rule.validate(content=content) for rule in self.dataset_rules)
|