ahorn-loader 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ahorn-loader might be problematic. Click here for more details.

@@ -0,0 +1,53 @@
1
+ Metadata-Version: 2.3
2
+ Name: ahorn-loader
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Author: Florian Frantzen
6
+ Author-email: Florian Frantzen <florian.frantzen@cs.rwth-aachen.de>
7
+ Requires-Dist: requests>=2.32.4
8
+ Requires-Dist: typer>=0.16.0
9
+ Requires-Python: >=3.12
10
+ Description-Content-Type: text/markdown
11
+
12
+ # `ahorn-loader`
13
+
14
+ Library and command-line application to interact with datasets in [AHORN](https://ahorn.rwth-aachen.de/).
15
+
16
+ ## Usage
17
+
18
+ `ahorn-loader` is both a command-line application and a Python package to interact with the AHORN repository for higher-order datasets.
19
+
20
+ ### Command-Line Usage
21
+
22
+ To install and use `ahorn-loader` from the command line, you can run the following command:
23
+
24
+ ```bash
25
+ uvx ahorn-loader [command] [args]
26
+ ```
27
+
28
+ Commands include:
29
+ - `ls`: List available datasets in AHORN.
30
+ - `download`: Download a dataset from AHORN.
31
+ - `validate`: Validate a specific dataset file (e.g., before adding it to AHORN).
32
+
33
+ To get a full help of available commands and options, run `ahorn-loader --help`.
34
+
35
+ ### Python Package Usage
36
+
37
+ To use `ahorn-loader` as a Python package, you can install it via `pip` (or some other package manager of your choice):
38
+
39
+ ```bash
40
+ pip install ahorn-loader
41
+ ```
42
+
43
+ Then, you can use it in your Python scripts:
44
+
45
+ ```python
46
+ import ahorn_loader
47
+
48
+ # download a dataset
49
+ ahorn_loader.download('dataset_name', 'target_path')
50
+
51
+ # validate a specific dataset (e.g., before adding it to AHORN)
52
+ ahorn_loader.validate('path_to_dataset_file')
53
+ ```
@@ -0,0 +1,42 @@
1
+ # `ahorn-loader`
2
+
3
+ Library and command-line application to interact with datasets in [AHORN](https://ahorn.rwth-aachen.de/).
4
+
5
+ ## Usage
6
+
7
+ `ahorn-loader` is both a command-line application and a Python package to interact with the AHORN repository for higher-order datasets.
8
+
9
+ ### Command-Line Usage
10
+
11
+ To install and use `ahorn-loader` from the command line, you can run the following command:
12
+
13
+ ```bash
14
+ uvx ahorn-loader [command] [args]
15
+ ```
16
+
17
+ Commands include:
18
+ - `ls`: List available datasets in AHORN.
19
+ - `download`: Download a dataset from AHORN.
20
+ - `validate`: Validate a specific dataset file (e.g., before adding it to AHORN).
21
+
22
+ To get a full help of available commands and options, run `ahorn-loader --help`.
23
+
24
+ ### Python Package Usage
25
+
26
+ To use `ahorn-loader` as a Python package, you can install it via `pip` (or some other package manager of your choice):
27
+
28
+ ```bash
29
+ pip install ahorn-loader
30
+ ```
31
+
32
+ Then, you can use it in your Python scripts:
33
+
34
+ ```python
35
+ import ahorn_loader
36
+
37
+ # download a dataset
38
+ ahorn_loader.download('dataset_name', 'target_path')
39
+
40
+ # validate a specific dataset (e.g., before adding it to AHORN)
41
+ ahorn_loader.validate('path_to_dataset_file')
42
+ ```
@@ -0,0 +1,98 @@
1
+ [project]
2
+ name = "ahorn-loader"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Florian Frantzen", email = "florian.frantzen@cs.rwth-aachen.de" }
8
+ ]
9
+ requires-python = ">=3.12"
10
+ dependencies = [
11
+ "requests>=2.32.4",
12
+ "typer>=0.16.0",
13
+ ]
14
+
15
+ [project.scripts]
16
+ ahorn-loader = "ahorn_loader.cli:app"
17
+
18
+ [build-system]
19
+ requires = ["uv_build>=0.8.4,<0.9.0"]
20
+ build-backend = "uv_build"
21
+
22
+ [dependency-groups]
23
+ dev = [
24
+ "mypy>=1.17.1",
25
+ "pytest>=8.4.1",
26
+ "ruff>=0.12.7",
27
+ "types-requests>=2.32.4",
28
+ ]
29
+
30
+ [tool.mypy]
31
+ warn_redundant_casts = true
32
+ warn_unreachable = true
33
+ warn_unused_ignores = true
34
+ enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"]
35
+
36
+ [tool.pytest.ini_options]
37
+ minversion = "7.0"
38
+ addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"]
39
+ xfail_strict = true
40
+ filterwarnings = ["error"]
41
+ log_cli_level = "info"
42
+ testpaths = ["tests"]
43
+
44
+ [tool.ruff.format]
45
+ docstring-code-format = true
46
+
47
+ [tool.ruff.lint]
48
+ select = [
49
+ "F", # pyflakes errors
50
+ "E", # code style
51
+ "W", # warnings
52
+ "I", # import order
53
+ "D", # pydocstyle rules
54
+ "UP", # pyupgrade rules
55
+ "YTT", # flake8-2020 rules
56
+ "S", # bandit rules
57
+ "BLE", # blind except
58
+ "B", # bugbear rules
59
+ "A", # builtin shadowing
60
+ "COM", # comma rules
61
+ "C4", # comprehensions
62
+ "DTZ", # datetime rules
63
+ "T10", # debugger calls
64
+ "FA", # future annotations
65
+ "ISC", # implicit str concatenation
66
+ "ICN", # import conventions
67
+ "LOG", # logging rules
68
+ "G", # logging format rules
69
+ "PIE", # pie rules
70
+ "Q", # quote rules
71
+ "RSE", # raise rules
72
+ "RET", # return rules
73
+ "SLOT", # slot rules
74
+ "SIM", # code simplifications
75
+ "TID", # tidy imports
76
+ "TC", # type checking rules
77
+ "PTH", # use pathlib
78
+ "PD", # pandas rules
79
+ "PLC", # pylint conventions
80
+ "PLE", # pylint errors
81
+ "FLY", # flynt
82
+ "NPY", # numpy rules
83
+ "PERF", # performance rules
84
+ "FURB", # refurb
85
+ "RUF", # miscellaneous rules
86
+ ]
87
+ ignore = [
88
+ "E501", # line too long
89
+ "COM812", # trailing commas; conflict with `ruff format`
90
+ "ISC001", # implicitly single-line str concat; conflict with `ruff format`
91
+ ]
92
+
93
+ [tool.ruff.lint.per-file-ignores]
94
+ "__init__.py" = ["F403"]
95
+ "tests/**.py" = ["S101"]
96
+
97
+ [tool.ruff.lint.pydocstyle]
98
+ convention = "numpy"
@@ -0,0 +1,3 @@
1
+ """Library and CLI for loading and managing AHORN datasets."""
2
+
3
+ from .api import *
@@ -0,0 +1,111 @@
1
+ """Module to interact with the Ahorn dataset API."""
2
+
3
+ import json
4
+ from datetime import UTC, datetime
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import requests
9
+
10
+ __all__ = ["download_dataset", "load_dataset_data", "load_datasets_data"]
11
+
12
+ DATASET_API_URL = "https://ahorn.rwth-aachen.de/api/datasets.json"
13
+ CACHE_PATH = Path(__file__).parent.parent.parent / "cache" / "datasets.json"
14
+
15
+
16
+ def load_datasets_data(*, cache_lifetime: int | None = None) -> dict[str, Any]:
17
+ """Load dataset data from the Ahorn API.
18
+
19
+ Parameters
20
+ ----------
21
+ cache_lifetime : int, optional
22
+ How long to reuse cached data in seconds. If not provided, the cache will not
23
+ be used.
24
+
25
+ Returns
26
+ -------
27
+ dict[str, Any]
28
+ Dictionary containing dataset information, where the keys are dataset slugs
29
+ and the values are dictionaries with dataset details such as title, tags, and
30
+ attachments.
31
+ """
32
+ if CACHE_PATH.exists() and cache_lifetime is not None:
33
+ with CACHE_PATH.open("r", encoding="utf-8") as cache_file:
34
+ cache = json.load(cache_file)
35
+ if (
36
+ cache.get("time")
37
+ and (
38
+ datetime.now(tz=UTC) - datetime.fromisoformat(cache["time"])
39
+ ).total_seconds()
40
+ < cache_lifetime
41
+ ):
42
+ return cache["datasets"]
43
+
44
+ response = requests.get(DATASET_API_URL, timeout=10)
45
+ response.raise_for_status()
46
+
47
+ CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
48
+ with CACHE_PATH.open("w", encoding="utf-8") as cache_file:
49
+ cache_file.write(response.text)
50
+
51
+ return response.json()["datasets"]
52
+
53
+
54
+ def load_dataset_data(
55
+ slug: str, *, cache_lifetime: int | None = None
56
+ ) -> dict[str, Any]:
57
+ """Load data for a specific dataset by its slug.
58
+
59
+ Parameters
60
+ ----------
61
+ slug : str
62
+ The slug of the dataset to load.
63
+ cache_lifetime : int, optional
64
+ How long to reuse cached data in seconds. If not provided, the cache will not
65
+ be used.
66
+
67
+ Returns
68
+ -------
69
+ dict[str, Any]
70
+ Dictionary containing the dataset details.
71
+ """
72
+ datasets = load_datasets_data(cache_lifetime=cache_lifetime)
73
+ if "error" in datasets:
74
+ return {"error": datasets["error"]}
75
+
76
+ return datasets.get(slug, {"error": f"Dataset '{slug}' not found."})
77
+
78
+
79
+ def download_dataset(
80
+ slug: str, folder: Path | str, *, cache_lifetime: int | None = None
81
+ ) -> None:
82
+ """Download a dataset by its slug to the specified folder.
83
+
84
+ Parameters
85
+ ----------
86
+ slug : str
87
+ The slug of the dataset to download.
88
+ folder : Path | str
89
+ The folder where the dataset should be saved.
90
+ cache_lifetime : int, optional
91
+ How long to reuse cached data in seconds. If not provided, the cache will not
92
+ be used.
93
+ """
94
+ if isinstance(folder, str):
95
+ folder = Path(folder)
96
+
97
+ data = load_dataset_data(slug, cache_lifetime=cache_lifetime)
98
+ if "error" in data:
99
+ raise ValueError(f"Error loading dataset '{slug}': {data['error']}")
100
+ if "attachments" not in data or "dataset" not in data["attachments"]:
101
+ raise KeyError(f"Dataset '{slug}' does not contain required 'attachments/dataset' keys.")
102
+ dataset_attachment = data["attachments"]["dataset"]
103
+
104
+ response = requests.get(dataset_attachment["url"], timeout=10, stream=True)
105
+ response.raise_for_status()
106
+ folder.mkdir(parents=True, exist_ok=True)
107
+ filepath = folder / dataset_attachment["name"]
108
+ with filepath.open("wb") as f:
109
+ for chunk in response.iter_content(chunk_size=8192):
110
+ if chunk:
111
+ f.write(chunk)
@@ -0,0 +1,83 @@
1
+ """Entry point for the ``ahorn-loader`` command-line application."""
2
+
3
+ from pathlib import Path
4
+ from typing import Annotated
5
+
6
+ import typer
7
+ from rich import print as rich_print
8
+ from rich.table import Table
9
+
10
+ from .api import download_dataset, load_datasets_data
11
+ from .validator import Validator
12
+
13
+ app = typer.Typer()
14
+
15
+
16
+ @app.command()
17
+ def ls() -> None:
18
+ """List available datasets in AHORN."""
19
+ try:
20
+ datasets = load_datasets_data(cache_lifetime=3600)
21
+ if "error" in datasets:
22
+ typer.echo(f"Error: {datasets['error']}")
23
+ raise typer.Exit(code=1)
24
+ except Exception as e:
25
+ print(f"Failed to load datasets: {e}")
26
+ raise typer.Exit(code=1) from e
27
+
28
+ table = Table(title="Available Datasets")
29
+ table.add_column("Slug", style="cyan")
30
+ table.add_column("Title", style="magenta")
31
+ table.add_column("Tags", style="green")
32
+
33
+ for slug, details in datasets.items():
34
+ table.add_row(slug, details["title"], ", ".join(details["tags"]))
35
+ rich_print(table)
36
+
37
+
38
+ @app.command()
39
+ def download(
40
+ name: Annotated[str, typer.Argument(help="The name of the dataset to download.")],
41
+ folder: Annotated[
42
+ Path, typer.Argument(help="Folder where the dataset should be saved.")
43
+ ] = Path(),
44
+ ) -> None:
45
+ """Download the specified dataset from AHORN.
46
+
47
+ Parameters
48
+ ----------
49
+ name : str
50
+ The name of the dataset to download.
51
+ folder : Path
52
+ The folder where the dataset should be saved. Defaults to the current directory.
53
+ """
54
+ download_dataset(name, folder, cache_lifetime=3600)
55
+ typer.echo(f"Downloaded dataset to {folder}")
56
+
57
+ try:
58
+ download_dataset(name, folder, cache_lifetime=3600)
59
+ typer.echo(f"Downloaded dataset to {folder}")
60
+ except Exception as e:
61
+ typer.echo(f"Failed to download dataset: {e}")
62
+ raise typer.Exit(code=1) from e
63
+ @app.command()
64
+ def validate(
65
+ path: Annotated[
66
+ Path, typer.Argument(help="The path to the dataset file to validate.")
67
+ ],
68
+ ) -> None:
69
+ """Validate whether a given file is a valid AHORN dataset.
70
+
71
+ Parameters
72
+ ----------
73
+ path : Path
74
+ The path to the dataset file to validate.
75
+ """
76
+ validator = Validator()
77
+ if not validator.validate(path):
78
+ typer.echo("Validation failed.")
79
+ raise typer.Exit(code=1)
80
+
81
+
82
+ if __name__ == "__main__":
83
+ app()
File without changes
@@ -0,0 +1,3 @@
1
+ """Module containing the validator for AHORN datasets."""
2
+
3
+ from .validator import *
@@ -0,0 +1,119 @@
1
+ """Module with validation rules for a AHORN dataset."""
2
+
3
+ import json
4
+ import logging
5
+ from abc import ABC, abstractmethod
6
+ from pathlib import Path
7
+
8
+ __all__ = [
9
+ "DatasetRule",
10
+ "FileNameRule",
11
+ "NetworkLevelMetadataRule",
12
+ "PreFlightRule",
13
+ ]
14
+
15
+
16
+ class PreFlightRule(ABC):
17
+ """Base class for validation rules that run before the dataset file is loaded."""
18
+
19
+ def __init__(self) -> None:
20
+ self.logger = logging.getLogger(self.__class__.__name__)
21
+
22
+ @abstractmethod
23
+ def validate(self, file_path: Path) -> bool:
24
+ """
25
+ Validate the dataset before loading it.
26
+
27
+ Parameters
28
+ ----------
29
+ file_path : pathlib.Path
30
+ The path of the file to validate.
31
+
32
+ Returns
33
+ -------
34
+ bool
35
+ True if the dataset is valid, False otherwise.
36
+ """
37
+
38
+
39
+ class DatasetRule(ABC):
40
+ """Base class for validation rules that validates the dataset content."""
41
+
42
+ def __init__(self) -> None:
43
+ self.logger = logging.getLogger(self.__class__.__name__)
44
+
45
+ @abstractmethod
46
+ def validate(self, content: list[str]) -> bool:
47
+ """
48
+ Validate the dataset content.
49
+
50
+ Parameters
51
+ ----------
52
+ content : list[str]
53
+ The content of the dataset file to validate.
54
+
55
+ Returns
56
+ -------
57
+ bool
58
+ True if the dataset content is valid, False otherwise.
59
+ """
60
+
61
+
62
+ class FileNameRule(PreFlightRule):
63
+ """Rule to validate file names."""
64
+
65
+ def validate(self, file_path: Path) -> bool:
66
+ """
67
+ Validate the file name against a specific pattern.
68
+
69
+ Parameters
70
+ ----------
71
+ file_path : pathlib.Path
72
+ The path of the file to validate.
73
+
74
+ Returns
75
+ -------
76
+ bool
77
+ True if the file name is valid, False otherwise.
78
+ """
79
+ if not (file_path.suffix == ".txt" or file_path.name.endswith(".txt.gz")):
80
+ self.logger.error("Dataset must be a .txt or .txt.gz file.")
81
+ return False
82
+
83
+ # TODO: Check that the file can be read as plain text or as gzipped text.
84
+
85
+ self.logger.debug("File name %s is valid.", file_path.name)
86
+ return True
87
+
88
+
89
+ class NetworkLevelMetadataRule(DatasetRule):
90
+ """Rule to validate network-level metadata."""
91
+
92
+ def validate(self, content: list[str]) -> bool:
93
+ """
94
+ Validate the network-level metadata.
95
+
96
+ Parameters
97
+ ----------
98
+ content : list[str]
99
+ The content of the dataset file to validate.
100
+
101
+ Returns
102
+ -------
103
+ bool
104
+ True if the metadata is valid, False otherwise.
105
+ """
106
+ try:
107
+ metadata = json.loads(content[0])
108
+ except json.JSONDecodeError:
109
+ self.logger.error("First line of the dataset must be valid JSON metadata.")
110
+ return False
111
+ self.logger.debug(
112
+ "Parsed network-level metadata successfully.", extra={"metadata": metadata}
113
+ )
114
+
115
+ if "_format_version" not in metadata:
116
+ self.logger.error("Network-level metadata must contain '_format_version'.")
117
+ return False
118
+
119
+ return True
@@ -0,0 +1,54 @@
1
+ """Module containing the validator for AHORN datasets."""
2
+
3
+ import gzip
4
+ from pathlib import Path
5
+
6
+ from .rules import DatasetRule, FileNameRule, NetworkLevelMetadataRule, PreFlightRule
7
+
8
+ __all__ = ["Validator"]
9
+
10
+
11
+ class Validator:
12
+ """Validator class to manage validation rules."""
13
+
14
+ pre_flight_rules: list[PreFlightRule]
15
+ dataset_rules: list[DatasetRule]
16
+
17
+ def __init__(self) -> None:
18
+ self.pre_flight_rules = [
19
+ FileNameRule(),
20
+ ]
21
+
22
+ self.dataset_rules = [
23
+ NetworkLevelMetadataRule(),
24
+ ]
25
+
26
+ def validate(self, dataset_path: Path | str) -> bool:
27
+ """Run all validation rules.
28
+
29
+ Parameters
30
+ ----------
31
+ dataset_path : Path | str
32
+ The path to the dataset file to validate.
33
+
34
+ Returns
35
+ -------
36
+ bool
37
+ True if all validation rules pass, False otherwise.
38
+ """
39
+ if isinstance(dataset_path, str):
40
+ dataset_path = Path(dataset_path)
41
+
42
+ if not all(
43
+ rule.validate(file_path=dataset_path) for rule in self.pre_flight_rules
44
+ ):
45
+ return False
46
+
47
+ if dataset_path.suffix == ".gz":
48
+ with gzip.open(dataset_path, "rt") as f:
49
+ content = f.readlines()
50
+ else:
51
+ with dataset_path.open() as f:
52
+ content = f.readlines()
53
+
54
+ return all(rule.validate(content=content) for rule in self.dataset_rules)