ahorn-loader 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ahorn_loader/__init__.py +3 -0
- ahorn_loader/api.py +261 -0
- ahorn_loader/cli.py +82 -0
- ahorn_loader/py.typed +0 -0
- ahorn_loader/utils/__init__.py +3 -0
- ahorn_loader/utils/cache.py +29 -0
- ahorn_loader/validator/__init__.py +3 -0
- ahorn_loader/validator/rules.py +119 -0
- ahorn_loader/validator/validator.py +54 -0
- ahorn_loader-0.4.1.dist-info/METADATA +85 -0
- ahorn_loader-0.4.1.dist-info/RECORD +13 -0
- ahorn_loader-0.4.1.dist-info/WHEEL +4 -0
- ahorn_loader-0.4.1.dist-info/entry_points.txt +3 -0
ahorn_loader/__init__.py
ADDED
ahorn_loader/api.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""Module to interact with the Ahorn dataset API."""
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import gzip
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from collections.abc import Generator, Iterator
|
|
8
|
+
from datetime import UTC, datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TypedDict
|
|
11
|
+
from urllib.parse import ParseResult, urlparse
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
from httpx_retries import Retry, RetryTransport
|
|
15
|
+
|
|
16
|
+
from .utils import get_cache_dir
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"download_dataset",
|
|
20
|
+
"load_dataset_data",
|
|
21
|
+
"load_datasets_data",
|
|
22
|
+
"read_dataset",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
DATASET_API_URL = "https://ahorn.rwth-aachen.de/api/datasets.json"
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AttachmentDict(TypedDict):
|
|
31
|
+
url: str
|
|
32
|
+
size: int
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DatasetDict(TypedDict):
|
|
36
|
+
slug: str
|
|
37
|
+
title: str
|
|
38
|
+
tags: list[str]
|
|
39
|
+
attachments: dict[str, AttachmentDict]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DatasetsDataDict(TypedDict):
|
|
43
|
+
datasets: dict[str, DatasetDict]
|
|
44
|
+
time: str
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def load_datasets_data(*, cache_lifetime: int | None = None) -> dict[str, DatasetDict]:
|
|
48
|
+
"""Load dataset data from the Ahorn API.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
cache_lifetime : int, optional
|
|
53
|
+
How long to reuse cached data in seconds. If not provided, the cache will not
|
|
54
|
+
be used.
|
|
55
|
+
|
|
56
|
+
Returns
|
|
57
|
+
-------
|
|
58
|
+
dict[str, Any]
|
|
59
|
+
Dictionary containing dataset information, where the keys are dataset slugs
|
|
60
|
+
and the values are dictionaries with dataset details such as title, tags, and
|
|
61
|
+
attachments.
|
|
62
|
+
"""
|
|
63
|
+
datasets_data_cache = get_cache_dir() / "datasets.json"
|
|
64
|
+
if datasets_data_cache.exists() and cache_lifetime is not None:
|
|
65
|
+
cache_mtime = datetime.fromtimestamp(
|
|
66
|
+
datasets_data_cache.stat().st_mtime, tz=UTC
|
|
67
|
+
)
|
|
68
|
+
age_seconds = (datetime.now(tz=UTC) - cache_mtime).total_seconds()
|
|
69
|
+
if age_seconds < cache_lifetime:
|
|
70
|
+
logger.info(
|
|
71
|
+
"Using cached datasets list (age=%.1fs, lifetime=%ss)",
|
|
72
|
+
age_seconds,
|
|
73
|
+
cache_lifetime,
|
|
74
|
+
)
|
|
75
|
+
with datasets_data_cache.open("r", encoding="utf-8") as cache_file:
|
|
76
|
+
cache: DatasetsDataDict = json.load(cache_file)
|
|
77
|
+
return cache["datasets"]
|
|
78
|
+
|
|
79
|
+
logger.info("Fetching datasets list from %s", DATASET_API_URL)
|
|
80
|
+
response = httpx.get(DATASET_API_URL, timeout=10)
|
|
81
|
+
response.raise_for_status()
|
|
82
|
+
|
|
83
|
+
datasets_data_cache.parent.mkdir(parents=True, exist_ok=True)
|
|
84
|
+
with datasets_data_cache.open("w", encoding="utf-8") as cache_file:
|
|
85
|
+
cache_file.write(response.text)
|
|
86
|
+
|
|
87
|
+
response_json: DatasetsDataDict = response.json()
|
|
88
|
+
return response_json["datasets"]
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def load_dataset_data(slug: str, *, cache_lifetime: int | None = None) -> DatasetDict:
|
|
92
|
+
"""Load data for a specific dataset by its slug.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
slug : str
|
|
97
|
+
The slug of the dataset to load.
|
|
98
|
+
cache_lifetime : int, optional
|
|
99
|
+
How long to reuse cached data in seconds. If not provided, the cache will not
|
|
100
|
+
be used.
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
DatasetDict
|
|
105
|
+
Dictionary containing the dataset details.
|
|
106
|
+
|
|
107
|
+
Raises
|
|
108
|
+
------
|
|
109
|
+
KeyError
|
|
110
|
+
If the dataset with the given `slug` does not exist.
|
|
111
|
+
"""
|
|
112
|
+
datasets = load_datasets_data(cache_lifetime=cache_lifetime)
|
|
113
|
+
|
|
114
|
+
if slug not in datasets:
|
|
115
|
+
raise KeyError(f"Dataset with slug '{slug}' does not exist in AHORN.")
|
|
116
|
+
|
|
117
|
+
return datasets[slug]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def get_dataset_url(slug: str, *, cache_lifetime: int | None = None) -> str:
|
|
121
|
+
"""Get the download URL for a specific dataset by its slug.
|
|
122
|
+
|
|
123
|
+
Parameters
|
|
124
|
+
----------
|
|
125
|
+
slug : str
|
|
126
|
+
The slug of the dataset.
|
|
127
|
+
cache_lifetime : int, optional
|
|
128
|
+
How long to reuse cached data in seconds. If not provided, the cache will not
|
|
129
|
+
be used.
|
|
130
|
+
|
|
131
|
+
Returns
|
|
132
|
+
-------
|
|
133
|
+
str
|
|
134
|
+
The download URL of the dataset.
|
|
135
|
+
|
|
136
|
+
Raises
|
|
137
|
+
------
|
|
138
|
+
KeyError
|
|
139
|
+
If the dataset with the given `slug` does not exist.
|
|
140
|
+
RuntimeError
|
|
141
|
+
If the dataset does not contain the required attachment information.
|
|
142
|
+
"""
|
|
143
|
+
data = load_dataset_data(slug, cache_lifetime=cache_lifetime)
|
|
144
|
+
if "dataset" not in data["attachments"]:
|
|
145
|
+
raise RuntimeError(
|
|
146
|
+
f"Dataset '{slug}' does not contain required 'attachments/dataset' keys."
|
|
147
|
+
)
|
|
148
|
+
return data["attachments"]["dataset"]["url"]
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def download_dataset(
|
|
152
|
+
slug: str, folder: Path | str, *, cache_lifetime: int | None = None
|
|
153
|
+
) -> Path:
|
|
154
|
+
"""Download a dataset by its slug to the specified folder.
|
|
155
|
+
|
|
156
|
+
This function implements an exponential backoff strategy when encountering HTTP 429
|
|
157
|
+
(Too Many Requests) responses. If available, it respects the 'Retry-After' header to
|
|
158
|
+
determine the wait time before retrying.
|
|
159
|
+
|
|
160
|
+
Parameters
|
|
161
|
+
----------
|
|
162
|
+
slug : str
|
|
163
|
+
The slug of the dataset to download.
|
|
164
|
+
folder : Path | str
|
|
165
|
+
The folder where the dataset should be saved.
|
|
166
|
+
cache_lifetime : int, optional
|
|
167
|
+
How long to reuse cached data in seconds. If not provided, the cache will not
|
|
168
|
+
be used.
|
|
169
|
+
|
|
170
|
+
Returns
|
|
171
|
+
-------
|
|
172
|
+
Path
|
|
173
|
+
The path to the downloaded dataset file.
|
|
174
|
+
|
|
175
|
+
Raises
|
|
176
|
+
------
|
|
177
|
+
KeyError
|
|
178
|
+
If the dataset with the given `slug` does not exist.
|
|
179
|
+
HTTPError
|
|
180
|
+
If the dataset file could not be downloaded due to some error.
|
|
181
|
+
RuntimeError
|
|
182
|
+
If the dataset file could not be downloaded due to some error.
|
|
183
|
+
"""
|
|
184
|
+
if isinstance(folder, str):
|
|
185
|
+
folder = Path(folder)
|
|
186
|
+
|
|
187
|
+
logger.info("Preparing download for dataset '%s' into %s", slug, folder)
|
|
188
|
+
download_url = get_dataset_url(slug, cache_lifetime=cache_lifetime)
|
|
189
|
+
|
|
190
|
+
url: ParseResult = urlparse(download_url)
|
|
191
|
+
folder.mkdir(parents=True, exist_ok=True)
|
|
192
|
+
filepath = folder / url.path.split("/")[-1]
|
|
193
|
+
|
|
194
|
+
# Use RetryTransport to automatically handle rate limiting (429) with exponential
|
|
195
|
+
# backoff. This also automatically respects 'Retry-After' headers if provided.
|
|
196
|
+
retry = Retry(
|
|
197
|
+
total=5,
|
|
198
|
+
backoff_factor=2.0,
|
|
199
|
+
)
|
|
200
|
+
retry_transport = RetryTransport(retry=retry)
|
|
201
|
+
|
|
202
|
+
with (
|
|
203
|
+
httpx.Client(transport=retry_transport, timeout=10) as client,
|
|
204
|
+
client.stream("GET", download_url) as response,
|
|
205
|
+
):
|
|
206
|
+
response.raise_for_status()
|
|
207
|
+
|
|
208
|
+
with filepath.open("wb") as f:
|
|
209
|
+
for chunk in response.iter_bytes(chunk_size=8192):
|
|
210
|
+
f.write(chunk)
|
|
211
|
+
|
|
212
|
+
logger.info("Downloaded dataset '%s' to %s", slug, filepath)
|
|
213
|
+
|
|
214
|
+
return filepath
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@contextlib.contextmanager
|
|
218
|
+
def read_dataset(slug: str) -> Generator[Iterator[str], None, None]:
|
|
219
|
+
"""Download and yield a context-managed file object for the dataset lines by slug.
|
|
220
|
+
|
|
221
|
+
The dataset file will be stored in your system cache and can be deleted according
|
|
222
|
+
to your system's cache policy. To ensure that costly re-downloads do not occur, use
|
|
223
|
+
the `download_dataset` function to store the dataset file at a more permanent
|
|
224
|
+
location.
|
|
225
|
+
|
|
226
|
+
Parameters
|
|
227
|
+
----------
|
|
228
|
+
slug : str
|
|
229
|
+
The slug of the dataset to download.
|
|
230
|
+
|
|
231
|
+
Returns
|
|
232
|
+
-------
|
|
233
|
+
Context manager yielding an open file object (iterator over lines).
|
|
234
|
+
|
|
235
|
+
Raises
|
|
236
|
+
------
|
|
237
|
+
KeyError
|
|
238
|
+
If the dataset with the given `slug` does not exist.
|
|
239
|
+
RuntimeError
|
|
240
|
+
If the dataset file could not be downloaded due to other errors.
|
|
241
|
+
|
|
242
|
+
Examples
|
|
243
|
+
--------
|
|
244
|
+
>>> import ahorn_loader
|
|
245
|
+
>>> with ahorn_loader.read_dataset("contact-high-school") as dataset:
|
|
246
|
+
>>> for line in dataset:
|
|
247
|
+
>>> ...
|
|
248
|
+
"""
|
|
249
|
+
download_url = get_dataset_url(slug)
|
|
250
|
+
filepath = get_cache_dir() / download_url.split("/")[-1]
|
|
251
|
+
|
|
252
|
+
# Download the dataset if it is not already cached
|
|
253
|
+
if not filepath.exists():
|
|
254
|
+
filepath = download_dataset(slug, get_cache_dir())
|
|
255
|
+
|
|
256
|
+
if filepath.suffix == ".gz":
|
|
257
|
+
with gzip.open(filepath, mode="rt", encoding="utf-8") as f:
|
|
258
|
+
yield f
|
|
259
|
+
else:
|
|
260
|
+
with filepath.open("r", encoding="utf-8") as f:
|
|
261
|
+
yield f
|
ahorn_loader/cli.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Entry point for the ``ahorn-loader`` command-line application."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Annotated
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich import print as rich_print
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
from .api import download_dataset, load_datasets_data
|
|
11
|
+
from .validator import Validator
|
|
12
|
+
|
|
13
|
+
app = typer.Typer()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@app.command()
|
|
17
|
+
def ls() -> None:
|
|
18
|
+
"""List available datasets in AHORN."""
|
|
19
|
+
try:
|
|
20
|
+
datasets = load_datasets_data(cache_lifetime=3600)
|
|
21
|
+
if "error" in datasets:
|
|
22
|
+
typer.echo(f"Error: {datasets['error']}")
|
|
23
|
+
raise typer.Exit(code=1)
|
|
24
|
+
except Exception as e:
|
|
25
|
+
print(f"Failed to load datasets: {e}")
|
|
26
|
+
raise typer.Exit(code=1) from e
|
|
27
|
+
|
|
28
|
+
table = Table(title="Available Datasets")
|
|
29
|
+
table.add_column("Slug", style="cyan")
|
|
30
|
+
table.add_column("Title", style="magenta")
|
|
31
|
+
table.add_column("Tags", style="green")
|
|
32
|
+
|
|
33
|
+
for slug, details in datasets.items():
|
|
34
|
+
table.add_row(slug, details["title"], ", ".join(details["tags"]))
|
|
35
|
+
rich_print(table)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@app.command()
|
|
39
|
+
def download(
|
|
40
|
+
name: Annotated[str, typer.Argument(help="The name of the dataset to download.")],
|
|
41
|
+
folder: Annotated[
|
|
42
|
+
Path, typer.Argument(help="Folder where the dataset should be saved.")
|
|
43
|
+
] = Path(),
|
|
44
|
+
) -> None:
|
|
45
|
+
"""Download the specified dataset from AHORN.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
name : str
|
|
50
|
+
The name of the dataset to download.
|
|
51
|
+
folder : Path
|
|
52
|
+
The folder where the dataset should be saved. Defaults to the current directory.
|
|
53
|
+
"""
|
|
54
|
+
try:
|
|
55
|
+
download_dataset(name, folder, cache_lifetime=3600)
|
|
56
|
+
typer.echo(f"Downloaded dataset to {folder.absolute()}")
|
|
57
|
+
except Exception as e:
|
|
58
|
+
typer.echo(f"Failed to download dataset: {e}")
|
|
59
|
+
raise typer.Exit(code=1) from e
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@app.command()
|
|
63
|
+
def validate(
|
|
64
|
+
path: Annotated[
|
|
65
|
+
Path, typer.Argument(help="The path to the dataset file to validate.")
|
|
66
|
+
],
|
|
67
|
+
) -> None:
|
|
68
|
+
"""Validate whether a given file is a valid AHORN dataset.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
path : Path
|
|
73
|
+
The path to the dataset file to validate.
|
|
74
|
+
"""
|
|
75
|
+
validator = Validator()
|
|
76
|
+
if not validator.validate(path):
|
|
77
|
+
typer.echo("Validation failed.")
|
|
78
|
+
raise typer.Exit(code=1)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
app()
|
ahorn_loader/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Module with cache-related utility functions."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
__all__ = ["get_cache_dir"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_cache_dir() -> Path:
|
|
11
|
+
"""Return an appropriate cache location for the current platform.
|
|
12
|
+
|
|
13
|
+
Returns
|
|
14
|
+
-------
|
|
15
|
+
pathlib.Path
|
|
16
|
+
Platform-dependent cache directory.
|
|
17
|
+
"""
|
|
18
|
+
match sys.platform:
|
|
19
|
+
case "win32":
|
|
20
|
+
base = os.getenv("LOCALAPPDATA") or Path("~\\AppData\\Local").expanduser()
|
|
21
|
+
return Path(base) / "ahorn-loader" / "Cache"
|
|
22
|
+
case "darwin":
|
|
23
|
+
return Path.home() / "Library" / "Caches" / "ahorn-loader"
|
|
24
|
+
case _:
|
|
25
|
+
# Linux and other Unix
|
|
26
|
+
xdg = os.getenv("XDG_CACHE_HOME")
|
|
27
|
+
if xdg:
|
|
28
|
+
return Path(xdg) / "ahorn-loader"
|
|
29
|
+
return Path.home() / ".cache" / "ahorn-loader"
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Module with validation rules for a AHORN dataset."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"DatasetRule",
|
|
10
|
+
"FileNameRule",
|
|
11
|
+
"NetworkLevelMetadataRule",
|
|
12
|
+
"PreFlightRule",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PreFlightRule(ABC):
|
|
17
|
+
"""Base class for validation rules that run before the dataset file is loaded."""
|
|
18
|
+
|
|
19
|
+
def __init__(self) -> None:
|
|
20
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def validate(self, file_path: Path) -> bool:
|
|
24
|
+
"""
|
|
25
|
+
Validate the dataset before loading it.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
file_path : pathlib.Path
|
|
30
|
+
The path of the file to validate.
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
bool
|
|
35
|
+
True if the dataset is valid, False otherwise.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DatasetRule(ABC):
|
|
40
|
+
"""Base class for validation rules that validates the dataset content."""
|
|
41
|
+
|
|
42
|
+
def __init__(self) -> None:
|
|
43
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def validate(self, content: list[str]) -> bool:
|
|
47
|
+
"""
|
|
48
|
+
Validate the dataset content.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
content : list[str]
|
|
53
|
+
The content of the dataset file to validate.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
bool
|
|
58
|
+
True if the dataset content is valid, False otherwise.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class FileNameRule(PreFlightRule):
|
|
63
|
+
"""Rule to validate file names."""
|
|
64
|
+
|
|
65
|
+
def validate(self, file_path: Path) -> bool:
|
|
66
|
+
"""
|
|
67
|
+
Validate the file name against a specific pattern.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
file_path : pathlib.Path
|
|
72
|
+
The path of the file to validate.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
bool
|
|
77
|
+
True if the file name is valid, False otherwise.
|
|
78
|
+
"""
|
|
79
|
+
if not (file_path.suffix == ".txt" or file_path.name.endswith(".txt.gz")):
|
|
80
|
+
self.logger.error("Dataset must be a .txt or .txt.gz file.")
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
# TODO: Check that the file can be read as plain text or as gzipped text.
|
|
84
|
+
|
|
85
|
+
self.logger.debug("File name %s is valid.", file_path.name)
|
|
86
|
+
return True
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class NetworkLevelMetadataRule(DatasetRule):
|
|
90
|
+
"""Rule to validate network-level metadata."""
|
|
91
|
+
|
|
92
|
+
def validate(self, content: list[str]) -> bool:
|
|
93
|
+
"""
|
|
94
|
+
Validate the network-level metadata.
|
|
95
|
+
|
|
96
|
+
Parameters
|
|
97
|
+
----------
|
|
98
|
+
content : list[str]
|
|
99
|
+
The content of the dataset file to validate.
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
bool
|
|
104
|
+
True if the metadata is valid, False otherwise.
|
|
105
|
+
"""
|
|
106
|
+
try:
|
|
107
|
+
metadata = json.loads(content[0])
|
|
108
|
+
except json.JSONDecodeError:
|
|
109
|
+
self.logger.error("First line of the dataset must be valid JSON metadata.")
|
|
110
|
+
return False
|
|
111
|
+
self.logger.debug(
|
|
112
|
+
"Parsed network-level metadata successfully.", extra={"metadata": metadata}
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if "_format_version" not in metadata:
|
|
116
|
+
self.logger.error("Network-level metadata must contain '_format_version'.")
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
return True
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Module containing the validator for AHORN datasets."""
|
|
2
|
+
|
|
3
|
+
import gzip
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .rules import DatasetRule, FileNameRule, NetworkLevelMetadataRule, PreFlightRule
|
|
7
|
+
|
|
8
|
+
__all__ = ["Validator"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Validator:
|
|
12
|
+
"""Validator class to manage validation rules."""
|
|
13
|
+
|
|
14
|
+
pre_flight_rules: list[PreFlightRule]
|
|
15
|
+
dataset_rules: list[DatasetRule]
|
|
16
|
+
|
|
17
|
+
def __init__(self) -> None:
|
|
18
|
+
self.pre_flight_rules = [
|
|
19
|
+
FileNameRule(),
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
self.dataset_rules = [
|
|
23
|
+
NetworkLevelMetadataRule(),
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
def validate(self, dataset_path: Path | str) -> bool:
|
|
27
|
+
"""Run all validation rules.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
dataset_path : Path | str
|
|
32
|
+
The path to the dataset file to validate.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
bool
|
|
37
|
+
True if all validation rules pass, False otherwise.
|
|
38
|
+
"""
|
|
39
|
+
if isinstance(dataset_path, str):
|
|
40
|
+
dataset_path = Path(dataset_path)
|
|
41
|
+
|
|
42
|
+
if not all(
|
|
43
|
+
rule.validate(file_path=dataset_path) for rule in self.pre_flight_rules
|
|
44
|
+
):
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
if dataset_path.suffix == ".gz":
|
|
48
|
+
with gzip.open(dataset_path, "rt") as f:
|
|
49
|
+
content = f.readlines()
|
|
50
|
+
else:
|
|
51
|
+
with dataset_path.open() as f:
|
|
52
|
+
content = f.readlines()
|
|
53
|
+
|
|
54
|
+
return all(rule.validate(content=content) for rule in self.dataset_rules)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ahorn-loader
|
|
3
|
+
Version: 0.4.1
|
|
4
|
+
Summary: Library and command-line application to interact with datasets in the Aachen Higher-Order Repository of Networks.
|
|
5
|
+
Author: Florian Frantzen
|
|
6
|
+
Author-email: Florian Frantzen <frantzen@netsci.rwth-aachen.de>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering
|
|
18
|
+
Requires-Dist: httpx>=0.27.0
|
|
19
|
+
Requires-Dist: httpx-retries>=0.4.5
|
|
20
|
+
Requires-Dist: typer>=0.16.0
|
|
21
|
+
Requires-Python: >=3.12
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# `ahorn-loader`
|
|
25
|
+
|
|
26
|
+
Library and command-line application to interact with datasets in [AHORN](https://ahorn.rwth-aachen.de/).
|
|
27
|
+
|
|
28
|
+
<div align="center">
|
|
29
|
+
|
|
30
|
+
[](https://www.python.org/)
|
|
31
|
+
[](https://github.com/netsci-rwth/ahorn-loader/blob/main/LICENSE)
|
|
32
|
+
|
|
33
|
+
</div>
|
|
34
|
+
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
`ahorn-loader` is both a command-line application and a Python package to interact with the AHORN repository for higher-order datasets.
|
|
38
|
+
|
|
39
|
+
### Command-Line Usage
|
|
40
|
+
|
|
41
|
+
To install and use `ahorn-loader` from the command line, you can run the following command:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
uvx ahorn-loader [command] [args]
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Commands include:
|
|
48
|
+
|
|
49
|
+
- `ls`: List available datasets in AHORN.
|
|
50
|
+
- `download`: Download a dataset from AHORN.
|
|
51
|
+
- `validate`: Validate a specific dataset file (e.g., before adding it to AHORN).
|
|
52
|
+
|
|
53
|
+
To get a full help of available commands and options, run `ahorn-loader --help`.
|
|
54
|
+
|
|
55
|
+
### Python Package Usage
|
|
56
|
+
|
|
57
|
+
To use `ahorn-loader` as a Python package, you can install it via `pip` (or some other package manager of your choice):
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install ahorn-loader
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Then, you can use it in your Python scripts:
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
import ahorn_loader
|
|
67
|
+
|
|
68
|
+
# Download a dataset:
|
|
69
|
+
ahorn_loader.download_dataset("dataset_name", "target_path")
|
|
70
|
+
|
|
71
|
+
# Download and read a dataset:
|
|
72
|
+
# The dataset will be stored in your system's cache. For a more permanent storage
|
|
73
|
+
# location, use `ahorn_loader.download_dataset` instead.
|
|
74
|
+
with ahorn_loader.read_dataset("dataset_name") as dataset:
|
|
75
|
+
for line in dataset:
|
|
76
|
+
...
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Funding
|
|
80
|
+
|
|
81
|
+
<img align="right" width="200" src="https://raw.githubusercontent.com/netsci-rwth/ahorn/main/public/images/erc_logo.png">
|
|
82
|
+
|
|
83
|
+
Funded by the European Union (ERC, HIGH-HOPeS, 101039827).
|
|
84
|
+
Views and opinions expressed are however those of the author(s) only and do not necessarily reflect those of the European Union or the European Research Council Executive Agency.
|
|
85
|
+
Neither the European Union nor the granting authority can be held responsible for them.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
ahorn_loader/__init__.py,sha256=kEDhV6uY5P7i2ceFDSPi7CCR9GekRszv7EzvYx4RDEw,83
|
|
2
|
+
ahorn_loader/api.py,sha256=emtkn7dEvFZaNspRLdnO94St0-84qRrQIY4cF3BEP04,7831
|
|
3
|
+
ahorn_loader/cli.py,sha256=4fFIQVhE-Zzvq47JMghKoMFAzhZXJ8lXRdtyAjvYzBY,2272
|
|
4
|
+
ahorn_loader/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
ahorn_loader/utils/__init__.py,sha256=kIYHc-9ExuESHM2TIXlh9-YF7r7hFiRfjAKYTQG4gGg,81
|
|
6
|
+
ahorn_loader/utils/cache.py,sha256=rRsn5z6LM1aFLufZGM4uppHVP553iR8cP3JTNxZiEKY,832
|
|
7
|
+
ahorn_loader/validator/__init__.py,sha256=tyGbqMMzzkGPI3pEb9uBAJoNMGUds_WdU_5575vGBM8,84
|
|
8
|
+
ahorn_loader/validator/rules.py,sha256=djiWi4_Y-UlC2XhwPGrZywyr56AoPfAcNpOnNMZ6w8I,3155
|
|
9
|
+
ahorn_loader/validator/validator.py,sha256=qfooTPfjZ2ieqraJ3CqdqADfDFlODHm-OU_LRPK0gmM,1437
|
|
10
|
+
ahorn_loader-0.4.1.dist-info/WHEEL,sha256=XV0cjMrO7zXhVAIyyc8aFf1VjZ33Fen4IiJk5zFlC3g,80
|
|
11
|
+
ahorn_loader-0.4.1.dist-info/entry_points.txt,sha256=oyQAA_k5r0sAD_lBKgQLPhpxqk0-UTagDJlsU97AJ4s,55
|
|
12
|
+
ahorn_loader-0.4.1.dist-info/METADATA,sha256=k_zmzKGL7SpBu-m8sR1QMmwhQKf8-ubAEYPaVdreCAM,2973
|
|
13
|
+
ahorn_loader-0.4.1.dist-info/RECORD,,
|