ahorn-loader 0.1.1__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ahorn-loader might be problematic. Click here for more details.
- ahorn_loader-0.2.0/PKG-INFO +85 -0
- ahorn_loader-0.2.0/README.md +64 -0
- {ahorn_loader-0.1.1 → ahorn_loader-0.2.0}/pyproject.toml +17 -4
- ahorn_loader-0.2.0/src/ahorn_loader/api.py +199 -0
- {ahorn_loader-0.1.1 → ahorn_loader-0.2.0}/src/ahorn_loader/cli.py +1 -0
- ahorn_loader-0.2.0/src/ahorn_loader/utils/__init__.py +3 -0
- ahorn_loader-0.2.0/src/ahorn_loader/utils/cache.py +29 -0
- ahorn_loader-0.1.1/PKG-INFO +0 -53
- ahorn_loader-0.1.1/README.md +0 -42
- ahorn_loader-0.1.1/src/ahorn_loader/api.py +0 -115
- {ahorn_loader-0.1.1 → ahorn_loader-0.2.0}/src/ahorn_loader/__init__.py +0 -0
- {ahorn_loader-0.1.1 → ahorn_loader-0.2.0}/src/ahorn_loader/py.typed +0 -0
- {ahorn_loader-0.1.1 → ahorn_loader-0.2.0}/src/ahorn_loader/validator/__init__.py +0 -0
- {ahorn_loader-0.1.1 → ahorn_loader-0.2.0}/src/ahorn_loader/validator/rules.py +0 -0
- {ahorn_loader-0.1.1 → ahorn_loader-0.2.0}/src/ahorn_loader/validator/validator.py +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: ahorn-loader
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Library and command-line application to interact with datasets in the Aachen Higher-Order Repository of Networks.
|
|
5
|
+
Author: Florian Frantzen
|
|
6
|
+
Author-email: Florian Frantzen <frantzen@netsci.rwth-aachen.de>
|
|
7
|
+
Classifier: Development Status :: 4 - Beta
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering
|
|
17
|
+
Requires-Dist: requests>=2.32.4
|
|
18
|
+
Requires-Dist: typer>=0.16.0
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# `ahorn-loader`
|
|
23
|
+
|
|
24
|
+
Library and command-line application to interact with datasets in [AHORN](https://ahorn.rwth-aachen.de/).
|
|
25
|
+
|
|
26
|
+
<div align="center">
|
|
27
|
+
|
|
28
|
+
[](https://www.python.org/)
|
|
29
|
+
[](https://github.com/pyt-team/TopoNetX/blob/main/LICENSE)
|
|
30
|
+
|
|
31
|
+
</div>
|
|
32
|
+
|
|
33
|
+
## Usage
|
|
34
|
+
|
|
35
|
+
`ahorn-loader` is both a command-line application and a Python package to interact with the AHORN repository for higher-order datasets.
|
|
36
|
+
|
|
37
|
+
### Command-Line Usage
|
|
38
|
+
|
|
39
|
+
To install and use `ahorn-loader` from the command line, you can run the following command:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
uvx ahorn-loader [command] [args]
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Commands include:
|
|
46
|
+
- `ls`: List available datasets in AHORN.
|
|
47
|
+
- `download`: Download a dataset from AHORN.
|
|
48
|
+
- `validate`: Validate a specific dataset file (e.g., before adding it to AHORN).
|
|
49
|
+
|
|
50
|
+
To get a full help of available commands and options, run `ahorn-loader --help`.
|
|
51
|
+
|
|
52
|
+
### Python Package Usage
|
|
53
|
+
|
|
54
|
+
To use `ahorn-loader` as a Python package, you can install it via `pip` (or some other package manager of your choice):
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install ahorn-loader
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Then, you can use it in your Python scripts:
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
import ahorn_loader
|
|
64
|
+
|
|
65
|
+
# Download a dataset:
|
|
66
|
+
ahorn_loader.download_dataset("dataset_name", "target_path")
|
|
67
|
+
|
|
68
|
+
# Download and read a dataset:
|
|
69
|
+
# The dataset will be stored in your system's cache. For a more permanent storage
|
|
70
|
+
# location, use `ahorn_loader.download_dataset` instead.
|
|
71
|
+
with ahorn_loader.read_dataset("dataset_name") as dataset:
|
|
72
|
+
for line in dataset:
|
|
73
|
+
...
|
|
74
|
+
|
|
75
|
+
# Validate a specific dataset (e.g., before adding it to AHORN):
|
|
76
|
+
ahorn_loader.validate("path_to_dataset_file")
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Funding
|
|
80
|
+
|
|
81
|
+
<img align="right" width="200" src="https://raw.githubusercontent.com/netsci-rwth/ahorn/main/public/images/erc_logo.png">
|
|
82
|
+
|
|
83
|
+
Funded by the European Union (ERC, HIGH-HOPeS, 101039827).
|
|
84
|
+
Views and opinions expressed are however those of the author(s) only and do not necessarily reflect those of the European Union or the European Research Council Executive Agency.
|
|
85
|
+
Neither the European Union nor the granting authority can be held responsible for them.
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# `ahorn-loader`
|
|
2
|
+
|
|
3
|
+
Library and command-line application to interact with datasets in [AHORN](https://ahorn.rwth-aachen.de/).
|
|
4
|
+
|
|
5
|
+
<div align="center">
|
|
6
|
+
|
|
7
|
+
[](https://www.python.org/)
|
|
8
|
+
[](https://github.com/pyt-team/TopoNetX/blob/main/LICENSE)
|
|
9
|
+
|
|
10
|
+
</div>
|
|
11
|
+
|
|
12
|
+
## Usage
|
|
13
|
+
|
|
14
|
+
`ahorn-loader` is both a command-line application and a Python package to interact with the AHORN repository for higher-order datasets.
|
|
15
|
+
|
|
16
|
+
### Command-Line Usage
|
|
17
|
+
|
|
18
|
+
To install and use `ahorn-loader` from the command line, you can run the following command:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
uvx ahorn-loader [command] [args]
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Commands include:
|
|
25
|
+
- `ls`: List available datasets in AHORN.
|
|
26
|
+
- `download`: Download a dataset from AHORN.
|
|
27
|
+
- `validate`: Validate a specific dataset file (e.g., before adding it to AHORN).
|
|
28
|
+
|
|
29
|
+
To get a full help of available commands and options, run `ahorn-loader --help`.
|
|
30
|
+
|
|
31
|
+
### Python Package Usage
|
|
32
|
+
|
|
33
|
+
To use `ahorn-loader` as a Python package, you can install it via `pip` (or some other package manager of your choice):
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install ahorn-loader
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Then, you can use it in your Python scripts:
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
import ahorn_loader
|
|
43
|
+
|
|
44
|
+
# Download a dataset:
|
|
45
|
+
ahorn_loader.download_dataset("dataset_name", "target_path")
|
|
46
|
+
|
|
47
|
+
# Download and read a dataset:
|
|
48
|
+
# The dataset will be stored in your system's cache. For a more permanent storage
|
|
49
|
+
# location, use `ahorn_loader.download_dataset` instead.
|
|
50
|
+
with ahorn_loader.read_dataset("dataset_name") as dataset:
|
|
51
|
+
for line in dataset:
|
|
52
|
+
...
|
|
53
|
+
|
|
54
|
+
# Validate a specific dataset (e.g., before adding it to AHORN):
|
|
55
|
+
ahorn_loader.validate("path_to_dataset_file")
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Funding
|
|
59
|
+
|
|
60
|
+
<img align="right" width="200" src="https://raw.githubusercontent.com/netsci-rwth/ahorn/main/public/images/erc_logo.png">
|
|
61
|
+
|
|
62
|
+
Funded by the European Union (ERC, HIGH-HOPeS, 101039827).
|
|
63
|
+
Views and opinions expressed are however those of the author(s) only and do not necessarily reflect those of the European Union or the European Research Council Executive Agency.
|
|
64
|
+
Neither the European Union nor the granting authority can be held responsible for them.
|
|
@@ -1,12 +1,24 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ahorn-loader"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
description = "Library and command-line application to interact with datasets in the Aachen Higher-Order Repository of Networks."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
7
|
-
{ name = "Florian Frantzen", email = "
|
|
7
|
+
{ name = "Florian Frantzen", email = "frantzen@netsci.rwth-aachen.de" }
|
|
8
8
|
]
|
|
9
|
-
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Development Status :: 4 - Beta",
|
|
11
|
+
"Intended Audience :: Developers",
|
|
12
|
+
"Intended Audience :: Science/Research",
|
|
13
|
+
"Operating System :: OS Independent",
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
"Programming Language :: Python :: 3.13",
|
|
19
|
+
"Topic :: Scientific/Engineering",
|
|
20
|
+
]
|
|
21
|
+
requires-python = ">=3.11"
|
|
10
22
|
dependencies = [
|
|
11
23
|
"requests>=2.32.4",
|
|
12
24
|
"typer>=0.16.0",
|
|
@@ -28,6 +40,7 @@ dev = [
|
|
|
28
40
|
]
|
|
29
41
|
|
|
30
42
|
[tool.mypy]
|
|
43
|
+
strict = true
|
|
31
44
|
warn_redundant_casts = true
|
|
32
45
|
warn_unreachable = true
|
|
33
46
|
warn_unused_ignores = true
|
|
@@ -38,7 +51,7 @@ minversion = "7.0"
|
|
|
38
51
|
addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"]
|
|
39
52
|
xfail_strict = true
|
|
40
53
|
filterwarnings = ["error"]
|
|
41
|
-
log_cli_level = "
|
|
54
|
+
log_cli_level = "INFO"
|
|
42
55
|
testpaths = ["tests"]
|
|
43
56
|
|
|
44
57
|
[tool.ruff.format]
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Module to interact with the Ahorn dataset API."""
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import gzip
|
|
5
|
+
import json
|
|
6
|
+
from collections.abc import Generator, Iterable
|
|
7
|
+
from datetime import UTC, datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TypedDict
|
|
10
|
+
from urllib.parse import ParseResult, urlparse
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
|
|
14
|
+
from .utils import get_cache_dir
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"download_dataset",
|
|
18
|
+
"load_dataset_data",
|
|
19
|
+
"load_datasets_data",
|
|
20
|
+
"read_dataset",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
DATASET_API_URL = "https://ahorn.rwth-aachen.de/api/datasets.json"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AttachmentDict(TypedDict):
|
|
27
|
+
url: str
|
|
28
|
+
size: int
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DatasetDict(TypedDict):
|
|
32
|
+
slug: str
|
|
33
|
+
title: str
|
|
34
|
+
tags: list[str]
|
|
35
|
+
attachments: dict[str, AttachmentDict]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DatasetsDataDict(TypedDict):
|
|
39
|
+
datasets: dict[str, DatasetDict]
|
|
40
|
+
time: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def load_datasets_data(*, cache_lifetime: int | None = None) -> dict[str, DatasetDict]:
|
|
44
|
+
"""Load dataset data from the Ahorn API.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
cache_lifetime : int, optional
|
|
49
|
+
How long to reuse cached data in seconds. If not provided, the cache will not
|
|
50
|
+
be used.
|
|
51
|
+
|
|
52
|
+
Returns
|
|
53
|
+
-------
|
|
54
|
+
dict[str, Any]
|
|
55
|
+
Dictionary containing dataset information, where the keys are dataset slugs
|
|
56
|
+
and the values are dictionaries with dataset details such as title, tags, and
|
|
57
|
+
attachments.
|
|
58
|
+
"""
|
|
59
|
+
datasets_data_cache = get_cache_dir() / "datasets.json"
|
|
60
|
+
if datasets_data_cache.exists() and cache_lifetime is not None:
|
|
61
|
+
cache_mtime = datetime.fromtimestamp(
|
|
62
|
+
datasets_data_cache.stat().st_mtime, tz=UTC
|
|
63
|
+
)
|
|
64
|
+
if (datetime.now(tz=UTC) - cache_mtime).total_seconds() < cache_lifetime:
|
|
65
|
+
with datasets_data_cache.open("r", encoding="utf-8") as cache_file:
|
|
66
|
+
cache: DatasetsDataDict = json.load(cache_file)
|
|
67
|
+
return cache["datasets"]
|
|
68
|
+
|
|
69
|
+
response = requests.get(DATASET_API_URL, timeout=10)
|
|
70
|
+
response.raise_for_status()
|
|
71
|
+
|
|
72
|
+
datasets_data_cache.parent.mkdir(parents=True, exist_ok=True)
|
|
73
|
+
with datasets_data_cache.open("w", encoding="utf-8") as cache_file:
|
|
74
|
+
cache_file.write(response.text)
|
|
75
|
+
|
|
76
|
+
response_json: DatasetsDataDict = response.json()
|
|
77
|
+
return response_json["datasets"]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def load_dataset_data(slug: str, *, cache_lifetime: int | None = None) -> DatasetDict:
|
|
81
|
+
"""Load data for a specific dataset by its slug.
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
slug : str
|
|
86
|
+
The slug of the dataset to load.
|
|
87
|
+
cache_lifetime : int, optional
|
|
88
|
+
How long to reuse cached data in seconds. If not provided, the cache will not
|
|
89
|
+
be used.
|
|
90
|
+
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
93
|
+
DatasetDict
|
|
94
|
+
Dictionary containing the dataset details.
|
|
95
|
+
|
|
96
|
+
Raises
|
|
97
|
+
------
|
|
98
|
+
KeyError
|
|
99
|
+
If the dataset with the given `slug` does not exist.
|
|
100
|
+
"""
|
|
101
|
+
datasets = load_datasets_data(cache_lifetime=cache_lifetime)
|
|
102
|
+
|
|
103
|
+
if slug not in datasets:
|
|
104
|
+
raise KeyError(f"Dataset with slug '{slug}' does not exist in AHORN.")
|
|
105
|
+
|
|
106
|
+
return datasets[slug]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def download_dataset(
|
|
110
|
+
slug: str, folder: Path | str, *, cache_lifetime: int | None = None
|
|
111
|
+
) -> Path:
|
|
112
|
+
"""Download a dataset by its slug to the specified folder.
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
slug : str
|
|
117
|
+
The slug of the dataset to download.
|
|
118
|
+
folder : Path | str
|
|
119
|
+
The folder where the dataset should be saved.
|
|
120
|
+
cache_lifetime : int, optional
|
|
121
|
+
How long to reuse cached data in seconds. If not provided, the cache will not
|
|
122
|
+
be used.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
Path
|
|
127
|
+
The path to the downloaded dataset file.
|
|
128
|
+
|
|
129
|
+
Raises
|
|
130
|
+
------
|
|
131
|
+
KeyError
|
|
132
|
+
If the dataset with the given `slug` does not exist.
|
|
133
|
+
RuntimeError
|
|
134
|
+
If the dataset file could not be downloaded due to some error.
|
|
135
|
+
"""
|
|
136
|
+
if isinstance(folder, str):
|
|
137
|
+
folder = Path(folder)
|
|
138
|
+
|
|
139
|
+
data = load_dataset_data(slug, cache_lifetime=cache_lifetime)
|
|
140
|
+
if "dataset" not in data["attachments"]:
|
|
141
|
+
raise RuntimeError(
|
|
142
|
+
f"Dataset '{slug}' does not contain required 'attachments/dataset' keys."
|
|
143
|
+
)
|
|
144
|
+
dataset_attachment = data["attachments"]["dataset"]
|
|
145
|
+
|
|
146
|
+
url: ParseResult = urlparse(dataset_attachment["url"])
|
|
147
|
+
folder.mkdir(parents=True, exist_ok=True)
|
|
148
|
+
filepath = folder / url.path.split("/")[-1]
|
|
149
|
+
|
|
150
|
+
response = requests.get(dataset_attachment["url"], timeout=10, stream=True)
|
|
151
|
+
response.raise_for_status()
|
|
152
|
+
|
|
153
|
+
with filepath.open("wb") as f:
|
|
154
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
155
|
+
if chunk:
|
|
156
|
+
f.write(chunk)
|
|
157
|
+
|
|
158
|
+
return filepath
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@contextlib.contextmanager
|
|
162
|
+
def read_dataset(slug: str) -> Generator[Iterable[str], None, None]:
|
|
163
|
+
"""Download and yield a context-managed file object for the dataset lines by slug.
|
|
164
|
+
|
|
165
|
+
The dataset file will be stored in your system cache and can be deleted according
|
|
166
|
+
to your system's cache policy. To ensure that costly re-downloads do not occur, use
|
|
167
|
+
the `download_dataset` function to store the dataset file at a more permanent
|
|
168
|
+
location.
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
slug : str
|
|
173
|
+
The slug of the dataset to download.
|
|
174
|
+
|
|
175
|
+
Returns
|
|
176
|
+
-------
|
|
177
|
+
Context manager yielding an open file object (iterator over lines).
|
|
178
|
+
|
|
179
|
+
Raises
|
|
180
|
+
------
|
|
181
|
+
KeyError
|
|
182
|
+
If the dataset with the given `slug` does not exist.
|
|
183
|
+
RuntimeError
|
|
184
|
+
If the dataset file could not be downloaded due to other errors.
|
|
185
|
+
|
|
186
|
+
Examples
|
|
187
|
+
--------
|
|
188
|
+
>>> import ahorn_loader
|
|
189
|
+
>>> with ahorn_loader.read_dataset("contact-high-school") as f:
|
|
190
|
+
>>> for line in f:
|
|
191
|
+
>>> ...
|
|
192
|
+
"""
|
|
193
|
+
filepath = download_dataset(slug, get_cache_dir())
|
|
194
|
+
if filepath.suffix == ".gz":
|
|
195
|
+
with gzip.open(filepath, mode="rt", encoding="utf-8") as f:
|
|
196
|
+
yield f
|
|
197
|
+
else:
|
|
198
|
+
with filepath.open("r", encoding="utf-8") as f:
|
|
199
|
+
yield f
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Module with cache-related utility functions."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
__all__ = ["get_cache_dir"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_cache_dir() -> Path:
|
|
11
|
+
"""Return an appropriate cache location for the current platform.
|
|
12
|
+
|
|
13
|
+
Returns
|
|
14
|
+
-------
|
|
15
|
+
pathlib.Path
|
|
16
|
+
Platform-dependent cache directory.
|
|
17
|
+
"""
|
|
18
|
+
match sys.platform:
|
|
19
|
+
case "win32":
|
|
20
|
+
base = os.getenv("LOCALAPPDATA") or Path("~\\AppData\\Local").expanduser()
|
|
21
|
+
return Path(base) / "ahorn-loader" / "Cache"
|
|
22
|
+
case "darwin":
|
|
23
|
+
return Path.home() / "Library" / "Caches" / "ahorn-loader"
|
|
24
|
+
case _:
|
|
25
|
+
# Linux and other Unix
|
|
26
|
+
xdg = os.getenv("XDG_CACHE_HOME")
|
|
27
|
+
if xdg:
|
|
28
|
+
return Path(xdg) / "ahorn-loader"
|
|
29
|
+
return Path.home() / ".cache" / "ahorn-loader"
|
ahorn_loader-0.1.1/PKG-INFO
DELETED
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: ahorn-loader
|
|
3
|
-
Version: 0.1.1
|
|
4
|
-
Summary: Library and command-line application to interact with datasets in the Aachen Higher-Order Repository of Networks.
|
|
5
|
-
Author: Florian Frantzen
|
|
6
|
-
Author-email: Florian Frantzen <florian.frantzen@cs.rwth-aachen.de>
|
|
7
|
-
Requires-Dist: requests>=2.32.4
|
|
8
|
-
Requires-Dist: typer>=0.16.0
|
|
9
|
-
Requires-Python: >=3.12
|
|
10
|
-
Description-Content-Type: text/markdown
|
|
11
|
-
|
|
12
|
-
# `ahorn-loader`
|
|
13
|
-
|
|
14
|
-
Library and command-line application to interact with datasets in [AHORN](https://ahorn.rwth-aachen.de/).
|
|
15
|
-
|
|
16
|
-
## Usage
|
|
17
|
-
|
|
18
|
-
`ahorn-loader` is both a command-line application and a Python package to interact with the AHORN repository for higher-order datasets.
|
|
19
|
-
|
|
20
|
-
### Command-Line Usage
|
|
21
|
-
|
|
22
|
-
To install and use `ahorn-loader` from the command line, you can run the following command:
|
|
23
|
-
|
|
24
|
-
```bash
|
|
25
|
-
uvx ahorn-loader [command] [args]
|
|
26
|
-
```
|
|
27
|
-
|
|
28
|
-
Commands include:
|
|
29
|
-
- `ls`: List available datasets in AHORN.
|
|
30
|
-
- `download`: Download a dataset from AHORN.
|
|
31
|
-
- `validate`: Validate a specific dataset file (e.g., before adding it to AHORN).
|
|
32
|
-
|
|
33
|
-
To get a full help of available commands and options, run `ahorn-loader --help`.
|
|
34
|
-
|
|
35
|
-
### Python Package Usage
|
|
36
|
-
|
|
37
|
-
To use `ahorn-loader` as a Python package, you can install it via `pip` (or some other package manager of your choice):
|
|
38
|
-
|
|
39
|
-
```bash
|
|
40
|
-
pip install ahorn-loader
|
|
41
|
-
```
|
|
42
|
-
|
|
43
|
-
Then, you can use it in your Python scripts:
|
|
44
|
-
|
|
45
|
-
```python
|
|
46
|
-
import ahorn_loader
|
|
47
|
-
|
|
48
|
-
# download a dataset
|
|
49
|
-
ahorn_loader.download('dataset_name', 'target_path')
|
|
50
|
-
|
|
51
|
-
# validate a specific dataset (e.g., before adding it to AHORN)
|
|
52
|
-
ahorn_loader.validate('path_to_dataset_file')
|
|
53
|
-
```
|
ahorn_loader-0.1.1/README.md
DELETED
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
# `ahorn-loader`
|
|
2
|
-
|
|
3
|
-
Library and command-line application to interact with datasets in [AHORN](https://ahorn.rwth-aachen.de/).
|
|
4
|
-
|
|
5
|
-
## Usage
|
|
6
|
-
|
|
7
|
-
`ahorn-loader` is both a command-line application and a Python package to interact with the AHORN repository for higher-order datasets.
|
|
8
|
-
|
|
9
|
-
### Command-Line Usage
|
|
10
|
-
|
|
11
|
-
To install and use `ahorn-loader` from the command line, you can run the following command:
|
|
12
|
-
|
|
13
|
-
```bash
|
|
14
|
-
uvx ahorn-loader [command] [args]
|
|
15
|
-
```
|
|
16
|
-
|
|
17
|
-
Commands include:
|
|
18
|
-
- `ls`: List available datasets in AHORN.
|
|
19
|
-
- `download`: Download a dataset from AHORN.
|
|
20
|
-
- `validate`: Validate a specific dataset file (e.g., before adding it to AHORN).
|
|
21
|
-
|
|
22
|
-
To get a full help of available commands and options, run `ahorn-loader --help`.
|
|
23
|
-
|
|
24
|
-
### Python Package Usage
|
|
25
|
-
|
|
26
|
-
To use `ahorn-loader` as a Python package, you can install it via `pip` (or some other package manager of your choice):
|
|
27
|
-
|
|
28
|
-
```bash
|
|
29
|
-
pip install ahorn-loader
|
|
30
|
-
```
|
|
31
|
-
|
|
32
|
-
Then, you can use it in your Python scripts:
|
|
33
|
-
|
|
34
|
-
```python
|
|
35
|
-
import ahorn_loader
|
|
36
|
-
|
|
37
|
-
# download a dataset
|
|
38
|
-
ahorn_loader.download('dataset_name', 'target_path')
|
|
39
|
-
|
|
40
|
-
# validate a specific dataset (e.g., before adding it to AHORN)
|
|
41
|
-
ahorn_loader.validate('path_to_dataset_file')
|
|
42
|
-
```
|
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
"""Module to interact with the Ahorn dataset API."""
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
from datetime import UTC, datetime
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Any
|
|
7
|
-
from urllib.parse import ParseResult, urlparse
|
|
8
|
-
|
|
9
|
-
import requests
|
|
10
|
-
|
|
11
|
-
__all__ = ["download_dataset", "load_dataset_data", "load_datasets_data"]
|
|
12
|
-
|
|
13
|
-
DATASET_API_URL = "https://ahorn.rwth-aachen.de/api/datasets.json"
|
|
14
|
-
CACHE_PATH = Path(__file__).parent.parent.parent / "cache" / "datasets.json"
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def load_datasets_data(*, cache_lifetime: int | None = None) -> dict[str, Any]:
|
|
18
|
-
"""Load dataset data from the Ahorn API.
|
|
19
|
-
|
|
20
|
-
Parameters
|
|
21
|
-
----------
|
|
22
|
-
cache_lifetime : int, optional
|
|
23
|
-
How long to reuse cached data in seconds. If not provided, the cache will not
|
|
24
|
-
be used.
|
|
25
|
-
|
|
26
|
-
Returns
|
|
27
|
-
-------
|
|
28
|
-
dict[str, Any]
|
|
29
|
-
Dictionary containing dataset information, where the keys are dataset slugs
|
|
30
|
-
and the values are dictionaries with dataset details such as title, tags, and
|
|
31
|
-
attachments.
|
|
32
|
-
"""
|
|
33
|
-
if CACHE_PATH.exists() and cache_lifetime is not None:
|
|
34
|
-
with CACHE_PATH.open("r", encoding="utf-8") as cache_file:
|
|
35
|
-
cache = json.load(cache_file)
|
|
36
|
-
if (
|
|
37
|
-
cache.get("time")
|
|
38
|
-
and (
|
|
39
|
-
datetime.now(tz=UTC) - datetime.fromisoformat(cache["time"])
|
|
40
|
-
).total_seconds()
|
|
41
|
-
< cache_lifetime
|
|
42
|
-
):
|
|
43
|
-
return cache["datasets"]
|
|
44
|
-
|
|
45
|
-
response = requests.get(DATASET_API_URL, timeout=10)
|
|
46
|
-
response.raise_for_status()
|
|
47
|
-
|
|
48
|
-
CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
49
|
-
with CACHE_PATH.open("w", encoding="utf-8") as cache_file:
|
|
50
|
-
cache_file.write(response.text)
|
|
51
|
-
|
|
52
|
-
return response.json()["datasets"]
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def load_dataset_data(
|
|
56
|
-
slug: str, *, cache_lifetime: int | None = None
|
|
57
|
-
) -> dict[str, Any]:
|
|
58
|
-
"""Load data for a specific dataset by its slug.
|
|
59
|
-
|
|
60
|
-
Parameters
|
|
61
|
-
----------
|
|
62
|
-
slug : str
|
|
63
|
-
The slug of the dataset to load.
|
|
64
|
-
cache_lifetime : int, optional
|
|
65
|
-
How long to reuse cached data in seconds. If not provided, the cache will not
|
|
66
|
-
be used.
|
|
67
|
-
|
|
68
|
-
Returns
|
|
69
|
-
-------
|
|
70
|
-
dict[str, Any]
|
|
71
|
-
Dictionary containing the dataset details.
|
|
72
|
-
"""
|
|
73
|
-
datasets = load_datasets_data(cache_lifetime=cache_lifetime)
|
|
74
|
-
if "error" in datasets:
|
|
75
|
-
return {"error": datasets["error"]}
|
|
76
|
-
|
|
77
|
-
return datasets.get(slug, {"error": f"Dataset '{slug}' not found."})
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def download_dataset(
|
|
81
|
-
slug: str, folder: Path | str, *, cache_lifetime: int | None = None
|
|
82
|
-
) -> None:
|
|
83
|
-
"""Download a dataset by its slug to the specified folder.
|
|
84
|
-
|
|
85
|
-
Parameters
|
|
86
|
-
----------
|
|
87
|
-
slug : str
|
|
88
|
-
The slug of the dataset to download.
|
|
89
|
-
folder : Path | str
|
|
90
|
-
The folder where the dataset should be saved.
|
|
91
|
-
cache_lifetime : int, optional
|
|
92
|
-
How long to reuse cached data in seconds. If not provided, the cache will not
|
|
93
|
-
be used.
|
|
94
|
-
"""
|
|
95
|
-
if isinstance(folder, str):
|
|
96
|
-
folder = Path(folder)
|
|
97
|
-
|
|
98
|
-
data = load_dataset_data(slug, cache_lifetime=cache_lifetime)
|
|
99
|
-
if "error" in data:
|
|
100
|
-
raise ValueError(f"Error loading dataset '{slug}': {data['error']}")
|
|
101
|
-
if "attachments" not in data or "dataset" not in data["attachments"]:
|
|
102
|
-
raise KeyError(f"Dataset '{slug}' does not contain required 'attachments/dataset' keys.")
|
|
103
|
-
dataset_attachment = data["attachments"]["dataset"]
|
|
104
|
-
|
|
105
|
-
url: ParseResult = urlparse(dataset_attachment["url"])
|
|
106
|
-
folder.mkdir(parents=True, exist_ok=True)
|
|
107
|
-
filepath = folder / url.path.split("/")[-1]
|
|
108
|
-
|
|
109
|
-
response = requests.get(dataset_attachment["url"], timeout=10, stream=True)
|
|
110
|
-
response.raise_for_status()
|
|
111
|
-
|
|
112
|
-
with filepath.open("wb") as f:
|
|
113
|
-
for chunk in response.iter_content(chunk_size=8192):
|
|
114
|
-
if chunk:
|
|
115
|
-
f.write(chunk)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|