ahorn-loader 0.1.1__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ahorn-loader might be problematic. Click here for more details.

@@ -0,0 +1,85 @@
1
+ Metadata-Version: 2.3
2
+ Name: ahorn-loader
3
+ Version: 0.2.0
4
+ Summary: Library and command-line application to interact with datasets in the Aachen Higher-Order Repository of Networks.
5
+ Author: Florian Frantzen
6
+ Author-email: Florian Frantzen <frantzen@netsci.rwth-aachen.de>
7
+ Classifier: Development Status :: 4 - Beta
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Scientific/Engineering
17
+ Requires-Dist: requests>=2.32.4
18
+ Requires-Dist: typer>=0.16.0
19
+ Requires-Python: >=3.11
20
+ Description-Content-Type: text/markdown
21
+
22
+ # `ahorn-loader`
23
+
24
+ Library and command-line application to interact with datasets in [AHORN](https://ahorn.rwth-aachen.de/).
25
+
26
+ <div align="center">
27
+
28
+ [![Python](https://img.shields.io/badge/python-3.11+-blue)](https://www.python.org/)
29
+ [![license](https://badgen.net/github/license/netsci-rwth/ahorn-loader)](https://github.com/pyt-team/TopoNetX/blob/main/LICENSE)
30
+
31
+ </div>
32
+
33
+ ## Usage
34
+
35
+ `ahorn-loader` is both a command-line application and a Python package to interact with the AHORN repository for higher-order datasets.
36
+
37
+ ### Command-Line Usage
38
+
39
+ To install and use `ahorn-loader` from the command line, you can run the following command:
40
+
41
+ ```bash
42
+ uvx ahorn-loader [command] [args]
43
+ ```
44
+
45
+ Commands include:
46
+ - `ls`: List available datasets in AHORN.
47
+ - `download`: Download a dataset from AHORN.
48
+ - `validate`: Validate a specific dataset file (e.g., before adding it to AHORN).
49
+
50
+ To get a full help of available commands and options, run `ahorn-loader --help`.
51
+
52
+ ### Python Package Usage
53
+
54
+ To use `ahorn-loader` as a Python package, you can install it via `pip` (or some other package manager of your choice):
55
+
56
+ ```bash
57
+ pip install ahorn-loader
58
+ ```
59
+
60
+ Then, you can use it in your Python scripts:
61
+
62
+ ```python
63
+ import ahorn_loader
64
+
65
+ # Download a dataset:
66
+ ahorn_loader.download_dataset("dataset_name", "target_path")
67
+
68
+ # Download and read a dataset:
69
+ # The dataset will be stored in your system's cache. For a more permanent storage
70
+ # location, use `ahorn_loader.download_dataset` instead.
71
+ with ahorn_loader.read_dataset("dataset_name") as dataset:
72
+ for line in dataset:
73
+ ...
74
+
75
+ # Validate a specific dataset (e.g., before adding it to AHORN):
76
+ ahorn_loader.validate("path_to_dataset_file")
77
+ ```
78
+
79
+ ## Funding
80
+
81
+ <img align="right" width="200" src="https://raw.githubusercontent.com/netsci-rwth/ahorn/main/public/images/erc_logo.png">
82
+
83
+ Funded by the European Union (ERC, HIGH-HOPeS, 101039827).
84
+ Views and opinions expressed are however those of the author(s) only and do not necessarily reflect those of the European Union or the European Research Council Executive Agency.
85
+ Neither the European Union nor the granting authority can be held responsible for them.
@@ -0,0 +1,64 @@
1
+ # `ahorn-loader`
2
+
3
+ Library and command-line application to interact with datasets in [AHORN](https://ahorn.rwth-aachen.de/).
4
+
5
+ <div align="center">
6
+
7
+ [![Python](https://img.shields.io/badge/python-3.11+-blue)](https://www.python.org/)
8
+ [![license](https://badgen.net/github/license/netsci-rwth/ahorn-loader)](https://github.com/pyt-team/TopoNetX/blob/main/LICENSE)
9
+
10
+ </div>
11
+
12
+ ## Usage
13
+
14
+ `ahorn-loader` is both a command-line application and a Python package to interact with the AHORN repository for higher-order datasets.
15
+
16
+ ### Command-Line Usage
17
+
18
+ To install and use `ahorn-loader` from the command line, you can run the following command:
19
+
20
+ ```bash
21
+ uvx ahorn-loader [command] [args]
22
+ ```
23
+
24
+ Commands include:
25
+ - `ls`: List available datasets in AHORN.
26
+ - `download`: Download a dataset from AHORN.
27
+ - `validate`: Validate a specific dataset file (e.g., before adding it to AHORN).
28
+
29
+ To get a full help of available commands and options, run `ahorn-loader --help`.
30
+
31
+ ### Python Package Usage
32
+
33
+ To use `ahorn-loader` as a Python package, you can install it via `pip` (or some other package manager of your choice):
34
+
35
+ ```bash
36
+ pip install ahorn-loader
37
+ ```
38
+
39
+ Then, you can use it in your Python scripts:
40
+
41
+ ```python
42
+ import ahorn_loader
43
+
44
+ # Download a dataset:
45
+ ahorn_loader.download_dataset("dataset_name", "target_path")
46
+
47
+ # Download and read a dataset:
48
+ # The dataset will be stored in your system's cache. For a more permanent storage
49
+ # location, use `ahorn_loader.download_dataset` instead.
50
+ with ahorn_loader.read_dataset("dataset_name") as dataset:
51
+ for line in dataset:
52
+ ...
53
+
54
+ # Validate a specific dataset (e.g., before adding it to AHORN):
55
+ ahorn_loader.validate("path_to_dataset_file")
56
+ ```
57
+
58
+ ## Funding
59
+
60
+ <img align="right" width="200" src="https://raw.githubusercontent.com/netsci-rwth/ahorn/main/public/images/erc_logo.png">
61
+
62
+ Funded by the European Union (ERC, HIGH-HOPeS, 101039827).
63
+ Views and opinions expressed are however those of the author(s) only and do not necessarily reflect those of the European Union or the European Research Council Executive Agency.
64
+ Neither the European Union nor the granting authority can be held responsible for them.
@@ -1,12 +1,24 @@
1
1
  [project]
2
2
  name = "ahorn-loader"
3
- version = "0.1.1"
3
+ version = "0.2.0"
4
4
  description = "Library and command-line application to interact with datasets in the Aachen Higher-Order Repository of Networks."
5
5
  readme = "README.md"
6
6
  authors = [
7
- { name = "Florian Frantzen", email = "florian.frantzen@cs.rwth-aachen.de" }
7
+ { name = "Florian Frantzen", email = "frantzen@netsci.rwth-aachen.de" }
8
8
  ]
9
- requires-python = ">=3.12"
9
+ classifiers = [
10
+ "Development Status :: 4 - Beta",
11
+ "Intended Audience :: Developers",
12
+ "Intended Audience :: Science/Research",
13
+ "Operating System :: OS Independent",
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3 :: Only",
16
+ "Programming Language :: Python :: 3.11",
17
+ "Programming Language :: Python :: 3.12",
18
+ "Programming Language :: Python :: 3.13",
19
+ "Topic :: Scientific/Engineering",
20
+ ]
21
+ requires-python = ">=3.11"
10
22
  dependencies = [
11
23
  "requests>=2.32.4",
12
24
  "typer>=0.16.0",
@@ -28,6 +40,7 @@ dev = [
28
40
  ]
29
41
 
30
42
  [tool.mypy]
43
+ strict = true
31
44
  warn_redundant_casts = true
32
45
  warn_unreachable = true
33
46
  warn_unused_ignores = true
@@ -38,7 +51,7 @@ minversion = "7.0"
38
51
  addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"]
39
52
  xfail_strict = true
40
53
  filterwarnings = ["error"]
41
- log_cli_level = "info"
54
+ log_cli_level = "INFO"
42
55
  testpaths = ["tests"]
43
56
 
44
57
  [tool.ruff.format]
@@ -0,0 +1,199 @@
1
+ """Module to interact with the Ahorn dataset API."""
2
+
3
+ import contextlib
4
+ import gzip
5
+ import json
6
+ from collections.abc import Generator, Iterable
7
+ from datetime import UTC, datetime
8
+ from pathlib import Path
9
+ from typing import TypedDict
10
+ from urllib.parse import ParseResult, urlparse
11
+
12
+ import requests
13
+
14
+ from .utils import get_cache_dir
15
+
16
+ __all__ = [
17
+ "download_dataset",
18
+ "load_dataset_data",
19
+ "load_datasets_data",
20
+ "read_dataset",
21
+ ]
22
+
23
+ DATASET_API_URL = "https://ahorn.rwth-aachen.de/api/datasets.json"
24
+
25
+
26
+ class AttachmentDict(TypedDict):
27
+ url: str
28
+ size: int
29
+
30
+
31
+ class DatasetDict(TypedDict):
32
+ slug: str
33
+ title: str
34
+ tags: list[str]
35
+ attachments: dict[str, AttachmentDict]
36
+
37
+
38
+ class DatasetsDataDict(TypedDict):
39
+ datasets: dict[str, DatasetDict]
40
+ time: str
41
+
42
+
43
+ def load_datasets_data(*, cache_lifetime: int | None = None) -> dict[str, DatasetDict]:
44
+ """Load dataset data from the Ahorn API.
45
+
46
+ Parameters
47
+ ----------
48
+ cache_lifetime : int, optional
49
+ How long to reuse cached data in seconds. If not provided, the cache will not
50
+ be used.
51
+
52
+ Returns
53
+ -------
54
+ dict[str, Any]
55
+ Dictionary containing dataset information, where the keys are dataset slugs
56
+ and the values are dictionaries with dataset details such as title, tags, and
57
+ attachments.
58
+ """
59
+ datasets_data_cache = get_cache_dir() / "datasets.json"
60
+ if datasets_data_cache.exists() and cache_lifetime is not None:
61
+ cache_mtime = datetime.fromtimestamp(
62
+ datasets_data_cache.stat().st_mtime, tz=UTC
63
+ )
64
+ if (datetime.now(tz=UTC) - cache_mtime).total_seconds() < cache_lifetime:
65
+ with datasets_data_cache.open("r", encoding="utf-8") as cache_file:
66
+ cache: DatasetsDataDict = json.load(cache_file)
67
+ return cache["datasets"]
68
+
69
+ response = requests.get(DATASET_API_URL, timeout=10)
70
+ response.raise_for_status()
71
+
72
+ datasets_data_cache.parent.mkdir(parents=True, exist_ok=True)
73
+ with datasets_data_cache.open("w", encoding="utf-8") as cache_file:
74
+ cache_file.write(response.text)
75
+
76
+ response_json: DatasetsDataDict = response.json()
77
+ return response_json["datasets"]
78
+
79
+
80
+ def load_dataset_data(slug: str, *, cache_lifetime: int | None = None) -> DatasetDict:
81
+ """Load data for a specific dataset by its slug.
82
+
83
+ Parameters
84
+ ----------
85
+ slug : str
86
+ The slug of the dataset to load.
87
+ cache_lifetime : int, optional
88
+ How long to reuse cached data in seconds. If not provided, the cache will not
89
+ be used.
90
+
91
+ Returns
92
+ -------
93
+ DatasetDict
94
+ Dictionary containing the dataset details.
95
+
96
+ Raises
97
+ ------
98
+ KeyError
99
+ If the dataset with the given `slug` does not exist.
100
+ """
101
+ datasets = load_datasets_data(cache_lifetime=cache_lifetime)
102
+
103
+ if slug not in datasets:
104
+ raise KeyError(f"Dataset with slug '{slug}' does not exist in AHORN.")
105
+
106
+ return datasets[slug]
107
+
108
+
109
+ def download_dataset(
110
+ slug: str, folder: Path | str, *, cache_lifetime: int | None = None
111
+ ) -> Path:
112
+ """Download a dataset by its slug to the specified folder.
113
+
114
+ Parameters
115
+ ----------
116
+ slug : str
117
+ The slug of the dataset to download.
118
+ folder : Path | str
119
+ The folder where the dataset should be saved.
120
+ cache_lifetime : int, optional
121
+ How long to reuse cached data in seconds. If not provided, the cache will not
122
+ be used.
123
+
124
+ Returns
125
+ -------
126
+ Path
127
+ The path to the downloaded dataset file.
128
+
129
+ Raises
130
+ ------
131
+ KeyError
132
+ If the dataset with the given `slug` does not exist.
133
+ RuntimeError
134
+ If the dataset file could not be downloaded due to some error.
135
+ """
136
+ if isinstance(folder, str):
137
+ folder = Path(folder)
138
+
139
+ data = load_dataset_data(slug, cache_lifetime=cache_lifetime)
140
+ if "dataset" not in data["attachments"]:
141
+ raise RuntimeError(
142
+ f"Dataset '{slug}' does not contain required 'attachments/dataset' keys."
143
+ )
144
+ dataset_attachment = data["attachments"]["dataset"]
145
+
146
+ url: ParseResult = urlparse(dataset_attachment["url"])
147
+ folder.mkdir(parents=True, exist_ok=True)
148
+ filepath = folder / url.path.split("/")[-1]
149
+
150
+ response = requests.get(dataset_attachment["url"], timeout=10, stream=True)
151
+ response.raise_for_status()
152
+
153
+ with filepath.open("wb") as f:
154
+ for chunk in response.iter_content(chunk_size=8192):
155
+ if chunk:
156
+ f.write(chunk)
157
+
158
+ return filepath
159
+
160
+
161
+ @contextlib.contextmanager
162
+ def read_dataset(slug: str) -> Generator[Iterable[str], None, None]:
163
+ """Download and yield a context-managed file object for the dataset lines by slug.
164
+
165
+ The dataset file will be stored in your system cache and can be deleted according
166
+ to your system's cache policy. To ensure that costly re-downloads do not occur, use
167
+ the `download_dataset` function to store the dataset file at a more permanent
168
+ location.
169
+
170
+ Parameters
171
+ ----------
172
+ slug : str
173
+ The slug of the dataset to download.
174
+
175
+ Returns
176
+ -------
177
+ Context manager yielding an open file object (iterator over lines).
178
+
179
+ Raises
180
+ ------
181
+ KeyError
182
+ If the dataset with the given `slug` does not exist.
183
+ RuntimeError
184
+ If the dataset file could not be downloaded due to other errors.
185
+
186
+ Examples
187
+ --------
188
+ >>> import ahorn_loader
189
+ >>> with ahorn_loader.read_dataset("contact-high-school") as f:
190
+ >>> for line in f:
191
+ >>> ...
192
+ """
193
+ filepath = download_dataset(slug, get_cache_dir())
194
+ if filepath.suffix == ".gz":
195
+ with gzip.open(filepath, mode="rt", encoding="utf-8") as f:
196
+ yield f
197
+ else:
198
+ with filepath.open("r", encoding="utf-8") as f:
199
+ yield f
@@ -58,6 +58,7 @@ def download(
58
58
  typer.echo(f"Failed to download dataset: {e}")
59
59
  raise typer.Exit(code=1) from e
60
60
 
61
+
61
62
  @app.command()
62
63
  def validate(
63
64
  path: Annotated[
@@ -0,0 +1,3 @@
1
+ """Utility functions used internally in `ahorn-loader`."""
2
+
3
+ from .cache import *
@@ -0,0 +1,29 @@
1
+ """Module with cache-related utility functions."""
2
+
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ __all__ = ["get_cache_dir"]
8
+
9
+
10
+ def get_cache_dir() -> Path:
11
+ """Return an appropriate cache location for the current platform.
12
+
13
+ Returns
14
+ -------
15
+ pathlib.Path
16
+ Platform-dependent cache directory.
17
+ """
18
+ match sys.platform:
19
+ case "win32":
20
+ base = os.getenv("LOCALAPPDATA") or Path("~\\AppData\\Local").expanduser()
21
+ return Path(base) / "ahorn-loader" / "Cache"
22
+ case "darwin":
23
+ return Path.home() / "Library" / "Caches" / "ahorn-loader"
24
+ case _:
25
+ # Linux and other Unix
26
+ xdg = os.getenv("XDG_CACHE_HOME")
27
+ if xdg:
28
+ return Path(xdg) / "ahorn-loader"
29
+ return Path.home() / ".cache" / "ahorn-loader"
@@ -1,53 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: ahorn-loader
3
- Version: 0.1.1
4
- Summary: Library and command-line application to interact with datasets in the Aachen Higher-Order Repository of Networks.
5
- Author: Florian Frantzen
6
- Author-email: Florian Frantzen <florian.frantzen@cs.rwth-aachen.de>
7
- Requires-Dist: requests>=2.32.4
8
- Requires-Dist: typer>=0.16.0
9
- Requires-Python: >=3.12
10
- Description-Content-Type: text/markdown
11
-
12
- # `ahorn-loader`
13
-
14
- Library and command-line application to interact with datasets in [AHORN](https://ahorn.rwth-aachen.de/).
15
-
16
- ## Usage
17
-
18
- `ahorn-loader` is both a command-line application and a Python package to interact with the AHORN repository for higher-order datasets.
19
-
20
- ### Command-Line Usage
21
-
22
- To install and use `ahorn-loader` from the command line, you can run the following command:
23
-
24
- ```bash
25
- uvx ahorn-loader [command] [args]
26
- ```
27
-
28
- Commands include:
29
- - `ls`: List available datasets in AHORN.
30
- - `download`: Download a dataset from AHORN.
31
- - `validate`: Validate a specific dataset file (e.g., before adding it to AHORN).
32
-
33
- To get a full help of available commands and options, run `ahorn-loader --help`.
34
-
35
- ### Python Package Usage
36
-
37
- To use `ahorn-loader` as a Python package, you can install it via `pip` (or some other package manager of your choice):
38
-
39
- ```bash
40
- pip install ahorn-loader
41
- ```
42
-
43
- Then, you can use it in your Python scripts:
44
-
45
- ```python
46
- import ahorn_loader
47
-
48
- # download a dataset
49
- ahorn_loader.download('dataset_name', 'target_path')
50
-
51
- # validate a specific dataset (e.g., before adding it to AHORN)
52
- ahorn_loader.validate('path_to_dataset_file')
53
- ```
@@ -1,42 +0,0 @@
1
- # `ahorn-loader`
2
-
3
- Library and command-line application to interact with datasets in [AHORN](https://ahorn.rwth-aachen.de/).
4
-
5
- ## Usage
6
-
7
- `ahorn-loader` is both a command-line application and a Python package to interact with the AHORN repository for higher-order datasets.
8
-
9
- ### Command-Line Usage
10
-
11
- To install and use `ahorn-loader` from the command line, you can run the following command:
12
-
13
- ```bash
14
- uvx ahorn-loader [command] [args]
15
- ```
16
-
17
- Commands include:
18
- - `ls`: List available datasets in AHORN.
19
- - `download`: Download a dataset from AHORN.
20
- - `validate`: Validate a specific dataset file (e.g., before adding it to AHORN).
21
-
22
- To get a full help of available commands and options, run `ahorn-loader --help`.
23
-
24
- ### Python Package Usage
25
-
26
- To use `ahorn-loader` as a Python package, you can install it via `pip` (or some other package manager of your choice):
27
-
28
- ```bash
29
- pip install ahorn-loader
30
- ```
31
-
32
- Then, you can use it in your Python scripts:
33
-
34
- ```python
35
- import ahorn_loader
36
-
37
- # download a dataset
38
- ahorn_loader.download('dataset_name', 'target_path')
39
-
40
- # validate a specific dataset (e.g., before adding it to AHORN)
41
- ahorn_loader.validate('path_to_dataset_file')
42
- ```
@@ -1,115 +0,0 @@
1
- """Module to interact with the Ahorn dataset API."""
2
-
3
- import json
4
- from datetime import UTC, datetime
5
- from pathlib import Path
6
- from typing import Any
7
- from urllib.parse import ParseResult, urlparse
8
-
9
- import requests
10
-
11
- __all__ = ["download_dataset", "load_dataset_data", "load_datasets_data"]
12
-
13
- DATASET_API_URL = "https://ahorn.rwth-aachen.de/api/datasets.json"
14
- CACHE_PATH = Path(__file__).parent.parent.parent / "cache" / "datasets.json"
15
-
16
-
17
- def load_datasets_data(*, cache_lifetime: int | None = None) -> dict[str, Any]:
18
- """Load dataset data from the Ahorn API.
19
-
20
- Parameters
21
- ----------
22
- cache_lifetime : int, optional
23
- How long to reuse cached data in seconds. If not provided, the cache will not
24
- be used.
25
-
26
- Returns
27
- -------
28
- dict[str, Any]
29
- Dictionary containing dataset information, where the keys are dataset slugs
30
- and the values are dictionaries with dataset details such as title, tags, and
31
- attachments.
32
- """
33
- if CACHE_PATH.exists() and cache_lifetime is not None:
34
- with CACHE_PATH.open("r", encoding="utf-8") as cache_file:
35
- cache = json.load(cache_file)
36
- if (
37
- cache.get("time")
38
- and (
39
- datetime.now(tz=UTC) - datetime.fromisoformat(cache["time"])
40
- ).total_seconds()
41
- < cache_lifetime
42
- ):
43
- return cache["datasets"]
44
-
45
- response = requests.get(DATASET_API_URL, timeout=10)
46
- response.raise_for_status()
47
-
48
- CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
49
- with CACHE_PATH.open("w", encoding="utf-8") as cache_file:
50
- cache_file.write(response.text)
51
-
52
- return response.json()["datasets"]
53
-
54
-
55
- def load_dataset_data(
56
- slug: str, *, cache_lifetime: int | None = None
57
- ) -> dict[str, Any]:
58
- """Load data for a specific dataset by its slug.
59
-
60
- Parameters
61
- ----------
62
- slug : str
63
- The slug of the dataset to load.
64
- cache_lifetime : int, optional
65
- How long to reuse cached data in seconds. If not provided, the cache will not
66
- be used.
67
-
68
- Returns
69
- -------
70
- dict[str, Any]
71
- Dictionary containing the dataset details.
72
- """
73
- datasets = load_datasets_data(cache_lifetime=cache_lifetime)
74
- if "error" in datasets:
75
- return {"error": datasets["error"]}
76
-
77
- return datasets.get(slug, {"error": f"Dataset '{slug}' not found."})
78
-
79
-
80
- def download_dataset(
81
- slug: str, folder: Path | str, *, cache_lifetime: int | None = None
82
- ) -> None:
83
- """Download a dataset by its slug to the specified folder.
84
-
85
- Parameters
86
- ----------
87
- slug : str
88
- The slug of the dataset to download.
89
- folder : Path | str
90
- The folder where the dataset should be saved.
91
- cache_lifetime : int, optional
92
- How long to reuse cached data in seconds. If not provided, the cache will not
93
- be used.
94
- """
95
- if isinstance(folder, str):
96
- folder = Path(folder)
97
-
98
- data = load_dataset_data(slug, cache_lifetime=cache_lifetime)
99
- if "error" in data:
100
- raise ValueError(f"Error loading dataset '{slug}': {data['error']}")
101
- if "attachments" not in data or "dataset" not in data["attachments"]:
102
- raise KeyError(f"Dataset '{slug}' does not contain required 'attachments/dataset' keys.")
103
- dataset_attachment = data["attachments"]["dataset"]
104
-
105
- url: ParseResult = urlparse(dataset_attachment["url"])
106
- folder.mkdir(parents=True, exist_ok=True)
107
- filepath = folder / url.path.split("/")[-1]
108
-
109
- response = requests.get(dataset_attachment["url"], timeout=10, stream=True)
110
- response.raise_for_status()
111
-
112
- with filepath.open("wb") as f:
113
- for chunk in response.iter_content(chunk_size=8192):
114
- if chunk:
115
- f.write(chunk)