ggh4x-python 0.3.1.9000__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ggh4x/__init__.py +140 -0
- ggh4x/_aimed_text_grob.py +432 -0
- ggh4x/_borrowed_ggplot2.py +273 -0
- ggh4x/_cli.py +84 -0
- ggh4x/_datasets.py +106 -0
- ggh4x/_download.py +111 -0
- ggh4x/_facet_helpers.py +313 -0
- ggh4x/_facet_utils.py +649 -0
- ggh4x/_gap_grobs.py +606 -0
- ggh4x/_registry.py +10 -0
- ggh4x/_rlang.py +93 -0
- ggh4x/_utils.py +150 -0
- ggh4x/_vctrs.py +233 -0
- ggh4x/conveniences.py +601 -0
- ggh4x/coord_axes_inside.py +380 -0
- ggh4x/element_part_rect.py +545 -0
- ggh4x/facet_grid2.py +1018 -0
- ggh4x/facet_manual.py +901 -0
- ggh4x/facet_nested.py +776 -0
- ggh4x/facet_nested_wrap.py +193 -0
- ggh4x/facet_wrap2.py +896 -0
- ggh4x/geom_box.py +536 -0
- ggh4x/geom_outline_point.py +444 -0
- ggh4x/geom_pointpath.py +259 -0
- ggh4x/geom_polygonraster.py +252 -0
- ggh4x/geom_rectrug.py +489 -0
- ggh4x/geom_text_aimed.py +279 -0
- ggh4x/guide_stringlegend.py +354 -0
- ggh4x/help_secondary.py +549 -0
- ggh4x/multiscale/__init__.py +51 -0
- ggh4x/multiscale/_multiscale_add.py +207 -0
- ggh4x/multiscale/scale_listed.py +167 -0
- ggh4x/multiscale/scale_manual.py +478 -0
- ggh4x/multiscale/scale_multi.py +393 -0
- ggh4x/panel_scales/__init__.py +58 -0
- ggh4x/panel_scales/at_panel.py +115 -0
- ggh4x/panel_scales/facetted_pos_scales.py +647 -0
- ggh4x/panel_scales/force_panelsize.py +411 -0
- ggh4x/panel_scales/scale_facet.py +222 -0
- ggh4x/position_disjoint_ranges.py +229 -0
- ggh4x/position_lineartrans.py +242 -0
- ggh4x/py.typed +0 -0
- ggh4x/resources/faithful.csv +273 -0
- ggh4x/resources/iris.csv +151 -0
- ggh4x/resources/mtcars.csv +33 -0
- ggh4x/resources/pressure.csv +20 -0
- ggh4x/resources/volcano.csv +87 -0
- ggh4x/save.py +255 -0
- ggh4x/stat_difference.py +388 -0
- ggh4x/stat_funxy.py +436 -0
- ggh4x/stat_rle.py +290 -0
- ggh4x/stat_rollingkernel.py +369 -0
- ggh4x/stat_theodensity.py +681 -0
- ggh4x/strip_nested.py +448 -0
- ggh4x/strip_split.py +687 -0
- ggh4x/strip_tag.py +636 -0
- ggh4x/strip_themed.py +232 -0
- ggh4x/strip_vanilla.py +1464 -0
- ggh4x/themes.py +31 -0
- ggh4x/themes_ggh4x.py +67 -0
- ggh4x_python-0.3.1.9000.dist-info/METADATA +40 -0
- ggh4x_python-0.3.1.9000.dist-info/RECORD +64 -0
- ggh4x_python-0.3.1.9000.dist-info/WHEEL +4 -0
- ggh4x_python-0.3.1.9000.dist-info/licenses/LICENSE +3 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
"""ggplot2-internal helpers borrowed by ggh4x (R source: borrowed_ggplot2.R).
|
|
2
|
+
|
|
3
|
+
These are ggplot2 internals that ggh4x copies because they are not exported. Only the ones
|
|
4
|
+
ggh4x actually uses for facet layout / strip assembly are ported here; the rest are sourced
|
|
5
|
+
from ``ggplot2_py`` where available. The crown jewel is ``id``/``id_var`` — the radix-based
|
|
6
|
+
panel-id assignment that defines facet panel ordering. Ported verbatim and verified against R.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any, List, Sequence
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"id_var",
|
|
18
|
+
"id",
|
|
19
|
+
"empty",
|
|
20
|
+
"is_zero",
|
|
21
|
+
"snake_class",
|
|
22
|
+
"ulevels",
|
|
23
|
+
"unique_combs",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _is_factor(x: Any) -> bool:
|
|
28
|
+
return isinstance(x, pd.Categorical) or (
|
|
29
|
+
isinstance(x, pd.Series) and isinstance(x.dtype, pd.CategoricalDtype)
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def id_var(x: Sequence[Any], drop: bool = False) -> np.ndarray:
|
|
34
|
+
"""Assign integer ids to a single variable, mirroring ggplot2's ``id_var``.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
x : sequence
|
|
39
|
+
A vector (optionally a pandas Categorical/factor).
|
|
40
|
+
drop : bool
|
|
41
|
+
If ``True``, drop unused factor levels before id assignment.
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
np.ndarray
|
|
46
|
+
1-based integer ids with an attached ``n`` (number of distinct values) accessible
|
|
47
|
+
via ``result.n`` (set as an attribute on the returned ndarray subclass).
|
|
48
|
+
"""
|
|
49
|
+
if len(x) == 0:
|
|
50
|
+
out = _IdArray(np.array([], dtype=int))
|
|
51
|
+
out.n = 0
|
|
52
|
+
return out
|
|
53
|
+
if _is_factor(x) and not drop:
|
|
54
|
+
cat = x if isinstance(x, pd.Categorical) else pd.Categorical(x)
|
|
55
|
+
levels = list(cat.categories)
|
|
56
|
+
codes = cat.codes.astype(int)
|
|
57
|
+
has_na = bool((codes < 0).any())
|
|
58
|
+
# addNA(x, ifany=TRUE): NA becomes an extra level if present
|
|
59
|
+
if has_na:
|
|
60
|
+
ids = np.where(codes < 0, len(levels) + 1, codes + 1)
|
|
61
|
+
n = len(levels) + 1
|
|
62
|
+
else:
|
|
63
|
+
ids = codes + 1
|
|
64
|
+
n = len(levels)
|
|
65
|
+
out = _IdArray(ids.astype(int))
|
|
66
|
+
out.n = n
|
|
67
|
+
return out
|
|
68
|
+
# else branch: drop=True (including factors) or non-factor.
|
|
69
|
+
# R: levels <- sort(unique0(x), na.last = TRUE); id <- match(x, levels).
|
|
70
|
+
# For a FACTOR, R's sort() orders by LEVEL order (NOT alphabetical) and
|
|
71
|
+
# keeps only present values (unique0); na.last puts NA at the end. Earlier
|
|
72
|
+
# this branch always used np.sort, which alphabetised factor levels and so
|
|
73
|
+
# mis-ordered facet PANEL/ROW/COL for non-alphabetical factor levels.
|
|
74
|
+
s = pd.Series(list(x))
|
|
75
|
+
has_na = bool(s.isna().any())
|
|
76
|
+
if _is_factor(x):
|
|
77
|
+
cat = x if isinstance(x, pd.Categorical) else pd.Categorical(x)
|
|
78
|
+
present = {int(c) for c in np.asarray(cat.codes) if c >= 0}
|
|
79
|
+
levels = [
|
|
80
|
+
cat.categories[k] for k in range(len(cat.categories)) if k in present
|
|
81
|
+
]
|
|
82
|
+
else:
|
|
83
|
+
uniq = pd.unique(s.dropna())
|
|
84
|
+
levels = list(np.sort(uniq)) if len(uniq) else []
|
|
85
|
+
level_list = list(levels) + ([np.nan] if has_na else [])
|
|
86
|
+
lookup = {v: i + 1 for i, v in enumerate(levels)}
|
|
87
|
+
na_id = len(levels) + 1 if has_na else 0
|
|
88
|
+
ids = np.array([na_id if pd.isna(v) else lookup[v] for v in s], dtype=int)
|
|
89
|
+
out = _IdArray(ids)
|
|
90
|
+
out.n = int(len(level_list))
|
|
91
|
+
return out
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class _IdArray(np.ndarray):
|
|
95
|
+
"""ndarray carrying an ``n`` attribute (R's ``attr(id, 'n')``)."""
|
|
96
|
+
|
|
97
|
+
n: int
|
|
98
|
+
|
|
99
|
+
def __new__(cls, input_array: np.ndarray) -> "_IdArray":
|
|
100
|
+
obj = np.asarray(input_array, dtype=int).view(cls)
|
|
101
|
+
obj.n = 0
|
|
102
|
+
return obj
|
|
103
|
+
|
|
104
|
+
def __array_finalize__(self, obj: Any) -> None:
|
|
105
|
+
if obj is None:
|
|
106
|
+
return
|
|
107
|
+
self.n = getattr(obj, "n", 0)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def id(variables: pd.DataFrame | Sequence[Any], drop: bool = False) -> _IdArray:
|
|
111
|
+
"""Compute a unique id per row across multiple variables, mirroring ggplot2's ``id``.
|
|
112
|
+
|
|
113
|
+
Uses radix mixing: variables are reversed (so the first varies slowest), each is
|
|
114
|
+
id-coded, and ids are combined as ``sum((id_i - 1) * cumprod(n_{<i})) + 1``. This is
|
|
115
|
+
what determines facet ``PANEL`` ordering.
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
variables : pandas.DataFrame or sequence of vectors
|
|
120
|
+
The faceting variables (columns).
|
|
121
|
+
drop : bool
|
|
122
|
+
Drop unused combinations.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
_IdArray
|
|
127
|
+
1-based row ids with ``.n`` = number of distinct combinations.
|
|
128
|
+
"""
|
|
129
|
+
if isinstance(variables, pd.DataFrame):
|
|
130
|
+
nrows = len(variables)
|
|
131
|
+
cols = [variables[c] for c in variables.columns]
|
|
132
|
+
else:
|
|
133
|
+
nrows = None
|
|
134
|
+
cols = list(variables)
|
|
135
|
+
cols = [c for c in cols if len(c) > 0]
|
|
136
|
+
if len(cols) == 0:
|
|
137
|
+
n = nrows if nrows is not None else 0
|
|
138
|
+
out = _IdArray(np.arange(1, n + 1))
|
|
139
|
+
out.n = n
|
|
140
|
+
return out
|
|
141
|
+
if len(cols) == 1:
|
|
142
|
+
return id_var(cols[0], drop=drop)
|
|
143
|
+
ids = [id_var(c, drop=drop) for c in cols][::-1] # rev()
|
|
144
|
+
ndistinct = np.array([i.n for i in ids], dtype=float)
|
|
145
|
+
n = int(np.prod(ndistinct))
|
|
146
|
+
p = len(ids)
|
|
147
|
+
combs = np.concatenate([[1.0], np.cumprod(ndistinct[: p - 1])])
|
|
148
|
+
mat = np.column_stack([np.asarray(i, dtype=float) for i in ids])
|
|
149
|
+
res = ((mat - 1.0) @ combs + 1.0).astype(int)
|
|
150
|
+
if drop:
|
|
151
|
+
return id_var(res, drop=True)
|
|
152
|
+
out = _IdArray(res)
|
|
153
|
+
out.n = n
|
|
154
|
+
return out
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def empty(df: Any) -> bool:
|
|
158
|
+
"""Test whether a data frame is "empty", mirroring ggplot2's ``empty``.
|
|
159
|
+
|
|
160
|
+
Parameters
|
|
161
|
+
----------
|
|
162
|
+
df : Any
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
bool
|
|
167
|
+
``True`` if *df* is ``None`` or has zero rows or zero columns.
|
|
168
|
+
"""
|
|
169
|
+
if df is None:
|
|
170
|
+
return True
|
|
171
|
+
if isinstance(df, pd.DataFrame):
|
|
172
|
+
return df.shape[0] == 0 or df.shape[1] == 0
|
|
173
|
+
return False
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def is_zero(x: Any) -> bool:
|
|
177
|
+
"""Test for a zero/empty grob, mirroring ggplot2's ``is.zero``.
|
|
178
|
+
|
|
179
|
+
Parameters
|
|
180
|
+
----------
|
|
181
|
+
x : Any
|
|
182
|
+
|
|
183
|
+
Returns
|
|
184
|
+
-------
|
|
185
|
+
bool
|
|
186
|
+
``True`` if *x* is ``None`` or a zeroGrob/null grob.
|
|
187
|
+
"""
|
|
188
|
+
if x is None:
|
|
189
|
+
return True
|
|
190
|
+
cls = type(x).__name__
|
|
191
|
+
return cls in ("ZeroGrob", "zeroGrob", "NullGrob") or getattr(x, "_grid_class", None) in (
|
|
192
|
+
"zeroGrob",
|
|
193
|
+
"null",
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def snake_class(x: Any) -> str:
|
|
198
|
+
"""Convert a class name to snake_case, mirroring ggplot2's ``snake_class``.
|
|
199
|
+
|
|
200
|
+
Parameters
|
|
201
|
+
----------
|
|
202
|
+
x : Any
|
|
203
|
+
An object (its first class name is used) or a class-name string.
|
|
204
|
+
|
|
205
|
+
Returns
|
|
206
|
+
-------
|
|
207
|
+
str
|
|
208
|
+
e.g. ``FacetGrid2`` -> ``facet_grid2``.
|
|
209
|
+
"""
|
|
210
|
+
import re
|
|
211
|
+
|
|
212
|
+
name = x if isinstance(x, str) else type(x).__name__
|
|
213
|
+
name = re.sub(r"([A-Za-z])([A-Z])([a-z])", r"\1_\2\3", name)
|
|
214
|
+
name = name.replace(".", "_")
|
|
215
|
+
name = re.sub(r"([a-z])([A-Z])", r"\1_\2", name)
|
|
216
|
+
return name.lower()
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def ulevels(x: Sequence[Any]) -> np.ndarray:
|
|
220
|
+
"""Unique sorted levels (NA included for factors), mirroring ggplot2's ``ulevels``.
|
|
221
|
+
|
|
222
|
+
Parameters
|
|
223
|
+
----------
|
|
224
|
+
x : sequence
|
|
225
|
+
|
|
226
|
+
Returns
|
|
227
|
+
-------
|
|
228
|
+
np.ndarray
|
|
229
|
+
"""
|
|
230
|
+
if _is_factor(x):
|
|
231
|
+
cat = x if isinstance(x, pd.Categorical) else pd.Categorical(x)
|
|
232
|
+
levels = list(cat.categories)
|
|
233
|
+
# R addNA(x, ifany=TRUE): add an <NA> level when an NA value is present.
|
|
234
|
+
if bool((np.asarray(cat.codes) < 0).any()):
|
|
235
|
+
return np.asarray(levels + [np.nan], dtype=object)
|
|
236
|
+
return np.asarray(levels)
|
|
237
|
+
s = pd.Series(list(x))
|
|
238
|
+
uniq = pd.unique(s.dropna())
|
|
239
|
+
sorted_uniq = np.sort(uniq) if len(uniq) else np.array([])
|
|
240
|
+
# R sort(..., na.last = TRUE): keep NA as the last level when present.
|
|
241
|
+
if bool(s.isna().any()):
|
|
242
|
+
return np.asarray(list(sorted_uniq) + [np.nan], dtype=object)
|
|
243
|
+
return sorted_uniq
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def unique_combs(df: pd.DataFrame) -> pd.DataFrame:
|
|
247
|
+
"""All unique combinations of the columns' levels, mirroring ggplot2's ``unique_combs``.
|
|
248
|
+
|
|
249
|
+
Parameters
|
|
250
|
+
----------
|
|
251
|
+
df : pandas.DataFrame
|
|
252
|
+
|
|
253
|
+
Returns
|
|
254
|
+
-------
|
|
255
|
+
pandas.DataFrame
|
|
256
|
+
Cross-product of per-column ``ulevels``. Mirrors R
|
|
257
|
+
``rev(expand.grid(rev(unique_values)))``: the FIRST column varies
|
|
258
|
+
slowest and the last varies fastest, and NA levels are included.
|
|
259
|
+
"""
|
|
260
|
+
if df.shape[1] == 0:
|
|
261
|
+
return pd.DataFrame()
|
|
262
|
+
level_lists = {c: ulevels(df[c]) for c in df.columns}
|
|
263
|
+
cols = list(df.columns)
|
|
264
|
+
from itertools import product
|
|
265
|
+
|
|
266
|
+
# itertools.product varies its first argument slowest -> first column
|
|
267
|
+
# slowest, matching R's rev(expand.grid(rev(...))).
|
|
268
|
+
rows = list(product(*[level_lists[c] for c in cols]))
|
|
269
|
+
data = {c: [] for c in cols}
|
|
270
|
+
for combo in rows:
|
|
271
|
+
for c, v in zip(cols, combo):
|
|
272
|
+
data[c].append(v)
|
|
273
|
+
return pd.DataFrame(data)[cols]
|
ggh4x/_cli.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""cli message shims (R source: cli package usage in ggh4x).
|
|
2
|
+
|
|
3
|
+
ggh4x calls ``cli::cli_abort`` / ``cli::cli_warn`` / ``cli::cli_inform`` for user-facing
|
|
4
|
+
messages. The Python port maps these to standard exceptions / ``warnings`` so failures stay
|
|
5
|
+
loud (per the error-handling discipline) while stripping cli's ``{.arg}`` glue markup.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
import warnings
|
|
12
|
+
from typing import NoReturn, Type
|
|
13
|
+
|
|
14
|
+
__all__ = ["cli_abort", "cli_warn", "cli_inform", "strip_cli_markup"]
|
|
15
|
+
|
|
16
|
+
# cli inline-markup spans like {.arg foo}, {.code x}, {.field y}, {.val 3}, {.cls C}.
|
|
17
|
+
_CLI_SPAN = re.compile(r"\{\.[a-zA-Z_]+\s+([^{}]*)\}")
|
|
18
|
+
# Leftover interpolation braces {x} -> x (we cannot evaluate R glue, just unwrap).
|
|
19
|
+
_CLI_BRACE = re.compile(r"\{([^{}]*)\}")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def strip_cli_markup(message: str) -> str:
|
|
23
|
+
"""Remove cli inline-markup so a plain message remains.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
message : str
|
|
28
|
+
A message possibly containing cli markup such as ``{.arg x}``.
|
|
29
|
+
|
|
30
|
+
Returns
|
|
31
|
+
-------
|
|
32
|
+
str
|
|
33
|
+
The message with markup spans replaced by their content.
|
|
34
|
+
"""
|
|
35
|
+
prev = None
|
|
36
|
+
out = message
|
|
37
|
+
while prev != out:
|
|
38
|
+
prev = out
|
|
39
|
+
out = _CLI_SPAN.sub(r"\1", out)
|
|
40
|
+
out = _CLI_BRACE.sub(r"\1", out)
|
|
41
|
+
return out
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def cli_abort(message: str, error_class: Type[Exception] = ValueError) -> NoReturn:
|
|
45
|
+
"""Raise an exception, mirroring ``cli::cli_abort``.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
message : str
|
|
50
|
+
Error message (cli markup is stripped).
|
|
51
|
+
error_class : type[Exception]
|
|
52
|
+
Exception type to raise (default ``ValueError``; pass ``TypeError`` where the R
|
|
53
|
+
error is about an input type).
|
|
54
|
+
|
|
55
|
+
Raises
|
|
56
|
+
------
|
|
57
|
+
Exception
|
|
58
|
+
Always raises *error_class*.
|
|
59
|
+
"""
|
|
60
|
+
raise error_class(strip_cli_markup(message))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def cli_warn(message: str, category: Type[Warning] = UserWarning) -> None:
|
|
64
|
+
"""Emit a warning, mirroring ``cli::cli_warn``.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
message : str
|
|
69
|
+
Warning message (cli markup is stripped).
|
|
70
|
+
category : type[Warning]
|
|
71
|
+
Warning category (default ``UserWarning``).
|
|
72
|
+
"""
|
|
73
|
+
warnings.warn(strip_cli_markup(message), category, stacklevel=2)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def cli_inform(message: str) -> None:
|
|
77
|
+
"""Print an informational message, mirroring ``cli::cli_inform``.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
message : str
|
|
82
|
+
Message to print (cli markup is stripped).
|
|
83
|
+
"""
|
|
84
|
+
print(strip_cli_markup(message))
|
ggh4x/_datasets.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Dataset loaders for tutorials/tests (R source: base ``datasets`` + ggplot2 datasets).
|
|
2
|
+
|
|
3
|
+
ggh4x bundles no data; its vignettes use standard R datasets. This module loads the
|
|
4
|
+
base-R datasets (bundled as CSVs in ``ggh4x/resources/``) and re-exports the ggplot2
|
|
5
|
+
datasets from ``ggplot2_py`` so tutorial/validation code is self-contained and R-faithful.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from importlib import resources
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"load_iris",
|
|
17
|
+
"load_mtcars",
|
|
18
|
+
"load_faithful",
|
|
19
|
+
"load_pressure",
|
|
20
|
+
"load_volcano",
|
|
21
|
+
"mpg",
|
|
22
|
+
"diamonds",
|
|
23
|
+
"economics",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _resource_path(filename: str):
|
|
28
|
+
return resources.files("ggh4x.resources").joinpath(filename)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def load_iris() -> pd.DataFrame:
|
|
32
|
+
"""Load the ``iris`` dataset (150x5), mirroring base R.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
pandas.DataFrame
|
|
37
|
+
Columns ``Sepal.Length``, ``Sepal.Width``, ``Petal.Length``, ``Petal.Width``,
|
|
38
|
+
``Species`` (``Species`` as a category).
|
|
39
|
+
"""
|
|
40
|
+
with resources.as_file(_resource_path("iris.csv")) as p:
|
|
41
|
+
df = pd.read_csv(p)
|
|
42
|
+
df["Species"] = pd.Categorical(df["Species"])
|
|
43
|
+
return df
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def load_mtcars() -> pd.DataFrame:
|
|
47
|
+
"""Load the ``mtcars`` dataset (32x11), mirroring base R.
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
pandas.DataFrame
|
|
52
|
+
Model name is the index (matching R rownames); 11 numeric columns.
|
|
53
|
+
"""
|
|
54
|
+
with resources.as_file(_resource_path("mtcars.csv")) as p:
|
|
55
|
+
df = pd.read_csv(p, index_col=0)
|
|
56
|
+
df.index.name = "model"
|
|
57
|
+
return df
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def load_faithful() -> pd.DataFrame:
|
|
61
|
+
"""Load the ``faithful`` dataset (272x2), mirroring base R.
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
pandas.DataFrame
|
|
66
|
+
Columns ``eruptions`` and ``waiting``.
|
|
67
|
+
"""
|
|
68
|
+
with resources.as_file(_resource_path("faithful.csv")) as p:
|
|
69
|
+
return pd.read_csv(p)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def load_pressure() -> pd.DataFrame:
|
|
73
|
+
"""Load the ``pressure`` dataset (19x2), mirroring base R.
|
|
74
|
+
|
|
75
|
+
Returns
|
|
76
|
+
-------
|
|
77
|
+
pandas.DataFrame
|
|
78
|
+
Columns ``temperature`` and ``pressure``.
|
|
79
|
+
"""
|
|
80
|
+
with resources.as_file(_resource_path("pressure.csv")) as p:
|
|
81
|
+
return pd.read_csv(p)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def load_volcano() -> np.ndarray:
|
|
85
|
+
"""Load the ``volcano`` matrix (87x61), mirroring base R.
|
|
86
|
+
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
numpy.ndarray
|
|
90
|
+
Topographic heights as a float array.
|
|
91
|
+
"""
|
|
92
|
+
with resources.as_file(_resource_path("volcano.csv")) as p:
|
|
93
|
+
return pd.read_csv(p, header=None).to_numpy(dtype=float)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _ggplot2_dataset(name: str) -> pd.DataFrame:
|
|
97
|
+
from ggplot2_py import datasets as _ds
|
|
98
|
+
|
|
99
|
+
return getattr(_ds, name)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ggplot2 datasets re-exported from ggplot2_py (loaded lazily on attribute access).
|
|
103
|
+
def __getattr__(name: str): # pragma: no cover - thin re-export
|
|
104
|
+
if name in ("mpg", "diamonds", "economics"):
|
|
105
|
+
return _ggplot2_dataset(name)
|
|
106
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
ggh4x/_download.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Transparent download and caching for remote data assets."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import sys
|
|
5
|
+
import urllib.request
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from ._registry import CACHE_DIR_NAME, DATA_DIR_NAME, REGISTRY
|
|
9
|
+
|
|
10
|
+
__all__ = ["resolve_data_path"]
|
|
11
|
+
|
|
12
|
+
# <pkg>-python/<import_name>/_download.py → parent.parent = <pkg>-python/
|
|
13
|
+
_PKG_ROOT = Path(__file__).resolve().parent.parent
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def resolve_data_path(filename: str) -> Path:
|
|
17
|
+
"""Resolve a remote data asset to a local file path.
|
|
18
|
+
|
|
19
|
+
Resolution order:
|
|
20
|
+
|
|
21
|
+
1. ``<work_dir>/<DATA_DIR_NAME>/<filename>`` — local staging copy
|
|
22
|
+
2. ``~/.cache/<CACHE_DIR_NAME>/<filename>`` — previously downloaded
|
|
23
|
+
3. Download from registry URL → save to cache
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
filename : str
|
|
28
|
+
Filename as registered in ``REGISTRY``.
|
|
29
|
+
|
|
30
|
+
Returns
|
|
31
|
+
-------
|
|
32
|
+
Path
|
|
33
|
+
Absolute path to the resolved local file.
|
|
34
|
+
|
|
35
|
+
Raises
|
|
36
|
+
------
|
|
37
|
+
FileNotFoundError
|
|
38
|
+
If the file cannot be resolved from any source.
|
|
39
|
+
"""
|
|
40
|
+
# 1. local staging dir (sibling of <pkg>-python/)
|
|
41
|
+
local = _PKG_ROOT.parent / DATA_DIR_NAME / filename
|
|
42
|
+
if local.exists():
|
|
43
|
+
return local
|
|
44
|
+
|
|
45
|
+
# 2. cache
|
|
46
|
+
cache_dir = Path.home() / ".cache" / CACHE_DIR_NAME
|
|
47
|
+
cached = cache_dir / filename
|
|
48
|
+
if cached.exists():
|
|
49
|
+
return cached
|
|
50
|
+
|
|
51
|
+
# 3. download
|
|
52
|
+
if filename not in REGISTRY:
|
|
53
|
+
raise FileNotFoundError(
|
|
54
|
+
f"\'{filename}\' not found locally and not in registry.\n"
|
|
55
|
+
f"Place it in: {local}"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
entry = REGISTRY[filename]
|
|
59
|
+
url = entry.get("url")
|
|
60
|
+
if not url:
|
|
61
|
+
raise FileNotFoundError(
|
|
62
|
+
f"\'{filename}\' has no download URL in registry.\n"
|
|
63
|
+
f"Place it manually in: {local}"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
_download(url, cached)
|
|
68
|
+
_verify_sha256(cached, entry.get("sha256"))
|
|
69
|
+
return cached
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _download(url: str, dest: Path) -> None:
|
|
73
|
+
"""Stream-download *url* to *dest* with progress."""
|
|
74
|
+
print(f"Downloading {dest.name} …", file=sys.stderr, flush=True)
|
|
75
|
+
with urllib.request.urlopen(url) as resp:
|
|
76
|
+
total = int(resp.headers.get("Content-Length", 0))
|
|
77
|
+
received = 0
|
|
78
|
+
with open(dest, "wb") as fout:
|
|
79
|
+
while True:
|
|
80
|
+
chunk = resp.read(1 << 16) # 64 KiB
|
|
81
|
+
if not chunk:
|
|
82
|
+
break
|
|
83
|
+
fout.write(chunk)
|
|
84
|
+
received += len(chunk)
|
|
85
|
+
if total:
|
|
86
|
+
pct = received * 100 // total
|
|
87
|
+
print(
|
|
88
|
+
f"\r {received / 1e6:.1f}/{total / 1e6:.1f} MB ({pct}%)",
|
|
89
|
+
end="",
|
|
90
|
+
file=sys.stderr,
|
|
91
|
+
flush=True,
|
|
92
|
+
)
|
|
93
|
+
if total:
|
|
94
|
+
print(file=sys.stderr)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _verify_sha256(path: Path, expected: str | None) -> None:
|
|
98
|
+
"""Check SHA-256; delete file and raise on mismatch."""
|
|
99
|
+
if not expected:
|
|
100
|
+
return
|
|
101
|
+
h = hashlib.sha256()
|
|
102
|
+
with open(path, "rb") as f:
|
|
103
|
+
for chunk in iter(lambda: f.read(1 << 16), b""):
|
|
104
|
+
h.update(chunk)
|
|
105
|
+
actual = h.hexdigest()
|
|
106
|
+
if actual != expected:
|
|
107
|
+
path.unlink(missing_ok=True)
|
|
108
|
+
raise RuntimeError(
|
|
109
|
+
f"SHA-256 mismatch for {path.name}: "
|
|
110
|
+
f"expected {expected[:16]}…, got {actual[:16]}…"
|
|
111
|
+
)
|