hyperview 0.2.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hyperview-0.2.0 → hyperview-0.3.1}/.gitignore +3 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/LICENSE +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/PKG-INFO +7 -6
- {hyperview-0.2.0 → hyperview-0.3.1}/README.md +6 -5
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/_version.py +2 -2
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/api.py +26 -18
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/cli.py +73 -25
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/core/dataset.py +353 -185
- hyperview-0.3.1/src/hyperview/core/selection.py +309 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/embeddings/__init__.py +2 -3
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/embeddings/engine.py +63 -2
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/embeddings/pipelines.py +108 -39
- hyperview-0.3.1/src/hyperview/embeddings/projection.py +467 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/embeddings/providers/lancedb_providers.py +178 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/app.py +157 -31
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/404/index.html +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/404.html +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/__next.__PAGE__.txt +2 -2
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/__next._full.txt +2 -2
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/__next._head.txt +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/__next._index.txt +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/__next._tree.txt +1 -1
- hyperview-0.3.1/src/hyperview/server/static/_next/static/chunks/077b38561d6ea80d.js +13 -0
- hyperview-0.3.1/src/hyperview/server/static/_next/static/chunks/6ab4c63fd83a6bdc.js +1 -0
- hyperview-0.3.1/src/hyperview/server/static/_next/static/chunks/6adcb3a43c287a0a.js +407 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_not-found/__next._full.txt +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_not-found/__next._head.txt +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_not-found/__next._index.txt +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_not-found/__next._not-found.txt +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_not-found/__next._tree.txt +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_not-found/index.html +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_not-found/index.txt +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/index.html +1 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/index.txt +2 -2
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/storage/backend.py +2 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/storage/lancedb_backend.py +226 -23
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/storage/memory_backend.py +35 -6
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/storage/schema.py +53 -13
- hyperview-0.2.0/src/hyperview/core/selection.py +0 -53
- hyperview-0.2.0/src/hyperview/embeddings/projection.py +0 -267
- hyperview-0.2.0/src/hyperview/server/static/_next/static/chunks/4543baba6321cb86.js +0 -301
- hyperview-0.2.0/src/hyperview/server/static/_next/static/chunks/7f11a0afb44e4703.js +0 -13
- hyperview-0.2.0/src/hyperview/server/static/_next/static/chunks/80cd550edf03d788.js +0 -1
- {hyperview-0.2.0 → hyperview-0.3.1}/pyproject.toml +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/__init__.py +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/core/__init__.py +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/core/sample.py +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/embeddings/compute.py +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/embeddings/providers/__init__.py +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/__init__.py +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/chunks/462c5e072cd14e02.css +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/chunks/567993cf36cd4ab1.js +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/chunks/86c1fc4cf542f408.js +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/chunks/a6dad97d9634a72d.js +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/chunks/a6dad97d9634a72d.js.map +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/chunks/e954ba82c0a04100.js +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/chunks/f29dd35a99c216ea.js +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/chunks/turbopack-cb59e03a04a579d1.js +0 -0
- {hyperview-0.2.0/src/hyperview/server/static/_next/static/u9HWgMoM1R5w0owC62Blr → hyperview-0.3.1/src/hyperview/server/static/_next/static/gMy4JPL2K0MjiU7F71me_}/_buildManifest.js +0 -0
- {hyperview-0.2.0/src/hyperview/server/static/_next/static/u9HWgMoM1R5w0owC62Blr → hyperview-0.3.1/src/hyperview/server/static/_next/static/gMy4JPL2K0MjiU7F71me_}/_clientMiddlewareManifest.json +0 -0
- {hyperview-0.2.0/src/hyperview/server/static/_next/static/u9HWgMoM1R5w0owC62Blr → hyperview-0.3.1/src/hyperview/server/static/_next/static/gMy4JPL2K0MjiU7F71me_}/_ssgManifest.js +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/media/1bffadaabf893a1e-s.7cd81963.woff2 +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/media/2bbe8d2671613f1f-s.76dcb0b2.woff2 +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/media/2c55a0e60120577a-s.2a48534a.woff2 +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/media/5476f68d60460930-s.c995e352.woff2 +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/media/83afe278b6a6bb3c-s.p.3a6ba036.woff2 +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/media/9c72aa0f40e4eef8-s.18a48cbc.woff2 +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/server/static/_next/static/media/ad66f9afd8947f86-s.7a40eb73.woff2 +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/storage/__init__.py +0 -0
- {hyperview-0.2.0 → hyperview-0.3.1}/src/hyperview/storage/config.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hyperview
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Open-source dataset curation with hyperbolic embeddings visualization
|
|
5
5
|
Project-URL: Homepage, https://github.com/Hyper3Labs/HyperView
|
|
6
6
|
Project-URL: Documentation, https://github.com/Hyper3Labs/HyperView#readme
|
|
@@ -48,7 +48,7 @@ Description-Content-Type: text/markdown
|
|
|
48
48
|
|
|
49
49
|
> **Open-source dataset curation + embedding visualization (Euclidean + Poincaré disk)**
|
|
50
50
|
|
|
51
|
-
[](https://opensource.org/licenses/MIT) [](https://deepwiki.com/Hyper3Labs/HyperView) [](https://huggingface.co/spaces/hyper3labs/HyperView) [](https://discord.gg/
|
|
51
|
+
[](https://opensource.org/licenses/MIT) [](https://deepwiki.com/Hyper3Labs/HyperView) [](https://huggingface.co/spaces/hyper3labs/HyperView) [](https://discord.gg/Za3rBkTPSf)
|
|
52
52
|
|
|
53
53
|
<p align="center">
|
|
54
54
|
<a href="https://huggingface.co/spaces/hyper3labs/HyperView" target="_blank">
|
|
@@ -63,7 +63,7 @@ Description-Content-Type: text/markdown
|
|
|
63
63
|
## Features
|
|
64
64
|
|
|
65
65
|
- **Dual-Panel UI**: Image grid + scatter plot with bidirectional selection
|
|
66
|
-
- **
|
|
66
|
+
- **Multi-Layout Visualizations**: Explore Euclidean, Poincare, and spherical layouts in 2D or 3D with UMAP or PCA projections
|
|
67
67
|
- **HuggingFace Integration**: Load datasets directly from HuggingFace Hub
|
|
68
68
|
- **Fast Embeddings**: Uses EmbedAnything for CLIP-based image embeddings
|
|
69
69
|
|
|
@@ -94,14 +94,15 @@ hyperview \
|
|
|
94
94
|
--label-key label \
|
|
95
95
|
--samples 500 \
|
|
96
96
|
--model openai/clip-vit-base-patch32 \
|
|
97
|
-
--
|
|
97
|
+
--layout euclidean \
|
|
98
|
+
--layout poincare
|
|
98
99
|
```
|
|
99
100
|
|
|
100
101
|
This will:
|
|
101
102
|
1. Use dataset `cifar10_demo`
|
|
102
103
|
2. Load up to 500 samples from CIFAR-10
|
|
103
104
|
3. Compute CLIP embeddings
|
|
104
|
-
4. Generate Euclidean and
|
|
105
|
+
4. Generate Euclidean and Poincare visualizations
|
|
105
106
|
5. Start the server at **http://127.0.0.1:6262**
|
|
106
107
|
|
|
107
108
|
You can also launch with explicit dataset/model/projection args:
|
|
@@ -116,7 +117,7 @@ hyperview \
|
|
|
116
117
|
--samples 1000 \
|
|
117
118
|
--model openai/clip-vit-base-patch32 \
|
|
118
119
|
--method umap \
|
|
119
|
-
--
|
|
120
|
+
--layout euclidean
|
|
120
121
|
```
|
|
121
122
|
|
|
122
123
|
### Python API
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
> **Open-source dataset curation + embedding visualization (Euclidean + Poincaré disk)**
|
|
4
4
|
|
|
5
|
-
[](https://opensource.org/licenses/MIT) [](https://deepwiki.com/Hyper3Labs/HyperView) [](https://huggingface.co/spaces/hyper3labs/HyperView) [](https://discord.gg/
|
|
5
|
+
[](https://opensource.org/licenses/MIT) [](https://deepwiki.com/Hyper3Labs/HyperView) [](https://huggingface.co/spaces/hyper3labs/HyperView) [](https://discord.gg/Za3rBkTPSf)
|
|
6
6
|
|
|
7
7
|
<p align="center">
|
|
8
8
|
<a href="https://huggingface.co/spaces/hyper3labs/HyperView" target="_blank">
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
## Features
|
|
18
18
|
|
|
19
19
|
- **Dual-Panel UI**: Image grid + scatter plot with bidirectional selection
|
|
20
|
-
- **
|
|
20
|
+
- **Multi-Layout Visualizations**: Explore Euclidean, Poincare, and spherical layouts in 2D or 3D with UMAP or PCA projections
|
|
21
21
|
- **HuggingFace Integration**: Load datasets directly from HuggingFace Hub
|
|
22
22
|
- **Fast Embeddings**: Uses EmbedAnything for CLIP-based image embeddings
|
|
23
23
|
|
|
@@ -48,14 +48,15 @@ hyperview \
|
|
|
48
48
|
--label-key label \
|
|
49
49
|
--samples 500 \
|
|
50
50
|
--model openai/clip-vit-base-patch32 \
|
|
51
|
-
--
|
|
51
|
+
--layout euclidean \
|
|
52
|
+
--layout poincare
|
|
52
53
|
```
|
|
53
54
|
|
|
54
55
|
This will:
|
|
55
56
|
1. Use dataset `cifar10_demo`
|
|
56
57
|
2. Load up to 500 samples from CIFAR-10
|
|
57
58
|
3. Compute CLIP embeddings
|
|
58
|
-
4. Generate Euclidean and
|
|
59
|
+
4. Generate Euclidean and Poincare visualizations
|
|
59
60
|
5. Start the server at **http://127.0.0.1:6262**
|
|
60
61
|
|
|
61
62
|
You can also launch with explicit dataset/model/projection args:
|
|
@@ -70,7 +71,7 @@ hyperview \
|
|
|
70
71
|
--samples 1000 \
|
|
71
72
|
--model openai/clip-vit-base-patch32 \
|
|
72
73
|
--method umap \
|
|
73
|
-
--
|
|
74
|
+
--layout euclidean
|
|
74
75
|
```
|
|
75
76
|
|
|
76
77
|
### Python API
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.
|
|
32
|
-
__version_tuple__ = version_tuple = (0,
|
|
31
|
+
__version__ = version = '0.3.1'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 3, 1)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -7,6 +7,7 @@ import threading
|
|
|
7
7
|
import time
|
|
8
8
|
import webbrowser
|
|
9
9
|
from dataclasses import dataclass
|
|
10
|
+
from importlib.util import find_spec
|
|
10
11
|
from urllib.error import URLError
|
|
11
12
|
from urllib.request import Request, urlopen
|
|
12
13
|
from uuid import uuid4
|
|
@@ -55,6 +56,16 @@ def _read_health(url: str, timeout_s: float) -> _HealthResponse:
|
|
|
55
56
|
)
|
|
56
57
|
|
|
57
58
|
|
|
59
|
+
def _resolve_default_launch_layout(dataset: Dataset) -> str:
|
|
60
|
+
spaces = dataset.list_spaces()
|
|
61
|
+
|
|
62
|
+
if any(space.geometry not in ("hyperboloid", "hypersphere") for space in spaces):
|
|
63
|
+
return "euclidean:2d"
|
|
64
|
+
if any(space.geometry == "hypersphere" for space in spaces):
|
|
65
|
+
return "spherical:3d"
|
|
66
|
+
return "poincare:2d"
|
|
67
|
+
|
|
68
|
+
|
|
58
69
|
class Session:
|
|
59
70
|
"""A session for the HyperView visualizer."""
|
|
60
71
|
|
|
@@ -228,9 +239,9 @@ def launch(
|
|
|
228
239
|
"""Launch the HyperView visualization server.
|
|
229
240
|
|
|
230
241
|
Note:
|
|
231
|
-
HyperView
|
|
232
|
-
embedding spaces
|
|
233
|
-
automatically
|
|
242
|
+
HyperView needs at least one visualization to display. If no layouts
|
|
243
|
+
exist yet but embedding spaces do, this function computes one default
|
|
244
|
+
layout automatically.
|
|
234
245
|
|
|
235
246
|
Args:
|
|
236
247
|
dataset: The dataset to visualize.
|
|
@@ -318,26 +329,26 @@ def launch(
|
|
|
318
329
|
"port or stop the process listening on that port."
|
|
319
330
|
)
|
|
320
331
|
|
|
321
|
-
# The frontend requires 2D coords from /api/embeddings.
|
|
322
|
-
# Ensure at least one layout exists; do not auto-generate optional geometries.
|
|
323
332
|
layouts = dataset.list_layouts()
|
|
324
333
|
spaces = dataset.list_spaces()
|
|
325
334
|
|
|
326
|
-
if not spaces:
|
|
335
|
+
if not layouts and not spaces:
|
|
327
336
|
raise ValueError(
|
|
328
|
-
"HyperView launch requires
|
|
329
|
-
"No
|
|
337
|
+
"HyperView launch requires at least one visualization or embedding space. "
|
|
338
|
+
"No visualizations or embedding spaces were found. "
|
|
330
339
|
"Call `dataset.compute_embeddings()` and `dataset.compute_visualization()` "
|
|
331
|
-
"before `hv.launch()`."
|
|
340
|
+
"or `dataset.set_coords()` before `hv.launch()`."
|
|
332
341
|
)
|
|
333
342
|
|
|
334
343
|
if not layouts:
|
|
335
|
-
|
|
336
|
-
default_geometry = "euclidean" if has_euclidean_space else "poincare"
|
|
344
|
+
default_layout = _resolve_default_launch_layout(dataset)
|
|
337
345
|
|
|
338
|
-
print(f"No
|
|
346
|
+
print(f"No visualizations found. Computing {default_layout} visualization...")
|
|
339
347
|
# Let compute_visualization pick the most appropriate default space.
|
|
340
|
-
dataset.compute_visualization(
|
|
348
|
+
dataset.compute_visualization(
|
|
349
|
+
space_key=None,
|
|
350
|
+
layout=default_layout,
|
|
351
|
+
)
|
|
341
352
|
|
|
342
353
|
session = Session(dataset, host, port)
|
|
343
354
|
|
|
@@ -390,9 +401,6 @@ def _is_colab() -> bool:
|
|
|
390
401
|
"""Check if running inside a Google Colab notebook runtime."""
|
|
391
402
|
if os.environ.get("COLAB_RELEASE_TAG"):
|
|
392
403
|
return True
|
|
393
|
-
|
|
394
|
-
import google.colab # type: ignore[import-not-found]
|
|
395
|
-
|
|
404
|
+
if find_spec("google.colab") is not None:
|
|
396
405
|
return True
|
|
397
|
-
|
|
398
|
-
return False
|
|
406
|
+
return False
|
|
@@ -5,6 +5,7 @@ from __future__ import annotations
|
|
|
5
5
|
import argparse
|
|
6
6
|
|
|
7
7
|
from hyperview import Dataset, launch
|
|
8
|
+
from hyperview.core.dataset import parse_visualization_layout
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
def _build_parser() -> argparse.ArgumentParser:
|
|
@@ -38,6 +39,12 @@ def _build_parser() -> argparse.ArgumentParser:
|
|
|
38
39
|
default=None,
|
|
39
40
|
help="HuggingFace split to use (required with --hf-dataset)",
|
|
40
41
|
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"--hf-config",
|
|
44
|
+
type=str,
|
|
45
|
+
default=None,
|
|
46
|
+
help="Optional HuggingFace subset/configuration to use",
|
|
47
|
+
)
|
|
41
48
|
parser.add_argument(
|
|
42
49
|
"--image-key",
|
|
43
50
|
type=str,
|
|
@@ -72,6 +79,14 @@ def _build_parser() -> argparse.ArgumentParser:
|
|
|
72
79
|
default=None,
|
|
73
80
|
help="Maximum number of ingested samples (omit to load all)",
|
|
74
81
|
)
|
|
82
|
+
parser.add_argument(
|
|
83
|
+
"--hf-streaming",
|
|
84
|
+
action="store_true",
|
|
85
|
+
help=(
|
|
86
|
+
"Stream HuggingFace rows instead of materializing the full split first. "
|
|
87
|
+
"Useful for loading subsets without eager full-split downloads."
|
|
88
|
+
),
|
|
89
|
+
)
|
|
75
90
|
parser.add_argument(
|
|
76
91
|
"--shuffle",
|
|
77
92
|
action="store_true",
|
|
@@ -83,6 +98,15 @@ def _build_parser() -> argparse.ArgumentParser:
|
|
|
83
98
|
default=42,
|
|
84
99
|
help="Random seed used when --shuffle is enabled (default: 42)",
|
|
85
100
|
)
|
|
101
|
+
parser.add_argument(
|
|
102
|
+
"--hf-shuffle-buffer-size",
|
|
103
|
+
type=int,
|
|
104
|
+
default=1000,
|
|
105
|
+
help=(
|
|
106
|
+
"Shuffle buffer size used with --hf-streaming and --shuffle. "
|
|
107
|
+
"Streaming shuffle is approximate and trades larger buffers for more read-ahead."
|
|
108
|
+
),
|
|
109
|
+
)
|
|
86
110
|
|
|
87
111
|
parser.add_argument(
|
|
88
112
|
"--model",
|
|
@@ -95,17 +119,20 @@ def _build_parser() -> argparse.ArgumentParser:
|
|
|
95
119
|
)
|
|
96
120
|
parser.add_argument(
|
|
97
121
|
"--method",
|
|
98
|
-
choices=["umap"],
|
|
122
|
+
choices=["umap", "pca"],
|
|
99
123
|
default="umap",
|
|
100
|
-
help="Projection method (
|
|
124
|
+
help="Projection method: 'umap' (default) or 'pca'",
|
|
101
125
|
)
|
|
102
126
|
parser.add_argument(
|
|
103
|
-
"--
|
|
104
|
-
|
|
105
|
-
|
|
127
|
+
"--layout",
|
|
128
|
+
action="append",
|
|
129
|
+
dest="layouts",
|
|
130
|
+
metavar="GEOMETRY[:2d|3d]",
|
|
106
131
|
help=(
|
|
107
|
-
"
|
|
108
|
-
"
|
|
132
|
+
"Visualization layout to compute. Repeat this flag to request multiple layouts, "
|
|
133
|
+
"for example '--layout euclidean --layout spherical'. "
|
|
134
|
+
"Omitting the suffix defaults to 2D for euclidean/poincare and 3D for spherical. "
|
|
135
|
+
"If omitted, HyperView picks one sensible default layout for the selected embedding space."
|
|
109
136
|
),
|
|
110
137
|
)
|
|
111
138
|
parser.add_argument(
|
|
@@ -162,6 +189,23 @@ def _build_parser() -> argparse.ArgumentParser:
|
|
|
162
189
|
|
|
163
190
|
|
|
164
191
|
def _validate_args(parser: argparse.ArgumentParser, args: argparse.Namespace) -> None:
|
|
192
|
+
if args.layouts:
|
|
193
|
+
canonical_layouts: list[str] = []
|
|
194
|
+
seen_layouts: set[str] = set()
|
|
195
|
+
for layout_spec in args.layouts:
|
|
196
|
+
try:
|
|
197
|
+
geometry, layout_dimension = parse_visualization_layout(layout_spec)
|
|
198
|
+
except ValueError as exc:
|
|
199
|
+
parser.error(str(exc))
|
|
200
|
+
|
|
201
|
+
canonical_layout = f"{geometry}:{layout_dimension}d"
|
|
202
|
+
if canonical_layout in seen_layouts:
|
|
203
|
+
continue
|
|
204
|
+
seen_layouts.add(canonical_layout)
|
|
205
|
+
canonical_layouts.append(canonical_layout)
|
|
206
|
+
|
|
207
|
+
args.layouts = canonical_layouts
|
|
208
|
+
|
|
165
209
|
if args.hf_dataset and args.images_dir:
|
|
166
210
|
parser.error("Use either --hf-dataset or --images-dir, not both.")
|
|
167
211
|
|
|
@@ -181,6 +225,8 @@ def _validate_args(parser: argparse.ArgumentParser, args: argparse.Namespace) ->
|
|
|
181
225
|
parser.error("--split is required when using --hf-dataset.")
|
|
182
226
|
if not args.image_key:
|
|
183
227
|
parser.error("--image-key is required when using --hf-dataset.")
|
|
228
|
+
if args.hf_shuffle_buffer_size < 1:
|
|
229
|
+
parser.error("--hf-shuffle-buffer-size must be at least 1.")
|
|
184
230
|
|
|
185
231
|
|
|
186
232
|
def _print_ingestion_result(added: int, skipped: int) -> None:
|
|
@@ -191,9 +237,11 @@ def _print_ingestion_result(added: int, skipped: int) -> None:
|
|
|
191
237
|
|
|
192
238
|
|
|
193
239
|
def _ingest_huggingface(dataset: Dataset, args: argparse.Namespace, dataset_name: str) -> None:
|
|
194
|
-
|
|
240
|
+
config_suffix = f" [{args.hf_config}]" if args.hf_config else ""
|
|
241
|
+
print(f"Loading HuggingFace dataset {dataset_name}{config_suffix}...")
|
|
195
242
|
added, skipped = dataset.add_from_huggingface(
|
|
196
243
|
dataset_name,
|
|
244
|
+
config=args.hf_config,
|
|
197
245
|
split=args.split,
|
|
198
246
|
image_key=args.image_key,
|
|
199
247
|
label_key=args.label_key,
|
|
@@ -201,6 +249,8 @@ def _ingest_huggingface(dataset: Dataset, args: argparse.Namespace, dataset_name
|
|
|
201
249
|
max_samples=args.samples,
|
|
202
250
|
shuffle=args.shuffle,
|
|
203
251
|
seed=args.seed,
|
|
252
|
+
streaming=args.hf_streaming,
|
|
253
|
+
shuffle_buffer_size=args.hf_shuffle_buffer_size,
|
|
204
254
|
)
|
|
205
255
|
_print_ingestion_result(added, skipped)
|
|
206
256
|
|
|
@@ -228,37 +278,35 @@ def _prepare_dataset(args: argparse.Namespace) -> Dataset:
|
|
|
228
278
|
return dataset
|
|
229
279
|
|
|
230
280
|
|
|
231
|
-
def
|
|
281
|
+
def _resolve_default_layouts(
|
|
232
282
|
dataset: Dataset,
|
|
233
|
-
geometry: str,
|
|
234
283
|
space_key: str | None,
|
|
235
284
|
) -> list[str]:
|
|
236
|
-
if geometry == "both":
|
|
237
|
-
return ["euclidean", "poincare"]
|
|
238
|
-
|
|
239
|
-
if geometry in ("euclidean", "poincare"):
|
|
240
|
-
return [geometry]
|
|
241
|
-
|
|
242
|
-
if space_key is None:
|
|
243
|
-
return ["euclidean"]
|
|
244
|
-
|
|
245
285
|
spaces = dataset.list_spaces()
|
|
246
286
|
selected = next((space for space in spaces if space.space_key == space_key), None)
|
|
247
|
-
if selected is not None and selected.geometry == "hyperboloid":
|
|
248
|
-
return ["poincare"]
|
|
249
287
|
|
|
250
|
-
|
|
288
|
+
if selected is not None:
|
|
289
|
+
if selected.geometry == "hyperboloid":
|
|
290
|
+
return ["poincare:2d"]
|
|
291
|
+
if selected.geometry == "hypersphere":
|
|
292
|
+
return ["spherical:3d"]
|
|
293
|
+
return ["euclidean:2d"]
|
|
251
294
|
|
|
295
|
+
if any(space.geometry not in ("hyperboloid", "hypersphere") for space in spaces):
|
|
296
|
+
return ["euclidean:2d"]
|
|
297
|
+
if any(space.geometry == "hypersphere" for space in spaces):
|
|
298
|
+
return ["spherical:3d"]
|
|
299
|
+
return ["poincare:2d"]
|
|
252
300
|
|
|
253
301
|
def _compute_layouts(dataset: Dataset, args: argparse.Namespace, space_key: str | None) -> None:
|
|
254
|
-
|
|
302
|
+
target_layouts = args.layouts or _resolve_default_layouts(dataset, space_key)
|
|
255
303
|
|
|
256
304
|
print("Computing visualizations...")
|
|
257
|
-
for
|
|
305
|
+
for target_layout in target_layouts:
|
|
258
306
|
dataset.compute_visualization(
|
|
259
307
|
space_key=space_key,
|
|
260
308
|
method=args.method,
|
|
261
|
-
|
|
309
|
+
layout=target_layout,
|
|
262
310
|
n_neighbors=args.n_neighbors,
|
|
263
311
|
min_dist=args.min_dist,
|
|
264
312
|
metric=args.metric,
|