hf2vespa 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hf2vespa/cli.py +41 -21
- {hf2vespa-0.1.0.dist-info → hf2vespa-0.1.2.dist-info}/METADATA +102 -20
- {hf2vespa-0.1.0.dist-info → hf2vespa-0.1.2.dist-info}/RECORD +6 -6
- {hf2vespa-0.1.0.dist-info → hf2vespa-0.1.2.dist-info}/WHEEL +0 -0
- {hf2vespa-0.1.0.dist-info → hf2vespa-0.1.2.dist-info}/entry_points.txt +0 -0
- {hf2vespa-0.1.0.dist-info → hf2vespa-0.1.2.dist-info}/top_level.txt +0 -0
hf2vespa/cli.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# Suppress warnings that occur during cleanup. These are harmless but confusing to users.
|
|
2
2
|
# 1. HuggingFace HTTP retry warnings: https://github.com/apache/arrow/issues/45214
|
|
3
3
|
# 2. Multiprocessing resource tracker warnings (leaked semaphores from HF datasets)
|
|
4
|
+
# 3. Python 3.14+ SyntaxWarning for 'return' in 'finally' blocks (multiprocess package)
|
|
5
|
+
# See: https://peps.python.org/pep-0765/
|
|
4
6
|
#
|
|
5
7
|
# IMPORTANT: Set PYTHONWARNINGS env var BEFORE any imports that might trigger multiprocessing.
|
|
6
8
|
# The resource_tracker runs as a separate daemon process and inherits the env at spawn time.
|
|
@@ -8,18 +10,23 @@
|
|
|
8
10
|
import os as _os
|
|
9
11
|
|
|
10
12
|
_existing_warnings = _os.environ.get("PYTHONWARNINGS", "")
|
|
11
|
-
|
|
13
|
+
_new_filters = [
|
|
14
|
+
"ignore::UserWarning:multiprocessing.resource_tracker",
|
|
15
|
+
"ignore::SyntaxWarning:multiprocess",
|
|
16
|
+
]
|
|
17
|
+
_combined = ",".join(_new_filters)
|
|
12
18
|
if _existing_warnings:
|
|
13
|
-
_os.environ["PYTHONWARNINGS"] = f"{_existing_warnings},{
|
|
19
|
+
_os.environ["PYTHONWARNINGS"] = f"{_existing_warnings},{_combined}"
|
|
14
20
|
else:
|
|
15
|
-
_os.environ["PYTHONWARNINGS"] =
|
|
16
|
-
del _os, _existing_warnings,
|
|
21
|
+
_os.environ["PYTHONWARNINGS"] = _combined
|
|
22
|
+
del _os, _existing_warnings, _new_filters, _combined
|
|
17
23
|
|
|
18
24
|
import logging as _logging
|
|
19
25
|
import warnings as _warnings
|
|
20
26
|
|
|
21
27
|
_logging.getLogger("huggingface_hub.utils._http").setLevel(_logging.CRITICAL)
|
|
22
28
|
_warnings.filterwarnings("ignore", message="resource_tracker:", category=UserWarning)
|
|
29
|
+
_warnings.filterwarnings("ignore", category=SyntaxWarning, module="multiprocess")
|
|
23
30
|
del _logging, _warnings
|
|
24
31
|
|
|
25
32
|
"""CLI for streaming HuggingFace datasets to Vespa JSON format."""
|
|
@@ -280,20 +287,26 @@ def feed(
|
|
|
280
287
|
|
|
281
288
|
Examples:
|
|
282
289
|
|
|
283
|
-
# Basic
|
|
284
|
-
|
|
290
|
+
# Basic streaming
|
|
291
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --limit 10
|
|
285
292
|
|
|
286
|
-
#
|
|
287
|
-
|
|
293
|
+
# Rename columns
|
|
294
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 5
|
|
295
|
+
|
|
296
|
+
# Filter specific columns
|
|
297
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --include title --include text --limit 5
|
|
288
298
|
|
|
289
299
|
# Custom namespace and doctype
|
|
290
|
-
|
|
300
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --namespace search --doctype passage --limit 10
|
|
301
|
+
|
|
302
|
+
# Use config file for complex mappings
|
|
303
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --config-file vespa-config.yaml --limit 10
|
|
291
304
|
|
|
292
|
-
#
|
|
293
|
-
|
|
305
|
+
# Skip errors instead of failing
|
|
306
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --on-error skip --limit 10
|
|
294
307
|
|
|
295
|
-
#
|
|
296
|
-
|
|
308
|
+
# Pipe directly to Vespa
|
|
309
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --limit 1000 | vespa feed -
|
|
297
310
|
"""
|
|
298
311
|
feed_impl(
|
|
299
312
|
dataset=dataset,
|
|
@@ -336,14 +349,14 @@ def init(
|
|
|
336
349
|
|
|
337
350
|
Examples:
|
|
338
351
|
|
|
339
|
-
# Generate config for
|
|
340
|
-
|
|
352
|
+
# Generate config for MS MARCO corpus
|
|
353
|
+
hf2vespa init mteb/msmarco-v2 --config corpus -o msmarco-config.yaml
|
|
341
354
|
|
|
342
355
|
# Specify output file
|
|
343
|
-
|
|
356
|
+
hf2vespa init mteb/msmarco-v2 --config corpus --output my-config.yaml
|
|
344
357
|
|
|
345
|
-
#
|
|
346
|
-
|
|
358
|
+
# Generate config for Cohere embeddings dataset
|
|
359
|
+
hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en -o cohere-config.yaml
|
|
347
360
|
"""
|
|
348
361
|
from hf2vespa.init import init_command
|
|
349
362
|
|
|
@@ -363,11 +376,18 @@ def install_completion(
|
|
|
363
376
|
|
|
364
377
|
Detects your shell automatically, or specify explicitly.
|
|
365
378
|
|
|
379
|
+
After installation, restart your shell or source your shell config file
|
|
380
|
+
(e.g., source ~/.bashrc).
|
|
381
|
+
|
|
366
382
|
Examples:
|
|
367
383
|
|
|
368
|
-
|
|
369
|
-
hf2vespa install-completion
|
|
370
|
-
|
|
384
|
+
# Auto-detect shell
|
|
385
|
+
hf2vespa install-completion
|
|
386
|
+
|
|
387
|
+
# Explicit shell
|
|
388
|
+
hf2vespa install-completion bash
|
|
389
|
+
hf2vespa install-completion zsh
|
|
390
|
+
hf2vespa install-completion fish
|
|
371
391
|
"""
|
|
372
392
|
from typer._completion_shared import Shells, install
|
|
373
393
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hf2vespa
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Stream HuggingFace datasets to Vespa JSON format
|
|
5
5
|
Author-email: Thomas Thoresen <thomas.h.thoresen@gmail.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -28,7 +28,7 @@ Requires-Dist: ruamel.yaml>=0.19.0
|
|
|
28
28
|
|
|
29
29
|
Stream HuggingFace datasets to Vespa JSON format
|
|
30
30
|
|
|
31
|
-
[](https://asciinema.org/a/VGODiMSiXua4FX7w?speed=2)
|
|
32
32
|
|
|
33
33
|
## Description
|
|
34
34
|
|
|
@@ -73,23 +73,23 @@ uv tool install .
|
|
|
73
73
|
Stream a HuggingFace dataset to Vespa JSON format:
|
|
74
74
|
|
|
75
75
|
```bash
|
|
76
|
-
hf2vespa feed
|
|
76
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 3
|
|
77
77
|
```
|
|
78
78
|
|
|
79
79
|
**Output:**
|
|
80
80
|
```json
|
|
81
|
-
{"put":"id:doc:doc::0","fields":{"
|
|
82
|
-
{"put":"id:doc:doc::1","fields":{"
|
|
83
|
-
{"put":"id:doc:doc::2","fields":{"
|
|
81
|
+
{"put":"id:doc:doc::0","fields":{"id":"00_0","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews."}}
|
|
82
|
+
{"put":"id:doc:doc::1","fields":{"id":"00_172","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"This allow for a more accurate measure, as does running the test first in one direction and then in the exact opposite direction..."}}
|
|
83
|
+
{"put":"id:doc:doc::2","fields":{"id":"00_587","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"Instead, some believe the measure should include a range of times rather than one finite mark..."}}
|
|
84
84
|
```
|
|
85
85
|
|
|
86
86
|
```
|
|
87
87
|
--- Completion Statistics ---
|
|
88
|
-
Total records processed:
|
|
89
|
-
Successful:
|
|
88
|
+
Total records processed: 3
|
|
89
|
+
Successful: 3
|
|
90
90
|
Errors: 0
|
|
91
|
-
Throughput:
|
|
92
|
-
Elapsed time:
|
|
91
|
+
Throughput: 4.5 records/sec
|
|
92
|
+
Elapsed time: 0.67s
|
|
93
93
|
```
|
|
94
94
|
|
|
95
95
|
### Preview Dataset Schema
|
|
@@ -97,33 +97,115 @@ Elapsed time: 2.38s
|
|
|
97
97
|
Inspect a dataset and generate a YAML configuration template:
|
|
98
98
|
|
|
99
99
|
```bash
|
|
100
|
-
hf2vespa init
|
|
100
|
+
hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en -o cohere-config.yaml
|
|
101
101
|
```
|
|
102
102
|
|
|
103
|
-
**Generated config.yaml:**
|
|
103
|
+
**Generated cohere-config.yaml:**
|
|
104
104
|
```yaml
|
|
105
105
|
namespace: doc
|
|
106
106
|
doctype: doc
|
|
107
|
-
id_column:
|
|
107
|
+
id_column:
|
|
108
108
|
|
|
109
109
|
mappings:
|
|
110
|
-
- source:
|
|
111
|
-
target:
|
|
110
|
+
- source: _id
|
|
111
|
+
target: _id
|
|
112
112
|
type: # string
|
|
113
|
-
- source:
|
|
114
|
-
target:
|
|
113
|
+
- source: url
|
|
114
|
+
target: url
|
|
115
|
+
type: # string
|
|
116
|
+
- source: title
|
|
117
|
+
target: title
|
|
118
|
+
type: # string
|
|
119
|
+
- source: text
|
|
120
|
+
target: text
|
|
115
121
|
type: # string
|
|
122
|
+
- source: emb
|
|
123
|
+
target: emb
|
|
124
|
+
type: tensor # Sequence[float32] -> suggested: tensor
|
|
116
125
|
```
|
|
117
126
|
|
|
118
127
|
### Use Config File
|
|
119
128
|
|
|
120
|
-
|
|
129
|
+
Edit the config to customize type conversions, then apply it:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
hf2vespa feed Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --config-file cohere-config.yaml --limit 5
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
See [Two Modes of Operation](#two-modes-of-operation) below for a complete example with bfloat16 hex encoding.
|
|
136
|
+
|
|
137
|
+
## Two Modes of Operation
|
|
138
|
+
|
|
139
|
+
hf2vespa supports two modes depending on your needs:
|
|
140
|
+
|
|
141
|
+
### CLI Mode (Quick & Simple)
|
|
142
|
+
|
|
143
|
+
Use CLI arguments when you need:
|
|
144
|
+
- Column renaming (`--rename old:new`)
|
|
145
|
+
- Column filtering (`--include col1 --include col2`)
|
|
146
|
+
- Custom namespace/doctype (`--namespace`, `--doctype`)
|
|
147
|
+
- Preview data structure
|
|
148
|
+
|
|
149
|
+
**Example:** Rename columns and stream MS MARCO corpus:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 5
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
**Output:**
|
|
156
|
+
```json
|
|
157
|
+
{"put":"id:doc:doc::0","fields":{"id":"00_0","title":"0-60 Times - 0-60 | 0 to 60 Times...","text":"0-60 Times - 0-60 | 0 to 60 Times..."}}
|
|
158
|
+
{"put":"id:doc:doc::1","fields":{"id":"00_172","title":"0-60 Times...","text":"This allow for a more accurate measure..."}}
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Config File Mode (Advanced)
|
|
162
|
+
|
|
163
|
+
Use `hf2vespa init` + YAML config when you need:
|
|
164
|
+
- Type conversions (tensor, hex-encoded formats)
|
|
165
|
+
- bfloat16/int8 quantized embeddings
|
|
166
|
+
- Sparse or mixed tensors
|
|
167
|
+
- Complex multi-field transformations
|
|
168
|
+
|
|
169
|
+
**Example:** Convert Cohere embeddings to hex-encoded bfloat16:
|
|
170
|
+
|
|
171
|
+
1. Generate a config template:
|
|
172
|
+
```bash
|
|
173
|
+
hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --output cohere-config.yaml
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
2. Edit the config to use `tensor_bfloat16_hex` for the embedding field:
|
|
177
|
+
```yaml
|
|
178
|
+
# cohere-config.yaml
|
|
179
|
+
namespace: doc
|
|
180
|
+
doctype: doc
|
|
181
|
+
id_column:
|
|
182
|
+
|
|
183
|
+
mappings:
|
|
184
|
+
- source: _id
|
|
185
|
+
target: _id
|
|
186
|
+
- source: url
|
|
187
|
+
target: url
|
|
188
|
+
- source: title
|
|
189
|
+
target: title
|
|
190
|
+
- source: text
|
|
191
|
+
target: text
|
|
192
|
+
- source: emb
|
|
193
|
+
target: emb
|
|
194
|
+
type: tensor_bfloat16_hex # Convert to hex-encoded bfloat16
|
|
195
|
+
```
|
|
121
196
|
|
|
197
|
+
3. Stream with the config file:
|
|
122
198
|
```bash
|
|
123
|
-
hf2vespa feed
|
|
199
|
+
hf2vespa feed Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --config-file cohere-config.yaml --limit 2
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
**Output:**
|
|
203
|
+
```json
|
|
204
|
+
{"put":"id:doc:doc::0","fields":{"_id":"20231101.en_13194570_0","url":"https://en.wikipedia.org/wiki/British%20Arab%20Commercial%20Bank","title":"British Arab Commercial Bank","text":"The British Arab Commercial Bank PLC (BACB) is an international wholesale bank...","emb":{"values":"3aeabd253b963d1a3b833d8f3d8bbb16bc3e3b01..."}}}
|
|
205
|
+
{"put":"id:doc:doc::1","fields":{"_id":"20231101.en_13194570_1","url":"https://en.wikipedia.org/wiki/British%20Arab%20Commercial%20Bank","title":"British Arab Commercial Bank","text":"BACB has a head office in London...","emb":{"values":"3baabcd7bc3c3d623cc13d853d94ba8dbb45bcb5..."}}}
|
|
124
206
|
```
|
|
125
207
|
|
|
126
|
-
The
|
|
208
|
+
The `emb` field is now hex-encoded bfloat16, reducing storage size by 50% compared to float32.
|
|
127
209
|
|
|
128
210
|
## YAML Configuration
|
|
129
211
|
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
hf2vespa/__init__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
|
|
2
2
|
hf2vespa/__main__.py,sha256=8swxmM2GAunJQ2Qs91RDMvu28RxSGTiDwuMpgKy4plQ,67
|
|
3
|
-
hf2vespa/cli.py,sha256=
|
|
3
|
+
hf2vespa/cli.py,sha256=wtyTdYlxn8UyZ7OufB6wJ3tyW73fBDVGecNxat1WhKc,17018
|
|
4
4
|
hf2vespa/config.py,sha256=JmUxnQcEm_XGg4llbAkQnt28HYgWeh2ldyEsk7ZVgMU,3686
|
|
5
5
|
hf2vespa/converters.py,sha256=sLzOQolvUez2ZymOGS3asiGFDDgqns8chpcoIzMkfME,20685
|
|
6
6
|
hf2vespa/init.py,sha256=CF4p9LMLCbwV_OehTsu036p8vjtWK8TngXU9fd_v7SM,10866
|
|
7
7
|
hf2vespa/pipeline.py,sha256=7q9NIF6GhbgcBXx2Jckxh0tcXi7rMJmjcwkXFiPi_tQ,7145
|
|
8
8
|
hf2vespa/stats.py,sha256=1Os61QpIpDJKthXWE5oWmK_SHx4bZUkcgVIK6t16ppk,1944
|
|
9
9
|
hf2vespa/utils.py,sha256=KGV-YwKaO6IPtEpb9NnRrHMxfBOMfjZzrtYJJSUCe14,1706
|
|
10
|
-
hf2vespa-0.1.
|
|
11
|
-
hf2vespa-0.1.
|
|
12
|
-
hf2vespa-0.1.
|
|
13
|
-
hf2vespa-0.1.
|
|
14
|
-
hf2vespa-0.1.
|
|
10
|
+
hf2vespa-0.1.2.dist-info/METADATA,sha256=VvjQOSoMorGuIV5gEtob_syvkLfIeBYGWxCauNMsWK4,23169
|
|
11
|
+
hf2vespa-0.1.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
12
|
+
hf2vespa-0.1.2.dist-info/entry_points.txt,sha256=R-1FE95nsxKVDqMWPPcXIQT0FL6J4ZWE-Sri68retXE,46
|
|
13
|
+
hf2vespa-0.1.2.dist-info/top_level.txt,sha256=Xul9tbYYe1Qw2uYuf-tQiPaPdWRPYsH6K3F2LO6X_lI,9
|
|
14
|
+
hf2vespa-0.1.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|