hf2vespa 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hf2vespa/cli.py CHANGED
@@ -1,6 +1,8 @@
1
1
  # Suppress warnings that occur during cleanup. These are harmless but confusing to users.
2
2
  # 1. HuggingFace HTTP retry warnings: https://github.com/apache/arrow/issues/45214
3
3
  # 2. Multiprocessing resource tracker warnings (leaked semaphores from HF datasets)
4
+ # 3. Python 3.14+ SyntaxWarning for 'return' in 'finally' blocks (multiprocess package)
5
+ # See: https://peps.python.org/pep-0765/
4
6
  #
5
7
  # IMPORTANT: Set PYTHONWARNINGS env var BEFORE any imports that might trigger multiprocessing.
6
8
  # The resource_tracker runs as a separate daemon process and inherits the env at spawn time.
@@ -8,18 +10,23 @@
8
10
  import os as _os
9
11
 
10
12
  _existing_warnings = _os.environ.get("PYTHONWARNINGS", "")
11
- _new_filter = "ignore::UserWarning:multiprocessing.resource_tracker"
13
+ _new_filters = [
14
+ "ignore::UserWarning:multiprocessing.resource_tracker",
15
+ "ignore::SyntaxWarning:multiprocess",
16
+ ]
17
+ _combined = ",".join(_new_filters)
12
18
  if _existing_warnings:
13
- _os.environ["PYTHONWARNINGS"] = f"{_existing_warnings},{_new_filter}"
19
+ _os.environ["PYTHONWARNINGS"] = f"{_existing_warnings},{_combined}"
14
20
  else:
15
- _os.environ["PYTHONWARNINGS"] = _new_filter
16
- del _os, _existing_warnings, _new_filter
21
+ _os.environ["PYTHONWARNINGS"] = _combined
22
+ del _os, _existing_warnings, _new_filters, _combined
17
23
 
18
24
  import logging as _logging
19
25
  import warnings as _warnings
20
26
 
21
27
  _logging.getLogger("huggingface_hub.utils._http").setLevel(_logging.CRITICAL)
22
28
  _warnings.filterwarnings("ignore", message="resource_tracker:", category=UserWarning)
29
+ _warnings.filterwarnings("ignore", category=SyntaxWarning, module="multiprocess")
23
30
  del _logging, _warnings
24
31
 
25
32
  """CLI for streaming HuggingFace datasets to Vespa JSON format."""
@@ -280,20 +287,26 @@ def feed(
280
287
 
281
288
  Examples:
282
289
 
283
- # Basic usage
284
- $ hf2vespa feed glue --split test --config ax
290
+ # Basic streaming
291
+ hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --limit 10
285
292
 
286
- # Filter columns
287
- $ hf2vespa feed glue --split test --config ax --include premise --include hypothesis
293
+ # Rename columns
294
+ hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 5
295
+
296
+ # Filter specific columns
297
+ hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --include title --include text --limit 5
288
298
 
289
299
  # Custom namespace and doctype
290
- $ hf2vespa feed squad --namespace wiki --doctype article
300
+ hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --namespace search --doctype passage --limit 10
301
+
302
+ # Use config file for complex mappings
303
+ hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --config-file vespa-config.yaml --limit 10
291
304
 
292
- # Use config file
293
- $ hf2vespa feed glue --config ax --config-file mappings.yaml
305
+ # Skip errors instead of failing
306
+ hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --on-error skip --limit 10
294
307
 
295
- # Preview first 10 records
296
- $ hf2vespa feed squad --limit 10
308
+ # Pipe directly to Vespa
309
+ hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --limit 1000 | vespa feed -
297
310
  """
298
311
  feed_impl(
299
312
  dataset=dataset,
@@ -336,14 +349,14 @@ def init(
336
349
 
337
350
  Examples:
338
351
 
339
- # Generate config for a dataset
340
- $ hf2vespa init glue --config ax
352
+ # Generate config for MS MARCO corpus
353
+ hf2vespa init mteb/msmarco-v2 --config corpus -o msmarco-config.yaml
341
354
 
342
355
  # Specify output file
343
- $ hf2vespa init squad --output my-config.yaml
356
+ hf2vespa init mteb/msmarco-v2 --config corpus --output my-config.yaml
344
357
 
345
- # Inspect a specific split
346
- $ hf2vespa init my-dataset --split validation
358
+ # Generate config for Cohere embeddings dataset
359
+ hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en -o cohere-config.yaml
347
360
  """
348
361
  from hf2vespa.init import init_command
349
362
 
@@ -363,11 +376,18 @@ def install_completion(
363
376
 
364
377
  Detects your shell automatically, or specify explicitly.
365
378
 
379
+ After installation, restart your shell or source your shell config file
380
+ (e.g., source ~/.bashrc).
381
+
366
382
  Examples:
367
383
 
368
- hf2vespa install-completion # Auto-detect shell
369
- hf2vespa install-completion bash # Explicit bash
370
- hf2vespa install-completion zsh # Explicit zsh
384
+ # Auto-detect shell
385
+ hf2vespa install-completion
386
+
387
+ # Explicit shell
388
+ hf2vespa install-completion bash
389
+ hf2vespa install-completion zsh
390
+ hf2vespa install-completion fish
371
391
  """
372
392
  from typer._completion_shared import Shells, install
373
393
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hf2vespa
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Stream HuggingFace datasets to Vespa JSON format
5
5
  Author-email: Thomas Thoresen <thomas.h.thoresen@gmail.com>
6
6
  License: Apache-2.0
@@ -28,7 +28,7 @@ Requires-Dist: ruamel.yaml>=0.19.0
28
28
 
29
29
  Stream HuggingFace datasets to Vespa JSON format
30
30
 
31
- [![asciicast](https://asciinema.org/a/kdD2bsVNFUL51Era.svg)](https://asciinema.org/a/kdD2bsVNFUL51Era)
31
+ [![asciicast](https://asciinema.org/a/VGODiMSiXua4FX7w.svg)](https://asciinema.org/a/VGODiMSiXua4FX7w?speed=2)
32
32
 
33
33
  ## Description
34
34
 
@@ -73,23 +73,23 @@ uv tool install .
73
73
  Stream a HuggingFace dataset to Vespa JSON format:
74
74
 
75
75
  ```bash
76
- hf2vespa feed glue --config ax --split test --limit 5
76
+ hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 3
77
77
  ```
78
78
 
79
79
  **Output:**
80
80
  ```json
81
- {"put":"id:doc:doc::0","fields":{"premise":"The cat sat on the mat.","hypothesis":"The cat did not sit on the mat.","label":-1,"idx":0}}
82
- {"put":"id:doc:doc::1","fields":{"premise":"The cat did not sit on the mat.","hypothesis":"The cat sat on the mat.","label":-1,"idx":1}}
83
- {"put":"id:doc:doc::2","fields":{"premise":"When you've got no snow...","hypothesis":"When you've got snow...","label":-1,"idx":2}}
81
+ {"put":"id:doc:doc::0","fields":{"id":"00_0","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews."}}
82
+ {"put":"id:doc:doc::1","fields":{"id":"00_172","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"This allow for a more accurate measure, as does running the test first in one direction and then in the exact opposite direction..."}}
83
+ {"put":"id:doc:doc::2","fields":{"id":"00_587","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"Instead, some believe the measure should include a range of times rather than one finite mark..."}}
84
84
  ```
85
85
 
86
86
  ```
87
87
  --- Completion Statistics ---
88
- Total records processed: 5
89
- Successful: 5
88
+ Total records processed: 3
89
+ Successful: 3
90
90
  Errors: 0
91
- Throughput: 2.1 records/sec
92
- Elapsed time: 2.38s
91
+ Throughput: 4.5 records/sec
92
+ Elapsed time: 0.67s
93
93
  ```
94
94
 
95
95
  ### Preview Dataset Schema
@@ -97,33 +97,115 @@ Elapsed time: 2.38s
97
97
  Inspect a dataset and generate a YAML configuration template:
98
98
 
99
99
  ```bash
100
- hf2vespa init glue --config ax --split test --output config.yaml
100
+ hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en -o cohere-config.yaml
101
101
  ```
102
102
 
103
- **Generated config.yaml:**
103
+ **Generated cohere-config.yaml:**
104
104
  ```yaml
105
105
  namespace: doc
106
106
  doctype: doc
107
- id_column: # null = auto-increment
107
+ id_column:
108
108
 
109
109
  mappings:
110
- - source: premise
111
- target: premise
110
+ - source: _id
111
+ target: _id
112
112
  type: # string
113
- - source: hypothesis
114
- target: hypothesis
113
+ - source: url
114
+ target: url
115
+ type: # string
116
+ - source: title
117
+ target: title
118
+ type: # string
119
+ - source: text
120
+ target: text
115
121
  type: # string
122
+ - source: emb
123
+ target: emb
124
+ type: tensor # Sequence[float32] -> suggested: tensor
116
125
  ```
117
126
 
118
127
  ### Use Config File
119
128
 
120
- Apply the generated configuration:
129
+ Edit the config to customize type conversions, then apply it:
130
+
131
+ ```bash
132
+ hf2vespa feed Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --config-file cohere-config.yaml --limit 5
133
+ ```
134
+
135
+ See [Two Modes of Operation](#two-modes-of-operation) below for a complete example with bfloat16 hex encoding.
136
+
137
+ ## Two Modes of Operation
138
+
139
+ hf2vespa supports two modes depending on your needs:
140
+
141
+ ### CLI Mode (Quick & Simple)
142
+
143
+ Use CLI arguments when you need:
144
+ - Column renaming (`--rename old:new`)
145
+ - Column filtering (`--include col1 --include col2`)
146
+ - Custom namespace/doctype (`--namespace`, `--doctype`)
147
+ - Preview data structure
148
+
149
+ **Example:** Rename columns and stream MS MARCO corpus:
150
+
151
+ ```bash
152
+ hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 5
153
+ ```
154
+
155
+ **Output:**
156
+ ```json
157
+ {"put":"id:doc:doc::0","fields":{"id":"00_0","title":"0-60 Times - 0-60 | 0 to 60 Times...","text":"0-60 Times - 0-60 | 0 to 60 Times..."}}
158
+ {"put":"id:doc:doc::1","fields":{"id":"00_172","title":"0-60 Times...","text":"This allow for a more accurate measure..."}}
159
+ ```
160
+
161
+ ### Config File Mode (Advanced)
162
+
163
+ Use `hf2vespa init` + YAML config when you need:
164
+ - Type conversions (tensor, hex-encoded formats)
165
+ - bfloat16/int8 quantized embeddings
166
+ - Sparse or mixed tensors
167
+ - Complex multi-field transformations
168
+
169
+ **Example:** Convert Cohere embeddings to hex-encoded bfloat16:
170
+
171
+ 1. Generate a config template:
172
+ ```bash
173
+ hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --output cohere-config.yaml
174
+ ```
175
+
176
+ 2. Edit the config to use `tensor_bfloat16_hex` for the embedding field:
177
+ ```yaml
178
+ # cohere-config.yaml
179
+ namespace: doc
180
+ doctype: doc
181
+ id_column:
182
+
183
+ mappings:
184
+ - source: _id
185
+ target: _id
186
+ - source: url
187
+ target: url
188
+ - source: title
189
+ target: title
190
+ - source: text
191
+ target: text
192
+ - source: emb
193
+ target: emb
194
+ type: tensor_bfloat16_hex # Convert to hex-encoded bfloat16
195
+ ```
121
196
 
197
+ 3. Stream with the config file:
122
198
  ```bash
123
- hf2vespa feed glue --config ax --split test --config-file config.yaml --limit 5
199
+ hf2vespa feed Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --config-file cohere-config.yaml --limit 2
200
+ ```
201
+
202
+ **Output:**
203
+ ```json
204
+ {"put":"id:doc:doc::0","fields":{"_id":"20231101.en_13194570_0","url":"https://en.wikipedia.org/wiki/British%20Arab%20Commercial%20Bank","title":"British Arab Commercial Bank","text":"The British Arab Commercial Bank PLC (BACB) is an international wholesale bank...","emb":{"values":"3aeabd253b963d1a3b833d8f3d8bbb16bc3e3b01..."}}}
205
+ {"put":"id:doc:doc::1","fields":{"_id":"20231101.en_13194570_1","url":"https://en.wikipedia.org/wiki/British%20Arab%20Commercial%20Bank","title":"British Arab Commercial Bank","text":"BACB has a head office in London...","emb":{"values":"3baabcd7bc3c3d623cc13d853d94ba8dbb45bcb5..."}}}
124
206
  ```
125
207
 
126
- The config file defines field mappings, document IDs, and type conversions (e.g., converting lists to Vespa tensor format).
208
+ The `emb` field is now hex-encoded bfloat16, reducing storage size by 50% compared to float32.
127
209
 
128
210
  ## YAML Configuration
129
211
 
@@ -1,14 +1,14 @@
1
1
  hf2vespa/__init__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
2
2
  hf2vespa/__main__.py,sha256=8swxmM2GAunJQ2Qs91RDMvu28RxSGTiDwuMpgKy4plQ,67
3
- hf2vespa/cli.py,sha256=NKeddkKC2N58gnOFH_XFsWXGTvRMC80SWsztTTyH2o4,15949
3
+ hf2vespa/cli.py,sha256=wtyTdYlxn8UyZ7OufB6wJ3tyW73fBDVGecNxat1WhKc,17018
4
4
  hf2vespa/config.py,sha256=JmUxnQcEm_XGg4llbAkQnt28HYgWeh2ldyEsk7ZVgMU,3686
5
5
  hf2vespa/converters.py,sha256=sLzOQolvUez2ZymOGS3asiGFDDgqns8chpcoIzMkfME,20685
6
6
  hf2vespa/init.py,sha256=CF4p9LMLCbwV_OehTsu036p8vjtWK8TngXU9fd_v7SM,10866
7
7
  hf2vespa/pipeline.py,sha256=7q9NIF6GhbgcBXx2Jckxh0tcXi7rMJmjcwkXFiPi_tQ,7145
8
8
  hf2vespa/stats.py,sha256=1Os61QpIpDJKthXWE5oWmK_SHx4bZUkcgVIK6t16ppk,1944
9
9
  hf2vespa/utils.py,sha256=KGV-YwKaO6IPtEpb9NnRrHMxfBOMfjZzrtYJJSUCe14,1706
10
- hf2vespa-0.1.0.dist-info/METADATA,sha256=pViQBDeYHOA6n1cB4iksr6ViKuv-Z8cTCt0prxNWK3A,20060
11
- hf2vespa-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
12
- hf2vespa-0.1.0.dist-info/entry_points.txt,sha256=R-1FE95nsxKVDqMWPPcXIQT0FL6J4ZWE-Sri68retXE,46
13
- hf2vespa-0.1.0.dist-info/top_level.txt,sha256=Xul9tbYYe1Qw2uYuf-tQiPaPdWRPYsH6K3F2LO6X_lI,9
14
- hf2vespa-0.1.0.dist-info/RECORD,,
10
+ hf2vespa-0.1.2.dist-info/METADATA,sha256=VvjQOSoMorGuIV5gEtob_syvkLfIeBYGWxCauNMsWK4,23169
11
+ hf2vespa-0.1.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
12
+ hf2vespa-0.1.2.dist-info/entry_points.txt,sha256=R-1FE95nsxKVDqMWPPcXIQT0FL6J4ZWE-Sri68retXE,46
13
+ hf2vespa-0.1.2.dist-info/top_level.txt,sha256=Xul9tbYYe1Qw2uYuf-tQiPaPdWRPYsH6K3F2LO6X_lI,9
14
+ hf2vespa-0.1.2.dist-info/RECORD,,