PyPI - hf2vespa - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

hf2vespa 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

hf2vespa/cli.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # Suppress warnings that occur during cleanup. These are harmless but confusing to users.
 # 1. HuggingFace HTTP retry warnings: https://github.com/apache/arrow/issues/45214
 # 2. Multiprocessing resource tracker warnings (leaked semaphores from HF datasets)
+# 3. Python 3.14+ SyntaxWarning for 'return' in 'finally' blocks (multiprocess package)
+#    See: https://peps.python.org/pep-0765/
 #
 # IMPORTANT: Set PYTHONWARNINGS env var BEFORE any imports that might trigger multiprocessing.
 # The resource_tracker runs as a separate daemon process and inherits the env at spawn time.
@@ -8,18 +10,23 @@
 import os as _os
 _existing_warnings = _os.environ.get("PYTHONWARNINGS", "")
-_new_filter = "ignore::UserWarning:multiprocessing.resource_tracker"
+_new_filters = [
+    "ignore::UserWarning:multiprocessing.resource_tracker",
+    "ignore::SyntaxWarning:multiprocess",
+]
+_combined = ",".join(_new_filters)
 if _existing_warnings:
-    _os.environ["PYTHONWARNINGS"] = f"{_existing_warnings},{_new_filter}"
+    _os.environ["PYTHONWARNINGS"] = f"{_existing_warnings},{_combined}"
 else:
-    _os.environ["PYTHONWARNINGS"] = _new_filter
-del _os, _existing_warnings, _new_filter
+    _os.environ["PYTHONWARNINGS"] = _combined
+del _os, _existing_warnings, _new_filters, _combined
 import logging as _logging
 import warnings as _warnings
 _logging.getLogger("huggingface_hub.utils._http").setLevel(_logging.CRITICAL)
 _warnings.filterwarnings("ignore", message="resource_tracker:", category=UserWarning)
+_warnings.filterwarnings("ignore", category=SyntaxWarning, module="multiprocess")
 del _logging, _warnings
 """CLI for streaming HuggingFace datasets to Vespa JSON format."""
@@ -280,20 +287,26 @@ def feed(
     Examples:
-        # Basic usage
-        $ hf2vespa feed glue --split test --config ax
+        # Basic streaming
+        hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --limit 10
-        # Filter columns
-        $ hf2vespa feed glue --split test --config ax --include premise --include hypothesis
+        # Rename columns
+        hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 5
+        # Filter specific columns
+        hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --include title --include text --limit 5
         # Custom namespace and doctype
-        $ hf2vespa feed squad --namespace wiki --doctype article
+        hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --namespace search --doctype passage --limit 10
+        # Use config file for complex mappings
+        hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --config-file vespa-config.yaml --limit 10
-        # Use config file
-        $ hf2vespa feed glue --config ax --config-file mappings.yaml
+        # Skip errors instead of failing
+        hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --on-error skip --limit 10
-        # Preview first 10 records
-        $ hf2vespa feed squad --limit 10
+        # Pipe directly to Vespa
+        hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --limit 1000 | vespa feed -
     """
     feed_impl(
         dataset=dataset,
@@ -336,14 +349,14 @@ def init(
     Examples:
-        # Generate config for a dataset
-        $ hf2vespa init glue --config ax
+        # Generate config for MS MARCO corpus
+        hf2vespa init mteb/msmarco-v2 --config corpus -o msmarco-config.yaml
         # Specify output file
-        $ hf2vespa init squad --output my-config.yaml
+        hf2vespa init mteb/msmarco-v2 --config corpus --output my-config.yaml
-        # Inspect a specific split
-        $ hf2vespa init my-dataset --split validation
+        # Generate config for Cohere embeddings dataset
+        hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en -o cohere-config.yaml
     """
     from hf2vespa.init import init_command
@@ -363,11 +376,18 @@ def install_completion(
     Detects your shell automatically, or specify explicitly.
+    After installation, restart your shell or source your shell config file
+    (e.g., source ~/.bashrc).
     Examples:
-        hf2vespa install-completion        # Auto-detect shell
-        hf2vespa install-completion bash   # Explicit bash
-        hf2vespa install-completion zsh    # Explicit zsh
+        # Auto-detect shell
+        hf2vespa install-completion
+        # Explicit shell
+        hf2vespa install-completion bash
+        hf2vespa install-completion zsh
+        hf2vespa install-completion fish
     """
     from typer._completion_shared import Shells, install

{hf2vespa-0.1.0.dist-info → hf2vespa-0.1.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hf2vespa
-Version: 0.1.0
+Version: 0.1.2
 Summary: Stream HuggingFace datasets to Vespa JSON format
 Author-email: Thomas Thoresen <thomas.h.thoresen@gmail.com>
 License: Apache-2.0
@@ -28,7 +28,7 @@ Requires-Dist: ruamel.yaml>=0.19.0
 Stream HuggingFace datasets to Vespa JSON format
-[![asciicast](https://asciinema.org/a/kdD2bsVNFUL51Era.svg)](https://asciinema.org/a/kdD2bsVNFUL51Era)
+[![asciicast](https://asciinema.org/a/VGODiMSiXua4FX7w.svg)](https://asciinema.org/a/VGODiMSiXua4FX7w?speed=2)
 ## Description
@@ -73,23 +73,23 @@ uv tool install .
 Stream a HuggingFace dataset to Vespa JSON format:
 ```bash
-hf2vespa feed glue --config ax --split test --limit 5
+hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 3
 ```
 **Output:**
 ```json
-{"put":"id:doc:doc::0","fields":{"premise":"The cat sat on the mat.","hypothesis":"The cat did not sit on the mat.","label":-1,"idx":0}}
-{"put":"id:doc:doc::1","fields":{"premise":"The cat did not sit on the mat.","hypothesis":"The cat sat on the mat.","label":-1,"idx":1}}
-{"put":"id:doc:doc::2","fields":{"premise":"When you've got no snow...","hypothesis":"When you've got snow...","label":-1,"idx":2}}
+{"put":"id:doc:doc::0","fields":{"id":"00_0","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews."}}
+{"put":"id:doc:doc::1","fields":{"id":"00_172","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"This allow for a more accurate measure, as does running the test first in one direction and then in the exact opposite direction..."}}
+{"put":"id:doc:doc::2","fields":{"id":"00_587","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"Instead, some believe the measure should include a range of times rather than one finite mark..."}}
 ```
 ```
 --- Completion Statistics ---
-Total records processed: 5
-Successful: 5
+Total records processed: 3
+Successful: 3
 Errors: 0
-Throughput: 2.1 records/sec
-Elapsed time: 2.38s
+Throughput: 4.5 records/sec
+Elapsed time: 0.67s
 ```
 ### Preview Dataset Schema
@@ -97,33 +97,115 @@ Elapsed time: 2.38s
 Inspect a dataset and generate a YAML configuration template:
 ```bash
-hf2vespa init glue --config ax --split test --output config.yaml
+hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en -o cohere-config.yaml
 ```
-**Generated config.yaml:**
+**Generated cohere-config.yaml:**
 ```yaml
 namespace: doc
 doctype: doc
-id_column: # null = auto-increment
+id_column:
 mappings:
-  - source: premise
-    target: premise
+  - source: _id
+    target: _id
     type:  # string
-  - source: hypothesis
-    target: hypothesis
+  - source: url
+    target: url
+    type:  # string
+  - source: title
+    target: title
+    type:  # string
+  - source: text
+    target: text
     type:  # string
+  - source: emb
+    target: emb
+    type: tensor  # Sequence[float32] -> suggested: tensor
 ```
 ### Use Config File
-Apply the generated configuration:
+Edit the config to customize type conversions, then apply it:
+```bash
+hf2vespa feed Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --config-file cohere-config.yaml --limit 5
+```
+See [Two Modes of Operation](#two-modes-of-operation) below for a complete example with bfloat16 hex encoding.
+## Two Modes of Operation
+hf2vespa supports two modes depending on your needs:
+### CLI Mode (Quick & Simple)
+Use CLI arguments when you need:
+- Column renaming (`--rename old:new`)
+- Column filtering (`--include col1 --include col2`)
+- Custom namespace/doctype (`--namespace`, `--doctype`)
+- Preview data structure
+**Example:** Rename columns and stream MS MARCO corpus:
+```bash
+hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 5
+```
+**Output:**
+```json
+{"put":"id:doc:doc::0","fields":{"id":"00_0","title":"0-60 Times - 0-60 | 0 to 60 Times...","text":"0-60 Times - 0-60 | 0 to 60 Times..."}}
+{"put":"id:doc:doc::1","fields":{"id":"00_172","title":"0-60 Times...","text":"This allow for a more accurate measure..."}}
+```
+### Config File Mode (Advanced)
+Use `hf2vespa init` + YAML config when you need:
+- Type conversions (tensor, hex-encoded formats)
+- bfloat16/int8 quantized embeddings
+- Sparse or mixed tensors
+- Complex multi-field transformations
+**Example:** Convert Cohere embeddings to hex-encoded bfloat16:
+1. Generate a config template:
+```bash
+hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --output cohere-config.yaml
+```
+2. Edit the config to use `tensor_bfloat16_hex` for the embedding field:
+```yaml
+# cohere-config.yaml
+namespace: doc
+doctype: doc
+id_column:
+mappings:
+  - source: _id
+    target: _id
+  - source: url
+    target: url
+  - source: title
+    target: title
+  - source: text
+    target: text
+  - source: emb
+    target: emb
+    type: tensor_bfloat16_hex  # Convert to hex-encoded bfloat16
+```
+3. Stream with the config file:
 ```bash
-hf2vespa feed glue --config ax --split test --config-file config.yaml --limit 5
+hf2vespa feed Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --config-file cohere-config.yaml --limit 2
+```
+**Output:**
+```json
+{"put":"id:doc:doc::0","fields":{"_id":"20231101.en_13194570_0","url":"https://en.wikipedia.org/wiki/British%20Arab%20Commercial%20Bank","title":"British Arab Commercial Bank","text":"The British Arab Commercial Bank PLC (BACB) is an international wholesale bank...","emb":{"values":"3aeabd253b963d1a3b833d8f3d8bbb16bc3e3b01..."}}}
+{"put":"id:doc:doc::1","fields":{"_id":"20231101.en_13194570_1","url":"https://en.wikipedia.org/wiki/British%20Arab%20Commercial%20Bank","title":"British Arab Commercial Bank","text":"BACB has a head office in London...","emb":{"values":"3baabcd7bc3c3d623cc13d853d94ba8dbb45bcb5..."}}}
 ```
-The config file defines field mappings, document IDs, and type conversions (e.g., converting lists to Vespa tensor format).
+The `emb` field is now hex-encoded bfloat16, reducing storage size by 50% compared to float32.
 ## YAML Configuration

{hf2vespa-0.1.0.dist-info → hf2vespa-0.1.2.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 hf2vespa/__init__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
 hf2vespa/__main__.py,sha256=8swxmM2GAunJQ2Qs91RDMvu28RxSGTiDwuMpgKy4plQ,67
-hf2vespa/cli.py,sha256=NKeddkKC2N58gnOFH_XFsWXGTvRMC80SWsztTTyH2o4,15949
+hf2vespa/cli.py,sha256=wtyTdYlxn8UyZ7OufB6wJ3tyW73fBDVGecNxat1WhKc,17018
 hf2vespa/config.py,sha256=JmUxnQcEm_XGg4llbAkQnt28HYgWeh2ldyEsk7ZVgMU,3686
 hf2vespa/converters.py,sha256=sLzOQolvUez2ZymOGS3asiGFDDgqns8chpcoIzMkfME,20685
 hf2vespa/init.py,sha256=CF4p9LMLCbwV_OehTsu036p8vjtWK8TngXU9fd_v7SM,10866
 hf2vespa/pipeline.py,sha256=7q9NIF6GhbgcBXx2Jckxh0tcXi7rMJmjcwkXFiPi_tQ,7145
 hf2vespa/stats.py,sha256=1Os61QpIpDJKthXWE5oWmK_SHx4bZUkcgVIK6t16ppk,1944
 hf2vespa/utils.py,sha256=KGV-YwKaO6IPtEpb9NnRrHMxfBOMfjZzrtYJJSUCe14,1706
-hf2vespa-0.1.0.dist-info/METADATA,sha256=pViQBDeYHOA6n1cB4iksr6ViKuv-Z8cTCt0prxNWK3A,20060
-hf2vespa-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-hf2vespa-0.1.0.dist-info/entry_points.txt,sha256=R-1FE95nsxKVDqMWPPcXIQT0FL6J4ZWE-Sri68retXE,46
-hf2vespa-0.1.0.dist-info/top_level.txt,sha256=Xul9tbYYe1Qw2uYuf-tQiPaPdWRPYsH6K3F2LO6X_lI,9
-hf2vespa-0.1.0.dist-info/RECORD,,
+hf2vespa-0.1.2.dist-info/METADATA,sha256=VvjQOSoMorGuIV5gEtob_syvkLfIeBYGWxCauNMsWK4,23169
+hf2vespa-0.1.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+hf2vespa-0.1.2.dist-info/entry_points.txt,sha256=R-1FE95nsxKVDqMWPPcXIQT0FL6J4ZWE-Sri68retXE,46
+hf2vespa-0.1.2.dist-info/top_level.txt,sha256=Xul9tbYYe1Qw2uYuf-tQiPaPdWRPYsH6K3F2LO6X_lI,9
+hf2vespa-0.1.2.dist-info/RECORD,,

{hf2vespa-0.1.0.dist-info → hf2vespa-0.1.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{hf2vespa-0.1.0.dist-info → hf2vespa-0.1.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{hf2vespa-0.1.0.dist-info → hf2vespa-0.1.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

hf2vespa 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

hf2vespa 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl