hf2vespa 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.github/workflows/benchmark.yml +7 -9
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/PKG-INFO +102 -20
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/README.md +101 -19
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa/cli.py +11 -4
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa.egg-info/PKG-INFO +102 -20
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.github/workflows/release.yml +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.github/workflows/test.yml +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.gitignore +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/MILESTONES.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/PROJECT.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/REQUIREMENTS.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/ROADMAP.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/STATE.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/config.json +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/milestones/v1.0-MILESTONE-AUDIT.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/milestones/v1.0-REQUIREMENTS.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/milestones/v1.0-ROADMAP.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/milestones/v1.1-MILESTONE-AUDIT.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/milestones/v1.1-REQUIREMENTS.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/milestones/v1.1-ROADMAP.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/milestones/v2.0-MILESTONE-AUDIT.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/milestones/v2.0-REQUIREMENTS.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/milestones/v2.0-ROADMAP.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/01-core-streaming-pipeline/01-01-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/01-core-streaming-pipeline/01-01-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/01-core-streaming-pipeline/01-02-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/01-core-streaming-pipeline/01-02-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/01-core-streaming-pipeline/01-RESEARCH.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/01-core-streaming-pipeline/01-VERIFICATION.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/02-config-advanced-mapping/02-01-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/02-config-advanced-mapping/02-01-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/02-config-advanced-mapping/02-02-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/02-config-advanced-mapping/02-02-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/02-config-advanced-mapping/02-RESEARCH.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/02-config-advanced-mapping/02-VERIFICATION.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/03-production-hardening/03-01-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/03-production-hardening/03-01-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/03-production-hardening/03-02-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/03-production-hardening/03-02-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/03-production-hardening/03-RESEARCH.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/03-production-hardening/03-VERIFICATION.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/04-init-command/04-01-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/04-init-command/04-01-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/04-init-command/04-RESEARCH.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/04-init-command/04-VERIFICATION.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/05-shell-completion/05-01-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/05-shell-completion/05-01-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/05-shell-completion/05-RESEARCH.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/05-shell-completion/05-VERIFICATION.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/06-documentation/06-01-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/06-documentation/06-01-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/06-documentation/06-02-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/06-documentation/06-02-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/06-documentation/06-03-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/06-documentation/06-03-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/06-documentation/06-RESEARCH.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/06-documentation/06-documentation-VERIFICATION.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/07-performance/07-01-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/07-performance/07-01-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/07-performance/07-02-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/07-performance/07-02-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/07-performance/07-CONTEXT.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/07-performance/07-RESEARCH.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/07-performance/07-VERIFICATION.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/08-scalar-types/08-01-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/08-scalar-types/08-01-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/08-scalar-types/08-RESEARCH.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/08-scalar-types/08-VERIFICATION.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/09-dense-hex-encoding/09-01-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/09-dense-hex-encoding/09-01-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/09-dense-hex-encoding/09-02-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/09-dense-hex-encoding/09-02-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/09-dense-hex-encoding/09-RESEARCH.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/09-dense-hex-encoding/09-VERIFICATION.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/10-sparse-and-mixed-tensors/10-01-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/10-sparse-and-mixed-tensors/10-01-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/10-sparse-and-mixed-tensors/10-02-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/10-sparse-and-mixed-tensors/10-02-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/10-sparse-and-mixed-tensors/10-RESEARCH.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/10-sparse-and-mixed-tensors/10-VERIFICATION.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-01-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-01-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-02-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-02-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-03-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-03-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-04-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-04-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-CONTEXT.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-RESEARCH.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-VERIFICATION.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/12-clean-exit/12-01-PLAN.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/12-clean-exit/12-01-SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/12-clean-exit/12-01-VERIFICATION.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/research/ARCHITECTURE.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/research/FEATURES.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/research/PITFALLS.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/research/STACK.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/research/SUMMARY.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/v2.1-MILESTONE-AUDIT.md +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/pyproject.toml +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/setup.cfg +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa/__init__.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa/__main__.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa/config.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa/converters.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa/init.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa/pipeline.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa/stats.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa/utils.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa.egg-info/SOURCES.txt +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa.egg-info/dependency_links.txt +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa.egg-info/entry_points.txt +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa.egg-info/requires.txt +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/src/hf2vespa.egg-info/top_level.txt +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/tests/__init__.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/tests/benchmarks/__init__.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/tests/benchmarks/conftest.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/tests/benchmarks/test_converter_benchmarks.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/tests/benchmarks/test_pipeline_benchmarks.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/tests/benchmarks/test_realworld_benchmarks.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/tests/fixtures/__init__.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/tests/fixtures/vespa_doc_examples.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/tests/smoke_test.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/tests/test_cli.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/tests/test_converters.py +0 -0
- {hf2vespa-0.1.0 → hf2vespa-0.1.1}/tests/test_pipeline.py +0 -0
|
@@ -40,20 +40,18 @@ jobs:
|
|
|
40
40
|
- name: Checkout repository
|
|
41
41
|
uses: actions/checkout@v4
|
|
42
42
|
|
|
43
|
-
- name:
|
|
44
|
-
uses:
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
43
|
+
- name: Install uv
|
|
44
|
+
uses: astral-sh/setup-uv@v6
|
|
45
|
+
|
|
46
|
+
- name: Install Python
|
|
47
|
+
run: uv python install 3.12
|
|
48
48
|
|
|
49
49
|
- name: Install dependencies
|
|
50
|
-
run:
|
|
51
|
-
python -m pip install --upgrade pip
|
|
52
|
-
pip install -e ".[dev]"
|
|
50
|
+
run: uv sync --dev
|
|
53
51
|
|
|
54
52
|
- name: Run benchmarks
|
|
55
53
|
run: |
|
|
56
|
-
pytest tests/benchmarks/ \
|
|
54
|
+
uv run pytest tests/benchmarks/ \
|
|
57
55
|
--benchmark-enable \
|
|
58
56
|
--benchmark-only \
|
|
59
57
|
--benchmark-json=benchmark-results.json \
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hf2vespa
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Stream HuggingFace datasets to Vespa JSON format
|
|
5
5
|
Author-email: Thomas Thoresen <thomas.h.thoresen@gmail.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -28,7 +28,7 @@ Requires-Dist: ruamel.yaml>=0.19.0
|
|
|
28
28
|
|
|
29
29
|
Stream HuggingFace datasets to Vespa JSON format
|
|
30
30
|
|
|
31
|
-
[](https://asciinema.org/a/VGODiMSiXua4FX7w?speed=2)
|
|
32
32
|
|
|
33
33
|
## Description
|
|
34
34
|
|
|
@@ -73,23 +73,23 @@ uv tool install .
|
|
|
73
73
|
Stream a HuggingFace dataset to Vespa JSON format:
|
|
74
74
|
|
|
75
75
|
```bash
|
|
76
|
-
hf2vespa feed
|
|
76
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 3
|
|
77
77
|
```
|
|
78
78
|
|
|
79
79
|
**Output:**
|
|
80
80
|
```json
|
|
81
|
-
{"put":"id:doc:doc::0","fields":{"
|
|
82
|
-
{"put":"id:doc:doc::1","fields":{"
|
|
83
|
-
{"put":"id:doc:doc::2","fields":{"
|
|
81
|
+
{"put":"id:doc:doc::0","fields":{"id":"00_0","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews."}}
|
|
82
|
+
{"put":"id:doc:doc::1","fields":{"id":"00_172","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"This allow for a more accurate measure, as does running the test first in one direction and then in the exact opposite direction..."}}
|
|
83
|
+
{"put":"id:doc:doc::2","fields":{"id":"00_587","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"Instead, some believe the measure should include a range of times rather than one finite mark..."}}
|
|
84
84
|
```
|
|
85
85
|
|
|
86
86
|
```
|
|
87
87
|
--- Completion Statistics ---
|
|
88
|
-
Total records processed:
|
|
89
|
-
Successful:
|
|
88
|
+
Total records processed: 3
|
|
89
|
+
Successful: 3
|
|
90
90
|
Errors: 0
|
|
91
|
-
Throughput:
|
|
92
|
-
Elapsed time:
|
|
91
|
+
Throughput: 4.5 records/sec
|
|
92
|
+
Elapsed time: 0.67s
|
|
93
93
|
```
|
|
94
94
|
|
|
95
95
|
### Preview Dataset Schema
|
|
@@ -97,33 +97,115 @@ Elapsed time: 2.38s
|
|
|
97
97
|
Inspect a dataset and generate a YAML configuration template:
|
|
98
98
|
|
|
99
99
|
```bash
|
|
100
|
-
hf2vespa init
|
|
100
|
+
hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en -o cohere-config.yaml
|
|
101
101
|
```
|
|
102
102
|
|
|
103
|
-
**Generated config.yaml:**
|
|
103
|
+
**Generated cohere-config.yaml:**
|
|
104
104
|
```yaml
|
|
105
105
|
namespace: doc
|
|
106
106
|
doctype: doc
|
|
107
|
-
id_column:
|
|
107
|
+
id_column:
|
|
108
108
|
|
|
109
109
|
mappings:
|
|
110
|
-
- source:
|
|
111
|
-
target:
|
|
110
|
+
- source: _id
|
|
111
|
+
target: _id
|
|
112
112
|
type: # string
|
|
113
|
-
- source:
|
|
114
|
-
target:
|
|
113
|
+
- source: url
|
|
114
|
+
target: url
|
|
115
|
+
type: # string
|
|
116
|
+
- source: title
|
|
117
|
+
target: title
|
|
118
|
+
type: # string
|
|
119
|
+
- source: text
|
|
120
|
+
target: text
|
|
115
121
|
type: # string
|
|
122
|
+
- source: emb
|
|
123
|
+
target: emb
|
|
124
|
+
type: tensor # Sequence[float32] -> suggested: tensor
|
|
116
125
|
```
|
|
117
126
|
|
|
118
127
|
### Use Config File
|
|
119
128
|
|
|
120
|
-
|
|
129
|
+
Edit the config to customize type conversions, then apply it:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
hf2vespa feed Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --config-file cohere-config.yaml --limit 5
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
See [Two Modes of Operation](#two-modes-of-operation) below for a complete example with bfloat16 hex encoding.
|
|
136
|
+
|
|
137
|
+
## Two Modes of Operation
|
|
138
|
+
|
|
139
|
+
hf2vespa supports two modes depending on your needs:
|
|
140
|
+
|
|
141
|
+
### CLI Mode (Quick & Simple)
|
|
142
|
+
|
|
143
|
+
Use CLI arguments when you need:
|
|
144
|
+
- Column renaming (`--rename old:new`)
|
|
145
|
+
- Column filtering (`--include col1 --include col2`)
|
|
146
|
+
- Custom namespace/doctype (`--namespace`, `--doctype`)
|
|
147
|
+
- Preview data structure
|
|
148
|
+
|
|
149
|
+
**Example:** Rename columns and stream MS MARCO corpus:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 5
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
**Output:**
|
|
156
|
+
```json
|
|
157
|
+
{"put":"id:doc:doc::0","fields":{"id":"00_0","title":"0-60 Times - 0-60 | 0 to 60 Times...","text":"0-60 Times - 0-60 | 0 to 60 Times..."}}
|
|
158
|
+
{"put":"id:doc:doc::1","fields":{"id":"00_172","title":"0-60 Times...","text":"This allow for a more accurate measure..."}}
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Config File Mode (Advanced)
|
|
162
|
+
|
|
163
|
+
Use `hf2vespa init` + YAML config when you need:
|
|
164
|
+
- Type conversions (tensor, hex-encoded formats)
|
|
165
|
+
- bfloat16/int8 quantized embeddings
|
|
166
|
+
- Sparse or mixed tensors
|
|
167
|
+
- Complex multi-field transformations
|
|
168
|
+
|
|
169
|
+
**Example:** Convert Cohere embeddings to hex-encoded bfloat16:
|
|
170
|
+
|
|
171
|
+
1. Generate a config template:
|
|
172
|
+
```bash
|
|
173
|
+
hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --output cohere-config.yaml
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
2. Edit the config to use `tensor_bfloat16_hex` for the embedding field:
|
|
177
|
+
```yaml
|
|
178
|
+
# cohere-config.yaml
|
|
179
|
+
namespace: doc
|
|
180
|
+
doctype: doc
|
|
181
|
+
id_column:
|
|
182
|
+
|
|
183
|
+
mappings:
|
|
184
|
+
- source: _id
|
|
185
|
+
target: _id
|
|
186
|
+
- source: url
|
|
187
|
+
target: url
|
|
188
|
+
- source: title
|
|
189
|
+
target: title
|
|
190
|
+
- source: text
|
|
191
|
+
target: text
|
|
192
|
+
- source: emb
|
|
193
|
+
target: emb
|
|
194
|
+
type: tensor_bfloat16_hex # Convert to hex-encoded bfloat16
|
|
195
|
+
```
|
|
121
196
|
|
|
197
|
+
3. Stream with the config file:
|
|
122
198
|
```bash
|
|
123
|
-
hf2vespa feed
|
|
199
|
+
hf2vespa feed Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --config-file cohere-config.yaml --limit 2
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
**Output:**
|
|
203
|
+
```json
|
|
204
|
+
{"put":"id:doc:doc::0","fields":{"_id":"20231101.en_13194570_0","url":"https://en.wikipedia.org/wiki/British%20Arab%20Commercial%20Bank","title":"British Arab Commercial Bank","text":"The British Arab Commercial Bank PLC (BACB) is an international wholesale bank...","emb":{"values":"3aeabd253b963d1a3b833d8f3d8bbb16bc3e3b01..."}}}
|
|
205
|
+
{"put":"id:doc:doc::1","fields":{"_id":"20231101.en_13194570_1","url":"https://en.wikipedia.org/wiki/British%20Arab%20Commercial%20Bank","title":"British Arab Commercial Bank","text":"BACB has a head office in London...","emb":{"values":"3baabcd7bc3c3d623cc13d853d94ba8dbb45bcb5..."}}}
|
|
124
206
|
```
|
|
125
207
|
|
|
126
|
-
The
|
|
208
|
+
The `emb` field is now hex-encoded bfloat16, reducing storage size by 50% compared to float32.
|
|
127
209
|
|
|
128
210
|
## YAML Configuration
|
|
129
211
|
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Stream HuggingFace datasets to Vespa JSON format
|
|
4
4
|
|
|
5
|
-
[](https://asciinema.org/a/VGODiMSiXua4FX7w?speed=2)
|
|
6
6
|
|
|
7
7
|
## Description
|
|
8
8
|
|
|
@@ -47,23 +47,23 @@ uv tool install .
|
|
|
47
47
|
Stream a HuggingFace dataset to Vespa JSON format:
|
|
48
48
|
|
|
49
49
|
```bash
|
|
50
|
-
hf2vespa feed
|
|
50
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 3
|
|
51
51
|
```
|
|
52
52
|
|
|
53
53
|
**Output:**
|
|
54
54
|
```json
|
|
55
|
-
{"put":"id:doc:doc::0","fields":{"
|
|
56
|
-
{"put":"id:doc:doc::1","fields":{"
|
|
57
|
-
{"put":"id:doc:doc::2","fields":{"
|
|
55
|
+
{"put":"id:doc:doc::0","fields":{"id":"00_0","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews."}}
|
|
56
|
+
{"put":"id:doc:doc::1","fields":{"id":"00_172","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"This allow for a more accurate measure, as does running the test first in one direction and then in the exact opposite direction..."}}
|
|
57
|
+
{"put":"id:doc:doc::2","fields":{"id":"00_587","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"Instead, some believe the measure should include a range of times rather than one finite mark..."}}
|
|
58
58
|
```
|
|
59
59
|
|
|
60
60
|
```
|
|
61
61
|
--- Completion Statistics ---
|
|
62
|
-
Total records processed:
|
|
63
|
-
Successful:
|
|
62
|
+
Total records processed: 3
|
|
63
|
+
Successful: 3
|
|
64
64
|
Errors: 0
|
|
65
|
-
Throughput:
|
|
66
|
-
Elapsed time:
|
|
65
|
+
Throughput: 4.5 records/sec
|
|
66
|
+
Elapsed time: 0.67s
|
|
67
67
|
```
|
|
68
68
|
|
|
69
69
|
### Preview Dataset Schema
|
|
@@ -71,33 +71,115 @@ Elapsed time: 2.38s
|
|
|
71
71
|
Inspect a dataset and generate a YAML configuration template:
|
|
72
72
|
|
|
73
73
|
```bash
|
|
74
|
-
hf2vespa init
|
|
74
|
+
hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en -o cohere-config.yaml
|
|
75
75
|
```
|
|
76
76
|
|
|
77
|
-
**Generated config.yaml:**
|
|
77
|
+
**Generated cohere-config.yaml:**
|
|
78
78
|
```yaml
|
|
79
79
|
namespace: doc
|
|
80
80
|
doctype: doc
|
|
81
|
-
id_column:
|
|
81
|
+
id_column:
|
|
82
82
|
|
|
83
83
|
mappings:
|
|
84
|
-
- source:
|
|
85
|
-
target:
|
|
84
|
+
- source: _id
|
|
85
|
+
target: _id
|
|
86
86
|
type: # string
|
|
87
|
-
- source:
|
|
88
|
-
target:
|
|
87
|
+
- source: url
|
|
88
|
+
target: url
|
|
89
|
+
type: # string
|
|
90
|
+
- source: title
|
|
91
|
+
target: title
|
|
92
|
+
type: # string
|
|
93
|
+
- source: text
|
|
94
|
+
target: text
|
|
89
95
|
type: # string
|
|
96
|
+
- source: emb
|
|
97
|
+
target: emb
|
|
98
|
+
type: tensor # Sequence[float32] -> suggested: tensor
|
|
90
99
|
```
|
|
91
100
|
|
|
92
101
|
### Use Config File
|
|
93
102
|
|
|
94
|
-
|
|
103
|
+
Edit the config to customize type conversions, then apply it:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
hf2vespa feed Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --config-file cohere-config.yaml --limit 5
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
See [Two Modes of Operation](#two-modes-of-operation) below for a complete example with bfloat16 hex encoding.
|
|
110
|
+
|
|
111
|
+
## Two Modes of Operation
|
|
112
|
+
|
|
113
|
+
hf2vespa supports two modes depending on your needs:
|
|
114
|
+
|
|
115
|
+
### CLI Mode (Quick & Simple)
|
|
116
|
+
|
|
117
|
+
Use CLI arguments when you need:
|
|
118
|
+
- Column renaming (`--rename old:new`)
|
|
119
|
+
- Column filtering (`--include col1 --include col2`)
|
|
120
|
+
- Custom namespace/doctype (`--namespace`, `--doctype`)
|
|
121
|
+
- Preview data structure
|
|
122
|
+
|
|
123
|
+
**Example:** Rename columns and stream MS MARCO corpus:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 5
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
**Output:**
|
|
130
|
+
```json
|
|
131
|
+
{"put":"id:doc:doc::0","fields":{"id":"00_0","title":"0-60 Times - 0-60 | 0 to 60 Times...","text":"0-60 Times - 0-60 | 0 to 60 Times..."}}
|
|
132
|
+
{"put":"id:doc:doc::1","fields":{"id":"00_172","title":"0-60 Times...","text":"This allow for a more accurate measure..."}}
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Config File Mode (Advanced)
|
|
136
|
+
|
|
137
|
+
Use `hf2vespa init` + YAML config when you need:
|
|
138
|
+
- Type conversions (tensor, hex-encoded formats)
|
|
139
|
+
- bfloat16/int8 quantized embeddings
|
|
140
|
+
- Sparse or mixed tensors
|
|
141
|
+
- Complex multi-field transformations
|
|
142
|
+
|
|
143
|
+
**Example:** Convert Cohere embeddings to hex-encoded bfloat16:
|
|
144
|
+
|
|
145
|
+
1. Generate a config template:
|
|
146
|
+
```bash
|
|
147
|
+
hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --output cohere-config.yaml
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
2. Edit the config to use `tensor_bfloat16_hex` for the embedding field:
|
|
151
|
+
```yaml
|
|
152
|
+
# cohere-config.yaml
|
|
153
|
+
namespace: doc
|
|
154
|
+
doctype: doc
|
|
155
|
+
id_column:
|
|
156
|
+
|
|
157
|
+
mappings:
|
|
158
|
+
- source: _id
|
|
159
|
+
target: _id
|
|
160
|
+
- source: url
|
|
161
|
+
target: url
|
|
162
|
+
- source: title
|
|
163
|
+
target: title
|
|
164
|
+
- source: text
|
|
165
|
+
target: text
|
|
166
|
+
- source: emb
|
|
167
|
+
target: emb
|
|
168
|
+
type: tensor_bfloat16_hex # Convert to hex-encoded bfloat16
|
|
169
|
+
```
|
|
95
170
|
|
|
171
|
+
3. Stream with the config file:
|
|
96
172
|
```bash
|
|
97
|
-
hf2vespa feed
|
|
173
|
+
hf2vespa feed Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --config-file cohere-config.yaml --limit 2
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
**Output:**
|
|
177
|
+
```json
|
|
178
|
+
{"put":"id:doc:doc::0","fields":{"_id":"20231101.en_13194570_0","url":"https://en.wikipedia.org/wiki/British%20Arab%20Commercial%20Bank","title":"British Arab Commercial Bank","text":"The British Arab Commercial Bank PLC (BACB) is an international wholesale bank...","emb":{"values":"3aeabd253b963d1a3b833d8f3d8bbb16bc3e3b01..."}}}
|
|
179
|
+
{"put":"id:doc:doc::1","fields":{"_id":"20231101.en_13194570_1","url":"https://en.wikipedia.org/wiki/British%20Arab%20Commercial%20Bank","title":"British Arab Commercial Bank","text":"BACB has a head office in London...","emb":{"values":"3baabcd7bc3c3d623cc13d853d94ba8dbb45bcb5..."}}}
|
|
98
180
|
```
|
|
99
181
|
|
|
100
|
-
The
|
|
182
|
+
The `emb` field is now hex-encoded bfloat16, reducing storage size by 50% compared to float32.
|
|
101
183
|
|
|
102
184
|
## YAML Configuration
|
|
103
185
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# Suppress warnings that occur during cleanup. These are harmless but confusing to users.
|
|
2
2
|
# 1. HuggingFace HTTP retry warnings: https://github.com/apache/arrow/issues/45214
|
|
3
3
|
# 2. Multiprocessing resource tracker warnings (leaked semaphores from HF datasets)
|
|
4
|
+
# 3. Python 3.14+ SyntaxWarning for 'return' in 'finally' blocks (multiprocess package)
|
|
5
|
+
# See: https://peps.python.org/pep-0765/
|
|
4
6
|
#
|
|
5
7
|
# IMPORTANT: Set PYTHONWARNINGS env var BEFORE any imports that might trigger multiprocessing.
|
|
6
8
|
# The resource_tracker runs as a separate daemon process and inherits the env at spawn time.
|
|
@@ -8,18 +10,23 @@
|
|
|
8
10
|
import os as _os
|
|
9
11
|
|
|
10
12
|
_existing_warnings = _os.environ.get("PYTHONWARNINGS", "")
|
|
11
|
-
|
|
13
|
+
_new_filters = [
|
|
14
|
+
"ignore::UserWarning:multiprocessing.resource_tracker",
|
|
15
|
+
"ignore::SyntaxWarning:multiprocess",
|
|
16
|
+
]
|
|
17
|
+
_combined = ",".join(_new_filters)
|
|
12
18
|
if _existing_warnings:
|
|
13
|
-
_os.environ["PYTHONWARNINGS"] = f"{_existing_warnings},{
|
|
19
|
+
_os.environ["PYTHONWARNINGS"] = f"{_existing_warnings},{_combined}"
|
|
14
20
|
else:
|
|
15
|
-
_os.environ["PYTHONWARNINGS"] =
|
|
16
|
-
del _os, _existing_warnings,
|
|
21
|
+
_os.environ["PYTHONWARNINGS"] = _combined
|
|
22
|
+
del _os, _existing_warnings, _new_filters, _combined
|
|
17
23
|
|
|
18
24
|
import logging as _logging
|
|
19
25
|
import warnings as _warnings
|
|
20
26
|
|
|
21
27
|
_logging.getLogger("huggingface_hub.utils._http").setLevel(_logging.CRITICAL)
|
|
22
28
|
_warnings.filterwarnings("ignore", message="resource_tracker:", category=UserWarning)
|
|
29
|
+
_warnings.filterwarnings("ignore", category=SyntaxWarning, module="multiprocess")
|
|
23
30
|
del _logging, _warnings
|
|
24
31
|
|
|
25
32
|
"""CLI for streaming HuggingFace datasets to Vespa JSON format."""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hf2vespa
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Stream HuggingFace datasets to Vespa JSON format
|
|
5
5
|
Author-email: Thomas Thoresen <thomas.h.thoresen@gmail.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -28,7 +28,7 @@ Requires-Dist: ruamel.yaml>=0.19.0
|
|
|
28
28
|
|
|
29
29
|
Stream HuggingFace datasets to Vespa JSON format
|
|
30
30
|
|
|
31
|
-
[](https://asciinema.org/a/VGODiMSiXua4FX7w?speed=2)
|
|
32
32
|
|
|
33
33
|
## Description
|
|
34
34
|
|
|
@@ -73,23 +73,23 @@ uv tool install .
|
|
|
73
73
|
Stream a HuggingFace dataset to Vespa JSON format:
|
|
74
74
|
|
|
75
75
|
```bash
|
|
76
|
-
hf2vespa feed
|
|
76
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 3
|
|
77
77
|
```
|
|
78
78
|
|
|
79
79
|
**Output:**
|
|
80
80
|
```json
|
|
81
|
-
{"put":"id:doc:doc::0","fields":{"
|
|
82
|
-
{"put":"id:doc:doc::1","fields":{"
|
|
83
|
-
{"put":"id:doc:doc::2","fields":{"
|
|
81
|
+
{"put":"id:doc:doc::0","fields":{"id":"00_0","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews."}}
|
|
82
|
+
{"put":"id:doc:doc::1","fields":{"id":"00_172","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"This allow for a more accurate measure, as does running the test first in one direction and then in the exact opposite direction..."}}
|
|
83
|
+
{"put":"id:doc:doc::2","fields":{"id":"00_587","title":"0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews","text":"Instead, some believe the measure should include a range of times rather than one finite mark..."}}
|
|
84
84
|
```
|
|
85
85
|
|
|
86
86
|
```
|
|
87
87
|
--- Completion Statistics ---
|
|
88
|
-
Total records processed:
|
|
89
|
-
Successful:
|
|
88
|
+
Total records processed: 3
|
|
89
|
+
Successful: 3
|
|
90
90
|
Errors: 0
|
|
91
|
-
Throughput:
|
|
92
|
-
Elapsed time:
|
|
91
|
+
Throughput: 4.5 records/sec
|
|
92
|
+
Elapsed time: 0.67s
|
|
93
93
|
```
|
|
94
94
|
|
|
95
95
|
### Preview Dataset Schema
|
|
@@ -97,33 +97,115 @@ Elapsed time: 2.38s
|
|
|
97
97
|
Inspect a dataset and generate a YAML configuration template:
|
|
98
98
|
|
|
99
99
|
```bash
|
|
100
|
-
hf2vespa init
|
|
100
|
+
hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en -o cohere-config.yaml
|
|
101
101
|
```
|
|
102
102
|
|
|
103
|
-
**Generated config.yaml:**
|
|
103
|
+
**Generated cohere-config.yaml:**
|
|
104
104
|
```yaml
|
|
105
105
|
namespace: doc
|
|
106
106
|
doctype: doc
|
|
107
|
-
id_column:
|
|
107
|
+
id_column:
|
|
108
108
|
|
|
109
109
|
mappings:
|
|
110
|
-
- source:
|
|
111
|
-
target:
|
|
110
|
+
- source: _id
|
|
111
|
+
target: _id
|
|
112
112
|
type: # string
|
|
113
|
-
- source:
|
|
114
|
-
target:
|
|
113
|
+
- source: url
|
|
114
|
+
target: url
|
|
115
|
+
type: # string
|
|
116
|
+
- source: title
|
|
117
|
+
target: title
|
|
118
|
+
type: # string
|
|
119
|
+
- source: text
|
|
120
|
+
target: text
|
|
115
121
|
type: # string
|
|
122
|
+
- source: emb
|
|
123
|
+
target: emb
|
|
124
|
+
type: tensor # Sequence[float32] -> suggested: tensor
|
|
116
125
|
```
|
|
117
126
|
|
|
118
127
|
### Use Config File
|
|
119
128
|
|
|
120
|
-
|
|
129
|
+
Edit the config to customize type conversions, then apply it:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
hf2vespa feed Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --config-file cohere-config.yaml --limit 5
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
See [Two Modes of Operation](#two-modes-of-operation) below for a complete example with bfloat16 hex encoding.
|
|
136
|
+
|
|
137
|
+
## Two Modes of Operation
|
|
138
|
+
|
|
139
|
+
hf2vespa supports two modes depending on your needs:
|
|
140
|
+
|
|
141
|
+
### CLI Mode (Quick & Simple)
|
|
142
|
+
|
|
143
|
+
Use CLI arguments when you need:
|
|
144
|
+
- Column renaming (`--rename old:new`)
|
|
145
|
+
- Column filtering (`--include col1 --include col2`)
|
|
146
|
+
- Custom namespace/doctype (`--namespace`, `--doctype`)
|
|
147
|
+
- Preview data structure
|
|
148
|
+
|
|
149
|
+
**Example:** Rename columns and stream MS MARCO corpus:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
hf2vespa feed mteb/msmarco-v2 --config corpus --split corpus --rename _id:id --limit 5
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
**Output:**
|
|
156
|
+
```json
|
|
157
|
+
{"put":"id:doc:doc::0","fields":{"id":"00_0","title":"0-60 Times - 0-60 | 0 to 60 Times...","text":"0-60 Times - 0-60 | 0 to 60 Times..."}}
|
|
158
|
+
{"put":"id:doc:doc::1","fields":{"id":"00_172","title":"0-60 Times...","text":"This allow for a more accurate measure..."}}
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Config File Mode (Advanced)
|
|
162
|
+
|
|
163
|
+
Use `hf2vespa init` + YAML config when you need:
|
|
164
|
+
- Type conversions (tensor, hex-encoded formats)
|
|
165
|
+
- bfloat16/int8 quantized embeddings
|
|
166
|
+
- Sparse or mixed tensors
|
|
167
|
+
- Complex multi-field transformations
|
|
168
|
+
|
|
169
|
+
**Example:** Convert Cohere embeddings to hex-encoded bfloat16:
|
|
170
|
+
|
|
171
|
+
1. Generate a config template:
|
|
172
|
+
```bash
|
|
173
|
+
hf2vespa init Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --output cohere-config.yaml
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
2. Edit the config to use `tensor_bfloat16_hex` for the embedding field:
|
|
177
|
+
```yaml
|
|
178
|
+
# cohere-config.yaml
|
|
179
|
+
namespace: doc
|
|
180
|
+
doctype: doc
|
|
181
|
+
id_column:
|
|
182
|
+
|
|
183
|
+
mappings:
|
|
184
|
+
- source: _id
|
|
185
|
+
target: _id
|
|
186
|
+
- source: url
|
|
187
|
+
target: url
|
|
188
|
+
- source: title
|
|
189
|
+
target: title
|
|
190
|
+
- source: text
|
|
191
|
+
target: text
|
|
192
|
+
- source: emb
|
|
193
|
+
target: emb
|
|
194
|
+
type: tensor_bfloat16_hex # Convert to hex-encoded bfloat16
|
|
195
|
+
```
|
|
121
196
|
|
|
197
|
+
3. Stream with the config file:
|
|
122
198
|
```bash
|
|
123
|
-
hf2vespa feed
|
|
199
|
+
hf2vespa feed Cohere/wikipedia-2023-11-embed-multilingual-v3 --config en --config-file cohere-config.yaml --limit 2
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
**Output:**
|
|
203
|
+
```json
|
|
204
|
+
{"put":"id:doc:doc::0","fields":{"_id":"20231101.en_13194570_0","url":"https://en.wikipedia.org/wiki/British%20Arab%20Commercial%20Bank","title":"British Arab Commercial Bank","text":"The British Arab Commercial Bank PLC (BACB) is an international wholesale bank...","emb":{"values":"3aeabd253b963d1a3b833d8f3d8bbb16bc3e3b01..."}}}
|
|
205
|
+
{"put":"id:doc:doc::1","fields":{"_id":"20231101.en_13194570_1","url":"https://en.wikipedia.org/wiki/British%20Arab%20Commercial%20Bank","title":"British Arab Commercial Bank","text":"BACB has a head office in London...","emb":{"values":"3baabcd7bc3c3d623cc13d853d94ba8dbb45bcb5..."}}}
|
|
124
206
|
```
|
|
125
207
|
|
|
126
|
-
The
|
|
208
|
+
The `emb` field is now hex-encoded bfloat16, reducing storage size by 50% compared to float32.
|
|
127
209
|
|
|
128
210
|
## YAML Configuration
|
|
129
211
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/01-core-streaming-pipeline/01-01-SUMMARY.md
RENAMED
|
File without changes
|
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/01-core-streaming-pipeline/01-02-SUMMARY.md
RENAMED
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/01-core-streaming-pipeline/01-RESEARCH.md
RENAMED
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/01-core-streaming-pipeline/01-VERIFICATION.md
RENAMED
|
File without changes
|
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/02-config-advanced-mapping/02-01-SUMMARY.md
RENAMED
|
File without changes
|
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/02-config-advanced-mapping/02-02-SUMMARY.md
RENAMED
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/02-config-advanced-mapping/02-RESEARCH.md
RENAMED
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/02-config-advanced-mapping/02-VERIFICATION.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/03-production-hardening/03-VERIFICATION.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/06-documentation/06-documentation-VERIFICATION.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/10-sparse-and-mixed-tensors/10-01-PLAN.md
RENAMED
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/10-sparse-and-mixed-tensors/10-01-SUMMARY.md
RENAMED
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/10-sparse-and-mixed-tensors/10-02-PLAN.md
RENAMED
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/10-sparse-and-mixed-tensors/10-02-SUMMARY.md
RENAMED
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/10-sparse-and-mixed-tensors/10-RESEARCH.md
RENAMED
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/10-sparse-and-mixed-tensors/10-VERIFICATION.md
RENAMED
|
File without changes
|
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-01-SUMMARY.md
RENAMED
|
File without changes
|
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-02-SUMMARY.md
RENAMED
|
File without changes
|
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-03-SUMMARY.md
RENAMED
|
File without changes
|
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-04-SUMMARY.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hf2vespa-0.1.0 → hf2vespa-0.1.1}/.planning/phases/11-testing-and-benchmarks/11-VERIFICATION.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|