hf2vespa 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. hf2vespa-0.1.0/.github/workflows/benchmark.yml +79 -0
  2. hf2vespa-0.1.0/.github/workflows/release.yml +35 -0
  3. hf2vespa-0.1.0/.github/workflows/test.yml +31 -0
  4. hf2vespa-0.1.0/.gitignore +51 -0
  5. hf2vespa-0.1.0/.planning/MILESTONES.md +86 -0
  6. hf2vespa-0.1.0/.planning/PROJECT.md +134 -0
  7. hf2vespa-0.1.0/.planning/REQUIREMENTS.md +47 -0
  8. hf2vespa-0.1.0/.planning/ROADMAP.md +91 -0
  9. hf2vespa-0.1.0/.planning/STATE.md +66 -0
  10. hf2vespa-0.1.0/.planning/config.json +12 -0
  11. hf2vespa-0.1.0/.planning/milestones/v1.0-MILESTONE-AUDIT.md +169 -0
  12. hf2vespa-0.1.0/.planning/milestones/v1.0-REQUIREMENTS.md +114 -0
  13. hf2vespa-0.1.0/.planning/milestones/v1.0-ROADMAP.md +127 -0
  14. hf2vespa-0.1.0/.planning/milestones/v1.1-MILESTONE-AUDIT.md +130 -0
  15. hf2vespa-0.1.0/.planning/milestones/v1.1-REQUIREMENTS.md +110 -0
  16. hf2vespa-0.1.0/.planning/milestones/v1.1-ROADMAP.md +111 -0
  17. hf2vespa-0.1.0/.planning/milestones/v2.0-MILESTONE-AUDIT.md +155 -0
  18. hf2vespa-0.1.0/.planning/milestones/v2.0-REQUIREMENTS.md +85 -0
  19. hf2vespa-0.1.0/.planning/milestones/v2.0-ROADMAP.md +108 -0
  20. hf2vespa-0.1.0/.planning/phases/01-core-streaming-pipeline/01-01-PLAN.md +164 -0
  21. hf2vespa-0.1.0/.planning/phases/01-core-streaming-pipeline/01-01-SUMMARY.md +110 -0
  22. hf2vespa-0.1.0/.planning/phases/01-core-streaming-pipeline/01-02-PLAN.md +236 -0
  23. hf2vespa-0.1.0/.planning/phases/01-core-streaming-pipeline/01-02-SUMMARY.md +151 -0
  24. hf2vespa-0.1.0/.planning/phases/01-core-streaming-pipeline/01-RESEARCH.md +652 -0
  25. hf2vespa-0.1.0/.planning/phases/01-core-streaming-pipeline/01-VERIFICATION.md +187 -0
  26. hf2vespa-0.1.0/.planning/phases/02-config-advanced-mapping/02-01-PLAN.md +219 -0
  27. hf2vespa-0.1.0/.planning/phases/02-config-advanced-mapping/02-01-SUMMARY.md +125 -0
  28. hf2vespa-0.1.0/.planning/phases/02-config-advanced-mapping/02-02-PLAN.md +293 -0
  29. hf2vespa-0.1.0/.planning/phases/02-config-advanced-mapping/02-02-SUMMARY.md +114 -0
  30. hf2vespa-0.1.0/.planning/phases/02-config-advanced-mapping/02-RESEARCH.md +435 -0
  31. hf2vespa-0.1.0/.planning/phases/02-config-advanced-mapping/02-VERIFICATION.md +199 -0
  32. hf2vespa-0.1.0/.planning/phases/03-production-hardening/03-01-PLAN.md +147 -0
  33. hf2vespa-0.1.0/.planning/phases/03-production-hardening/03-01-SUMMARY.md +96 -0
  34. hf2vespa-0.1.0/.planning/phases/03-production-hardening/03-02-PLAN.md +268 -0
  35. hf2vespa-0.1.0/.planning/phases/03-production-hardening/03-02-SUMMARY.md +102 -0
  36. hf2vespa-0.1.0/.planning/phases/03-production-hardening/03-RESEARCH.md +453 -0
  37. hf2vespa-0.1.0/.planning/phases/03-production-hardening/03-VERIFICATION.md +129 -0
  38. hf2vespa-0.1.0/.planning/phases/04-init-command/04-01-PLAN.md +292 -0
  39. hf2vespa-0.1.0/.planning/phases/04-init-command/04-01-SUMMARY.md +126 -0
  40. hf2vespa-0.1.0/.planning/phases/04-init-command/04-RESEARCH.md +725 -0
  41. hf2vespa-0.1.0/.planning/phases/04-init-command/04-VERIFICATION.md +104 -0
  42. hf2vespa-0.1.0/.planning/phases/05-shell-completion/05-01-PLAN.md +245 -0
  43. hf2vespa-0.1.0/.planning/phases/05-shell-completion/05-01-SUMMARY.md +98 -0
  44. hf2vespa-0.1.0/.planning/phases/05-shell-completion/05-RESEARCH.md +362 -0
  45. hf2vespa-0.1.0/.planning/phases/05-shell-completion/05-VERIFICATION.md +179 -0
  46. hf2vespa-0.1.0/.planning/phases/06-documentation/06-01-PLAN.md +173 -0
  47. hf2vespa-0.1.0/.planning/phases/06-documentation/06-01-SUMMARY.md +113 -0
  48. hf2vespa-0.1.0/.planning/phases/06-documentation/06-02-PLAN.md +262 -0
  49. hf2vespa-0.1.0/.planning/phases/06-documentation/06-02-SUMMARY.md +112 -0
  50. hf2vespa-0.1.0/.planning/phases/06-documentation/06-03-PLAN.md +333 -0
  51. hf2vespa-0.1.0/.planning/phases/06-documentation/06-03-SUMMARY.md +128 -0
  52. hf2vespa-0.1.0/.planning/phases/06-documentation/06-RESEARCH.md +577 -0
  53. hf2vespa-0.1.0/.planning/phases/06-documentation/06-documentation-VERIFICATION.md +164 -0
  54. hf2vespa-0.1.0/.planning/phases/07-performance/07-01-PLAN.md +218 -0
  55. hf2vespa-0.1.0/.planning/phases/07-performance/07-01-SUMMARY.md +120 -0
  56. hf2vespa-0.1.0/.planning/phases/07-performance/07-02-PLAN.md +230 -0
  57. hf2vespa-0.1.0/.planning/phases/07-performance/07-02-SUMMARY.md +121 -0
  58. hf2vespa-0.1.0/.planning/phases/07-performance/07-CONTEXT.md +54 -0
  59. hf2vespa-0.1.0/.planning/phases/07-performance/07-RESEARCH.md +492 -0
  60. hf2vespa-0.1.0/.planning/phases/07-performance/07-VERIFICATION.md +223 -0
  61. hf2vespa-0.1.0/.planning/phases/08-scalar-types/08-01-PLAN.md +203 -0
  62. hf2vespa-0.1.0/.planning/phases/08-scalar-types/08-01-SUMMARY.md +106 -0
  63. hf2vespa-0.1.0/.planning/phases/08-scalar-types/08-RESEARCH.md +420 -0
  64. hf2vespa-0.1.0/.planning/phases/08-scalar-types/08-VERIFICATION.md +117 -0
  65. hf2vespa-0.1.0/.planning/phases/09-dense-hex-encoding/09-01-PLAN.md +118 -0
  66. hf2vespa-0.1.0/.planning/phases/09-dense-hex-encoding/09-01-SUMMARY.md +104 -0
  67. hf2vespa-0.1.0/.planning/phases/09-dense-hex-encoding/09-02-PLAN.md +139 -0
  68. hf2vespa-0.1.0/.planning/phases/09-dense-hex-encoding/09-02-SUMMARY.md +105 -0
  69. hf2vespa-0.1.0/.planning/phases/09-dense-hex-encoding/09-RESEARCH.md +545 -0
  70. hf2vespa-0.1.0/.planning/phases/09-dense-hex-encoding/09-VERIFICATION.md +91 -0
  71. hf2vespa-0.1.0/.planning/phases/10-sparse-and-mixed-tensors/10-01-PLAN.md +259 -0
  72. hf2vespa-0.1.0/.planning/phases/10-sparse-and-mixed-tensors/10-01-SUMMARY.md +191 -0
  73. hf2vespa-0.1.0/.planning/phases/10-sparse-and-mixed-tensors/10-02-PLAN.md +418 -0
  74. hf2vespa-0.1.0/.planning/phases/10-sparse-and-mixed-tensors/10-02-SUMMARY.md +121 -0
  75. hf2vespa-0.1.0/.planning/phases/10-sparse-and-mixed-tensors/10-RESEARCH.md +647 -0
  76. hf2vespa-0.1.0/.planning/phases/10-sparse-and-mixed-tensors/10-VERIFICATION.md +185 -0
  77. hf2vespa-0.1.0/.planning/phases/11-testing-and-benchmarks/11-01-PLAN.md +394 -0
  78. hf2vespa-0.1.0/.planning/phases/11-testing-and-benchmarks/11-01-SUMMARY.md +128 -0
  79. hf2vespa-0.1.0/.planning/phases/11-testing-and-benchmarks/11-02-PLAN.md +554 -0
  80. hf2vespa-0.1.0/.planning/phases/11-testing-and-benchmarks/11-02-SUMMARY.md +131 -0
  81. hf2vespa-0.1.0/.planning/phases/11-testing-and-benchmarks/11-03-PLAN.md +210 -0
  82. hf2vespa-0.1.0/.planning/phases/11-testing-and-benchmarks/11-03-SUMMARY.md +111 -0
  83. hf2vespa-0.1.0/.planning/phases/11-testing-and-benchmarks/11-04-PLAN.md +340 -0
  84. hf2vespa-0.1.0/.planning/phases/11-testing-and-benchmarks/11-04-SUMMARY.md +96 -0
  85. hf2vespa-0.1.0/.planning/phases/11-testing-and-benchmarks/11-CONTEXT.md +60 -0
  86. hf2vespa-0.1.0/.planning/phases/11-testing-and-benchmarks/11-RESEARCH.md +441 -0
  87. hf2vespa-0.1.0/.planning/phases/11-testing-and-benchmarks/11-VERIFICATION.md +121 -0
  88. hf2vespa-0.1.0/.planning/phases/12-clean-exit/12-01-PLAN.md +275 -0
  89. hf2vespa-0.1.0/.planning/phases/12-clean-exit/12-01-SUMMARY.md +151 -0
  90. hf2vespa-0.1.0/.planning/phases/12-clean-exit/12-01-VERIFICATION.md +118 -0
  91. hf2vespa-0.1.0/.planning/research/ARCHITECTURE.md +917 -0
  92. hf2vespa-0.1.0/.planning/research/FEATURES.md +330 -0
  93. hf2vespa-0.1.0/.planning/research/PITFALLS.md +625 -0
  94. hf2vespa-0.1.0/.planning/research/STACK.md +639 -0
  95. hf2vespa-0.1.0/.planning/research/SUMMARY.md +193 -0
  96. hf2vespa-0.1.0/.planning/v2.1-MILESTONE-AUDIT.md +102 -0
  97. hf2vespa-0.1.0/PKG-INFO +820 -0
  98. hf2vespa-0.1.0/README.md +794 -0
  99. hf2vespa-0.1.0/pyproject.toml +62 -0
  100. hf2vespa-0.1.0/setup.cfg +4 -0
  101. hf2vespa-0.1.0/src/hf2vespa/__init__.py +1 -0
  102. hf2vespa-0.1.0/src/hf2vespa/__main__.py +4 -0
  103. hf2vespa-0.1.0/src/hf2vespa/cli.py +465 -0
  104. hf2vespa-0.1.0/src/hf2vespa/config.py +131 -0
  105. hf2vespa-0.1.0/src/hf2vespa/converters.py +648 -0
  106. hf2vespa-0.1.0/src/hf2vespa/init.py +351 -0
  107. hf2vespa-0.1.0/src/hf2vespa/pipeline.py +198 -0
  108. hf2vespa-0.1.0/src/hf2vespa/stats.py +76 -0
  109. hf2vespa-0.1.0/src/hf2vespa/utils.py +57 -0
  110. hf2vespa-0.1.0/src/hf2vespa.egg-info/PKG-INFO +820 -0
  111. hf2vespa-0.1.0/src/hf2vespa.egg-info/SOURCES.txt +125 -0
  112. hf2vespa-0.1.0/src/hf2vespa.egg-info/dependency_links.txt +1 -0
  113. hf2vespa-0.1.0/src/hf2vespa.egg-info/entry_points.txt +2 -0
  114. hf2vespa-0.1.0/src/hf2vespa.egg-info/requires.txt +7 -0
  115. hf2vespa-0.1.0/src/hf2vespa.egg-info/top_level.txt +1 -0
  116. hf2vespa-0.1.0/tests/__init__.py +1 -0
  117. hf2vespa-0.1.0/tests/benchmarks/__init__.py +1 -0
  118. hf2vespa-0.1.0/tests/benchmarks/conftest.py +82 -0
  119. hf2vespa-0.1.0/tests/benchmarks/test_converter_benchmarks.py +183 -0
  120. hf2vespa-0.1.0/tests/benchmarks/test_pipeline_benchmarks.py +171 -0
  121. hf2vespa-0.1.0/tests/benchmarks/test_realworld_benchmarks.py +239 -0
  122. hf2vespa-0.1.0/tests/fixtures/__init__.py +1 -0
  123. hf2vespa-0.1.0/tests/fixtures/vespa_doc_examples.py +57 -0
  124. hf2vespa-0.1.0/tests/smoke_test.py +9 -0
  125. hf2vespa-0.1.0/tests/test_cli.py +90 -0
  126. hf2vespa-0.1.0/tests/test_converters.py +1193 -0
  127. hf2vespa-0.1.0/tests/test_pipeline.py +18 -0
@@ -0,0 +1,79 @@
1
+ # Benchmark workflow for hf-vespa-feed
2
+ # Runs on version tags, publishes results to GitHub Pages, fails on 10% regression
3
+ #
4
+ # SETUP REQUIRED (one-time):
5
+ # 1. Create orphan gh-pages branch:
6
+ # git checkout --orphan gh-pages
7
+ # git reset --hard
8
+ # git commit --allow-empty -m "Initialize gh-pages"
9
+ # git push origin gh-pages
10
+ # git checkout master
11
+ #
12
+ # 2. Enable GitHub Pages in repo settings:
13
+ # Settings -> Pages -> Source: "Deploy from a branch"
14
+ # Branch: gh-pages, folder: / (root)
15
+ #
16
+ # 3. Ensure GITHUB_TOKEN has write permissions:
17
+ # Settings -> Actions -> General -> Workflow permissions
18
+ # Select "Read and write permissions"
19
+ #
20
+ # Results will be visible at: https://<username>.github.io/<repo>/dev/bench/
21
+
22
+ name: Benchmark
23
+
24
+ on:
25
+ push:
26
+ tags:
27
+ - 'v*' # Run on version tags only (e.g., v0.2.0, v1.0.0)
28
+ workflow_dispatch: # Allow manual trigger for testing
29
+
30
+ permissions:
31
+ contents: write
32
+ deployments: write
33
+
34
+ jobs:
35
+ benchmark:
36
+ name: Run Benchmarks
37
+ runs-on: ubuntu-latest
38
+
39
+ steps:
40
+ - name: Checkout repository
41
+ uses: actions/checkout@v4
42
+
43
+ - name: Set up Python
44
+ uses: actions/setup-python@v5
45
+ with:
46
+ python-version: '3.12'
47
+ cache: 'pip'
48
+
49
+ - name: Install dependencies
50
+ run: |
51
+ python -m pip install --upgrade pip
52
+ pip install -e ".[dev]"
53
+
54
+ - name: Run benchmarks
55
+ run: |
56
+ pytest tests/benchmarks/ \
57
+ --benchmark-enable \
58
+ --benchmark-only \
59
+ --benchmark-json=benchmark-results.json \
60
+ --benchmark-min-rounds=5 \
61
+ --benchmark-warmup=on \
62
+ -v
63
+
64
+ - name: Store benchmark result
65
+ uses: benchmark-action/github-action-benchmark@v1
66
+ with:
67
+ name: 'Converter Benchmarks'
68
+ tool: 'pytest'
69
+ output-file-path: benchmark-results.json
70
+ github-token: ${{ secrets.GITHUB_TOKEN }}
71
+ auto-push: true
72
+ # Alert threshold: 110% means 10% slower triggers alert
73
+ alert-threshold: '110%'
74
+ # Fail the workflow if benchmark regresses by more than 10%
75
+ fail-on-alert: true
76
+ # Comment on commits that cause alerts
77
+ comment-on-alert: true
78
+ # Summary comment shows comparison with previous
79
+ summary-always: true
@@ -0,0 +1,35 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ # Publish on any tag starting with a `v`, e.g., v1.2.3
7
+ - v*
8
+
9
+ jobs:
10
+ pypi:
11
+ name: Publish to PyPI
12
+ runs-on: ubuntu-latest
13
+ # Environment and permissions trusted publishing.
14
+ environment:
15
+ # Create this environment in the GitHub repository under Settings -> Environments
16
+ name: pypi
17
+ permissions:
18
+ id-token: write
19
+ contents: read
20
+ steps:
21
+ - name: Checkout
22
+ uses: actions/checkout@v5
23
+ - name: Install uv
24
+ uses: astral-sh/setup-uv@v6
25
+ - name: Install Python 3.13
26
+ run: uv python install 3.13
27
+ - name: Build
28
+ run: uv build
29
+ # Check that basic features work and we didn't miss to include crucial files
30
+ - name: Smoke test (wheel)
31
+ run: uv run --isolated --no-project --with dist/*.whl python tests/smoke_test.py
32
+ - name: Smoke test (source distribution)
33
+ run: uv run --isolated --no-project --with dist/*.tar.gz python tests/smoke_test.py
34
+ - name: Publish
35
+ run: uv publish
@@ -0,0 +1,31 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ test:
10
+ name: Test Python ${{ matrix.python-version }}
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
16
+
17
+ steps:
18
+ - name: Checkout
19
+ uses: actions/checkout@v5
20
+
21
+ - name: Install uv
22
+ uses: astral-sh/setup-uv@v6
23
+
24
+ - name: Install Python ${{ matrix.python-version }}
25
+ run: uv python install ${{ matrix.python-version }}
26
+
27
+ - name: Install dependencies
28
+ run: uv sync --dev
29
+
30
+ - name: Run tests
31
+ run: uv run pytest
@@ -0,0 +1,51 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ .venv/
25
+ venv/
26
+ ENV/
27
+
28
+ # IDE
29
+ .idea/
30
+ .vscode/
31
+ *.swp
32
+ *.swo
33
+
34
+ # Testing
35
+ .pytest_cache/
36
+ .coverage
37
+ htmlcov/
38
+ .tox/
39
+ .nox/
40
+
41
+ # Package managers
42
+ uv.lock
43
+ pip-log.txt
44
+
45
+ # Local test files
46
+ vespa-config.yaml
47
+ vespa-llms.txt
48
+
49
+ # OS
50
+ .DS_Store
51
+ Thumbs.db
@@ -0,0 +1,86 @@
1
+ # Project Milestones: hf-vespa-feed
2
+
3
+ ## v2.0 Vespa Types (Shipped: 2026-02-03)
4
+
5
+ **Delivered:** Complete Vespa type coverage with hex tensor encoding, sparse/mixed tensors, scalar types, and comprehensive benchmarking infrastructure.
6
+
7
+ **Phases completed:** 8-11 (9 plans total)
8
+
9
+ **Key accomplishments:**
10
+
11
+ - Hex-encoded tensors (int8, bfloat16, float32, float64) with IEEE 754 compliance
12
+ - Sparse tensor support (Vespa cells notation) for term weights
13
+ - Mixed tensor support (blocks notation) for ColBERT-style multi-vector embeddings
14
+ - Position, weighted set, and map scalar type converters
15
+ - 165 tests including Vespa doc-verified correctness and round-trip validation
16
+ - Benchmark infrastructure with 52 tests and GitHub Actions CI with 10% regression detection
17
+ - Comprehensive type reference documentation with 8 cookbook examples
18
+
19
+ **Stats:**
20
+
21
+ - 1,788 lines of Python source
22
+ - 1,899 lines of Python tests
23
+ - 778-line README with full type documentation
24
+ - 4 phases, 9 plans
25
+ - Same day completion (2026-02-03)
26
+
27
+ **Git range:** `feat(08-01)` → `docs(v2.0)`
28
+
29
+ **What's next:** v3 features (field concatenation, row filtering, DuckDB SQL integration) based on user feedback.
30
+
31
+ ---
32
+
33
+ ## v1.1 User Experience (Shipped: 2026-02-03)
34
+
35
+ **Delivered:** Config bootstrapping, shell completion, comprehensive documentation, and performance optimizations for better developer experience.
36
+
37
+ **Phases completed:** 4-7 (7 plans total)
38
+
39
+ **Key accomplishments:**
40
+
41
+ - `hf-vespa-feed init <dataset>` generates YAML config from HuggingFace schema with type suggestions
42
+ - `hf-vespa-feed install-completion` adds shell tab completion (bash/zsh/fish)
43
+ - 527-line README with installation guide, CLI reference, and 5-example cookbook
44
+ - Troubleshooting guide for common issues (auth, memory, type errors)
45
+ - `--num-workers` CLI flag for parallel dataset loading (default: CPU count)
46
+ - lru_cache on converter lookups for faster type conversions
47
+
48
+ **Stats:**
49
+
50
+ - 1,332 lines of Python
51
+ - 4 phases, 7 plans
52
+ - 2 days from v1.0 to v1.1 ship
53
+
54
+ **Git range:** `feat(04-01)` → `docs(07)`
55
+
56
+ **What's next:** Consider v2 features (field concatenation, row filtering, DuckDB integration) based on user feedback.
57
+
58
+ ---
59
+
60
+ ## v1.0 MVP (Shipped: 2026-02-02)
61
+
62
+ **Delivered:** Production-ready CLI tool for streaming HuggingFace datasets to Vespa JSON feed format with YAML config, type conversions, and error handling.
63
+
64
+ **Phases completed:** 1-3 (6 plans total)
65
+
66
+ **Key accomplishments:**
67
+
68
+ - Streaming CLI tool (`hf-vespa-feed`) with O(1) memory usage
69
+ - Column filtering, renaming, and auto-increment document IDs
70
+ - YAML config support with type conversions (list → tensor)
71
+ - Custom document IDs from dataset columns
72
+ - Error handling modes (`--on-error=fail/skip`) with row-level diagnostics
73
+ - Progress bar and completion statistics with throughput
74
+
75
+ **Stats:**
76
+
77
+ - 38 files created/modified
78
+ - 782 lines of Python
79
+ - 3 phases, 6 plans
80
+ - 1 day from start to ship
81
+
82
+ **Git range:** `bf70cf1` → `7e3b276`
83
+
84
+ **What's next:** Consider v2 features (field concatenation, row filtering, DuckDB integration) based on user feedback.
85
+
86
+ ---
@@ -0,0 +1,134 @@
1
+ # hf-vespa-feed
2
+
3
+ ## What This Is
4
+
5
+ A CLI tool that streams Hugging Face datasets into Vespa's JSON feed format with comprehensive type support. Designed for processing millions of rows efficiently with support for hex-encoded tensors, sparse/mixed tensors, and scalar types. Users define field mappings via YAML config or CLI args, and pipe output directly to `vespa feed -`.
6
+
7
+ ## Core Value
8
+
9
+ Fast, memory-efficient streaming of HF datasets to Vespa without intermediate files or full dataset loading.
10
+
11
+ ## Current State
12
+
13
+ **Shipped:** v2.0 (2026-02-03)
14
+ **Source code:** 1,788 lines Python
15
+ **Tests:** 1,899 lines Python (165 tests)
16
+ **Documentation:** 778-line README with type reference
17
+ **Tech stack:** Python 3.10+, typer, datasets, orjson, pydantic, tqdm, ruamel.yaml, pytest-benchmark
18
+
19
+ ## v2.0 Delivered
20
+
21
+ **Hex Tensor Encoding:**
22
+ - int8, bfloat16, float32, float64 with IEEE 754 compliance
23
+ - Range validation with row-level error context
24
+
25
+ **Sparse and Mixed Tensors:**
26
+ - Sparse tensor (Vespa cells notation) for term weights
27
+ - Mixed tensor (blocks notation) for ColBERT-style multi-vectors
28
+ - Mixed tensor hex with configurable cell types
29
+
30
+ **Scalar Types:**
31
+ - Position (geo coordinates with range validation)
32
+ - Weighted set (key-weight pairs)
33
+ - Map (key-value pairs with key stringification)
34
+
35
+ **Testing Infrastructure:**
36
+ - Vespa doc-verified correctness tests
37
+ - Round-trip byte pattern validation
38
+ - 52 benchmark tests with GitHub Actions CI
39
+
40
+ ## Current Milestone: v2.1 Clean Exit
41
+
42
+ **Goal:** Fix the "Bad file descriptor" error that appears after successful processing.
43
+
44
+ **Bug:** After processing completes, users see ugly error messages from HuggingFace's HTTP retry logic even though the process exits successfully. This is confusing and unprofessional.
45
+
46
+ **Root cause:** The `_cleanup_hf_resources()` function calls `close_session()` which triggers cleanup of HTTP connections in a bad state, causing the error to be printed before `os._exit()` runs.
47
+
48
+ **Fix approach:**
49
+ 1. Suppress HuggingFace HTTP logger at module startup
50
+ 2. Remove counterproductive `close_session()` call from cleanup
51
+
52
+ ## Requirements
53
+
54
+ ### Validated
55
+
56
+ - ✓ Stream HF dataset to Vespa JSON feed format — v1.0
57
+ - ✓ Config-driven field mapping (YAML file or CLI args) — v1.0
58
+ - ✓ Column selection (include only specified columns) — v1.0
59
+ - ✓ Column renaming (HF column → Vespa field name) — v1.0
60
+ - ✓ Type conversion (list → tensor format) — v1.0
61
+ - ✓ Auto-increment document ID — v1.0
62
+ - ✓ Document ID from dataset column — v1.0
63
+ - ✓ Configurable namespace and doctype — v1.0
64
+ - ✓ `--limit N` flag for preview — v1.0
65
+ - ✓ Configurable error handling (fail/skip) — v1.0
66
+ - ✓ Progress indicators and completion statistics — v1.0
67
+ - ✓ Use HF cache (no re-download) — v1.0
68
+ - ✓ Graceful SIGPIPE handling — v1.0
69
+ - ✓ Config bootstrapping from dataset inspection — v1.1
70
+ - ✓ Shell completion installation command — v1.1
71
+ - ✓ Comprehensive README with usage guide and cookbook — v1.1
72
+ - ✓ --num-workers flag for parallel loading — v1.1
73
+ - ✓ Converter caching with lru_cache — v1.1
74
+ - ✓ Hex tensor encoding (int8, bfloat16, float32, float64) — v2.0
75
+ - ✓ Sparse tensor format support — v2.0
76
+ - ✓ Mixed tensor format support — v2.0
77
+ - ✓ Position (geo) type conversion — v2.0
78
+ - ✓ Weighted set type conversion — v2.0
79
+ - ✓ Map type conversion — v2.0
80
+ - ✓ Throughput benchmarking with pytest-benchmark — v2.0
81
+ - ✓ Correctness tests based on Vespa docs examples — v2.0
82
+
83
+ ### Active
84
+
85
+ - [ ] Clean exit without "Bad file descriptor" error message — v2.1
86
+ - [ ] No visible warnings from HF/PyArrow cleanup on exit — v2.1
87
+
88
+ ### Out of Scope
89
+
90
+ - Field concatenation — defer to v3
91
+ - Row filtering — defer to v3
92
+ - Default values for missing fields — defer to v3
93
+ - DuckDB integration — defer to v3
94
+ - GUI or web interface — CLI-only tool
95
+
96
+ ## Context
97
+
98
+ - Target users work with large HF datasets (millions of rows) and Vespa search infrastructure
99
+ - HF datasets library uses Arrow backend, enabling memory-mapped streaming
100
+ - Vespa feed CLI handles progress display, so this tool outputs clean JSON only
101
+ - Vespa document JSON format: `{"put": "id:ns:type::id", "fields": {...}}`
102
+ - HF datasets are cached locally by the library, avoiding redundant downloads
103
+
104
+ ## Constraints
105
+
106
+ - **Language**: Python — HF datasets library is Python-native, and streaming/Arrow integration is mature
107
+ - **Memory**: Must stream, not load entire dataset — millions of rows won't fit in memory
108
+ - **Output**: Valid JSONL that `vespa feed -` accepts without modification
109
+ - **Dependencies**: Minimize — HF datasets, PyYAML for config, standard library otherwise
110
+
111
+ ## Key Decisions
112
+
113
+ | Decision | Rationale | Outcome |
114
+ |----------|-----------|---------|
115
+ | Pure HF datasets (no DuckDB for v1) | Simpler architecture, HF handles column ops natively | ✓ Good |
116
+ | YAML config format | Readable, supports nested field definitions | ✓ Good |
117
+ | Default ID format `id:doc:doc::N` | Common pattern, namespace/doctype configurable | ✓ Good |
118
+ | Error handling as CLI flag | Different use cases need different behavior | ✓ Good |
119
+ | Source layout (src/) | Modern Python packaging best practice | ✓ Good |
120
+ | Registry pattern for type converters | Extensible design for future converters | ✓ Good |
121
+ | TTY-aware progress bar | Prevents progress bar from corrupting piped output | ✓ Good |
122
+ | 1-based row numbering in errors | More user-friendly for debugging | ✓ Good |
123
+ | ruamel.yaml for YAML generation | Only library supporting comments in output | ✓ Good |
124
+ | Explicit install-completion command | Cleaner UX than Typer's default flag | ✓ Good |
125
+ | Cache function lookups not results | Prevents memory exhaustion with diverse inputs | ✓ Good |
126
+ | Accept num_workers but document streaming limitation | HuggingFace limitation, reserved for future | ✓ Good |
127
+ | Int8 validates range before struct.pack | Clear error messages with index/row context | ✓ Good |
128
+ | Bfloat16 truncates (not rounds) precision | Matches IEEE 754 bfloat16 behavior | ✓ Good |
129
+ | Sparse tensor uses Vespa cells notation | Standard format for single mapped dimension | ✓ Good |
130
+ | Mixed tensor blocks reuse hex encoders | Consistent encoding, no code duplication | ✓ Good |
131
+ | 10% benchmark regression threshold | Catches regressions without false positives | ✓ Good |
132
+
133
+ ---
134
+ *Last updated: 2026-02-03 after v2.1 milestone started*
@@ -0,0 +1,47 @@
1
+ # Requirements: hf-vespa-feed
2
+
3
+ **Defined:** 2026-02-03
4
+ **Core Value:** Fast, memory-efficient streaming of HF datasets to Vespa without intermediate files or full dataset loading.
5
+
6
+ ## v2.1 Requirements
7
+
8
+ Bug fix release for clean exit behavior.
9
+
10
+ ### Clean Exit
11
+
12
+ - [x] **EXIT-01**: No "Bad file descriptor" error message after successful processing
13
+ - [x] **EXIT-02**: No visible warnings from HF/PyArrow cleanup on exit
14
+ - [x] **EXIT-03**: Process exits with code 0 after successful processing (unchanged)
15
+ - [x] **EXIT-04**: New test `test_feed_does_not_hang_on_exit` created and passes
16
+
17
+ ## Out of Scope
18
+
19
+ Explicitly excluded from v2.1. Documented to prevent scope creep.
20
+
21
+ | Feature | Reason |
22
+ |---------|--------|
23
+ | Field concatenation | Defer to v3 — feature work, not bug fix |
24
+ | Row filtering | Defer to v3 — feature work, not bug fix |
25
+ | Default values | Defer to v3 — feature work, not bug fix |
26
+ | DuckDB integration | Defer to v3 — feature work, not bug fix |
27
+ | Performance optimization | Not needed — exit cleanup is fast enough |
28
+
29
+ ## Traceability
30
+
31
+ Which phases cover which requirements.
32
+
33
+ | Requirement | Phase | Status |
34
+ |-------------|-------|--------|
35
+ | EXIT-01 | Phase 12 | Complete |
36
+ | EXIT-02 | Phase 12 | Complete |
37
+ | EXIT-03 | Phase 12 | Complete |
38
+ | EXIT-04 | Phase 12 | Complete |
39
+
40
+ **Coverage:**
41
+ - v2.1 requirements: 4 total
42
+ - Mapped to phases: 4
43
+ - Unmapped: 0 ✓
44
+
45
+ ---
46
+ *Requirements defined: 2026-02-03*
47
+ *Last updated: 2026-02-03 after phase 12 complete*
@@ -0,0 +1,91 @@
1
+ # Roadmap: hf-vespa-feed
2
+
3
+ ## Milestones
4
+
5
+ - **v1.0 MVP** — Phases 1-3 (shipped 2026-02-02)
6
+ - **v1.1 User Experience** — Phases 4-7 (shipped 2026-02-03)
7
+ - **v2.0 Vespa Types** — Phases 8-11 (shipped 2026-02-03)
8
+ - **v2.1 Clean Exit** — Phase 12 (active)
9
+
10
+ ## Phases
11
+
12
+ <details>
13
+ <summary>v1.0 MVP (Phases 1-3) — SHIPPED 2026-02-02</summary>
14
+
15
+ - [x] Phase 1: Foundation (2/2 plans) — completed 2026-02-02
16
+ - [x] Phase 2: Core Streaming (2/2 plans) — completed 2026-02-02
17
+ - [x] Phase 3: Configuration (2/2 plans) — completed 2026-02-02
18
+
19
+ See: `.planning/milestones/v1.0-ROADMAP.md` (if exists)
20
+
21
+ </details>
22
+
23
+ <details>
24
+ <summary>v1.1 User Experience (Phases 4-7) — SHIPPED 2026-02-03</summary>
25
+
26
+ - [x] Phase 4: Init Command (1/1 plans) — completed 2026-02-02
27
+ - [x] Phase 5: Shell Completion (1/1 plans) — completed 2026-02-03
28
+ - [x] Phase 6: Documentation (3/3 plans) — completed 2026-02-03
29
+ - [x] Phase 7: Performance (2/2 plans) — completed 2026-02-03
30
+
31
+ See: `.planning/milestones/v1.1-ROADMAP.md`
32
+
33
+ </details>
34
+
35
+ <details>
36
+ <summary>v2.0 Vespa Types (Phases 8-11) — SHIPPED 2026-02-03</summary>
37
+
38
+ - [x] Phase 8: Scalar Types (1/1 plans) — completed 2026-02-03
39
+ - [x] Phase 9: Dense Hex Encoding (2/2 plans) — completed 2026-02-03
40
+ - [x] Phase 10: Sparse and Mixed Tensors (2/2 plans) — completed 2026-02-03
41
+ - [x] Phase 11: Testing and Benchmarks (4/4 plans) — completed 2026-02-03
42
+
43
+ See: `.planning/milestones/v2.0-ROADMAP.md`
44
+
45
+ </details>
46
+
47
+ ### v2.1 Clean Exit (Phase 12) — COMPLETE
48
+
49
+ - [x] Phase 12: Clean Exit (1/1 plans) — completed 2026-02-03
50
+
51
+ **Phase 12: Clean Exit**
52
+
53
+ **Goal:** Fix the "Bad file descriptor" error that appears after successful processing.
54
+
55
+ **Requirements:** EXIT-01, EXIT-02, EXIT-03, EXIT-04
56
+
57
+ **Success criteria:**
58
+ 1. Running `hf-vespa-feed ... --limit 1000 > /dev/null` shows no error messages on stderr (only progress bar and completion stats)
59
+ 2. Process exits with code 0
60
+ 3. No visible warnings from HF/PyArrow cleanup
61
+ 4. Existing test `test_feed_does_not_hang_on_exit` passes
62
+
63
+ **Implementation approach (from research):**
64
+ 1. Add `logging.getLogger('huggingface_hub.utils._http').setLevel(logging.CRITICAL)` at module top of cli.py
65
+ 2. Remove `close_session()` call from `_cleanup_hf_resources()` — it triggers the error and isn't needed since `os._exit()` follows
66
+ 3. Keep existing `os._exit()` — required to prevent hang from HF retry loop
67
+ 4. Update/fix the existing test to verify clean stderr output
68
+
69
+ ## Progress
70
+
71
+ **Execution Order:**
72
+ Phases execute in numeric order: 1 -> 2 -> 3 -> 4 -> 5 -> 6 -> 7 -> 8 -> 9 -> 10 -> 11 -> 12
73
+
74
+ | Phase | Milestone | Plans Complete | Status | Completed |
75
+ |-------|-----------|----------------|--------|-----------|
76
+ | 1. Foundation | v1.0 | 2/2 | Complete | 2026-02-02 |
77
+ | 2. Core Streaming | v1.0 | 2/2 | Complete | 2026-02-02 |
78
+ | 3. Configuration | v1.0 | 2/2 | Complete | 2026-02-02 |
79
+ | 4. Init Command | v1.1 | 1/1 | Complete | 2026-02-02 |
80
+ | 5. Shell Completion | v1.1 | 1/1 | Complete | 2026-02-03 |
81
+ | 6. Documentation | v1.1 | 3/3 | Complete | 2026-02-03 |
82
+ | 7. Performance | v1.1 | 2/2 | Complete | 2026-02-03 |
83
+ | 8. Scalar Types | v2.0 | 1/1 | Complete | 2026-02-03 |
84
+ | 9. Dense Hex Encoding | v2.0 | 2/2 | Complete | 2026-02-03 |
85
+ | 10. Sparse and Mixed Tensors | v2.0 | 2/2 | Complete | 2026-02-03 |
86
+ | 11. Testing and Benchmarks | v2.0 | 4/4 | Complete | 2026-02-03 |
87
+ | 12. Clean Exit | v2.1 | 1/1 | Complete | 2026-02-03 |
88
+
89
+ ---
90
+ *Roadmap created: 2026-02-02*
91
+ *Last updated: 2026-02-03 after phase 12 complete*
@@ -0,0 +1,66 @@
1
+ # Project State: hf-vespa-feed
2
+
3
+ ## Project Reference
4
+
5
+ See: `.planning/PROJECT.md` (updated 2026-02-03)
6
+
7
+ **Core value:** Fast, memory-efficient streaming of HF datasets to Vespa without intermediate files or full dataset loading.
8
+
9
+ **Current focus:** v2.1 Clean Exit complete
10
+
11
+ ## Current Position
12
+
13
+ Phase: 12 of 12 (clean-exit)
14
+ Plan: 1 of 1
15
+ Status: Phase complete
16
+ Last activity: 2026-02-03 - Completed 12-01-PLAN.md
17
+
18
+ ```
19
+ v1.0 [====================] 100% Shipped
20
+ v1.1 [====================] 100% Shipped
21
+ v2.0 [====================] 100% Shipped
22
+ v2.1 [====================] 100% Complete
23
+ ```
24
+
25
+ ## Milestone History
26
+
27
+ | Version | Name | Phases | Status | Shipped |
28
+ |---------|------|--------|--------|---------|
29
+ | v1.0 | MVP | 1-3 | Shipped | 2026-02-02 |
30
+ | v1.1 | User Experience | 4-7 | Shipped | 2026-02-03 |
31
+ | v2.0 | Vespa Types | 8-11 | Shipped | 2026-02-03 |
32
+ | v2.1 | Clean Exit | 12 | Complete | 2026-02-03 |
33
+
34
+ ## v2.1 Results
35
+
36
+ **Bug fixed:** The "Bad file descriptor" error no longer appears after successful processing.
37
+
38
+ **Solution implemented:**
39
+ 1. Suppress HuggingFace HTTP logger at module startup
40
+ 2. Add `_cleanup_hf_resources()` function that sets `HF_HUB_OFFLINE` and forces GC
41
+ 3. Use `os._exit()` in `run()` to skip Python cleanup that triggers PyArrow bug
42
+
43
+ **Verification:**
44
+ - All 177 tests pass
45
+ - Manual test shows clean exit with no error messages
46
+ - Exit code is 0
47
+
48
+ ## Accumulated Decisions
49
+
50
+ | Phase | Decision | Rationale |
51
+ |-------|----------|-----------|
52
+ | 12-research | Remove close_session() call | Triggers the error, not needed since os._exit() follows |
53
+ | 12-research | Suppress HF HTTP logger at startup | Prevents error message from any cleanup path |
54
+ | 12-research | Keep os._exit() | Required to prevent hang from HF retry loop |
55
+ | 12-01 | Set HF_HUB_OFFLINE before cleanup | Prevents HTTP retry loops during garbage collection |
56
+
57
+ ## Session Continuity
58
+
59
+ Last session: 2026-02-03 13:52 UTC
60
+ Stopped at: Completed 12-01-PLAN.md (v2.1 milestone complete)
61
+ Resume file: None
62
+ Next action: None - v2.1 milestone complete
63
+
64
+ ---
65
+ *State initialized: 2026-02-01*
66
+ *Last updated: 2026-02-03 after completing v2.1 Clean Exit milestone*
@@ -0,0 +1,12 @@
1
+ {
2
+ "mode": "yolo",
3
+ "depth": "quick",
4
+ "parallelization": true,
5
+ "commit_docs": true,
6
+ "model_profile": "balanced",
7
+ "workflow": {
8
+ "research": true,
9
+ "plan_check": true,
10
+ "verifier": true
11
+ }
12
+ }