hf2vespa 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hf2vespa/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
hf2vespa/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from hf2vespa.cli import app
2
+
3
+ if __name__ == "__main__":
4
+ app()
hf2vespa/cli.py ADDED
@@ -0,0 +1,465 @@
1
+ # Suppress warnings that occur during cleanup. These are harmless but confusing to users.
2
+ # 1. HuggingFace HTTP retry warnings: https://github.com/apache/arrow/issues/45214
3
+ # 2. Multiprocessing resource tracker warnings (leaked semaphores from HF datasets)
4
+ #
5
+ # IMPORTANT: Set PYTHONWARNINGS env var BEFORE any imports that might trigger multiprocessing.
6
+ # The resource_tracker runs as a separate daemon process and inherits the env at spawn time.
7
+ # warnings.filterwarnings() alone doesn't work because it only affects the current process.
8
+ import os as _os
9
+
10
+ _existing_warnings = _os.environ.get("PYTHONWARNINGS", "")
11
+ _new_filter = "ignore::UserWarning:multiprocessing.resource_tracker"
12
+ if _existing_warnings:
13
+ _os.environ["PYTHONWARNINGS"] = f"{_existing_warnings},{_new_filter}"
14
+ else:
15
+ _os.environ["PYTHONWARNINGS"] = _new_filter
16
+ del _os, _existing_warnings, _new_filter
17
+
18
+ import logging as _logging
19
+ import warnings as _warnings
20
+
21
+ _logging.getLogger("huggingface_hub.utils._http").setLevel(_logging.CRITICAL)
22
+ _warnings.filterwarnings("ignore", message="resource_tracker:", category=UserWarning)
23
+ del _logging, _warnings
24
+
25
+ """CLI for streaming HuggingFace datasets to Vespa JSON format."""
26
+ import gc
27
+ import itertools
28
+ import os
29
+ import sys
30
+ import time
31
+ from pathlib import Path
32
+ from typing import Annotated
33
+
34
+ import orjson
35
+ import typer
36
+ from tqdm import tqdm
37
+
38
+ from hf2vespa.config import VespaConfig
39
+ from hf2vespa.pipeline import format_vespa_put, stream_dataset, validate_config
40
+ from hf2vespa.stats import ErrorMode, ProcessingStats
41
+ from hf2vespa.utils import handle_broken_pipe
42
+
43
+ # Create a Typer app that supports subcommands
44
+ app = typer.Typer(
45
+ help="Stream HuggingFace datasets to Vespa JSON format.",
46
+ no_args_is_help=False,
47
+ add_completion=False, # We provide our own install-completion command
48
+ )
49
+
50
+
51
+ def report_completion(stats: ProcessingStats, elapsed_ns: int) -> None:
52
+ """Print completion statistics to stderr."""
53
+ elapsed_sec = elapsed_ns / 1_000_000_000
54
+ total = stats.total_processed
55
+ errors = stats.error_count
56
+ success = stats.success_count
57
+ throughput = total / elapsed_sec if elapsed_sec > 0 else 0
58
+
59
+ print("\n--- Completion Statistics ---", file=sys.stderr)
60
+ print(f"Total records processed: {total:,}", file=sys.stderr)
61
+ print(f"Successful: {success:,}", file=sys.stderr)
62
+ print(f"Errors: {errors:,}", file=sys.stderr)
63
+ print(f"Throughput: {throughput:.1f} records/sec", file=sys.stderr)
64
+ print(f"Elapsed time: {elapsed_sec:.2f}s", file=sys.stderr)
65
+
66
+
67
+ def _cleanup_hf_resources() -> None:
68
+ """Clean up HuggingFace Hub resources to prevent exit hangs.
69
+
70
+ This addresses a known PyArrow bug (https://github.com/apache/arrow/issues/45214)
71
+ that causes hangs when cleaning up streaming dataset iterators.
72
+ """
73
+ # Enable offline mode to prevent HTTP retry loops during cleanup
74
+ os.environ["HF_HUB_OFFLINE"] = "1"
75
+ # Force garbage collection to trigger finalizers
76
+ gc.collect()
77
+
78
+
79
+ def feed_impl(
80
+ dataset: str,
81
+ split: str = "train",
82
+ config: str | None = None,
83
+ include: list[str] | None = None,
84
+ rename: list[str] | None = None,
85
+ namespace: str = "doc",
86
+ doctype: str = "doc",
87
+ config_file: Path | None = None,
88
+ limit: int | None = None,
89
+ id_column: str | None = None,
90
+ on_error: ErrorMode = ErrorMode.fail,
91
+ num_workers: int | None = None,
92
+ ) -> None:
93
+ """Implementation of the feed command.
94
+
95
+ This is the actual implementation that both the callback (for backward
96
+ compatibility) and the explicit 'feed' command delegate to.
97
+ """
98
+ # Validate limit parameter
99
+ if limit is not None and limit <= 0:
100
+ typer.echo("Error: --limit must be positive", err=True)
101
+ raise typer.Exit(1)
102
+
103
+ # Auto-detect CPU cores if not specified
104
+ if num_workers is None:
105
+ num_workers = os.cpu_count()
106
+
107
+ # Validate num_workers
108
+ if num_workers is not None and num_workers <= 0:
109
+ typer.echo("Error: --num-workers must be positive", err=True)
110
+ raise typer.Exit(1)
111
+
112
+ # Load config from file or create default
113
+ if config_file is not None:
114
+ try:
115
+ vespa_config = VespaConfig.from_yaml(config_file)
116
+ except ValueError as e:
117
+ typer.echo(f"Error loading config file: {e}", err=True)
118
+ raise typer.Exit(1)
119
+
120
+ # CLI flags override config file values only if non-default
121
+ # (We can't detect if user explicitly passed the default value, so we assume
122
+ # that if a config file is provided, the user wants to use its values unless
123
+ # they explicitly override them with a different value)
124
+ if namespace != "doc":
125
+ vespa_config.namespace = namespace
126
+ if doctype != "doc":
127
+ vespa_config.doctype = doctype
128
+ if id_column is not None:
129
+ vespa_config.id_column = id_column
130
+ else:
131
+ # No config file - use CLI values directly
132
+ vespa_config = VespaConfig(
133
+ namespace=namespace,
134
+ doctype=doctype,
135
+ id_column=id_column,
136
+ )
137
+
138
+ # Parse rename list into dictionary
139
+ rename_dict = None
140
+ if rename:
141
+ rename_dict = {}
142
+ for pair in rename:
143
+ if ":" not in pair:
144
+ typer.echo(
145
+ f"Error: --rename must be in 'old:new' format, got: {pair}",
146
+ err=True,
147
+ )
148
+ raise typer.Exit(1)
149
+ old, new = pair.split(":", 1)
150
+ rename_dict[old] = new
151
+
152
+ # Load dataset to validate config against schema
153
+ from datasets import load_dataset
154
+
155
+ try:
156
+ ds = load_dataset(dataset, config, split=split, streaming=True)
157
+ dataset_columns = set(ds.column_names)
158
+ except Exception as e:
159
+ typer.echo(f"Error loading dataset: {e}", err=True)
160
+ raise typer.Exit(1)
161
+
162
+ # Validate config against dataset schema (fail-fast)
163
+ try:
164
+ validate_config(vespa_config, dataset_columns)
165
+ except ValueError as e:
166
+ typer.echo(f"Config validation failed: {e}", err=True)
167
+ raise typer.Exit(1)
168
+
169
+ # Stream dataset with transformations
170
+ records = stream_dataset(
171
+ dataset_name=dataset,
172
+ split=split,
173
+ include=include,
174
+ rename=rename_dict,
175
+ config=config,
176
+ num_proc=num_workers,
177
+ )
178
+
179
+ # Format as Vespa PUT operations
180
+ vespa_docs = format_vespa_put(
181
+ records, vespa_config.namespace, vespa_config.doctype, config=vespa_config
182
+ )
183
+
184
+ # Apply limit if specified
185
+ if limit is not None:
186
+ vespa_docs = itertools.islice(vespa_docs, limit)
187
+
188
+ # Initialize statistics tracking
189
+ stats = ProcessingStats()
190
+ start_time = time.perf_counter_ns()
191
+ show_progress = sys.stderr.isatty()
192
+
193
+ # Write to stdout with SIGPIPE handling
194
+ with handle_broken_pipe():
195
+ # Wrap with progress bar (no-op if not TTY)
196
+ iterator = tqdm(
197
+ vespa_docs,
198
+ disable=not show_progress,
199
+ desc="Processing records",
200
+ unit="rec",
201
+ file=sys.stderr,
202
+ )
203
+
204
+ try:
205
+ for doc in iterator:
206
+ try:
207
+ # Serialize to JSON with orjson for performance
208
+ json_bytes = orjson.dumps(doc, option=orjson.OPT_APPEND_NEWLINE)
209
+ sys.stdout.buffer.write(json_bytes)
210
+ stats.record_success()
211
+ except Exception as e:
212
+ # Error during processing this record
213
+ if on_error == ErrorMode.fail:
214
+ raise
215
+ else:
216
+ # Skip mode: warn and continue
217
+ tqdm.write(f"Warning: {e}", file=sys.stderr)
218
+ stats.record_error()
219
+ continue
220
+
221
+ # Flush periodically for streaming UX
222
+ if stats.total_processed % 100 == 0:
223
+ sys.stdout.flush()
224
+ finally:
225
+ # Always report stats, even on error
226
+ end_time = time.perf_counter_ns()
227
+ report_completion(stats, end_time - start_time)
228
+
229
+ # Clean up HuggingFace resources IMMEDIATELY after processing,
230
+ # before the iterator is garbage collected. This prevents
231
+ # "Bad file descriptor" errors from PyArrow/fsspec cleanup.
232
+ _cleanup_hf_resources()
233
+
234
+
235
+ @app.command("feed")
236
+ def feed(
237
+ dataset: Annotated[str, typer.Argument(help="HuggingFace dataset name")],
238
+ split: Annotated[str, typer.Option(help="Dataset split")] = "train",
239
+ config: Annotated[str | None, typer.Option(help="Dataset config name")] = None,
240
+ include: Annotated[
241
+ list[str] | None, typer.Option(help="Columns to include (repeatable)")
242
+ ] = None,
243
+ rename: Annotated[
244
+ list[str] | None,
245
+ typer.Option(help="Rename columns as 'old:new' (repeatable)"),
246
+ ] = None,
247
+ namespace: Annotated[
248
+ str, typer.Option(help="Vespa namespace for document IDs")
249
+ ] = "doc",
250
+ doctype: Annotated[
251
+ str, typer.Option(help="Vespa document type for document IDs")
252
+ ] = "doc",
253
+ config_file: Annotated[
254
+ Path | None,
255
+ typer.Option("--config-file", help="YAML config file for field mappings"),
256
+ ] = None,
257
+ limit: Annotated[
258
+ int | None, typer.Option(help="Process only first N records")
259
+ ] = None,
260
+ id_column: Annotated[
261
+ str | None,
262
+ typer.Option("--id-column", help="Dataset column to use as document ID"),
263
+ ] = None,
264
+ on_error: Annotated[
265
+ ErrorMode,
266
+ typer.Option(
267
+ "--on-error",
268
+ help="Error handling: fail (stop on error) or skip (warn and continue)",
269
+ ),
270
+ ] = ErrorMode.fail,
271
+ num_workers: Annotated[
272
+ int | None,
273
+ typer.Option(
274
+ "--num-workers",
275
+ help="Number of parallel workers. Note: Not supported with streaming mode. Reserved for future use with non-streaming datasets.",
276
+ ),
277
+ ] = None,
278
+ ) -> None:
279
+ """Stream HuggingFace dataset to Vespa JSON format.
280
+
281
+ Examples:
282
+
283
+ # Basic usage
284
+ $ hf2vespa feed glue --split test --config ax
285
+
286
+ # Filter columns
287
+ $ hf2vespa feed glue --split test --config ax --include premise --include hypothesis
288
+
289
+ # Custom namespace and doctype
290
+ $ hf2vespa feed squad --namespace wiki --doctype article
291
+
292
+ # Use config file
293
+ $ hf2vespa feed glue --config ax --config-file mappings.yaml
294
+
295
+ # Preview first 10 records
296
+ $ hf2vespa feed squad --limit 10
297
+ """
298
+ feed_impl(
299
+ dataset=dataset,
300
+ split=split,
301
+ config=config,
302
+ include=include,
303
+ rename=rename,
304
+ namespace=namespace,
305
+ doctype=doctype,
306
+ config_file=config_file,
307
+ limit=limit,
308
+ id_column=id_column,
309
+ on_error=on_error,
310
+ num_workers=num_workers,
311
+ )
312
+
313
+
314
+ @app.command("init")
315
+ def init(
316
+ dataset: Annotated[str, typer.Argument(help="HuggingFace dataset name")],
317
+ output: Annotated[
318
+ Path, typer.Option("--output", "-o", help="Output YAML file path")
319
+ ] = Path("vespa-config.yaml"),
320
+ split: Annotated[
321
+ str, typer.Option("--split", "-s", help="Dataset split to inspect")
322
+ ] = "train",
323
+ config: Annotated[
324
+ str | None,
325
+ typer.Option(
326
+ "--config",
327
+ "-c",
328
+ help="Dataset config name (required for multi-config datasets)",
329
+ ),
330
+ ] = None,
331
+ ) -> None:
332
+ """Generate a YAML config by inspecting a HuggingFace dataset schema.
333
+
334
+ This command inspects the dataset schema (without downloading the full dataset)
335
+ and generates a YAML configuration file with sensible defaults and helpful comments.
336
+
337
+ Examples:
338
+
339
+ # Generate config for a dataset
340
+ $ hf2vespa init glue --config ax
341
+
342
+ # Specify output file
343
+ $ hf2vespa init squad --output my-config.yaml
344
+
345
+ # Inspect a specific split
346
+ $ hf2vespa init my-dataset --split validation
347
+ """
348
+ from hf2vespa.init import init_command
349
+
350
+ init_command(dataset, output, split, config)
351
+
352
+
353
+ @app.command("install-completion")
354
+ def install_completion(
355
+ shell: Annotated[
356
+ str | None,
357
+ typer.Argument(
358
+ help="Shell to install completion for (bash, zsh, fish). Auto-detected if omitted."
359
+ ),
360
+ ] = None,
361
+ ) -> None:
362
+ """Install shell tab-completion for hf2vespa.
363
+
364
+ Detects your shell automatically, or specify explicitly.
365
+
366
+ Examples:
367
+
368
+ hf2vespa install-completion # Auto-detect shell
369
+ hf2vespa install-completion bash # Explicit bash
370
+ hf2vespa install-completion zsh # Explicit zsh
371
+ """
372
+ from typer._completion_shared import Shells, install
373
+
374
+ # Detect shell if not provided
375
+ if shell is None:
376
+ try:
377
+ import shellingham
378
+
379
+ detected_name, _ = shellingham.detect_shell()
380
+ shell = detected_name.lower()
381
+ typer.echo(f"Detected shell: {shell}")
382
+ except Exception:
383
+ typer.echo(
384
+ "Could not auto-detect your shell.\n"
385
+ "Please specify: hf2vespa install-completion [bash|zsh|fish]",
386
+ err=True,
387
+ )
388
+ raise typer.Exit(1)
389
+
390
+ # Validate shell
391
+ shell = shell.lower()
392
+ supported = {"bash", "zsh", "fish"}
393
+ if shell not in supported:
394
+ typer.echo(f"Unsupported shell: {shell}", err=True)
395
+ typer.echo(f"Supported shells: {', '.join(sorted(supported))}", err=True)
396
+ raise typer.Exit(1)
397
+
398
+ # Install completion
399
+ try:
400
+ shell_enum = Shells(shell)
401
+ _, path = install(shell=shell_enum)
402
+
403
+ # Success message
404
+ typer.echo(f"\nShell completion installed for {shell}!")
405
+ typer.echo(f"Modified: {path}")
406
+ typer.echo("\nTo activate, either:")
407
+ typer.echo(" 1. Restart your terminal, OR")
408
+ typer.echo(f" 2. Run: source {path}")
409
+
410
+ except Exception as e:
411
+ typer.echo(f"Failed to install completion: {e}", err=True)
412
+ raise typer.Exit(1)
413
+
414
+
415
+ def run() -> None:
416
+ """Entry point with backward compatibility handling.
417
+
418
+ This function provides backward compatibility for the old CLI pattern:
419
+ `hf2vespa <dataset>` by checking if the first argument looks like
420
+ a dataset name (not a subcommand) and inserting 'feed' if needed.
421
+ """
422
+ # The known subcommands
423
+ subcommands = {"feed", "init", "install-completion"}
424
+
425
+ if len(sys.argv) > 1:
426
+ first_arg = sys.argv[1]
427
+ # If first arg is a known subcommand, proceed normally
428
+ # If first arg is a flag (--help, -h, etc.), proceed normally
429
+ # Otherwise, assume it's a dataset name and insert 'feed'
430
+ if first_arg not in subcommands and not first_arg.startswith("-"):
431
+ sys.argv.insert(1, "feed")
432
+
433
+ exit_code = 0
434
+ try:
435
+ app()
436
+ except SystemExit as e:
437
+ # Preserve the exit code from typer (e.g., 1 for validation errors)
438
+ exit_code = e.code if isinstance(e.code, int) else 1
439
+
440
+ # Clean up HuggingFace resources before exit
441
+ _cleanup_hf_resources()
442
+
443
+ # Flush all output streams
444
+ sys.stdout.flush()
445
+ sys.stderr.flush()
446
+
447
+ # Redirect stderr to /dev/null before exit to suppress resource_tracker warnings.
448
+ # The warnings.filterwarnings() call at module level doesn't affect the resource_tracker
449
+ # subprocess that Python's multiprocessing spawns. These warnings are harmless but
450
+ # confusing to users. We've already flushed our output above.
451
+ try:
452
+ devnull = os.open(os.devnull, os.O_WRONLY)
453
+ os.dup2(devnull, sys.stderr.fileno())
454
+ os.close(devnull)
455
+ except OSError:
456
+ pass # If we can't redirect, just proceed
457
+
458
+ # Exit immediately to avoid HuggingFace/PyArrow cleanup issues that cause hangs.
459
+ # The hang is a known PyArrow bug: https://github.com/huggingface/datasets/issues/7467
460
+ # os._exit() is required as a safety net until the upstream bug is fixed.
461
+ os._exit(exit_code)
462
+
463
+
464
+ if __name__ == "__main__":
465
+ run()
hf2vespa/config.py ADDED
@@ -0,0 +1,131 @@
1
+ """Configuration schema for Vespa feed generation."""
2
+
3
+ from pathlib import Path
4
+
5
+ import yaml
6
+ from pydantic import BaseModel, field_validator
7
+
8
+
9
+ class FieldMapping(BaseModel):
10
+ """
11
+ Field mapping configuration from dataset column to Vespa field.
12
+
13
+ Attributes:
14
+ source: Source column name in the dataset
15
+ target: Target field name in Vespa document
16
+ type: Type conversion to apply (e.g., "tensor", "string", "int", "float")
17
+
18
+ Examples:
19
+ >>> m = FieldMapping(source="embedding", target="vector", type="tensor")
20
+ >>> m.source
21
+ 'embedding'
22
+ >>> m.type
23
+ 'tensor'
24
+ """
25
+
26
+ source: str
27
+ target: str
28
+ type: str | None = None
29
+
30
+ @field_validator("type")
31
+ @classmethod
32
+ def validate_type(cls, v: str | None) -> str | None:
33
+ """
34
+ Validate that type is one of the known converter types.
35
+
36
+ Args:
37
+ v: Type string to validate
38
+
39
+ Returns:
40
+ Validated type string or None
41
+
42
+ Raises:
43
+ ValueError: If type is not one of the known types
44
+ """
45
+ if v is None:
46
+ return v
47
+
48
+ valid_types = {
49
+ # Basic types
50
+ "tensor",
51
+ "string",
52
+ "int",
53
+ "float",
54
+ # Scalar types (Phase 8)
55
+ "position",
56
+ "weightedset",
57
+ "map",
58
+ # Hex tensor types (Phase 9)
59
+ "tensor_int8_hex",
60
+ "tensor_bfloat16_hex",
61
+ "tensor_float32_hex",
62
+ "tensor_float64_hex",
63
+ # Sparse and mixed tensor types (Phase 10)
64
+ "sparse_tensor",
65
+ "mixed_tensor",
66
+ "mixed_tensor_hex",
67
+ }
68
+ if v not in valid_types:
69
+ raise ValueError(
70
+ f"Unknown type converter '{v}'. Must be one of: {', '.join(sorted(valid_types))}"
71
+ )
72
+ return v
73
+
74
+
75
+ class VespaConfig(BaseModel):
76
+ """
77
+ Configuration for Vespa feed generation.
78
+
79
+ Attributes:
80
+ namespace: Vespa namespace for document IDs (default: "doc")
81
+ doctype: Vespa document type for document IDs (default: "doc")
82
+ id_column: Dataset column to use as document ID (default: None, uses sequential numbering)
83
+ mappings: List of field mappings from dataset to Vespa format
84
+
85
+ Examples:
86
+ >>> cfg = VespaConfig()
87
+ >>> cfg.namespace
88
+ 'doc'
89
+ >>> cfg.mappings
90
+ []
91
+ """
92
+
93
+ namespace: str = "doc"
94
+ doctype: str = "doc"
95
+ id_column: str | None = None
96
+ mappings: list[FieldMapping] = []
97
+
98
+ @classmethod
99
+ def from_yaml(cls, path: str | Path) -> "VespaConfig":
100
+ """
101
+ Load configuration from a YAML file.
102
+
103
+ Args:
104
+ path: Path to YAML configuration file
105
+
106
+ Returns:
107
+ VespaConfig instance with validated configuration
108
+
109
+ Raises:
110
+ ValueError: If file not found, YAML parsing fails, or validation fails
111
+
112
+ Examples:
113
+ >>> # Assuming config.yaml exists with valid configuration
114
+ >>> cfg = VespaConfig.from_yaml("config.yaml") # doctest: +SKIP
115
+ >>> isinstance(cfg, VespaConfig) # doctest: +SKIP
116
+ True
117
+ """
118
+ path_obj = Path(path)
119
+
120
+ try:
121
+ with open(path_obj, "r") as f:
122
+ data = yaml.safe_load(f)
123
+ except FileNotFoundError:
124
+ raise ValueError(f"Configuration file not found: {path}")
125
+ except yaml.YAMLError as e:
126
+ raise ValueError(f"Failed to parse YAML from {path}: {e}")
127
+
128
+ try:
129
+ return cls(**data)
130
+ except Exception as e:
131
+ raise ValueError(f"Failed to validate configuration from {path}: {e}")