hf2vespa 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hf2vespa/init.py ADDED
@@ -0,0 +1,351 @@
1
+ """Schema inspection and YAML config generation for HuggingFace datasets."""
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import typer
7
+ from datasets import Sequence, Value, load_dataset_builder, get_dataset_config_names
8
+ from datasets.features import List, LargeList
9
+ from ruamel.yaml import YAML
10
+ from ruamel.yaml.comments import CommentedMap, CommentedSeq
11
+
12
+
13
+ def is_list_feature(feature: Any) -> bool:
14
+ """
15
+ Check if feature is a list/sequence type suitable for tensor conversion.
16
+
17
+ List columns containing numeric values are good candidates for
18
+ Vespa tensor conversion (embeddings, vectors, etc.)
19
+
20
+ Args:
21
+ feature: A HuggingFace datasets Feature object
22
+
23
+ Returns:
24
+ True if feature is a list/sequence type
25
+
26
+ Examples:
27
+ >>> from datasets import Sequence, Value
28
+ >>> is_list_feature(Sequence(feature=Value("float32")))
29
+ True
30
+ >>> is_list_feature(Value("string"))
31
+ False
32
+ """
33
+ # Check for datasets library sequence types
34
+ if isinstance(feature, (Sequence, List, LargeList)):
35
+ return True
36
+
37
+ # Check for Python list with feature type inside
38
+ if isinstance(feature, list) and len(feature) == 1:
39
+ return True
40
+
41
+ return False
42
+
43
+
44
+ def get_value_dtype(feature: Any) -> str:
45
+ """
46
+ Get the dtype string for display in YAML comments.
47
+
48
+ Args:
49
+ feature: A HuggingFace datasets Feature object
50
+
51
+ Returns:
52
+ Human-readable dtype string (e.g., "string", "Sequence[float32]")
53
+
54
+ Examples:
55
+ >>> from datasets import Sequence, Value
56
+ >>> get_value_dtype(Value("string"))
57
+ 'string'
58
+ >>> get_value_dtype(Sequence(feature=Value("float32")))
59
+ 'Sequence[float32]'
60
+ """
61
+ if isinstance(feature, Value):
62
+ return feature.dtype
63
+ elif isinstance(feature, Sequence):
64
+ inner = feature.feature
65
+ if isinstance(inner, Value):
66
+ return f"Sequence[{inner.dtype}]"
67
+ return "Sequence[complex]"
68
+ elif isinstance(feature, (List, LargeList)):
69
+ inner = feature.feature
70
+ if isinstance(inner, Value):
71
+ return f"List[{inner.dtype}]"
72
+ return "List[complex]"
73
+ elif isinstance(feature, list) and len(feature) == 1:
74
+ inner = feature[0]
75
+ if isinstance(inner, Value):
76
+ return f"list[{inner.dtype}]"
77
+ return "list[complex]"
78
+ elif isinstance(feature, dict):
79
+ return "dict"
80
+ else:
81
+ return str(type(feature).__name__)
82
+
83
+
84
+ def suggest_type(col_name: str, feature: Any) -> str | None:
85
+ """
86
+ Suggest type conversion based on feature type.
87
+
88
+ For list columns with numeric inner types, suggests "tensor" conversion.
89
+ For other columns, returns None (no conversion needed).
90
+
91
+ Args:
92
+ col_name: Column name (not currently used but available for name-based heuristics)
93
+ feature: A HuggingFace datasets Feature object
94
+
95
+ Returns:
96
+ "tensor" for numeric list columns, None otherwise
97
+
98
+ Examples:
99
+ >>> from datasets import Sequence, Value
100
+ >>> suggest_type("embedding", Sequence(feature=Value("float32")))
101
+ 'tensor'
102
+ >>> suggest_type("name", Value("string"))
103
+
104
+ """
105
+ if is_list_feature(feature):
106
+ # Get inner type
107
+ inner = None
108
+ if isinstance(feature, Sequence):
109
+ inner = feature.feature
110
+ elif isinstance(feature, (List, LargeList)):
111
+ inner = feature.feature
112
+ elif isinstance(feature, list) and len(feature) == 1:
113
+ inner = feature[0]
114
+
115
+ # Check if inner type is numeric
116
+ if isinstance(inner, Value):
117
+ numeric_types = (
118
+ "float32",
119
+ "float64",
120
+ "float",
121
+ "double",
122
+ "int32",
123
+ "int64",
124
+ "int",
125
+ "int8",
126
+ "int16",
127
+ )
128
+ if inner.dtype in numeric_types:
129
+ return "tensor"
130
+ return None
131
+
132
+
133
+ def inspect_dataset_schema(
134
+ dataset_name: str,
135
+ config: str | None = None,
136
+ ) -> dict[str, Any]:
137
+ """
138
+ Get dataset schema without downloading data files.
139
+
140
+ Uses load_dataset_builder() to access metadata without downloading
141
+ the actual data files.
142
+
143
+ Args:
144
+ dataset_name: HuggingFace dataset name (e.g., "glue", "squad")
145
+ config: Dataset configuration name (required for multi-config datasets)
146
+
147
+ Returns:
148
+ Dict containing:
149
+ - columns: {name: feature_type} mapping
150
+ - list_columns: List of column names that are list/sequence types
151
+ - available_splits: List of available split names
152
+
153
+ Raises:
154
+ ValueError: If dataset cannot be loaded or config is required but not provided
155
+
156
+ Examples:
157
+ >>> schema = inspect_dataset_schema("glue", config="ax") # doctest: +SKIP
158
+ >>> "premise" in schema["columns"] # doctest: +SKIP
159
+ True
160
+ """
161
+ # Get builder for dataset (no download)
162
+ builder = load_dataset_builder(dataset_name, config)
163
+ features = builder.info.features
164
+
165
+ columns = {}
166
+ list_columns = []
167
+
168
+ for col_name, col_type in features.items():
169
+ columns[col_name] = col_type
170
+ # Detect list/sequence columns for tensor suggestion
171
+ if is_list_feature(col_type):
172
+ list_columns.append(col_name)
173
+
174
+ # Get available splits
175
+ available_splits = (
176
+ list(builder.info.splits.keys()) if builder.info.splits else ["train"]
177
+ )
178
+
179
+ return {
180
+ "columns": columns,
181
+ "list_columns": list_columns,
182
+ "available_splits": available_splits,
183
+ }
184
+
185
+
186
+ def generate_config_yaml(
187
+ features: dict[str, Any],
188
+ output_path: Path,
189
+ dataset_name: str,
190
+ config: str | None = None,
191
+ split: str = "train",
192
+ namespace: str = "doc",
193
+ doctype: str = "doc",
194
+ ) -> None:
195
+ """
196
+ Generate a YAML config file with comments for user guidance.
197
+
198
+ Uses ruamel.yaml's CommentedMap to include helpful comments
199
+ explaining each configuration option.
200
+
201
+ Args:
202
+ features: Dict mapping column names to feature types
203
+ output_path: Path to write the YAML file
204
+ dataset_name: Name of the dataset (for header comment)
205
+ config: Dataset config name (for header comment)
206
+ split: Dataset split name (for header comment)
207
+ namespace: Default Vespa namespace
208
+ doctype: Default Vespa document type
209
+
210
+ Examples:
211
+ >>> from pathlib import Path
212
+ >>> features = {"id": Value("string"), "text": Value("string")} # doctest: +SKIP
213
+ >>> generate_config_yaml(features, Path("/tmp/test.yaml"), "test-dataset") # doctest: +SKIP
214
+ """
215
+ yaml = YAML()
216
+ yaml.default_flow_style = False
217
+ yaml.indent(mapping=2, sequence=4, offset=2)
218
+
219
+ # Create root config with CommentedMap for comment support
220
+ root = CommentedMap()
221
+
222
+ # Header comment
223
+ root.yaml_set_start_comment(
224
+ f"Generated config for dataset: {dataset_name}\n"
225
+ f"Config: {config or 'default'}, Split: {split}\n"
226
+ f"Edit this file to customize field mappings.\n"
227
+ )
228
+
229
+ # Basic config with comments
230
+ root["namespace"] = namespace
231
+ root.yaml_add_eol_comment("Vespa namespace for document IDs", "namespace")
232
+
233
+ root["doctype"] = doctype
234
+ root.yaml_add_eol_comment("Vespa document type", "doctype")
235
+
236
+ root["id_column"] = None
237
+ root.yaml_add_eol_comment(
238
+ "Column to use as document ID (null = auto-increment)", "id_column"
239
+ )
240
+
241
+ # Mappings section
242
+ mappings = CommentedSeq()
243
+ root["mappings"] = mappings
244
+ root.yaml_set_comment_before_after_key(
245
+ "mappings", before="\nField mappings: source (dataset) -> target (Vespa)"
246
+ )
247
+
248
+ for col_name, col_type in features.items():
249
+ mapping = CommentedMap()
250
+ mapping["source"] = col_name
251
+ mapping["target"] = col_name # Default: same name
252
+
253
+ # Add type suggestion for list columns
254
+ suggested = suggest_type(col_name, col_type)
255
+ dtype_str = get_value_dtype(col_type)
256
+
257
+ if suggested:
258
+ mapping["type"] = suggested
259
+ mapping.yaml_add_eol_comment(
260
+ f"{dtype_str} -> suggested: {suggested}", "type"
261
+ )
262
+ else:
263
+ mapping["type"] = None
264
+ mapping.yaml_add_eol_comment(dtype_str, "type")
265
+
266
+ mappings.append(mapping)
267
+
268
+ # Write to file
269
+ with open(output_path, "w") as f:
270
+ yaml.dump(root, f)
271
+
272
+
273
+ def init_command(
274
+ dataset: str,
275
+ output: Path,
276
+ split: str = "train",
277
+ config: str | None = None,
278
+ ) -> None:
279
+ """
280
+ Generate a YAML config by inspecting a HuggingFace dataset schema.
281
+
282
+ Main entry point for the init command. Inspects the dataset schema
283
+ without downloading data and generates a commented YAML config file.
284
+
285
+ Args:
286
+ dataset: HuggingFace dataset name
287
+ output: Output YAML file path
288
+ split: Dataset split to inspect (default: "train")
289
+ config: Dataset config name (required for multi-config datasets)
290
+
291
+ Raises:
292
+ typer.Exit: On error (with helpful message printed to stderr)
293
+ """
294
+ # Check for multi-config datasets
295
+ try:
296
+ configs = get_dataset_config_names(dataset)
297
+ if len(configs) > 1 and config is None:
298
+ typer.echo(
299
+ f"Dataset '{dataset}' has multiple configs: {', '.join(configs)}",
300
+ err=True,
301
+ )
302
+ typer.echo("Please specify one with --config", err=True)
303
+ raise typer.Exit(1)
304
+ except Exception as e:
305
+ # Some datasets don't have configs - that's fine
306
+ if "multiple configs" not in str(e):
307
+ configs = []
308
+
309
+ # Get dataset builder (no download)
310
+ try:
311
+ builder = load_dataset_builder(dataset, config)
312
+ except Exception as e:
313
+ typer.echo(f"Error loading dataset: {e}", err=True)
314
+ raise typer.Exit(1)
315
+
316
+ features = builder.info.features
317
+
318
+ # Check split exists
319
+ available_splits = (
320
+ list(builder.info.splits.keys()) if builder.info.splits else ["train"]
321
+ )
322
+ if split not in available_splits:
323
+ typer.echo(
324
+ f"Split '{split}' not found. Available: {', '.join(available_splits)}",
325
+ err=True,
326
+ )
327
+ raise typer.Exit(1)
328
+
329
+ # Generate YAML config
330
+ try:
331
+ generate_config_yaml(
332
+ features=features,
333
+ output_path=output,
334
+ dataset_name=dataset,
335
+ config=config,
336
+ split=split,
337
+ )
338
+ except Exception as e:
339
+ typer.echo(f"Error writing file: {e}", err=True)
340
+ raise typer.Exit(1)
341
+
342
+ # Print summary
343
+ typer.echo(f"Generated config: {output}", err=True)
344
+ typer.echo(f" {len(features)} columns mapped", err=True)
345
+
346
+ list_cols = [n for n, t in features.items() if is_list_feature(t)]
347
+ if list_cols:
348
+ typer.echo(
349
+ f" {len(list_cols)} list columns suggested for tensor conversion",
350
+ err=True,
351
+ )
hf2vespa/pipeline.py ADDED
@@ -0,0 +1,198 @@
1
+ """Streaming pipeline for HuggingFace datasets to Vespa format."""
2
+
3
+ from typing import Generator
4
+
5
+ from datasets import load_dataset
6
+
7
+ from .config import FieldMapping, VespaConfig
8
+ from .converters import converters
9
+ from .utils import generate_vespa_id
10
+
11
+
12
+ def stream_dataset(
13
+ dataset_name: str,
14
+ split: str,
15
+ include: list[str] | None = None,
16
+ rename: dict[str, str] | None = None,
17
+ config: str | None = None,
18
+ num_proc: int | None = None,
19
+ ) -> Generator[dict, None, None]:
20
+ """
21
+ Stream a HuggingFace dataset with optional column filtering and renaming.
22
+
23
+ Args:
24
+ dataset_name: HuggingFace dataset identifier (e.g., "glue", "squad")
25
+ split: Dataset split to stream (e.g., "train", "test", "validation")
26
+ include: List of column names to include. If None, include all columns.
27
+ rename: Dictionary mapping old column names to new names. Applied after filtering.
28
+ config: Dataset configuration name (e.g., "ax" for glue dataset)
29
+ num_proc: Number of parallel workers for dataset operations. Note: HuggingFace
30
+ datasets library does not support num_proc with streaming=True. This parameter
31
+ is accepted for API consistency but is not passed to load_dataset. For parallel
32
+ processing with streaming datasets, wrap the dataset with PyTorch DataLoader
33
+ using num_workers > 1 instead.
34
+
35
+ Yields:
36
+ Dictionary records from the dataset
37
+
38
+ Examples:
39
+ >>> records = stream_dataset("glue", "ax", include=["premise", "hypothesis"], config="ax")
40
+ >>> first = next(records)
41
+ >>> "premise" in first and "hypothesis" in first
42
+ True
43
+ """
44
+ # Load dataset in streaming mode for O(1) memory usage
45
+ # Note: num_proc is not supported with streaming=True by HuggingFace datasets
46
+ # library. Parameter is accepted but not used. For parallel streaming, users
47
+ # should wrap the output with PyTorch DataLoader.
48
+ dataset = load_dataset(
49
+ dataset_name,
50
+ config,
51
+ split=split,
52
+ streaming=True,
53
+ )
54
+
55
+ # Apply column filtering if specified
56
+ if include is not None:
57
+ dataset = dataset.select_columns(include)
58
+
59
+ # Apply column renaming if specified
60
+ if rename is not None:
61
+ dataset = dataset.rename_columns(rename)
62
+
63
+ # Yield records one at a time
64
+ yield from dataset
65
+
66
+
67
+ def validate_config(config: VespaConfig, dataset_columns: set[str]) -> None:
68
+ """
69
+ Validate that config references only columns that exist in the dataset.
70
+
71
+ Args:
72
+ config: VespaConfig to validate
73
+ dataset_columns: Set of column names available in the dataset
74
+
75
+ Raises:
76
+ ValueError: If id_column or any mapping source references a non-existent column
77
+
78
+ Examples:
79
+ >>> cfg = VespaConfig(id_column="idx", mappings=[FieldMapping(source="text", target="content")])
80
+ >>> validate_config(cfg, {"idx", "text", "label"}) # No error
81
+ >>> validate_config(VespaConfig(id_column="missing"), {"idx", "text"}) # doctest: +SKIP
82
+ Traceback (most recent call last):
83
+ ...
84
+ ValueError: id_column 'missing' not found in dataset. Available columns: idx, text
85
+ """
86
+ # Validate id_column exists if set
87
+ if config.id_column is not None and config.id_column not in dataset_columns:
88
+ available = ", ".join(sorted(dataset_columns))
89
+ raise ValueError(
90
+ f"id_column '{config.id_column}' not found in dataset. Available columns: {available}"
91
+ )
92
+
93
+ # Validate each mapping source exists
94
+ for mapping in config.mappings:
95
+ if mapping.source not in dataset_columns:
96
+ available = ", ".join(sorted(dataset_columns))
97
+ raise ValueError(
98
+ f"Mapping source '{mapping.source}' not found in dataset. Available columns: {available}"
99
+ )
100
+
101
+
102
+ def apply_mappings(
103
+ record: dict, mappings: list[FieldMapping], row_num: int = 0
104
+ ) -> dict:
105
+ """
106
+ Apply field mappings and type conversions to a record.
107
+
108
+ Args:
109
+ record: Source record from dataset
110
+ mappings: List of field mappings to apply
111
+ row_num: Row number for error context (1-based, default 0 for unspecified)
112
+
113
+ Returns:
114
+ New dictionary with mapped and converted fields
115
+
116
+ Raises:
117
+ ValueError: If field is missing or type conversion fails.
118
+ Error message includes row number and field name for debugging.
119
+
120
+ Examples:
121
+ >>> mappings = [
122
+ ... FieldMapping(source="vec", target="embedding", type="tensor"),
123
+ ... FieldMapping(source="name", target="title")
124
+ ... ]
125
+ >>> record = {"vec": [1.0, 2.0, 3.0], "name": "test"}
126
+ >>> result = apply_mappings(record, mappings, row_num=1)
127
+ >>> result
128
+ {'embedding': {'values': [1.0, 2.0, 3.0]}, 'title': 'test'}
129
+
130
+ >>> # Missing field error includes row and field context
131
+ >>> apply_mappings({"x": 1}, [FieldMapping(source="missing", target="out")], row_num=42)
132
+ Traceback (most recent call last):
133
+ ...
134
+ ValueError: Row 42: Missing field 'missing'
135
+ """
136
+ result = {}
137
+ for mapping in mappings:
138
+ try:
139
+ value = record[mapping.source]
140
+ except KeyError:
141
+ raise ValueError(f"Row {row_num}: Missing field '{mapping.source}'")
142
+
143
+ # Apply type conversion if specified
144
+ if mapping.type is not None:
145
+ try:
146
+ value = converters.convert(value, mapping.type)
147
+ except Exception as e:
148
+ raise ValueError(f"Row {row_num}, field '{mapping.source}': {e}")
149
+
150
+ result[mapping.target] = value
151
+
152
+ return result
153
+
154
+
155
+ def format_vespa_put(
156
+ records: Generator[dict, None, None],
157
+ namespace: str,
158
+ doctype: str,
159
+ config: VespaConfig | None = None,
160
+ ) -> Generator[dict, None, None]:
161
+ """
162
+ Format dataset records as Vespa PUT operations.
163
+
164
+ Args:
165
+ records: Generator of dataset records
166
+ namespace: Vespa namespace for document IDs
167
+ doctype: Vespa document type for document IDs
168
+ config: Optional VespaConfig for field mappings and custom ID column
169
+
170
+ Yields:
171
+ Vespa PUT operation dictionaries with structure:
172
+ {"put": "id:namespace:doctype::N", "fields": {...}}
173
+
174
+ Examples:
175
+ >>> records = iter([{"text": "hello"}, {"text": "world"}])
176
+ >>> vespa = format_vespa_put(records, "test", "doc")
177
+ >>> doc = next(vespa)
178
+ >>> doc["put"]
179
+ 'id:test:doc::0'
180
+ >>> doc["fields"]["text"]
181
+ 'hello'
182
+ """
183
+ for idx, record in enumerate(records, start=1):
184
+ # Determine document ID
185
+ if config is not None and config.id_column is not None:
186
+ doc_id = str(record[config.id_column])
187
+ else:
188
+ doc_id = str(idx - 1) # Keep 0-based IDs for compatibility
189
+
190
+ vespa_id = generate_vespa_id(namespace, doctype, doc_id)
191
+
192
+ # Apply field mappings if configured
193
+ if config is not None and config.mappings:
194
+ fields = apply_mappings(record, config.mappings, row_num=idx)
195
+ else:
196
+ fields = record
197
+
198
+ yield {"put": vespa_id, "fields": fields}
hf2vespa/stats.py ADDED
@@ -0,0 +1,76 @@
1
+ """Statistics tracking for pipeline processing."""
2
+
3
+ from collections import Counter
4
+ from dataclasses import dataclass, field
5
+ from enum import Enum
6
+
7
+
8
+ class ErrorMode(str, Enum):
9
+ """
10
+ Error handling mode for pipeline processing.
11
+
12
+ Values:
13
+ fail: Stop processing on first error (default)
14
+ skip: Skip erroring records and continue processing
15
+
16
+ Examples:
17
+ >>> ErrorMode.fail.value
18
+ 'fail'
19
+ >>> ErrorMode.skip.value
20
+ 'skip'
21
+ """
22
+
23
+ fail = "fail"
24
+ skip = "skip"
25
+
26
+
27
+ @dataclass
28
+ class ProcessingStats:
29
+ """
30
+ Statistics accumulator for tracking processing success and errors.
31
+
32
+ Attributes:
33
+ counter: Counter tracking success/error counts by type
34
+
35
+ Examples:
36
+ >>> stats = ProcessingStats()
37
+ >>> stats.record_success()
38
+ >>> stats.record_success()
39
+ >>> stats.record_error()
40
+ >>> stats.total_processed
41
+ 3
42
+ >>> stats.success_count
43
+ 2
44
+ >>> stats.error_count
45
+ 1
46
+ """
47
+
48
+ counter: Counter = field(default_factory=Counter)
49
+
50
+ def record_success(self) -> None:
51
+ """Increment the success counter."""
52
+ self.counter["success"] += 1
53
+
54
+ def record_error(self, error_type: str = "error") -> None:
55
+ """
56
+ Increment an error counter.
57
+
58
+ Args:
59
+ error_type: Type of error to record (default: "error")
60
+ """
61
+ self.counter[error_type] += 1
62
+
63
+ @property
64
+ def total_processed(self) -> int:
65
+ """Return total number of records processed (success + errors)."""
66
+ return self.counter.total()
67
+
68
+ @property
69
+ def success_count(self) -> int:
70
+ """Return number of successfully processed records."""
71
+ return self.counter["success"]
72
+
73
+ @property
74
+ def error_count(self) -> int:
75
+ """Return total number of errors (all non-success counts)."""
76
+ return sum(count for key, count in self.counter.items() if key != "success")
hf2vespa/utils.py ADDED
@@ -0,0 +1,57 @@
1
+ """Utility functions for Vespa feed generation."""
2
+
3
+ import os
4
+ import sys
5
+ from contextlib import contextmanager
6
+
7
+
8
+ @contextmanager
9
+ def handle_broken_pipe():
10
+ """
11
+ Context manager to handle SIGPIPE/BrokenPipeError gracefully.
12
+
13
+ When piped to tools like `head`, the pipe may close before all output is written.
14
+ This prevents ugly tracebacks by redirecting remaining output to /dev/null.
15
+
16
+ Usage:
17
+ with handle_broken_pipe():
18
+ # Write to stdout
19
+ sys.stdout.buffer.write(data)
20
+ """
21
+ try:
22
+ sys.stdout.flush()
23
+ yield
24
+ except BrokenPipeError:
25
+ # Redirect stdout to /dev/null to suppress further errors
26
+ devnull = os.open(os.devnull, os.O_WRONLY)
27
+ os.dup2(devnull, sys.stdout.fileno())
28
+ sys.exit(0)
29
+
30
+
31
+ def generate_vespa_id(namespace: str, doctype: str, key: str | int) -> str:
32
+ """
33
+ Generate a Vespa document ID in the format: id:namespace:doctype::key
34
+
35
+ Args:
36
+ namespace: Vespa namespace (must not contain ':')
37
+ doctype: Vespa document type (must not contain ':')
38
+ key: Unique identifier (typically an integer)
39
+
40
+ Returns:
41
+ Formatted Vespa document ID
42
+
43
+ Raises:
44
+ ValueError: If namespace or doctype contains invalid characters
45
+
46
+ Examples:
47
+ >>> generate_vespa_id("myns", "doc", 42)
48
+ 'id:myns:doc::42'
49
+ >>> generate_vespa_id("test", "article", "abc123")
50
+ 'id:test:article::abc123'
51
+ """
52
+ if ":" in namespace:
53
+ raise ValueError(f"Namespace cannot contain ':' character: {namespace}")
54
+ if ":" in doctype:
55
+ raise ValueError(f"Doctype cannot contain ':' character: {doctype}")
56
+
57
+ return f"id:{namespace}:{doctype}::{key}"