hf2vespa 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hf2vespa/converters.py ADDED
@@ -0,0 +1,648 @@
1
+ """Type converter registry for field transformations."""
2
+
3
+ import struct
4
+ from functools import lru_cache
5
+ from typing import Any, Callable
6
+
7
+
8
+ class ConverterRegistry:
9
+ """
10
+ Registry for type converters used in field mapping.
11
+
12
+ Converters are functions that transform values from dataset format
13
+ to Vespa format (e.g., list to tensor).
14
+
15
+ Examples:
16
+ >>> registry = ConverterRegistry()
17
+ >>> registry.register("double", lambda x: float(x) * 2)
18
+ >>> registry.convert(21, "double")
19
+ 42.0
20
+ >>> registry.has("double")
21
+ True
22
+ """
23
+
24
+ def __init__(self) -> None:
25
+ """Initialize an empty converter registry."""
26
+ self._converters: dict[str, Callable[[Any], Any]] = {}
27
+
28
+ @lru_cache(maxsize=16)
29
+ def _get_converter(self, type_name: str) -> Callable[[Any], Any]:
30
+ """
31
+ Cached lookup of converter function by type name.
32
+
33
+ Uses lru_cache to avoid repeated dict lookups for the same type.
34
+ maxsize=16 is sufficient since we have few converter types.
35
+
36
+ Args:
37
+ type_name: Name of the converter to look up
38
+
39
+ Returns:
40
+ Converter function
41
+
42
+ Raises:
43
+ ValueError: If converter is not registered
44
+ """
45
+ if type_name not in self._converters:
46
+ raise ValueError(
47
+ f"Unknown type converter '{type_name}'. Available converters: {', '.join(sorted(self._converters.keys()))}"
48
+ )
49
+ return self._converters[type_name]
50
+
51
+ def register(self, name: str, func: Callable[[Any], Any]) -> None:
52
+ """
53
+ Register a type converter function.
54
+
55
+ Args:
56
+ name: Name of the converter (e.g., "tensor", "string")
57
+ func: Converter function that takes a value and returns transformed value
58
+
59
+ Examples:
60
+ >>> registry = ConverterRegistry()
61
+ >>> registry.register("uppercase", str.upper)
62
+ >>> registry.convert("hello", "uppercase")
63
+ 'HELLO'
64
+ """
65
+ self._converters[name] = func
66
+
67
+ def convert(self, value: Any, type_name: str) -> Any:
68
+ """
69
+ Apply a registered converter to a value.
70
+
71
+ Converter function lookup is cached for performance.
72
+
73
+ Args:
74
+ value: Value to convert
75
+ type_name: Name of the converter to apply
76
+
77
+ Returns:
78
+ Converted value
79
+
80
+ Raises:
81
+ ValueError: If converter is not registered
82
+
83
+ Examples:
84
+ >>> registry = ConverterRegistry()
85
+ >>> registry.register("str", str)
86
+ >>> registry.convert(42, "str")
87
+ '42'
88
+ """
89
+ converter_fn = self._get_converter(type_name)
90
+ return converter_fn(value)
91
+
92
+ def has(self, type_name: str) -> bool:
93
+ """
94
+ Check if a converter is registered.
95
+
96
+ Args:
97
+ type_name: Name of the converter to check
98
+
99
+ Returns:
100
+ True if converter is registered, False otherwise
101
+
102
+ Examples:
103
+ >>> registry = ConverterRegistry()
104
+ >>> registry.has("nonexistent")
105
+ False
106
+ >>> registry.register("exists", str)
107
+ >>> registry.has("exists")
108
+ True
109
+ """
110
+ return type_name in self._converters
111
+
112
+ def cache_info(self):
113
+ """
114
+ Return cache statistics for converter lookups.
115
+
116
+ Returns:
117
+ CacheInfo named tuple with hits, misses, maxsize, currsize
118
+
119
+ Examples:
120
+ >>> registry = ConverterRegistry()
121
+ >>> registry.register("str", str)
122
+ >>> _ = registry.convert(1, "str")
123
+ >>> info = registry.cache_info()
124
+ >>> info.misses >= 1
125
+ True
126
+ """
127
+ return self._get_converter.cache_info()
128
+
129
+
130
+ def list_to_tensor(value: Any) -> dict[str, list]:
131
+ """
132
+ Convert a Python list to Vespa indexed tensor format.
133
+
134
+ Args:
135
+ value: List of numeric values (int or float)
136
+
137
+ Returns:
138
+ Dictionary with "values" key containing the list
139
+
140
+ Raises:
141
+ ValueError: If input is not a list, is empty, or contains non-numeric values
142
+
143
+ Examples:
144
+ >>> list_to_tensor([1.0, 2.0, 3.0])
145
+ {'values': [1.0, 2.0, 3.0]}
146
+ >>> list_to_tensor([1, 2, 3])
147
+ {'values': [1, 2, 3]}
148
+ """
149
+ if not isinstance(value, list):
150
+ raise ValueError(
151
+ f"Expected list for tensor conversion, got {type(value).__name__}"
152
+ )
153
+
154
+ if len(value) == 0:
155
+ raise ValueError("Cannot convert empty list to tensor")
156
+
157
+ # Validate all values are numeric
158
+ for i, item in enumerate(value):
159
+ if not isinstance(item, (int, float)):
160
+ raise ValueError(
161
+ f"Tensor values must be numeric (int or float), got {type(item).__name__} at index {i}"
162
+ )
163
+
164
+ return {"values": value}
165
+
166
+
167
+ def dict_to_position(value: Any) -> dict[str, float]:
168
+ """
169
+ Convert a lat/lng dictionary to Vespa position format.
170
+
171
+ Args:
172
+ value: Dictionary with "lat" and "lng" keys containing numeric values
173
+
174
+ Returns:
175
+ Dictionary with "lat" and "lng" as floats
176
+
177
+ Raises:
178
+ ValueError: If input is not a dict, missing keys, non-numeric values,
179
+ or coordinates out of range
180
+
181
+ Examples:
182
+ >>> dict_to_position({"lat": 37.4, "lng": -122.0})
183
+ {'lat': 37.4, 'lng': -122.0}
184
+ >>> dict_to_position({"lat": 90, "lng": 180})
185
+ {'lat': 90.0, 'lng': 180.0}
186
+ """
187
+ if not isinstance(value, dict):
188
+ raise ValueError(
189
+ f"Expected dict for position conversion, got {type(value).__name__}"
190
+ )
191
+
192
+ if "lat" not in value:
193
+ raise ValueError("Position dict must contain 'lat' key")
194
+ if "lng" not in value:
195
+ raise ValueError("Position dict must contain 'lng' key")
196
+
197
+ lat = value["lat"]
198
+ lng = value["lng"]
199
+
200
+ if not isinstance(lat, (int, float)):
201
+ raise ValueError(f"Latitude must be numeric, got {type(lat).__name__}")
202
+ if not isinstance(lng, (int, float)):
203
+ raise ValueError(f"Longitude must be numeric, got {type(lng).__name__}")
204
+
205
+ if lat < -90 or lat > 90:
206
+ raise ValueError(f"Latitude must be between -90 and 90, got {lat}")
207
+ if lng < -180 or lng > 180:
208
+ raise ValueError(f"Longitude must be between -180 and 180, got {lng}")
209
+
210
+ return {"lat": float(lat), "lng": float(lng)}
211
+
212
+
213
+ def dict_to_weightedset(value: Any) -> dict[str, int]:
214
+ """
215
+ Convert a key-weight dictionary to Vespa weighted set format.
216
+
217
+ Args:
218
+ value: Dictionary with keys and numeric weights
219
+
220
+ Returns:
221
+ Dictionary with stringified keys and integer weights
222
+
223
+ Raises:
224
+ ValueError: If input is not a dict or weights are not numeric
225
+
226
+ Examples:
227
+ >>> dict_to_weightedset({"tag1": 10, "tag2": 5})
228
+ {'tag1': 10, 'tag2': 5}
229
+ >>> dict_to_weightedset({1: 100, 2: 50})
230
+ {'1': 100, '2': 50}
231
+ """
232
+ if not isinstance(value, dict):
233
+ raise ValueError(
234
+ f"Expected dict for weightedset conversion, got {type(value).__name__}"
235
+ )
236
+
237
+ result: dict[str, int] = {}
238
+ for key, weight in value.items():
239
+ if not isinstance(weight, (int, float)):
240
+ raise ValueError(
241
+ f"Weight for key '{key}' must be numeric, got {type(weight).__name__}"
242
+ )
243
+ result[str(key)] = int(weight)
244
+
245
+ return result
246
+
247
+
248
+ def dict_to_map(value: Any) -> dict[str, Any]:
249
+ """
250
+ Convert a dictionary to Vespa map format with stringified keys.
251
+
252
+ Args:
253
+ value: Dictionary with any keys and values
254
+
255
+ Returns:
256
+ Dictionary with stringified keys
257
+
258
+ Raises:
259
+ ValueError: If input is not a dict
260
+
261
+ Examples:
262
+ >>> dict_to_map({"key": "value"})
263
+ {'key': 'value'}
264
+ >>> dict_to_map({1: "one", 2: "two"})
265
+ {'1': 'one', '2': 'two'}
266
+ """
267
+ if not isinstance(value, dict):
268
+ raise ValueError(
269
+ f"Expected dict for map conversion, got {type(value).__name__}"
270
+ )
271
+
272
+ return {str(key): val for key, val in value.items()}
273
+
274
+
275
+ def list_to_tensor_int8_hex(value: Any, row_num: int | None = None) -> dict[str, str]:
276
+ """
277
+ Convert a list of integers to Vespa hex-encoded int8 tensor format.
278
+
279
+ Each int8 value is encoded as 2 hex characters (big-endian).
280
+
281
+ Args:
282
+ value: List of integers in range -128 to 127
283
+ row_num: Optional row number for error context
284
+
285
+ Returns:
286
+ Dictionary with "values" key containing hex string
287
+
288
+ Raises:
289
+ ValueError: If input is not a list, is empty, or contains out-of-range values
290
+
291
+ Examples:
292
+ >>> list_to_tensor_int8_hex([11, 34, 3])
293
+ {'values': '0b2203'}
294
+ >>> list_to_tensor_int8_hex([0, 127, -128])
295
+ {'values': '007f80'}
296
+ """
297
+ if not isinstance(value, list):
298
+ raise ValueError(
299
+ f"Expected list for tensor conversion, got {type(value).__name__}"
300
+ )
301
+
302
+ if len(value) == 0:
303
+ raise ValueError("Cannot convert empty list to tensor")
304
+
305
+ hex_parts = []
306
+ for i, v in enumerate(value):
307
+ if not isinstance(v, int) or v < -128 or v > 127:
308
+ row_context = f" (row {row_num})" if row_num is not None else ""
309
+ raise ValueError(
310
+ f"Value {v} at index {i}{row_context} outside int8 range (-128 to 127)"
311
+ )
312
+ # Pack as signed byte and convert to hex
313
+ hex_parts.append(struct.pack(">b", v).hex())
314
+
315
+ return {"values": "".join(hex_parts)}
316
+
317
+
318
+ def list_to_tensor_bfloat16_hex(
319
+ value: Any, row_num: int | None = None
320
+ ) -> dict[str, str]:
321
+ """
322
+ Convert a list of floats to Vespa hex-encoded bfloat16 tensor format.
323
+
324
+ Each bfloat16 value is encoded as 4 hex characters (upper 2 bytes of float32).
325
+ Precision is truncated, not rounded.
326
+
327
+ Args:
328
+ value: List of numeric values (int or float)
329
+ row_num: Optional row number for error context (unused, for API consistency)
330
+
331
+ Returns:
332
+ Dictionary with "values" key containing hex string
333
+
334
+ Raises:
335
+ ValueError: If input is not a list or is empty
336
+
337
+ Examples:
338
+ >>> list_to_tensor_bfloat16_hex([1.0, -1.0, 0.0])
339
+ {'values': '3f80bf800000'}
340
+ >>> list_to_tensor_bfloat16_hex([3.14159])
341
+ {'values': '4049'}
342
+ """
343
+ if not isinstance(value, list):
344
+ raise ValueError(
345
+ f"Expected list for tensor conversion, got {type(value).__name__}"
346
+ )
347
+
348
+ if len(value) == 0:
349
+ raise ValueError("Cannot convert empty list to tensor")
350
+
351
+ hex_parts = []
352
+ for v in value:
353
+ # Pack as float32, take upper 2 bytes for bfloat16
354
+ packed = struct.pack(">f", float(v))
355
+ hex_parts.append(packed[:2].hex())
356
+
357
+ return {"values": "".join(hex_parts)}
358
+
359
+
360
+ def list_to_tensor_float32_hex(
361
+ value: Any, row_num: int | None = None
362
+ ) -> dict[str, str]:
363
+ """
364
+ Convert a list of floats to Vespa hex-encoded float32 tensor format.
365
+
366
+ Each float32 value is encoded as 8 hex characters (big-endian IEEE 754).
367
+
368
+ Args:
369
+ value: List of numeric values (int or float)
370
+ row_num: Optional row number for error context (unused, for API consistency)
371
+
372
+ Returns:
373
+ Dictionary with "values" key containing hex string
374
+
375
+ Raises:
376
+ ValueError: If input is not a list or is empty
377
+
378
+ Examples:
379
+ >>> list_to_tensor_float32_hex([1.0, 2.0, 3.0])
380
+ {'values': '3f8000004000000040400000'}
381
+ >>> list_to_tensor_float32_hex([3.14159])
382
+ {'values': '40490fdb'}
383
+ """
384
+ if not isinstance(value, list):
385
+ raise ValueError(
386
+ f"Expected list for tensor conversion, got {type(value).__name__}"
387
+ )
388
+
389
+ if len(value) == 0:
390
+ raise ValueError("Cannot convert empty list to tensor")
391
+
392
+ hex_parts = []
393
+ for v in value:
394
+ # Pack as float32 (big-endian) = 4 bytes = 8 hex chars
395
+ hex_parts.append(struct.pack(">f", float(v)).hex())
396
+
397
+ return {"values": "".join(hex_parts)}
398
+
399
+
400
+ def list_to_tensor_float64_hex(
401
+ value: Any, row_num: int | None = None
402
+ ) -> dict[str, str]:
403
+ """
404
+ Convert a list of floats to Vespa hex-encoded float64 tensor format.
405
+
406
+ Each float64 value is encoded as 16 hex characters (big-endian IEEE 754 double).
407
+
408
+ Args:
409
+ value: List of numeric values (int or float)
410
+ row_num: Optional row number for error context (unused, for API consistency)
411
+
412
+ Returns:
413
+ Dictionary with "values" key containing hex string
414
+
415
+ Raises:
416
+ ValueError: If input is not a list or is empty
417
+
418
+ Examples:
419
+ >>> list_to_tensor_float64_hex([1.0, 2.0])
420
+ {'values': '3ff00000000000004000000000000000'}
421
+ >>> list_to_tensor_float64_hex([3.141592653589793])
422
+ {'values': '400921fb54442d18'}
423
+ """
424
+ if not isinstance(value, list):
425
+ raise ValueError(
426
+ f"Expected list for tensor conversion, got {type(value).__name__}"
427
+ )
428
+
429
+ if len(value) == 0:
430
+ raise ValueError("Cannot convert empty list to tensor")
431
+
432
+ hex_parts = []
433
+ for v in value:
434
+ # Pack as float64 (big-endian) = 8 bytes = 16 hex chars
435
+ hex_parts.append(struct.pack(">d", float(v)).hex())
436
+
437
+ return {"values": "".join(hex_parts)}
438
+
439
+
440
+ def dict_to_sparse_tensor(value: Any, row_num: int | None = None) -> dict[str, list]:
441
+ """
442
+ Convert dictionary to Vespa sparse tensor format (cells notation).
443
+
444
+ For tensors with single mapped dimension: tensor<float>(key{})
445
+ Output format: {"cells": [{"address": {"key": "..."}, "value": ...}, ...]}
446
+
447
+ Args:
448
+ value: Dict with string keys and numeric values
449
+ row_num: Optional row number for error context
450
+
451
+ Returns:
452
+ Dict with "cells" key containing list of address-value pairs
453
+
454
+ Raises:
455
+ ValueError: If input is not a dict, is empty, or values are non-numeric
456
+
457
+ Examples:
458
+ >>> dict_to_sparse_tensor({"word1": 0.5, "word2": 0.3})
459
+ {'cells': [{'address': {'key': 'word1'}, 'value': 0.5},
460
+ {'address': {'key': 'word2'}, 'value': 0.3}]}
461
+ """
462
+ if not isinstance(value, dict):
463
+ raise ValueError(
464
+ f"Expected dict for sparse tensor conversion, got {type(value).__name__}"
465
+ )
466
+
467
+ if len(value) == 0:
468
+ raise ValueError("Cannot convert empty dict to sparse tensor")
469
+
470
+ cells = []
471
+ for key, val in value.items():
472
+ if not isinstance(val, (int, float)):
473
+ row_context = f" (row {row_num})" if row_num is not None else ""
474
+ raise ValueError(
475
+ f"Value for key '{key}'{row_context} must be numeric, got {type(val).__name__}"
476
+ )
477
+ cells.append({"address": {"key": str(key)}, "value": float(val)})
478
+
479
+ return {"cells": cells}
480
+
481
+
482
+ def dict_to_mixed_tensor(value: Any, row_num: int | None = None) -> dict[str, dict]:
483
+ """
484
+ Convert nested dict to Vespa mixed tensor format (blocks notation).
485
+
486
+ For tensors with one mapped and indexed dimensions: tensor<float>(word{},x[N])
487
+ Output format: {"blocks": {"key1": [...], "key2": [...], ...}}
488
+
489
+ Args:
490
+ value: Dict with string keys and list values (all lists must be same length)
491
+ row_num: Optional row number for error context
492
+
493
+ Returns:
494
+ Dict with "blocks" key containing dict of key->array mappings
495
+
496
+ Raises:
497
+ ValueError: If input invalid or dimension lengths inconsistent
498
+
499
+ Examples:
500
+ >>> dict_to_mixed_tensor({"word1": [1.0, 2.0], "word2": [3.0, 4.0]})
501
+ {'blocks': {'word1': [1.0, 2.0], 'word2': [3.0, 4.0]}}
502
+ """
503
+ if not isinstance(value, dict):
504
+ raise ValueError(
505
+ f"Expected dict for mixed tensor conversion, got {type(value).__name__}"
506
+ )
507
+
508
+ if len(value) == 0:
509
+ raise ValueError("Cannot convert empty dict to mixed tensor")
510
+
511
+ blocks = {}
512
+ first_key = None
513
+ first_shape = None
514
+
515
+ for key, val in value.items():
516
+ if not isinstance(val, list):
517
+ raise ValueError(
518
+ f"Value for key '{key}' must be list, got {type(val).__name__}"
519
+ )
520
+
521
+ if len(val) == 0:
522
+ raise ValueError(f"Value for key '{key}' cannot be empty list")
523
+
524
+ # Track first key shape for consistency checking
525
+ if first_key is None:
526
+ first_key = key
527
+ first_shape = len(val)
528
+ else:
529
+ current_shape = len(val)
530
+ if current_shape != first_shape:
531
+ row_ctx = f" (row {row_num})" if row_num is not None else ""
532
+ raise ValueError(
533
+ f"Mixed tensor dimension mismatch{row_ctx}: key '{first_key}' has shape [{first_shape}], "
534
+ f"but key '{key}' has shape [{current_shape}]"
535
+ )
536
+
537
+ # Validate all items in list are numeric
538
+ for i, item in enumerate(val):
539
+ if not isinstance(item, (int, float)):
540
+ raise ValueError(
541
+ f"Value at index {i} for key '{key}' must be numeric, got {type(item).__name__}"
542
+ )
543
+
544
+ blocks[str(key)] = val
545
+
546
+ return {"blocks": blocks}
547
+
548
+
549
+ def dict_to_mixed_tensor_hex(
550
+ value: Any, cell_type: str = "float32", row_num: int | None = None
551
+ ) -> dict[str, dict]:
552
+ """
553
+ Convert nested dict to Vespa mixed tensor format with hex-encoded blocks.
554
+
555
+ Combines sparse mapped dimension with dense hex-encoded indexed dimensions.
556
+ Uses Phase 9 hex encoders for the dense subspace.
557
+
558
+ Args:
559
+ value: Dict with string keys and list values (all lists must be same length)
560
+ cell_type: Cell type for hex encoding (int8, bfloat16, float32, float64)
561
+ row_num: Optional row number for error context
562
+
563
+ Returns:
564
+ Dict with "blocks" key containing dict of key->hex-string mappings
565
+
566
+ Raises:
567
+ ValueError: If input invalid, dimensions inconsistent, or values out of range
568
+
569
+ Examples:
570
+ >>> dict_to_mixed_tensor_hex({"w1": [11, 34, 3], "w2": [-124, 5, -1]}, cell_type="int8")
571
+ {'blocks': {'w1': '0b2203', 'w2': '8405ff'}}
572
+ """
573
+ if not isinstance(value, dict):
574
+ raise ValueError(
575
+ f"Expected dict for mixed tensor conversion, got {type(value).__name__}"
576
+ )
577
+
578
+ if len(value) == 0:
579
+ raise ValueError("Cannot convert empty dict to mixed tensor")
580
+
581
+ # Create encoder lookup
582
+ hex_encoders = {
583
+ "int8": list_to_tensor_int8_hex,
584
+ "bfloat16": list_to_tensor_bfloat16_hex,
585
+ "float32": list_to_tensor_float32_hex,
586
+ "float64": list_to_tensor_float64_hex,
587
+ }
588
+
589
+ if cell_type not in hex_encoders:
590
+ raise ValueError(
591
+ f"Unknown cell type '{cell_type}'. Must be one of: {', '.join(sorted(hex_encoders.keys()))}"
592
+ )
593
+
594
+ encoder = hex_encoders[cell_type]
595
+ blocks = {}
596
+ first_key = None
597
+ first_shape = None
598
+
599
+ for key, val in value.items():
600
+ if not isinstance(val, list):
601
+ raise ValueError(
602
+ f"Value for key '{key}' must be list, got {type(val).__name__}"
603
+ )
604
+
605
+ if len(val) == 0:
606
+ raise ValueError(f"Value for key '{key}' cannot be empty list")
607
+
608
+ # Track first key shape for consistency checking
609
+ if first_key is None:
610
+ first_key = key
611
+ first_shape = len(val)
612
+ else:
613
+ current_shape = len(val)
614
+ if current_shape != first_shape:
615
+ row_ctx = f" (row {row_num})" if row_num is not None else ""
616
+ raise ValueError(
617
+ f"Mixed tensor dimension mismatch{row_ctx}: key '{first_key}' has shape [{first_shape}], "
618
+ f"but key '{key}' has shape [{current_shape}]"
619
+ )
620
+
621
+ # Encode the block using Phase 9 hex encoder
622
+ try:
623
+ encoded = encoder(val, row_num=row_num)
624
+ blocks[str(key)] = encoded["values"]
625
+ except ValueError as e:
626
+ raise ValueError(f"Error encoding key '{key}': {e}") from e
627
+
628
+ return {"blocks": blocks}
629
+
630
+
631
+ # Create module-level registry instance
632
+ converters = ConverterRegistry()
633
+
634
+ # Register built-in converters
635
+ converters.register("tensor", list_to_tensor)
636
+ converters.register("string", str)
637
+ converters.register("int", int)
638
+ converters.register("float", float)
639
+ converters.register("position", dict_to_position)
640
+ converters.register("weightedset", dict_to_weightedset)
641
+ converters.register("map", dict_to_map)
642
+ converters.register("tensor_int8_hex", list_to_tensor_int8_hex)
643
+ converters.register("tensor_bfloat16_hex", list_to_tensor_bfloat16_hex)
644
+ converters.register("tensor_float32_hex", list_to_tensor_float32_hex)
645
+ converters.register("tensor_float64_hex", list_to_tensor_float64_hex)
646
+ converters.register("sparse_tensor", dict_to_sparse_tensor)
647
+ converters.register("mixed_tensor", dict_to_mixed_tensor)
648
+ converters.register("mixed_tensor_hex", dict_to_mixed_tensor_hex)