rdf-starbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,628 @@
1
+ """
2
+ Term Dictionary with Integer ID Encoding.
3
+
4
+ Implements dictionary-encoded RDF terms for high-performance columnar storage.
5
+ All terms (IRIs, literals, blank nodes, quoted triples) are mapped to u64 TermIds.
6
+
7
+ Key design decisions (from storage-spec.md):
8
+ - Tagged ID space: high bits encode term kind for O(1) kind detection
9
+ - Hash-based interning: 128-bit hashes for fast bulk dedupe
10
+ - Batch-first: bulk get_or_create operations for ingestion performance
11
+ - Persistence: Parquet-backed for restart-safe term catalogs
12
+ """
13
+
14
+ from dataclasses import dataclass, field
15
+ from enum import IntEnum
16
+ from typing import Optional, Any, Union
17
+ from pathlib import Path
18
+ import hashlib
19
+ import struct
20
+
21
+ import polars as pl
22
+
23
+
24
+ # =============================================================================
25
+ # Term Identity and Encoding
26
+ # =============================================================================
27
+
28
+ class TermKind(IntEnum):
29
+ """
30
+ RDF term kind enumeration.
31
+
32
+ Encoded in the high 2 bits of TermId for O(1) kind detection.
33
+ """
34
+ IRI = 0
35
+ LITERAL = 1
36
+ BNODE = 2
37
+ QUOTED_TRIPLE = 3
38
+
39
+
40
+ # Type alias for term identifiers (u64)
41
+ TermId = int
42
+
43
+ # Constants for ID encoding
44
+ KIND_SHIFT = 62
45
+ KIND_MASK = 0x3 # 2 bits
46
+ PAYLOAD_MASK = (1 << KIND_SHIFT) - 1
47
+
48
+
49
+ def make_term_id(kind: TermKind, payload: int) -> TermId:
50
+ """Create a TermId from kind and payload."""
51
+ return (kind << KIND_SHIFT) | (payload & PAYLOAD_MASK)
52
+
53
+
54
+ def get_term_kind(term_id: TermId) -> TermKind:
55
+ """Extract the term kind from a TermId (O(1) operation)."""
56
+ return TermKind((term_id >> KIND_SHIFT) & KIND_MASK)
57
+
58
+
59
+ def get_term_payload(term_id: TermId) -> int:
60
+ """Extract the payload (sequence number) from a TermId."""
61
+ return term_id & PAYLOAD_MASK
62
+
63
+
64
+ def is_quoted_triple(term_id: TermId) -> bool:
65
+ """Check if a TermId refers to a quoted triple."""
66
+ return get_term_kind(term_id) == TermKind.QUOTED_TRIPLE
67
+
68
+
69
+ # =============================================================================
70
+ # Term Representation
71
+ # =============================================================================
72
+
73
+ @dataclass(frozen=True, slots=True)
74
+ class Term:
75
+ """
76
+ Internal representation of an RDF term.
77
+
78
+ Attributes:
79
+ kind: The type of term (IRI, LITERAL, BNODE, QUOTED_TRIPLE)
80
+ lex: Lexical form (IRI string, literal value, bnode label)
81
+ datatype_id: TermId of datatype IRI (for typed literals)
82
+ lang: Language tag (for language-tagged literals)
83
+ """
84
+ kind: TermKind
85
+ lex: str
86
+ datatype_id: Optional[TermId] = None
87
+ lang: Optional[str] = None
88
+
89
+ def __hash__(self) -> int:
90
+ return hash((self.kind, self.lex, self.datatype_id, self.lang))
91
+
92
+ def canonical_bytes(self) -> bytes:
93
+ """
94
+ Generate canonical byte representation for hashing.
95
+
96
+ Includes: kind tag, lexical form, datatype IRI, language tag.
97
+ """
98
+ parts = [
99
+ struct.pack('B', self.kind),
100
+ self.lex.encode('utf-8'),
101
+ ]
102
+ if self.datatype_id is not None:
103
+ parts.append(struct.pack('>Q', self.datatype_id))
104
+ if self.lang is not None:
105
+ parts.append(b'@')
106
+ parts.append(self.lang.encode('utf-8'))
107
+ return b'\x00'.join(parts)
108
+
109
+ def compute_hash(self) -> int:
110
+ """Compute 128-bit hash for bulk deduplication."""
111
+ h = hashlib.md5(self.canonical_bytes()).digest()
112
+ return int.from_bytes(h, 'big')
113
+
114
+ @classmethod
115
+ def iri(cls, value: str) -> "Term":
116
+ """Create an IRI term."""
117
+ return cls(kind=TermKind.IRI, lex=value)
118
+
119
+ @classmethod
120
+ def literal(
121
+ cls,
122
+ value: str,
123
+ datatype_id: Optional[TermId] = None,
124
+ lang: Optional[str] = None
125
+ ) -> "Term":
126
+ """Create a literal term."""
127
+ return cls(
128
+ kind=TermKind.LITERAL,
129
+ lex=value,
130
+ datatype_id=datatype_id,
131
+ lang=lang
132
+ )
133
+
134
+ @classmethod
135
+ def bnode(cls, label: str) -> "Term":
136
+ """Create a blank node term."""
137
+ return cls(kind=TermKind.BNODE, lex=label)
138
+
139
+
140
+ # =============================================================================
141
+ # Term Dictionary
142
+ # =============================================================================
143
+
144
+ class TermDict:
145
+ """
146
+ Dictionary-encoded term catalog.
147
+
148
+ Maps RDF terms to integer TermIds with:
149
+ - O(1) kind detection via tagged ID space
150
+ - Hash-based bulk interning for fast ingestion
151
+ - Parquet persistence for restart-safe catalogs
152
+
153
+ Thread-safety: NOT thread-safe. Use external synchronization for concurrent access.
154
+ """
155
+
156
+ # Well-known datatype IRIs (pre-interned)
157
+ XSD_STRING = "http://www.w3.org/2001/XMLSchema#string"
158
+ XSD_INTEGER = "http://www.w3.org/2001/XMLSchema#integer"
159
+ XSD_DECIMAL = "http://www.w3.org/2001/XMLSchema#decimal"
160
+ XSD_DOUBLE = "http://www.w3.org/2001/XMLSchema#double"
161
+ XSD_BOOLEAN = "http://www.w3.org/2001/XMLSchema#boolean"
162
+ XSD_DATETIME = "http://www.w3.org/2001/XMLSchema#dateTime"
163
+ RDF_LANGSTRING = "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString"
164
+
165
+ def __init__(self):
166
+ # Per-kind sequence counters
167
+ self._next_payload: dict[TermKind, int] = {
168
+ TermKind.IRI: 0,
169
+ TermKind.LITERAL: 0,
170
+ TermKind.BNODE: 0,
171
+ TermKind.QUOTED_TRIPLE: 0,
172
+ }
173
+
174
+ # Forward map: hash -> TermId (for interning)
175
+ self._hash_to_id: dict[int, TermId] = {}
176
+
177
+ # Reverse map: TermId -> Term (for lookup)
178
+ self._id_to_term: dict[TermId, Term] = {}
179
+
180
+ # =================================================================
181
+ # FAST PATH CACHES: Direct string->TermId lookup (no hashing)
182
+ # These bypass the expensive MD5 computation for common cases
183
+ # =================================================================
184
+ self._iri_cache: dict[str, TermId] = {} # IRI string -> TermId
185
+ self._plain_literal_cache: dict[str, TermId] = {} # Plain string literal -> TermId
186
+ self._bnode_cache: dict[str, TermId] = {} # Blank node label -> TermId
187
+
188
+ # Statistics
189
+ self._collision_count = 0
190
+
191
+ # Pre-intern well-known datatypes
192
+ self._init_well_known()
193
+
194
+ def _init_well_known(self):
195
+ """Pre-intern well-known datatype IRIs and populate caches."""
196
+ self.xsd_string_id = self.get_or_create(Term.iri(self.XSD_STRING))
197
+ self.xsd_integer_id = self.get_or_create(Term.iri(self.XSD_INTEGER))
198
+ self.xsd_decimal_id = self.get_or_create(Term.iri(self.XSD_DECIMAL))
199
+ self.xsd_double_id = self.get_or_create(Term.iri(self.XSD_DOUBLE))
200
+ self.xsd_boolean_id = self.get_or_create(Term.iri(self.XSD_BOOLEAN))
201
+ self.xsd_datetime_id = self.get_or_create(Term.iri(self.XSD_DATETIME))
202
+ self.rdf_langstring_id = self.get_or_create(Term.iri(self.RDF_LANGSTRING))
203
+
204
+ # Populate fast-path IRI cache for well-known IRIs
205
+ self._iri_cache[self.XSD_STRING] = self.xsd_string_id
206
+ self._iri_cache[self.XSD_INTEGER] = self.xsd_integer_id
207
+ self._iri_cache[self.XSD_DECIMAL] = self.xsd_decimal_id
208
+ self._iri_cache[self.XSD_DOUBLE] = self.xsd_double_id
209
+ self._iri_cache[self.XSD_BOOLEAN] = self.xsd_boolean_id
210
+ self._iri_cache[self.XSD_DATETIME] = self.xsd_datetime_id
211
+ self._iri_cache[self.RDF_LANGSTRING] = self.rdf_langstring_id
212
+
213
+ def _allocate_id(self, kind: TermKind) -> TermId:
214
+ """Allocate the next TermId for a given kind."""
215
+ payload = self._next_payload[kind]
216
+ self._next_payload[kind] = payload + 1
217
+ return make_term_id(kind, payload)
218
+
219
+ def get_or_create(self, term: Term) -> TermId:
220
+ """
221
+ Intern a term, returning its TermId.
222
+
223
+ If the term already exists, returns the existing ID.
224
+ Otherwise, allocates a new ID and stores the term.
225
+ """
226
+ term_hash = term.compute_hash()
227
+
228
+ if term_hash in self._hash_to_id:
229
+ existing_id = self._hash_to_id[term_hash]
230
+ # Verify it's actually the same term (hash collision check)
231
+ if self._id_to_term[existing_id] == term:
232
+ return existing_id
233
+ # Hash collision - need to handle
234
+ self._collision_count += 1
235
+ # Fall through to create new entry with different ID
236
+
237
+ # Allocate new ID
238
+ term_id = self._allocate_id(term.kind)
239
+ self._hash_to_id[term_hash] = term_id
240
+ self._id_to_term[term_id] = term
241
+
242
+ return term_id
243
+
244
+ def get_or_create_batch(self, terms: list[Term]) -> list[TermId]:
245
+ """
246
+ Bulk intern a batch of terms.
247
+
248
+ Optimized for ingestion performance. Returns TermIds in the same order.
249
+ """
250
+ return [self.get_or_create(term) for term in terms]
251
+
252
+ def lookup(self, term_id: TermId) -> Optional[Term]:
253
+ """Look up a term by its ID."""
254
+ return self._id_to_term.get(term_id)
255
+
256
+ def lookup_batch(self, term_ids: list[TermId]) -> list[Optional[Term]]:
257
+ """Bulk lookup terms by their IDs."""
258
+ return [self._id_to_term.get(tid) for tid in term_ids]
259
+
260
+ def contains(self, term: Term) -> bool:
261
+ """Check if a term is already interned."""
262
+ term_hash = term.compute_hash()
263
+ if term_hash not in self._hash_to_id:
264
+ return False
265
+ existing_id = self._hash_to_id[term_hash]
266
+ return self._id_to_term[existing_id] == term
267
+
268
+ def get_id(self, term: Term) -> Optional[TermId]:
269
+ """Get the TermId for a term if it exists, without creating it."""
270
+ term_hash = term.compute_hash()
271
+ if term_hash not in self._hash_to_id:
272
+ return None
273
+ existing_id = self._hash_to_id[term_hash]
274
+ if self._id_to_term[existing_id] == term:
275
+ return existing_id
276
+ return None
277
+
278
+ def __len__(self) -> int:
279
+ """Return the total number of interned terms."""
280
+ return len(self._id_to_term)
281
+
282
+ def count_by_kind(self) -> dict[TermKind, int]:
283
+ """Return counts of terms by kind."""
284
+ return {kind: self._next_payload[kind] for kind in TermKind}
285
+
286
+ @property
287
+ def collision_count(self) -> int:
288
+ """Return the number of hash collisions encountered."""
289
+ return self._collision_count
290
+
291
+ # =========================================================================
292
+ # Persistence (Parquet)
293
+ # =========================================================================
294
+
295
+ def to_dataframe(self) -> pl.DataFrame:
296
+ """
297
+ Export the term dictionary to a Polars DataFrame.
298
+
299
+ Schema matches storage-spec.md §3.1:
300
+ - term_id: u64
301
+ - kind: u8
302
+ - lex: string
303
+ - datatype_id: u64 (nullable)
304
+ - lang: string (nullable)
305
+ """
306
+ if not self._id_to_term:
307
+ return pl.DataFrame({
308
+ "term_id": pl.Series([], dtype=pl.UInt64),
309
+ "kind": pl.Series([], dtype=pl.UInt8),
310
+ "lex": pl.Series([], dtype=pl.Utf8),
311
+ "datatype_id": pl.Series([], dtype=pl.UInt64),
312
+ "lang": pl.Series([], dtype=pl.Utf8),
313
+ })
314
+
315
+ rows = []
316
+ for term_id, term in self._id_to_term.items():
317
+ rows.append({
318
+ "term_id": term_id,
319
+ "kind": int(term.kind),
320
+ "lex": term.lex,
321
+ "datatype_id": term.datatype_id,
322
+ "lang": term.lang,
323
+ })
324
+
325
+ return pl.DataFrame(rows).cast({
326
+ "term_id": pl.UInt64,
327
+ "kind": pl.UInt8,
328
+ })
329
+
330
+ def to_hash_dataframe(self) -> pl.DataFrame:
331
+ """
332
+ Export the term hash table to a Polars DataFrame.
333
+
334
+ Schema matches storage-spec.md §3.2:
335
+ - term_hash: stored as two u64 columns (hash_high, hash_low)
336
+ - term_id: u64
337
+ """
338
+ if not self._hash_to_id:
339
+ return pl.DataFrame({
340
+ "hash_high": pl.Series([], dtype=pl.UInt64),
341
+ "hash_low": pl.Series([], dtype=pl.UInt64),
342
+ "term_id": pl.Series([], dtype=pl.UInt64),
343
+ })
344
+
345
+ rows = []
346
+ for term_hash, term_id in self._hash_to_id.items():
347
+ hash_high = term_hash >> 64
348
+ hash_low = term_hash & ((1 << 64) - 1)
349
+ rows.append({
350
+ "hash_high": hash_high,
351
+ "hash_low": hash_low,
352
+ "term_id": term_id,
353
+ })
354
+
355
+ return pl.DataFrame(rows).cast({
356
+ "hash_high": pl.UInt64,
357
+ "hash_low": pl.UInt64,
358
+ "term_id": pl.UInt64,
359
+ })
360
+
361
+ def save(self, path: Path):
362
+ """
363
+ Save the term dictionary to Parquet files.
364
+
365
+ Creates:
366
+ - {path}/term_dict.parquet
367
+ - {path}/term_hash.parquet
368
+ """
369
+ path = Path(path)
370
+ path.mkdir(parents=True, exist_ok=True)
371
+
372
+ self.to_dataframe().write_parquet(path / "term_dict.parquet")
373
+ self.to_hash_dataframe().write_parquet(path / "term_hash.parquet")
374
+
375
+ @classmethod
376
+ def load(cls, path: Path) -> "TermDict":
377
+ """
378
+ Load a term dictionary from Parquet files.
379
+
380
+ Expects:
381
+ - {path}/term_dict.parquet
382
+ - {path}/term_hash.parquet
383
+ """
384
+ path = Path(path)
385
+
386
+ instance = cls.__new__(cls)
387
+ instance._next_payload = {kind: 0 for kind in TermKind}
388
+ instance._hash_to_id = {}
389
+ instance._id_to_term = {}
390
+ instance._collision_count = 0
391
+
392
+ # Initialize fast-path caches
393
+ instance._iri_cache = {}
394
+ instance._plain_literal_cache = {}
395
+ instance._bnode_cache = {}
396
+
397
+ # Load term dictionary
398
+ term_df = pl.read_parquet(path / "term_dict.parquet")
399
+ for row in term_df.iter_rows(named=True):
400
+ term_id = row["term_id"]
401
+ term = Term(
402
+ kind=TermKind(row["kind"]),
403
+ lex=row["lex"],
404
+ datatype_id=row["datatype_id"],
405
+ lang=row["lang"],
406
+ )
407
+ instance._id_to_term[term_id] = term
408
+
409
+ # Populate fast-path caches
410
+ if term.kind == TermKind.IRI:
411
+ instance._iri_cache[term.lex] = term_id
412
+ elif term.kind == TermKind.BNODE:
413
+ instance._bnode_cache[term.lex] = term_id
414
+ elif term.kind == TermKind.LITERAL and term.lang is None:
415
+ # Only cache plain literals (no lang tag) - check if it's xsd:string
416
+ instance._plain_literal_cache[term.lex] = term_id
417
+
418
+ # Update sequence counters
419
+ kind = get_term_kind(term_id)
420
+ payload = get_term_payload(term_id)
421
+ if payload >= instance._next_payload[kind]:
422
+ instance._next_payload[kind] = payload + 1
423
+
424
+ # Load hash table
425
+ hash_df = pl.read_parquet(path / "term_hash.parquet")
426
+ for row in hash_df.iter_rows(named=True):
427
+ term_hash = (row["hash_high"] << 64) | row["hash_low"]
428
+ instance._hash_to_id[term_hash] = row["term_id"]
429
+
430
+ # Restore well-known IDs
431
+ instance._restore_well_known()
432
+
433
+ return instance
434
+
435
+ def _restore_well_known(self):
436
+ """Restore well-known datatype ID references after loading."""
437
+ for term_id, term in self._id_to_term.items():
438
+ if term.kind == TermKind.IRI:
439
+ if term.lex == self.XSD_STRING:
440
+ self.xsd_string_id = term_id
441
+ elif term.lex == self.XSD_INTEGER:
442
+ self.xsd_integer_id = term_id
443
+ elif term.lex == self.XSD_DECIMAL:
444
+ self.xsd_decimal_id = term_id
445
+ elif term.lex == self.XSD_DOUBLE:
446
+ self.xsd_double_id = term_id
447
+ elif term.lex == self.XSD_BOOLEAN:
448
+ self.xsd_boolean_id = term_id
449
+ elif term.lex == self.XSD_DATETIME:
450
+ self.xsd_datetime_id = term_id
451
+ elif term.lex == self.RDF_LANGSTRING:
452
+ self.rdf_langstring_id = term_id
453
+
454
+ # =========================================================================
455
+ # Convenience methods for common term types (OPTIMIZED)
456
+ # =========================================================================
457
+
458
+ def intern_iri(self, value: str) -> TermId:
459
+ """Intern an IRI and return its TermId. Uses fast-path cache."""
460
+ # Fast path: direct string lookup (no Term object, no MD5)
461
+ cached = self._iri_cache.get(value)
462
+ if cached is not None:
463
+ return cached
464
+
465
+ # Slow path: create Term and intern via hash
466
+ term = Term.iri(value)
467
+ term_id = self._allocate_id(TermKind.IRI)
468
+ term_hash = term.compute_hash()
469
+ self._hash_to_id[term_hash] = term_id
470
+ self._id_to_term[term_id] = term
471
+
472
+ # Cache for future fast lookups
473
+ self._iri_cache[value] = term_id
474
+ return term_id
475
+
476
+ def intern_literal(
477
+ self,
478
+ value: Any,
479
+ datatype: Optional[str] = None,
480
+ lang: Optional[str] = None
481
+ ) -> TermId:
482
+ """
483
+ Intern a literal and return its TermId.
484
+
485
+ Automatically determines datatype from Python type if not specified.
486
+ Uses fast-path cache for plain string literals (the common case).
487
+ """
488
+ lex = str(value)
489
+
490
+ # FAST PATH: plain string literal with no lang tag (most common case)
491
+ # Use direct string lookup - no Term object, no MD5 hash
492
+ if lang is None and datatype is None and isinstance(value, str):
493
+ cached = self._plain_literal_cache.get(lex)
494
+ if cached is not None:
495
+ return cached
496
+
497
+ # Create and cache
498
+ term = Term.literal(lex, self.xsd_string_id, None)
499
+ term_id = self._allocate_id(TermKind.LITERAL)
500
+ term_hash = term.compute_hash()
501
+ self._hash_to_id[term_hash] = term_id
502
+ self._id_to_term[term_id] = term
503
+ self._plain_literal_cache[lex] = term_id
504
+ return term_id
505
+
506
+ # SLOW PATH: typed literals or lang-tagged strings
507
+ # Determine datatype ID
508
+ datatype_id = None
509
+ if lang is not None:
510
+ datatype_id = self.rdf_langstring_id
511
+ elif datatype is not None:
512
+ datatype_id = self.intern_iri(datatype)
513
+ elif isinstance(value, bool):
514
+ datatype_id = self.xsd_boolean_id
515
+ elif isinstance(value, int):
516
+ datatype_id = self.xsd_integer_id
517
+ elif isinstance(value, float):
518
+ datatype_id = self.xsd_decimal_id
519
+ else:
520
+ datatype_id = self.xsd_string_id
521
+
522
+ return self.get_or_create(Term.literal(lex, datatype_id, lang))
523
+
524
+ def intern_bnode(self, label: Optional[str] = None) -> TermId:
525
+ """
526
+ Intern a blank node and return its TermId. Uses fast-path cache.
527
+
528
+ If no label is provided, generates a unique one.
529
+ """
530
+ if label is None:
531
+ label = f"b{self._next_payload[TermKind.BNODE]}"
532
+
533
+ # Fast path: direct string lookup
534
+ cached = self._bnode_cache.get(label)
535
+ if cached is not None:
536
+ return cached
537
+
538
+ # Slow path: create and cache
539
+ term = Term.bnode(label)
540
+ term_id = self._allocate_id(TermKind.BNODE)
541
+ term_hash = term.compute_hash()
542
+ self._hash_to_id[term_hash] = term_id
543
+ self._id_to_term[term_id] = term
544
+ self._bnode_cache[label] = term_id
545
+ return term_id
546
+
547
+ def get_lex(self, term_id: TermId) -> Optional[str]:
548
+ """Get the lexical form of a term by its ID."""
549
+ term = self.lookup(term_id)
550
+ return term.lex if term else None
551
+
552
+ # =========================================================================
553
+ # Lookup methods (read-only)
554
+ # =========================================================================
555
+
556
+ def lookup_iri(self, value: str) -> Optional[TermId]:
557
+ """Look up an IRI's TermId without creating it."""
558
+ return self.get_id(Term.iri(value))
559
+
560
+ def lookup_literal(
561
+ self,
562
+ value: str,
563
+ datatype: Optional[str] = None,
564
+ lang: Optional[str] = None
565
+ ) -> Optional[TermId]:
566
+ """Look up a literal's TermId without creating it."""
567
+ # Determine datatype ID (must already exist)
568
+ datatype_id = None
569
+ if lang is not None:
570
+ datatype_id = self.rdf_langstring_id
571
+ elif datatype is not None:
572
+ dt_term = Term.iri(datatype)
573
+ datatype_id = self.get_id(dt_term)
574
+ if datatype_id is None:
575
+ return None # Datatype not in dict means literal can't exist
576
+ else:
577
+ datatype_id = self.xsd_string_id
578
+
579
+ return self.get_id(Term.literal(value, datatype_id, lang))
580
+
581
+ def lookup_bnode(self, label: str) -> Optional[TermId]:
582
+ """Look up a blank node's TermId without creating it."""
583
+ return self.get_id(Term.bnode(label))
584
+
585
+ def build_literal_to_float_map(self) -> dict[TermId, float]:
586
+ """
587
+ Build a mapping from literal term IDs to float values.
588
+
589
+ Returns a dict for all literals that can be parsed as floats.
590
+ Used for vectorized confidence filtering.
591
+ """
592
+ result = {}
593
+ for term_id, term in self._id_to_term.items():
594
+ if term.kind == TermKind.LITERAL:
595
+ try:
596
+ result[term_id] = float(term.lex)
597
+ except (ValueError, TypeError):
598
+ continue
599
+ return result
600
+
601
+ def get_lex_series(self, term_ids: pl.Series) -> pl.Series:
602
+ """
603
+ Vectorized lookup of lexical forms for a series of term IDs.
604
+
605
+ Returns a Utf8 Series with lexical forms (null for missing IDs).
606
+ """
607
+ # Build a mapping dict for the unique IDs in the series
608
+ unique_ids = term_ids.unique().to_list()
609
+ id_to_lex = {}
610
+ for tid in unique_ids:
611
+ if tid is not None:
612
+ term = self._id_to_term.get(tid)
613
+ if term is not None:
614
+ id_to_lex[tid] = term.lex
615
+
616
+ # Map using Polars map_elements for compatibility
617
+ return term_ids.map_elements(
618
+ lambda x: id_to_lex.get(x) if x is not None else None,
619
+ return_dtype=pl.Utf8
620
+ )
621
+
622
+ def stats(self) -> dict:
623
+ """Return statistics about the term dictionary."""
624
+ return {
625
+ "total_terms": len(self),
626
+ "by_kind": {kind.name: count for kind, count in self.count_by_kind().items()},
627
+ "hash_collisions": self._collision_count,
628
+ }