opteryx-catalog 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1221 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import time
5
+ import uuid
6
+ from dataclasses import dataclass
7
+ from typing import Any
8
+ from typing import Iterable
9
+ from typing import Optional
10
+
11
+ from .metadata import DatasetMetadata
12
+ from .metadata import Snapshot
13
+ from .metastore import Dataset
14
+
15
+ # Stable node identifier for this process (hex-mac-hex-pid)
16
+ _NODE = f"{uuid.getnode():x}-{os.getpid():x}"
17
+
18
+
19
+ @dataclass
20
+ class Datafile:
21
+ """Wrapper for a manifest entry representing a data file."""
22
+
23
+ entry: dict
24
+
25
+ @property
26
+ def file_path(self) -> Optional[str]:
27
+ return self.entry.get("file_path")
28
+
29
+ @property
30
+ def record_count(self) -> int:
31
+ return int(self.entry.get("record_count") or 0)
32
+
33
+ @property
34
+ def file_size_in_bytes(self) -> int:
35
+ return int(self.entry.get("file_size_in_bytes") or 0)
36
+
37
+ def to_dict(self) -> dict:
38
+ return dict(self.entry)
39
+
40
+ @property
41
+ def min_k_hashes(self) -> list:
42
+ return self.entry.get("min_k_hashes") or []
43
+
44
+ @property
45
+ def histogram_counts(self) -> list:
46
+ return self.entry.get("histogram_counts") or []
47
+
48
+ @property
49
+ def histogram_bins(self) -> int:
50
+ return int(self.entry.get("histogram_bins") or 0)
51
+
52
+ @property
53
+ def min_values(self) -> list:
54
+ return self.entry.get("min_values") or []
55
+
56
+ @property
57
+ def max_values(self) -> list:
58
+ return self.entry.get("max_values") or []
59
+
60
+
61
+ @dataclass
62
+ class SimpleDataset(Dataset):
63
+ identifier: str
64
+ _metadata: DatasetMetadata
65
+ io: Any = None
66
+ catalog: Any = None
67
+
68
+ @property
69
+ def metadata(self) -> DatasetMetadata:
70
+ return self._metadata
71
+
72
+ def snapshot(self, snapshot_id: Optional[int] = None) -> Optional[Snapshot]:
73
+ """Return a Snapshot.
74
+
75
+ - If `snapshot_id` is None, return the in-memory current snapshot.
76
+ - If a `snapshot_id` is provided, prefer a Firestore lookup via the
77
+ attached `catalog` (O(1) document get). Fall back to the in-memory
78
+ `metadata.snapshots` list only when no catalog is attached or the
79
+ remote lookup fails.
80
+ """
81
+ # Current snapshot: keep in memory for fast access
82
+ if snapshot_id is None:
83
+ return self.metadata.current_snapshot()
84
+
85
+ # Try Firestore document lookup when catalog attached
86
+ if self.catalog:
87
+ try:
88
+ collection, dataset_name = self.identifier.split(".")
89
+ doc = (
90
+ self.catalog._dataset_doc_ref(collection, dataset_name)
91
+ .collection("snapshots")
92
+ .document(str(snapshot_id))
93
+ .get()
94
+ )
95
+ if doc.exists:
96
+ sd = doc.to_dict() or {}
97
+ snap = Snapshot(
98
+ snapshot_id=int(
99
+ sd.get("snapshot-id") or sd.get("snapshot_id") or snapshot_id
100
+ ),
101
+ timestamp_ms=int(sd.get("timestamp-ms") or sd.get("timestamp_ms") or 0),
102
+ author=sd.get("author"),
103
+ sequence_number=sd.get("sequence-number") or sd.get("sequence_number"),
104
+ user_created=sd.get("user-created") or sd.get("user_created"),
105
+ manifest_list=sd.get("manifest") or sd.get("manifest_list"),
106
+ schema_id=sd.get("schema-id") or sd.get("schema_id"),
107
+ summary=sd.get("summary", {}),
108
+ operation_type=sd.get("operation-type") or sd.get("operation_type"),
109
+ parent_snapshot_id=sd.get("parent-snapshot-id")
110
+ or sd.get("parent_snapshot_id"),
111
+ commit_message=sd.get("commit-message") or sd.get("commit_message"),
112
+ )
113
+ return snap
114
+ except Exception:
115
+ # Be conservative: fall through to in-memory fallback
116
+ pass
117
+
118
+ # Fallback: search in-memory snapshots (only used when no catalog)
119
+ for s in self.metadata.snapshots:
120
+ if s.snapshot_id == snapshot_id:
121
+ return s
122
+
123
+ return None
124
+
125
+ def _get_node(self) -> str:
126
+ """Return the stable node identifier for this process.
127
+
128
+ Uses a module-level constant to avoid per-instance hashing/caching.
129
+ """
130
+ return _NODE
131
+
132
+ def snapshots(self) -> Iterable[Snapshot]:
133
+ return list(self.metadata.snapshots)
134
+
135
+ def schema(self, schema_id: Optional[str] = None) -> Optional[dict]:
136
+ """Return a stored schema description.
137
+
138
+ If `schema_id` is None, return the current schema (by
139
+ `metadata.current_schema_id` or last-known schema). If a
140
+ specific `schema_id` is provided, attempt to find it in the
141
+ in-memory `metadata.schemas` list and, failing that, fetch it
142
+ from the catalog's `schemas` subcollection when a catalog is
143
+ attached.
144
+
145
+ Returns the stored schema dict (contains keys like `schema_id`,
146
+ `columns`, `timestamp-ms`, etc.) or None if not found.
147
+ """
148
+ # Determine which schema id to use
149
+ sid = schema_id or self.metadata.current_schema_id
150
+
151
+ # If no sid and a raw schema is stored on the metadata, return it
152
+ if sid is None:
153
+ return getattr(self.metadata, "schema", None)
154
+
155
+ # Fast path: if this is the current schema id, prefer the cached
156
+ # current schema (99% case) rather than scanning the entire list.
157
+ sdict = None
158
+ if sid == self.metadata.current_schema_id:
159
+ if getattr(self.metadata, "schemas", None):
160
+ last = self.metadata.schemas[-1]
161
+ if last.get("schema_id") == sid:
162
+ sdict = last
163
+ else:
164
+ # If a raw schema is stored directly on metadata, use it.
165
+ raw = getattr(self.metadata, "schema", None)
166
+ if raw is not None:
167
+ sdict = {"schema_id": sid, "columns": raw}
168
+
169
+ # If not the current schema, or cached current not present,
170
+ # prefer to load the schema document from the backend (O(1) doc get).
171
+ if sdict is None and self.catalog:
172
+ try:
173
+ collection, dataset_name = self.identifier.split(".")
174
+ doc = (
175
+ self.catalog._dataset_doc_ref(collection, dataset_name)
176
+ .collection("schemas")
177
+ .document(sid)
178
+ .get()
179
+ )
180
+ sdict = doc.to_dict() or None
181
+ except Exception:
182
+ sdict = None
183
+
184
+ # As a last-resort when no catalog is attached, fall back to an
185
+ # in-memory search for compatibility (offline/unit-test mode).
186
+ if sdict is None and not self.catalog:
187
+ for s in self.metadata.schemas or []:
188
+ if s.get("schema_id") == sid:
189
+ sdict = s
190
+ break
191
+
192
+ if sdict is None:
193
+ return None
194
+
195
+ # Try to construct an Orso RelationSchema
196
+ from orso.schema import FlatColumn
197
+ from orso.schema import RelationSchema
198
+
199
+ # If metadata stored a raw schema
200
+ raw = sdict.get("columns")
201
+
202
+ columns = [
203
+ FlatColumn(
204
+ name=c.get("name"),
205
+ type=c.get("type"),
206
+ element_type=c.get("element-type"),
207
+ precision=c.get("precision"),
208
+ scale=c.get("scale"),
209
+ )
210
+ for c in raw
211
+ ]
212
+ orso_schema = RelationSchema(name=self.identifier, columns=columns)
213
+ return orso_schema
214
+
215
+ def append(self, table: Any, author: str = None, commit_message: Optional[str] = None):
216
+ """Append a pyarrow.Table:
217
+
218
+ - write a Parquet data file via `self.io`
219
+ - create a simple Parquet manifest (one entry)
220
+ - persist manifest and snapshot metadata using the attached `catalog`
221
+ """
222
+ import pyarrow as pa
223
+ import pyarrow.parquet as pq
224
+
225
+ snapshot_id = int(time.time() * 1000)
226
+
227
+ if not hasattr(table, "schema"):
228
+ raise TypeError("append() expects a pyarrow.Table-like object")
229
+
230
+ # Write parquet file with collision-resistant name
231
+ fname = f"{time.time_ns():x}-{self._get_node()}.parquet"
232
+ data_path = f"{self.metadata.location}/data/{fname}"
233
+ buf = pa.BufferOutputStream()
234
+ pq.write_table(table, buf, compression="zstd")
235
+ pdata = buf.getvalue().to_pybytes()
236
+
237
+ out = self.io.new_output(data_path).create()
238
+ out.write(pdata)
239
+ out.close()
240
+
241
+ # Prepare sketches/stats
242
+ K = 32
243
+ HBINS = 32
244
+ min_k_hashes: list[list[int]] = []
245
+ histograms: list[list[int]] = []
246
+ min_values: list[int] = []
247
+ max_values: list[int] = []
248
+
249
+ # Use draken for efficient hashing and compression when available.
250
+ import heapq
251
+
252
+ # canonical NULL flag for missing values
253
+ NULL_FLAG = -(1 << 63)
254
+
255
+ try:
256
+ import opteryx.draken as draken # type: ignore
257
+
258
+ num_rows = int(table.num_rows)
259
+
260
+ for col_idx, col in enumerate(table.columns):
261
+ # hash column values to 64-bit via draken (new cpdef API)
262
+ vec = draken.Vector.from_arrow(col)
263
+ hashes = list(vec.hash())
264
+
265
+ # Decide whether to compute min-k/histogram for this column based
266
+ # on field type and, for strings, average length of values.
267
+ field_type = table.schema.field(col_idx).type
268
+ compute_min_k = False
269
+ if (
270
+ pa.types.is_integer(field_type)
271
+ or pa.types.is_floating(field_type)
272
+ or pa.types.is_decimal(field_type)
273
+ ):
274
+ compute_min_k = True
275
+ elif (
276
+ pa.types.is_timestamp(field_type)
277
+ or pa.types.is_date(field_type)
278
+ or pa.types.is_time(field_type)
279
+ ):
280
+ compute_min_k = True
281
+ elif pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
282
+ # compute average length from non-null values; only allow
283
+ # min-k/histogram for short strings (avg <= 16)
284
+ col_py = None
285
+ try:
286
+ col_py = col.to_pylist()
287
+ except Exception:
288
+ col_py = None
289
+
290
+ if col_py is not None:
291
+ lens = [len(x) for x in col_py if x is not None]
292
+ if lens:
293
+ avg_len = sum(lens) / len(lens)
294
+ if avg_len <= 16:
295
+ compute_min_k = True
296
+
297
+ # KMV: take K smallest hashes when allowed; otherwise store an
298
+ # empty list for this column.
299
+ if compute_min_k:
300
+ smallest = heapq.nsmallest(K, hashes)
301
+ col_min_k = sorted(smallest)
302
+ else:
303
+ col_min_k = []
304
+
305
+ # For histogram decisions follow the same rule as min-k
306
+ compute_hist = compute_min_k
307
+
308
+ # Use draken.compress() to get canonical int64 per value
309
+ mapped = list(vec.compress())
310
+ non_nulls_mapped = [m for m in mapped if m != NULL_FLAG]
311
+ if non_nulls_mapped:
312
+ vmin = min(non_nulls_mapped)
313
+ vmax = max(non_nulls_mapped)
314
+ col_min = int(vmin)
315
+ col_max = int(vmax)
316
+ if compute_hist:
317
+ if vmin == vmax:
318
+ col_hist = [0] * HBINS
319
+ col_hist[-1] = len(non_nulls_mapped)
320
+ else:
321
+ col_hist = [0] * HBINS
322
+ span = float(vmax - vmin)
323
+ for m in non_nulls_mapped:
324
+ b = int(((float(m) - float(vmin)) / span) * (HBINS - 1))
325
+ if b < 0:
326
+ b = 0
327
+ if b >= HBINS:
328
+ b = HBINS - 1
329
+ col_hist[b] += 1
330
+ else:
331
+ col_hist = [0] * HBINS
332
+ else:
333
+ # no non-null values; histogram via hash buckets
334
+ col_min = NULL_FLAG
335
+ col_max = NULL_FLAG
336
+ if compute_hist:
337
+ col_hist = [0] * HBINS
338
+ for h in hashes:
339
+ b = (h >> (64 - 5)) & 0x1F
340
+ col_hist[b] += 1
341
+ else:
342
+ col_hist = [0] * HBINS
343
+
344
+ min_k_hashes.append(col_min_k)
345
+ histograms.append(col_hist)
346
+ min_values.append(col_min)
347
+ max_values.append(col_max)
348
+ except Exception:
349
+ # If draken or its dependencies are unavailable, fall back to
350
+ # conservative defaults so we can still write the manifest and
351
+ # snapshot without failing the append operation.
352
+ num_cols = table.num_columns
353
+ min_k_hashes = [[] for _ in range(num_cols)]
354
+ HBINS = 32
355
+ histograms = [[0] * HBINS for _ in range(num_cols)]
356
+ min_values = [NULL_FLAG] * num_cols
357
+ max_values = [NULL_FLAG] * num_cols
358
+
359
+ entries = [
360
+ {
361
+ "file_path": data_path,
362
+ "file_format": "parquet",
363
+ "record_count": int(table.num_rows),
364
+ "file_size_in_bytes": len(pdata),
365
+ "min_k_hashes": min_k_hashes,
366
+ "histogram_counts": histograms,
367
+ "histogram_bins": HBINS,
368
+ "min_values": min_values,
369
+ "max_values": max_values,
370
+ }
371
+ ]
372
+
373
+ # persist manifest: for append, merge previous manifest entries
374
+ # with the new entries so the snapshot's manifest is cumulative.
375
+ manifest_path = None
376
+ if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
377
+ merged_entries = list(entries)
378
+
379
+ # If there is a previous snapshot with a manifest, try to read
380
+ # it and prepend its entries. Any read error is non-fatal and we
381
+ # fall back to writing only the new entries.
382
+ prev_snap = self.snapshot(None)
383
+ if prev_snap and getattr(prev_snap, "manifest_list", None):
384
+ prev_manifest_path = prev_snap.manifest_list
385
+ try:
386
+ # Prefer FileIO when available
387
+ if self.io and hasattr(self.io, "new_input"):
388
+ inp = self.io.new_input(prev_manifest_path)
389
+ with inp.open() as f:
390
+ prev_data = f.read()
391
+ import pyarrow as pa
392
+ import pyarrow.parquet as pq
393
+
394
+ prev_table = pq.read_table(pa.BufferReader(prev_data))
395
+ prev_rows = prev_table.to_pylist()
396
+ merged_entries = prev_rows + merged_entries
397
+ else:
398
+ # Fall back to catalog storage client (GCS)
399
+ if (
400
+ self.catalog
401
+ and getattr(self.catalog, "_storage_client", None)
402
+ and getattr(self.catalog, "gcs_bucket", None)
403
+ ):
404
+ bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
405
+ parsed = prev_manifest_path
406
+ if parsed.startswith("gs://"):
407
+ parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
408
+ blob = bucket.blob(parsed)
409
+ prev_data = blob.download_as_bytes()
410
+ import pyarrow as pa
411
+ import pyarrow.parquet as pq
412
+
413
+ prev_table = pq.read_table(pa.BufferReader(prev_data))
414
+ prev_rows = prev_table.to_pylist()
415
+ merged_entries = prev_rows + merged_entries
416
+ except Exception:
417
+ # If we can't read the previous manifest, continue with
418
+ # just the new entries (don't fail the append).
419
+ pass
420
+
421
+ manifest_path = self.catalog.write_parquet_manifest(
422
+ snapshot_id, merged_entries, self.metadata.location
423
+ )
424
+
425
+ # snapshot metadata
426
+ if author is None:
427
+ raise ValueError("author must be provided when appending to a dataset")
428
+ # update metadata author/timestamp for this append
429
+ self.metadata.author = author
430
+ self.metadata.timestamp_ms = snapshot_id
431
+ # default commit message
432
+ if commit_message is None:
433
+ commit_message = f"commit by {author}"
434
+
435
+ recs = int(table.num_rows)
436
+ fsize = len(pdata)
437
+ added_data_files = 1
438
+ added_files_size = fsize
439
+ added_records = recs
440
+ deleted_data_files = 0
441
+ deleted_files_size = 0
442
+ deleted_records = 0
443
+
444
+ prev = self.snapshot()
445
+ if prev and prev.summary:
446
+ try:
447
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
448
+ except Exception:
449
+ prev_total_files = 0
450
+ try:
451
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
452
+ except Exception:
453
+ prev_total_size = 0
454
+ try:
455
+ prev_total_records = int(prev.summary.get("total-records", 0))
456
+ except Exception:
457
+ prev_total_records = 0
458
+ else:
459
+ prev_total_files = 0
460
+ prev_total_size = 0
461
+ prev_total_records = 0
462
+
463
+ total_data_files = prev_total_files + added_data_files - deleted_data_files
464
+ total_files_size = prev_total_size + added_files_size - deleted_files_size
465
+ total_records = prev_total_records + added_records - deleted_records
466
+
467
+ summary = {
468
+ "added-data-files": added_data_files,
469
+ "added-files-size": added_files_size,
470
+ "added-records": added_records,
471
+ "deleted-data-files": deleted_data_files,
472
+ "deleted-files-size": deleted_files_size,
473
+ "deleted-records": deleted_records,
474
+ "total-data-files": total_data_files,
475
+ "total-files-size": total_files_size,
476
+ "total-records": total_records,
477
+ }
478
+
479
+ # sequence number
480
+ try:
481
+ max_seq = 0
482
+ for s in self.metadata.snapshots:
483
+ seq = getattr(s, "sequence_number", None)
484
+ if seq is None:
485
+ continue
486
+ try:
487
+ ival = int(seq)
488
+ except Exception:
489
+ continue
490
+ if ival > max_seq:
491
+ max_seq = ival
492
+ next_seq = max_seq + 1
493
+ except Exception:
494
+ next_seq = 1
495
+
496
+ parent_id = self.metadata.current_snapshot_id
497
+
498
+ snap = Snapshot(
499
+ snapshot_id=snapshot_id,
500
+ timestamp_ms=snapshot_id,
501
+ author=author,
502
+ sequence_number=next_seq,
503
+ user_created=True,
504
+ operation_type="append",
505
+ parent_snapshot_id=parent_id,
506
+ manifest_list=manifest_path,
507
+ schema_id=self.metadata.current_schema_id,
508
+ commit_message=commit_message,
509
+ summary=summary,
510
+ )
511
+
512
+ self.metadata.snapshots.append(snap)
513
+ self.metadata.current_snapshot_id = snapshot_id
514
+
515
+ # persist metadata (let errors propagate)
516
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
517
+ self.catalog.save_snapshot(self.identifier, snap)
518
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
519
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
520
+
521
+ def add_files(self, files: list[str], author: str = None, commit_message: Optional[str] = None):
522
+ """Add filenames to the dataset manifest without writing the files.
523
+
524
+ - `files` is a list of file paths (strings). Files are assumed to
525
+ already exist in storage; this method only updates the manifest.
526
+ - Does not add files that already appear in the current manifest
527
+ (deduplicates by `file_path`).
528
+ - Creates a cumulative manifest for the new snapshot (previous
529
+ entries + new unique entries).
530
+ """
531
+ if author is None:
532
+ raise ValueError("author must be provided when adding files to a dataset")
533
+
534
+ snapshot_id = int(time.time() * 1000)
535
+
536
+ # Gather previous summary and manifest entries
537
+ prev = self.snapshot(None)
538
+ prev_total_files = 0
539
+ prev_total_size = 0
540
+ prev_total_records = 0
541
+ prev_entries = []
542
+ if prev and prev.summary:
543
+ try:
544
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
545
+ except Exception:
546
+ prev_total_files = 0
547
+ try:
548
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
549
+ except Exception:
550
+ prev_total_size = 0
551
+ try:
552
+ prev_total_records = int(prev.summary.get("total-records", 0))
553
+ except Exception:
554
+ prev_total_records = 0
555
+
556
+ if prev and getattr(prev, "manifest_list", None):
557
+ # try to read prev manifest entries
558
+ try:
559
+ import pyarrow as pa
560
+ import pyarrow.parquet as pq
561
+
562
+ if self.io and hasattr(self.io, "new_input"):
563
+ inp = self.io.new_input(prev.manifest_list)
564
+ with inp.open() as f:
565
+ data = f.read()
566
+ table = pq.read_table(pa.BufferReader(data))
567
+ prev_entries = table.to_pylist()
568
+ else:
569
+ if (
570
+ self.catalog
571
+ and getattr(self.catalog, "_storage_client", None)
572
+ and getattr(self.catalog, "gcs_bucket", None)
573
+ ):
574
+ bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
575
+ parsed = prev.manifest_list
576
+ if parsed.startswith("gs://"):
577
+ parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
578
+ blob = bucket.blob(parsed)
579
+ data = blob.download_as_bytes()
580
+ table = pq.read_table(pa.BufferReader(data))
581
+ prev_entries = table.to_pylist()
582
+ except Exception:
583
+ prev_entries = []
584
+
585
+ existing = {
586
+ e.get("file_path") for e in prev_entries if isinstance(e, dict) and e.get("file_path")
587
+ }
588
+
589
+ # Build new entries for files that don't already exist. Only accept
590
+ # Parquet files and attempt to read lightweight metadata (bytes,
591
+ # row count, per-column min/max) from the Parquet footer when
592
+ # available.
593
+ new_entries = []
594
+ seen = set()
595
+ for fp in files:
596
+ if not fp or fp in existing or fp in seen:
597
+ continue
598
+ if not fp.lower().endswith(".parquet"):
599
+ # only accept parquet files
600
+ continue
601
+ seen.add(fp)
602
+
603
+ # Attempt to read file bytes and parquet metadata
604
+ file_size = 0
605
+ record_count = 0
606
+ min_values = []
607
+ max_values = []
608
+ try:
609
+ import pyarrow as pa
610
+ import pyarrow.parquet as pq
611
+
612
+ data = None
613
+ if self.io and hasattr(self.io, "new_input"):
614
+ inp = self.io.new_input(fp)
615
+ with inp.open() as f:
616
+ data = f.read()
617
+ else:
618
+ if (
619
+ self.catalog
620
+ and getattr(self.catalog, "_storage_client", None)
621
+ and getattr(self.catalog, "gcs_bucket", None)
622
+ ):
623
+ bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
624
+ parsed = fp
625
+ if parsed.startswith("gs://"):
626
+ parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
627
+ blob = bucket.blob(parsed)
628
+ data = blob.download_as_bytes()
629
+
630
+ if data:
631
+ file_size = len(data)
632
+ pf = pq.ParquetFile(pa.BufferReader(data))
633
+ record_count = int(pf.metadata.num_rows or 0)
634
+
635
+ # Prefer computing min/max via draken.compress() over
636
+ # relying on Parquet footer stats which may contain
637
+ # heterogenous or non-numeric values. Fall back to
638
+ # footer stats only if draken is unavailable.
639
+ try:
640
+ import opteryx.draken as draken # type: ignore
641
+
642
+ table = pq.read_table(pa.BufferReader(data))
643
+ ncols = table.num_columns
644
+ mins = [None] * ncols
645
+ maxs = [None] * ncols
646
+
647
+ NULL_FLAG = -(1 << 63)
648
+
649
+ for ci in range(ncols):
650
+ try:
651
+ col = table.column(ci)
652
+ # combine chunks if needed
653
+ if hasattr(col, "combine_chunks"):
654
+ arr = col.combine_chunks()
655
+ else:
656
+ arr = col
657
+ vec = draken.Vector.from_arrow(arr)
658
+ mapped = list(vec.compress())
659
+ non_nulls = [m for m in mapped if m != NULL_FLAG]
660
+ if non_nulls:
661
+ mins[ci] = int(min(non_nulls))
662
+ maxs[ci] = int(max(non_nulls))
663
+ else:
664
+ mins[ci] = None
665
+ maxs[ci] = None
666
+ except Exception:
667
+ # per-column fallback: leave None
668
+ mins[ci] = None
669
+ maxs[ci] = None
670
+ except Exception:
671
+ # Draken not available; fall back to Parquet footer stats
672
+ ncols = pf.metadata.num_columns
673
+ mins = [None] * ncols
674
+ maxs = [None] * ncols
675
+ for rg in range(pf.num_row_groups):
676
+ for ci in range(ncols):
677
+ col_meta = pf.metadata.row_group(rg).column(ci)
678
+ stats = getattr(col_meta, "statistics", None)
679
+ if not stats:
680
+ continue
681
+ smin = getattr(stats, "min", None)
682
+ smax = getattr(stats, "max", None)
683
+ if smin is None and smax is None:
684
+ continue
685
+
686
+ def _to_py(v):
687
+ try:
688
+ return int(v)
689
+ except Exception:
690
+ try:
691
+ return float(v)
692
+ except Exception:
693
+ try:
694
+ if isinstance(v, (bytes, bytearray)):
695
+ return v.decode("utf-8", errors="ignore")
696
+ except Exception:
697
+ pass
698
+ return v
699
+
700
+ if smin is not None:
701
+ sval = _to_py(smin)
702
+ if mins[ci] is None:
703
+ mins[ci] = sval
704
+ else:
705
+ try:
706
+ if sval < mins[ci]:
707
+ mins[ci] = sval
708
+ except Exception:
709
+ pass
710
+ if smax is not None:
711
+ sval = _to_py(smax)
712
+ if maxs[ci] is None:
713
+ maxs[ci] = sval
714
+ else:
715
+ try:
716
+ if sval > maxs[ci]:
717
+ maxs[ci] = sval
718
+ except Exception:
719
+ pass
720
+
721
+ # normalize lists to empty lists when values missing
722
+ min_values = [m for m in mins if m is not None]
723
+ max_values = [m for m in maxs if m is not None]
724
+ except Exception:
725
+ # If metadata read fails, fall back to placeholders
726
+ file_size = 0
727
+ record_count = 0
728
+ min_values = []
729
+ max_values = []
730
+
731
+ new_entries.append(
732
+ {
733
+ "file_path": fp,
734
+ "file_format": "parquet",
735
+ "record_count": int(record_count),
736
+ "file_size_in_bytes": int(file_size),
737
+ "min_k_hashes": [],
738
+ "histogram_counts": [],
739
+ "histogram_bins": 0,
740
+ "min_values": min_values,
741
+ "max_values": max_values,
742
+ }
743
+ )
744
+
745
+ merged_entries = prev_entries + new_entries
746
+
747
+ # write cumulative manifest
748
+ manifest_path = None
749
+ if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
750
+ manifest_path = self.catalog.write_parquet_manifest(
751
+ snapshot_id, merged_entries, self.metadata.location
752
+ )
753
+
754
+ # Build summary deltas
755
+ added_data_files = len(new_entries)
756
+ added_files_size = 0
757
+ added_records = 0
758
+ deleted_data_files = 0
759
+ deleted_files_size = 0
760
+ deleted_records = 0
761
+
762
+ total_data_files = prev_total_files + added_data_files - deleted_data_files
763
+ total_files_size = prev_total_size + added_files_size - deleted_files_size
764
+ total_records = prev_total_records + added_records - deleted_records
765
+
766
+ summary = {
767
+ "added-data-files": added_data_files,
768
+ "added-files-size": added_files_size,
769
+ "added-records": added_records,
770
+ "deleted-data-files": deleted_data_files,
771
+ "deleted-files-size": deleted_files_size,
772
+ "deleted-records": deleted_records,
773
+ "total-data-files": total_data_files,
774
+ "total-files-size": total_files_size,
775
+ "total-records": total_records,
776
+ }
777
+
778
+ # Sequence number
779
+ try:
780
+ max_seq = 0
781
+ for s in self.metadata.snapshots:
782
+ seq = getattr(s, "sequence_number", None)
783
+ if seq is None:
784
+ continue
785
+ try:
786
+ ival = int(seq)
787
+ except Exception:
788
+ continue
789
+ if ival > max_seq:
790
+ max_seq = ival
791
+ next_seq = max_seq + 1
792
+ except Exception:
793
+ next_seq = 1
794
+
795
+ parent_id = self.metadata.current_snapshot_id
796
+
797
+ if commit_message is None:
798
+ commit_message = f"add files by {author}"
799
+
800
+ snap = Snapshot(
801
+ snapshot_id=snapshot_id,
802
+ timestamp_ms=snapshot_id,
803
+ author=author,
804
+ sequence_number=next_seq,
805
+ user_created=True,
806
+ operation_type="add-files",
807
+ parent_snapshot_id=parent_id,
808
+ manifest_list=manifest_path,
809
+ schema_id=self.metadata.current_schema_id,
810
+ commit_message=commit_message,
811
+ summary=summary,
812
+ )
813
+
814
+ self.metadata.snapshots.append(snap)
815
+ self.metadata.current_snapshot_id = snapshot_id
816
+
817
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
818
+ self.catalog.save_snapshot(self.identifier, snap)
819
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
820
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
821
+
822
+ def truncate_and_add_files(
823
+ self, files: list[str], author: str = None, commit_message: Optional[str] = None
824
+ ):
825
+ """Truncate dataset (logical) and set manifest to provided files.
826
+
827
+ - Writes a manifest that contains exactly the unique filenames provided.
828
+ - Does not delete objects from storage.
829
+ - Useful for replace/overwrite semantics.
830
+ """
831
+ if author is None:
832
+ raise ValueError("author must be provided when truncating/adding files")
833
+
834
+ snapshot_id = int(time.time() * 1000)
835
+
836
+ # Read previous summary for reporting deleted counts
837
+ prev = self.snapshot(None)
838
+ prev_total_files = 0
839
+ prev_total_size = 0
840
+ prev_total_records = 0
841
+ if prev and prev.summary:
842
+ try:
843
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
844
+ except Exception:
845
+ prev_total_files = 0
846
+ try:
847
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
848
+ except Exception:
849
+ prev_total_size = 0
850
+ try:
851
+ prev_total_records = int(prev.summary.get("total-records", 0))
852
+ except Exception:
853
+ prev_total_records = 0
854
+
855
+ # Build unique new entries (ignore duplicates in input). Only accept
856
+ # parquet files and try to read lightweight metadata from each file.
857
+ new_entries = []
858
+ seen = set()
859
+ for fp in files:
860
+ if not fp or fp in seen:
861
+ continue
862
+ if not fp.lower().endswith(".parquet"):
863
+ continue
864
+ seen.add(fp)
865
+
866
+ file_size = 0
867
+ record_count = 0
868
+ min_values = []
869
+ max_values = []
870
+ try:
871
+ import pyarrow as pa
872
+ import pyarrow.parquet as pq
873
+
874
+ data = None
875
+ if self.io and hasattr(self.io, "new_input"):
876
+ inp = self.io.new_input(fp)
877
+ with inp.open() as f:
878
+ data = f.read()
879
+ else:
880
+ if (
881
+ self.catalog
882
+ and getattr(self.catalog, "_storage_client", None)
883
+ and getattr(self.catalog, "gcs_bucket", None)
884
+ ):
885
+ bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
886
+ parsed = fp
887
+ if parsed.startswith("gs://"):
888
+ parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
889
+ blob = bucket.blob(parsed)
890
+ data = blob.download_as_bytes()
891
+
892
+ if data:
893
+ file_size = len(data)
894
+ pf = pq.ParquetFile(pa.BufferReader(data))
895
+ record_count = int(pf.metadata.num_rows or 0)
896
+
897
+ ncols = pf.metadata.num_columns
898
+ mins = [None] * ncols
899
+ maxs = [None] * ncols
900
+ for rg in range(pf.num_row_groups):
901
+ for ci in range(ncols):
902
+ col_meta = pf.metadata.row_group(rg).column(ci)
903
+ stats = getattr(col_meta, "statistics", None)
904
+ if not stats:
905
+ continue
906
+ smin = getattr(stats, "min", None)
907
+ smax = getattr(stats, "max", None)
908
+ if smin is None and smax is None:
909
+ continue
910
+
911
+ def _to_py(v):
912
+ try:
913
+ return int(v)
914
+ except Exception:
915
+ try:
916
+ return float(v)
917
+ except Exception:
918
+ try:
919
+ if isinstance(v, (bytes, bytearray)):
920
+ return v.decode("utf-8", errors="ignore")
921
+ except Exception:
922
+ pass
923
+ return v
924
+
925
+ if smin is not None:
926
+ sval = _to_py(smin)
927
+ if mins[ci] is None:
928
+ mins[ci] = sval
929
+ else:
930
+ try:
931
+ if sval < mins[ci]:
932
+ mins[ci] = sval
933
+ except Exception:
934
+ pass
935
+ if smax is not None:
936
+ sval = _to_py(smax)
937
+ if maxs[ci] is None:
938
+ maxs[ci] = sval
939
+ else:
940
+ try:
941
+ if sval > maxs[ci]:
942
+ maxs[ci] = sval
943
+ except Exception:
944
+ pass
945
+
946
+ min_values = [m for m in mins if m is not None]
947
+ max_values = [m for m in maxs if m is not None]
948
+ except Exception:
949
+ file_size = 0
950
+ record_count = 0
951
+ min_values = []
952
+ max_values = []
953
+
954
+ new_entries.append(
955
+ {
956
+ "file_path": fp,
957
+ "file_format": "parquet",
958
+ "record_count": int(record_count),
959
+ "file_size_in_bytes": int(file_size),
960
+ "min_k_hashes": [],
961
+ "histogram_counts": [],
962
+ "histogram_bins": 0,
963
+ "min_values": min_values,
964
+ "max_values": max_values,
965
+ }
966
+ )
967
+
968
+ manifest_path = None
969
+ if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
970
+ manifest_path = self.catalog.write_parquet_manifest(
971
+ snapshot_id, new_entries, self.metadata.location
972
+ )
973
+
974
+ # Build summary: previous entries become deleted
975
+ deleted_data_files = prev_total_files
976
+ deleted_files_size = prev_total_size
977
+ deleted_records = prev_total_records
978
+
979
+ added_data_files = len(new_entries)
980
+ added_files_size = 0
981
+ added_records = 0
982
+
983
+ total_data_files = added_data_files
984
+ total_files_size = added_files_size
985
+ total_records = added_records
986
+
987
+ summary = {
988
+ "added-data-files": added_data_files,
989
+ "added-files-size": added_files_size,
990
+ "added-records": added_records,
991
+ "deleted-data-files": deleted_data_files,
992
+ "deleted-files-size": deleted_files_size,
993
+ "deleted-records": deleted_records,
994
+ "total-data-files": total_data_files,
995
+ "total-files-size": total_files_size,
996
+ "total-records": total_records,
997
+ }
998
+
999
+ # Sequence number
1000
+ try:
1001
+ max_seq = 0
1002
+ for s in self.metadata.snapshots:
1003
+ seq = getattr(s, "sequence_number", None)
1004
+ if seq is None:
1005
+ continue
1006
+ try:
1007
+ ival = int(seq)
1008
+ except Exception:
1009
+ continue
1010
+ if ival > max_seq:
1011
+ max_seq = ival
1012
+ next_seq = max_seq + 1
1013
+ except Exception:
1014
+ next_seq = 1
1015
+
1016
+ parent_id = self.metadata.current_snapshot_id
1017
+
1018
+ if commit_message is None:
1019
+ commit_message = f"truncate and add files by {author}"
1020
+
1021
+ snap = Snapshot(
1022
+ snapshot_id=snapshot_id,
1023
+ timestamp_ms=snapshot_id,
1024
+ author=author,
1025
+ sequence_number=next_seq,
1026
+ user_created=True,
1027
+ operation_type="truncate-and-add-files",
1028
+ parent_snapshot_id=parent_id,
1029
+ manifest_list=manifest_path,
1030
+ schema_id=self.metadata.current_schema_id,
1031
+ commit_message=commit_message,
1032
+ summary=summary,
1033
+ )
1034
+
1035
+ # Replace in-memory snapshots: append snapshot and update current id
1036
+ self.metadata.snapshots.append(snap)
1037
+ self.metadata.current_snapshot_id = snapshot_id
1038
+
1039
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
1040
+ self.catalog.save_snapshot(self.identifier, snap)
1041
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
1042
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
1043
+
1044
+ def scan(
1045
+ self, row_filter=None, row_limit=None, snapshot_id: Optional[int] = None
1046
+ ) -> Iterable[Datafile]:
1047
+ """Return Datafile objects for the given snapshot.
1048
+
1049
+ - If `snapshot_id` is None, use the current snapshot.
1050
+ - Ignore `row_filter` for now and return all files listed in the
1051
+ snapshot's parquet manifest (if present).
1052
+ """
1053
+ # Determine snapshot to read using the dataset-level helper which
1054
+ # prefers the in-memory current snapshot and otherwise performs a
1055
+ # backend lookup for the requested id.
1056
+ snap = self.snapshot(snapshot_id)
1057
+
1058
+ if snap is None or not getattr(snap, "manifest_list", None):
1059
+ return iter(())
1060
+
1061
+ manifest_path = snap.manifest_list
1062
+
1063
+ # Read manifest via FileIO if available
1064
+ try:
1065
+ import pyarrow as pa
1066
+ import pyarrow.parquet as pq
1067
+
1068
+ data = None
1069
+
1070
+ inp = self.io.new_input(manifest_path)
1071
+ with inp.open() as f:
1072
+ data = f.read()
1073
+
1074
+ if not data:
1075
+ return iter(())
1076
+
1077
+ table = pq.read_table(pa.BufferReader(data))
1078
+ rows = table.to_pylist()
1079
+ cum_rows = 0
1080
+ for r in rows:
1081
+ yield Datafile(entry=r)
1082
+ try:
1083
+ rc = int(r.get("record_count") or 0)
1084
+ except Exception:
1085
+ rc = 0
1086
+ cum_rows += rc
1087
+ if row_limit is not None and cum_rows >= row_limit:
1088
+ break
1089
+ except FileNotFoundError:
1090
+ return iter(())
1091
+ except Exception:
1092
+ return iter(())
1093
+
1094
+ def truncate(self, author: str = None, commit_message: Optional[str] = None) -> None:
1095
+ """Delete all data files and manifests for this table.
1096
+
1097
+ This attempts to delete every data file referenced by existing
1098
+ Parquet manifests and then delete the manifest files themselves.
1099
+ Finally it clears the in-memory snapshot list and persists the
1100
+ empty snapshot set via the attached `catalog` (if available).
1101
+ """
1102
+ import pyarrow as pa
1103
+ import pyarrow.parquet as pq
1104
+
1105
+ io = self.io
1106
+ # Collect files referenced by existing manifests but do NOT delete
1107
+ # them from storage. Instead we will write a new empty manifest and
1108
+ # create a truncate snapshot that records these files as deleted.
1109
+ snaps = list(self.metadata.snapshots)
1110
+ removed_files = []
1111
+ removed_total_size = 0
1112
+
1113
+ for snap in snaps:
1114
+ manifest_path = getattr(snap, "manifest_list", None)
1115
+ if not manifest_path:
1116
+ continue
1117
+
1118
+ # Read manifest via FileIO if available
1119
+ rows = []
1120
+ try:
1121
+ if hasattr(io, "new_input"):
1122
+ inp = io.new_input(manifest_path)
1123
+ with inp.open() as f:
1124
+ data = f.read()
1125
+ table = pq.read_table(pa.BufferReader(data))
1126
+ rows = table.to_pylist()
1127
+ except Exception:
1128
+ rows = []
1129
+
1130
+ for r in rows:
1131
+ fp = None
1132
+ fsize = 0
1133
+ if isinstance(r, dict):
1134
+ fp = r.get("file_path")
1135
+ fsize = int(r.get("file_size_in_bytes") or 0)
1136
+ if not fp and "data_file" in r and isinstance(r["data_file"], dict):
1137
+ fp = r["data_file"].get("file_path") or r["data_file"].get("path")
1138
+ fsize = int(r["data_file"].get("file_size_in_bytes") or 0)
1139
+
1140
+ if fp:
1141
+ removed_files.append(fp)
1142
+ removed_total_size += fsize
1143
+
1144
+ # Create a new empty Parquet manifest (entries=[]) to represent the
1145
+ # truncated table for the new snapshot. Do not delete objects.
1146
+ snapshot_id = int(time.time() * 1000)
1147
+
1148
+ # Do NOT write an empty Parquet manifest when there are no entries.
1149
+ # Per policy, create the snapshot without a manifest so older
1150
+ # snapshots remain readable and we avoid creating empty manifest files.
1151
+ manifest_path = None
1152
+
1153
+ # Build summary reflecting deleted files (tracked, not removed)
1154
+ deleted_count = len(removed_files)
1155
+ deleted_size = removed_total_size
1156
+
1157
+ summary = {
1158
+ "added-data-files": 0,
1159
+ "added-files-size": 0,
1160
+ "added-records": 0,
1161
+ "deleted-data-files": deleted_count,
1162
+ "deleted-files-size": deleted_size,
1163
+ "deleted-records": 0,
1164
+ "total-data-files": 0,
1165
+ "total-files-size": 0,
1166
+ "total-records": 0,
1167
+ }
1168
+
1169
+ # Sequence number
1170
+ try:
1171
+ max_seq = 0
1172
+ for s in self.metadata.snapshots:
1173
+ seq = getattr(s, "sequence_number", None)
1174
+ if seq is None:
1175
+ continue
1176
+ try:
1177
+ ival = int(seq)
1178
+ except Exception:
1179
+ continue
1180
+ if ival > max_seq:
1181
+ max_seq = ival
1182
+ next_seq = max_seq + 1
1183
+ except Exception:
1184
+ next_seq = 1
1185
+
1186
+ if author is None:
1187
+ raise ValueError(
1188
+ "truncate() must be called with an explicit author; use truncate(author=...) in caller"
1189
+ )
1190
+ # update metadata author/timestamp for this truncate
1191
+ self.metadata.author = author
1192
+ self.metadata.timestamp_ms = snapshot_id
1193
+ # default commit message
1194
+ if commit_message is None:
1195
+ commit_message = f"commit by {author}"
1196
+
1197
+ parent_id = self.metadata.current_snapshot_id
1198
+
1199
+ snap = Snapshot(
1200
+ snapshot_id=snapshot_id,
1201
+ timestamp_ms=snapshot_id,
1202
+ author=author,
1203
+ sequence_number=next_seq,
1204
+ user_created=True,
1205
+ operation_type="truncate",
1206
+ parent_snapshot_id=parent_id,
1207
+ manifest_list=manifest_path,
1208
+ schema_id=self.metadata.current_schema_id,
1209
+ commit_message=commit_message,
1210
+ summary=summary,
1211
+ )
1212
+
1213
+ # Append new snapshot and update current snapshot id
1214
+ self.metadata.snapshots.append(snap)
1215
+ self.metadata.current_snapshot_id = snapshot_id
1216
+
1217
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
1218
+ try:
1219
+ self.catalog.save_snapshot(self.identifier, snap)
1220
+ except Exception:
1221
+ pass