opteryx-catalog 0.4.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opteryx-catalog might be problematic. Click here for more details.

@@ -0,0 +1,1201 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import time
5
+ import uuid
6
+ from dataclasses import dataclass
7
+ from typing import Any
8
+ from typing import Iterable
9
+ from typing import Optional
10
+
11
+ from .manifest import ParquetManifestEntry
12
+ from .manifest import build_parquet_manifest_entry
13
+ from .manifest import build_parquet_manifest_minmax_entry
14
+ from .metadata import DatasetMetadata
15
+ from .metadata import Snapshot
16
+ from .metastore import Dataset
17
+
18
+ # Stable node identifier for this process (hex-mac-hex-pid)
19
+ _NODE = f"{uuid.getnode():x}-{os.getpid():x}"
20
+
21
+
22
+ @dataclass
23
+ class Datafile:
24
+ """Wrapper for a manifest entry representing a data file."""
25
+
26
+ entry: dict
27
+
28
+ @property
29
+ def file_path(self) -> Optional[str]:
30
+ return self.entry.get("file_path")
31
+
32
+ @property
33
+ def record_count(self) -> int:
34
+ return int(self.entry.get("record_count") or 0)
35
+
36
+ @property
37
+ def file_size_in_bytes(self) -> int:
38
+ return int(self.entry.get("file_size_in_bytes") or 0)
39
+
40
+ def to_dict(self) -> dict:
41
+ return dict(self.entry)
42
+
43
+ @property
44
+ def min_k_hashes(self) -> list:
45
+ return self.entry.get("min_k_hashes") or []
46
+
47
+ @property
48
+ def histogram_counts(self) -> list:
49
+ return self.entry.get("histogram_counts") or []
50
+
51
+ @property
52
+ def histogram_bins(self) -> int:
53
+ return int(self.entry.get("histogram_bins") or 0)
54
+
55
+ @property
56
+ def min_values(self) -> list:
57
+ return self.entry.get("min_values") or []
58
+
59
+ @property
60
+ def max_values(self) -> list:
61
+ return self.entry.get("max_values") or []
62
+
63
+
64
+ @dataclass
65
+ class SimpleDataset(Dataset):
66
+ identifier: str
67
+ _metadata: DatasetMetadata
68
+ io: Any = None
69
+ catalog: Any = None
70
+
71
+ @property
72
+ def metadata(self) -> DatasetMetadata:
73
+ return self._metadata
74
+
75
+ def _next_sequence_number(self) -> int:
76
+ """Calculate the next sequence number.
77
+
78
+ Uses the current snapshot's sequence number + 1. Works efficiently
79
+ with load_history=False since we only need the most recent snapshot,
80
+ not the full history.
81
+
82
+ Returns:
83
+ The next sequence number (current snapshot's sequence + 1, or 1 if no snapshots).
84
+ """
85
+ if not self.metadata.snapshots:
86
+ # No snapshots yet - this is the first one
87
+ return 1
88
+
89
+ # Get the current (most recent) snapshot - should have the highest sequence number
90
+ current = self.snapshot()
91
+ if current:
92
+ seq = getattr(current, "sequence_number", None)
93
+ return int(seq) + 1 if seq is not None else 1
94
+
95
+ def snapshot(self, snapshot_id: Optional[int] = None) -> Optional[Snapshot]:
96
+ """Return a Snapshot.
97
+
98
+ - If `snapshot_id` is None, return the in-memory current snapshot.
99
+ - If a `snapshot_id` is provided, prefer a Firestore lookup via the
100
+ attached `catalog` (O(1) document get). Fall back to the in-memory
101
+ `metadata.snapshots` list only when no catalog is attached or the
102
+ remote lookup fails.
103
+ """
104
+ # Current snapshot: keep in memory for fast access
105
+ if snapshot_id is None:
106
+ return self.metadata.current_snapshot()
107
+
108
+ # Try Firestore document lookup when catalog attached
109
+ if self.catalog:
110
+ try:
111
+ collection, dataset_name = self.identifier.split(".")
112
+ doc = (
113
+ self.catalog._dataset_doc_ref(collection, dataset_name)
114
+ .collection("snapshots")
115
+ .document(str(snapshot_id))
116
+ .get()
117
+ )
118
+ if doc.exists:
119
+ sd = doc.to_dict() or {}
120
+ snap = Snapshot(
121
+ snapshot_id=int(sd.get("snapshot-id") or snapshot_id),
122
+ timestamp_ms=int(sd.get("timestamp-ms", 0)),
123
+ author=sd.get("author"),
124
+ sequence_number=sd.get("sequence-number", 0),
125
+ user_created=sd.get("user-created"),
126
+ manifest_list=sd.get("manifest"),
127
+ schema_id=sd.get("schema-id"),
128
+ summary=sd.get("summary", {}),
129
+ operation_type=sd.get("operation-type"),
130
+ parent_snapshot_id=sd.get("parent-snapshot-id"),
131
+ commit_message=sd.get("commit-message"),
132
+ )
133
+ return snap
134
+ except Exception:
135
+ # Be conservative: fall through to in-memory fallback
136
+ pass
137
+
138
+ # Fallback: search in-memory snapshots (only used when no catalog)
139
+ for s in self.metadata.snapshots:
140
+ if s.snapshot_id == snapshot_id:
141
+ return s
142
+
143
+ return None
144
+
145
+ def _get_node(self) -> str:
146
+ """Return the stable node identifier for this process.
147
+
148
+ Uses a module-level constant to avoid per-instance hashing/caching.
149
+ """
150
+ return _NODE
151
+
152
+ def snapshots(self) -> Iterable[Snapshot]:
153
+ return list(self.metadata.snapshots)
154
+
155
+ def schema(self, schema_id: Optional[str] = None) -> Optional[dict]:
156
+ """Return a stored schema description.
157
+
158
+ If `schema_id` is None, return the current schema (by
159
+ `metadata.current_schema_id` or last-known schema). If a
160
+ specific `schema_id` is provided, attempt to find it in the
161
+ in-memory `metadata.schemas` list and, failing that, fetch it
162
+ from the catalog's `schemas` subcollection when a catalog is
163
+ attached.
164
+
165
+ Returns the stored schema dict (contains keys like `schema_id`,
166
+ `columns`, `timestamp-ms`, etc.) or None if not found.
167
+ """
168
+ # Determine which schema id to use
169
+ sid = schema_id or self.metadata.current_schema_id
170
+
171
+ # If no sid and a raw schema is stored on the metadata, return it
172
+ if sid is None:
173
+ return getattr(self.metadata, "schema", None)
174
+
175
+ # Fast path: if this is the current schema id, prefer the cached
176
+ # current schema (99% case) rather than scanning the entire list.
177
+ sdict = None
178
+ if sid == self.metadata.current_schema_id:
179
+ if getattr(self.metadata, "schemas", None):
180
+ last = self.metadata.schemas[-1]
181
+ if last.get("schema_id") == sid:
182
+ sdict = last
183
+ else:
184
+ # If a raw schema is stored directly on metadata, use it.
185
+ raw = getattr(self.metadata, "schema", None)
186
+ if raw is not None:
187
+ sdict = {"schema_id": sid, "columns": raw}
188
+
189
+ # If not the current schema, or cached current not present,
190
+ # prefer to load the schema document from the backend (O(1) doc get).
191
+ if sdict is None and self.catalog:
192
+ try:
193
+ collection, dataset_name = self.identifier.split(".")
194
+ doc = (
195
+ self.catalog._dataset_doc_ref(collection, dataset_name)
196
+ .collection("schemas")
197
+ .document(sid)
198
+ .get()
199
+ )
200
+ sdict = doc.to_dict() or None
201
+ except Exception:
202
+ sdict = None
203
+
204
+ # As a last-resort when no catalog is attached, fall back to an
205
+ # in-memory search for compatibility (offline/unit-test mode).
206
+ if sdict is None and not self.catalog:
207
+ for s in self.metadata.schemas or []:
208
+ if s.get("schema_id") == sid:
209
+ sdict = s
210
+ break
211
+
212
+ if sdict is None:
213
+ return None
214
+
215
+ # Try to construct an Orso RelationSchema
216
+ from orso.schema import FlatColumn
217
+ from orso.schema import RelationSchema
218
+
219
+ # If metadata stored a raw schema
220
+ raw = sdict.get("columns")
221
+
222
+ columns = [
223
+ FlatColumn(
224
+ name=c.get("name"),
225
+ type=c.get("type"),
226
+ element_type=c.get("element-type"),
227
+ precision=c.get("precision"),
228
+ scale=c.get("scale"),
229
+ )
230
+ for c in raw
231
+ ]
232
+ orso_schema = RelationSchema(name=self.identifier, columns=columns)
233
+ return orso_schema
234
+
235
+ def append(self, table: Any, author: str = None, commit_message: Optional[str] = None):
236
+ """Append a pyarrow.Table:
237
+
238
+ - write a Parquet data file via `self.io`
239
+ - create a simple Parquet manifest (one entry)
240
+ - persist manifest and snapshot metadata using the attached `catalog`
241
+ """
242
+ import pyarrow as pa
243
+ import pyarrow.parquet as pq
244
+
245
+ snapshot_id = int(time.time() * 1000)
246
+
247
+ if not hasattr(table, "schema"):
248
+ raise TypeError("append() expects a pyarrow.Table-like object")
249
+
250
+ # Write table and build manifest entry
251
+ manifest_entry = self._write_table_and_build_entry(table)
252
+ entries = [manifest_entry.to_dict()]
253
+
254
+ # persist manifest: for append, merge previous manifest entries
255
+ # with the new entries so the snapshot's manifest is cumulative.
256
+ manifest_path = None
257
+ if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
258
+ merged_entries = list(entries)
259
+
260
+ # If there is a previous snapshot with a manifest, try to read
261
+ # it and prepend its entries. Any read error is non-fatal and we
262
+ # fall back to writing only the new entries.
263
+ prev_snap = self.snapshot(None)
264
+ if prev_snap and getattr(prev_snap, "manifest_list", None):
265
+ prev_manifest_path = prev_snap.manifest_list
266
+ try:
267
+ # Prefer FileIO when available
268
+ inp = self.io.new_input(prev_manifest_path)
269
+ with inp.open() as f:
270
+ prev_data = f.read()
271
+ import pyarrow as pa
272
+ import pyarrow.parquet as pq
273
+
274
+ prev_table = pq.read_table(pa.BufferReader(prev_data))
275
+ prev_rows = prev_table.to_pylist()
276
+ merged_entries = prev_rows + merged_entries
277
+ except Exception:
278
+ # If we can't read the previous manifest, continue with
279
+ # just the new entries (don't fail the append).
280
+ pass
281
+
282
+ manifest_path = self.catalog.write_parquet_manifest(
283
+ snapshot_id, merged_entries, self.metadata.location
284
+ )
285
+
286
+ # snapshot metadata
287
+ if author is None:
288
+ raise ValueError("author must be provided when appending to a dataset")
289
+ # update metadata author/timestamp for this append
290
+ self.metadata.author = author
291
+ self.metadata.timestamp_ms = snapshot_id
292
+ # default commit message
293
+ if commit_message is None:
294
+ commit_message = f"commit by {author}"
295
+
296
+ recs = int(table.num_rows)
297
+ fsize = int(getattr(manifest_entry, "file_size_in_bytes", 0))
298
+ # Calculate uncompressed size from the manifest entry
299
+ added_data_size = manifest_entry.uncompressed_size_in_bytes
300
+ added_data_files = 1
301
+ added_files_size = fsize
302
+ added_records = recs
303
+ deleted_data_files = 0
304
+ deleted_files_size = 0
305
+ deleted_data_size = 0
306
+ deleted_records = 0
307
+
308
+ prev = self.snapshot()
309
+ if prev and prev.summary:
310
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
311
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
312
+ prev_total_data_size = int(prev.summary.get("total-data-size", 0))
313
+ prev_total_records = int(prev.summary.get("total-records", 0))
314
+ else:
315
+ prev_total_files = 0
316
+ prev_total_size = 0
317
+ prev_total_data_size = 0
318
+ prev_total_records = 0
319
+
320
+ total_data_files = prev_total_files + added_data_files - deleted_data_files
321
+ total_files_size = prev_total_size + added_files_size - deleted_files_size
322
+ total_data_size = prev_total_data_size + added_data_size - deleted_data_size
323
+ total_records = prev_total_records + added_records - deleted_records
324
+
325
+ summary = {
326
+ "added-data-files": added_data_files,
327
+ "added-files-size": added_files_size,
328
+ "added-data-size": added_data_size,
329
+ "added-records": added_records,
330
+ "deleted-data-files": deleted_data_files,
331
+ "deleted-files-size": deleted_files_size,
332
+ "deleted-data-size": deleted_data_size,
333
+ "deleted-records": deleted_records,
334
+ "total-data-files": total_data_files,
335
+ "total-files-size": total_files_size,
336
+ "total-data-size": total_data_size,
337
+ "total-records": total_records,
338
+ }
339
+
340
+ # sequence number
341
+ try:
342
+ next_seq = self._next_sequence_number()
343
+ except Exception:
344
+ next_seq = 1
345
+
346
+ parent_id = self.metadata.current_snapshot_id
347
+
348
+ snap = Snapshot(
349
+ snapshot_id=snapshot_id,
350
+ timestamp_ms=snapshot_id,
351
+ author=author,
352
+ sequence_number=next_seq,
353
+ user_created=True,
354
+ operation_type="append",
355
+ parent_snapshot_id=parent_id,
356
+ manifest_list=manifest_path,
357
+ schema_id=self.metadata.current_schema_id,
358
+ commit_message=commit_message,
359
+ summary=summary,
360
+ )
361
+
362
+ self.metadata.snapshots.append(snap)
363
+ self.metadata.current_snapshot_id = snapshot_id
364
+
365
+ # persist metadata (let errors propagate)
366
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
367
+ self.catalog.save_snapshot(self.identifier, snap)
368
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
369
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
370
+
371
+ def _write_table_and_build_entry(self, table: Any):
372
+ """Write a PyArrow table to storage and return a ParquetManifestEntry.
373
+
374
+ This centralizes the IO and manifest construction so other operations
375
+ (e.g. `overwrite`) can reuse the same behavior as `append`.
376
+ """
377
+ # Write parquet file with collision-resistant name
378
+ fname = f"{time.time_ns():x}-{self._get_node()}.parquet"
379
+ data_path = f"{self.metadata.location}/data/{fname}"
380
+
381
+ import pyarrow as pa
382
+ import pyarrow.parquet as pq
383
+
384
+ buf = pa.BufferOutputStream()
385
+ pq.write_table(table, buf, compression="zstd")
386
+ pdata = buf.getvalue().to_pybytes()
387
+
388
+ out = self.io.new_output(data_path).create()
389
+ out.write(pdata)
390
+ out.close()
391
+
392
+ # Build manifest entry with statistics
393
+ manifest_entry = build_parquet_manifest_entry(table, data_path, len(pdata))
394
+ return manifest_entry
395
+
396
+ def overwrite(self, table: Any, author: str = None, commit_message: Optional[str] = None):
397
+ """Replace the dataset entirely with `table` in a single snapshot.
398
+
399
+ Semantics:
400
+ - Write the provided table as new data file(s)
401
+ - Create a new parquet manifest that contains only the new entries
402
+ - Create a snapshot that records previous files as deleted and the
403
+ new files as added (logical replace)
404
+ """
405
+ # Similar validation as append
406
+ snapshot_id = int(time.time() * 1000)
407
+
408
+ if not hasattr(table, "schema"):
409
+ raise TypeError("overwrite() expects a pyarrow.Table-like object")
410
+
411
+ if author is None:
412
+ raise ValueError("author must be provided when overwriting a dataset")
413
+
414
+ # Write new data and build manifest entries (single table -> single entry)
415
+ manifest_entry = self._write_table_and_build_entry(table)
416
+ new_entries = [manifest_entry.to_dict()]
417
+
418
+ # Write manifest containing only the new entries
419
+ manifest_path = None
420
+ if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
421
+ manifest_path = self.catalog.write_parquet_manifest(
422
+ snapshot_id, new_entries, self.metadata.location
423
+ )
424
+
425
+ # Compute deltas: previous manifest becomes deleted
426
+ prev = self.snapshot(None)
427
+ prev_total_files = 0
428
+ prev_total_size = 0
429
+ prev_total_data_size = 0
430
+ prev_total_records = 0
431
+ if prev and prev.summary:
432
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
433
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
434
+ prev_total_data_size = int(prev.summary.get("total-data-size", 0))
435
+ prev_total_records = int(prev.summary.get("total-records", 0))
436
+
437
+ deleted_data_files = prev_total_files
438
+ deleted_files_size = prev_total_size
439
+ deleted_data_size = prev_total_data_size
440
+ deleted_records = prev_total_records
441
+
442
+ added_data_files = len(new_entries)
443
+ added_files_size = sum(e.get("file_size_in_bytes", 0) for e in new_entries)
444
+ added_data_size = sum(e.get("uncompressed_size_in_bytes", 0) for e in new_entries)
445
+ added_records = sum(e.get("record_count", 0) for e in new_entries)
446
+
447
+ total_data_files = added_data_files
448
+ total_files_size = added_files_size
449
+ total_data_size = added_data_size
450
+ total_records = added_records
451
+
452
+ summary = {
453
+ "added-data-files": added_data_files,
454
+ "added-files-size": added_files_size,
455
+ "added-data-size": added_data_size,
456
+ "added-records": added_records,
457
+ "deleted-data-files": deleted_data_files,
458
+ "deleted-files-size": deleted_files_size,
459
+ "deleted-data-size": deleted_data_size,
460
+ "deleted-records": deleted_records,
461
+ "total-data-files": total_data_files,
462
+ "total-files-size": total_files_size,
463
+ "total-data-size": total_data_size,
464
+ "total-records": total_records,
465
+ }
466
+
467
+ # sequence number
468
+ try:
469
+ next_seq = self._next_sequence_number()
470
+ except Exception:
471
+ next_seq = 1
472
+
473
+ parent_id = self.metadata.current_snapshot_id
474
+
475
+ if commit_message is None:
476
+ commit_message = f"overwrite by {author}"
477
+
478
+ snap = Snapshot(
479
+ snapshot_id=snapshot_id,
480
+ timestamp_ms=snapshot_id,
481
+ author=author,
482
+ sequence_number=next_seq,
483
+ user_created=True,
484
+ operation_type="overwrite",
485
+ parent_snapshot_id=parent_id,
486
+ manifest_list=manifest_path,
487
+ schema_id=self.metadata.current_schema_id,
488
+ commit_message=commit_message,
489
+ summary=summary,
490
+ )
491
+
492
+ # Replace in-memory snapshots
493
+ self.metadata.snapshots.append(snap)
494
+ self.metadata.current_snapshot_id = snapshot_id
495
+
496
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
497
+ self.catalog.save_snapshot(self.identifier, snap)
498
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
499
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
500
+
501
+ def add_files(self, files: list[str], author: str = None, commit_message: Optional[str] = None):
502
+ """Add filenames to the dataset manifest without writing the files.
503
+
504
+ - `files` is a list of file paths (strings). Files are assumed to
505
+ already exist in storage; this method only updates the manifest.
506
+ - Does not add files that already appear in the current manifest
507
+ (deduplicates by `file_path`).
508
+ - Creates a cumulative manifest for the new snapshot (previous
509
+ entries + new unique entries).
510
+ """
511
+ if author is None:
512
+ raise ValueError("author must be provided when adding files to a dataset")
513
+
514
+ snapshot_id = int(time.time() * 1000)
515
+
516
+ # Gather previous summary and manifest entries
517
+ prev = self.snapshot(None)
518
+ prev_total_files = 0
519
+ prev_total_size = 0
520
+ prev_total_records = 0
521
+ prev_entries = []
522
+ if prev and prev.summary:
523
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
524
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
525
+ prev_total_records = int(prev.summary.get("total-records", 0))
526
+ if prev and getattr(prev, "manifest_list", None):
527
+ # try to read prev manifest entries
528
+ try:
529
+ import pyarrow as pa
530
+ import pyarrow.parquet as pq
531
+
532
+ inp = self.io.new_input(prev.manifest_list)
533
+ with inp.open() as f:
534
+ data = f.read()
535
+ table = pq.read_table(pa.BufferReader(data))
536
+ prev_entries = table.to_pylist()
537
+ except Exception:
538
+ prev_entries = []
539
+
540
+ existing = {
541
+ e.get("file_path") for e in prev_entries if isinstance(e, dict) and e.get("file_path")
542
+ }
543
+
544
+ # Build new entries for files that don't already exist. Only accept
545
+ # Parquet files and attempt to read lightweight metadata (bytes,
546
+ # row count, per-column min/max) from the Parquet footer when
547
+ # available.
548
+ new_entries = []
549
+ seen = set()
550
+ for fp in files:
551
+ if not fp or fp in existing or fp in seen:
552
+ continue
553
+ if not fp.lower().endswith(".parquet"):
554
+ # only accept parquet files
555
+ continue
556
+ seen.add(fp)
557
+
558
+ # Attempt to read file bytes and parquet metadata
559
+ # Use rugo's metadata reader which is much faster (microseconds per file)
560
+ try:
561
+ inp = self.io.new_input(fp)
562
+ with inp.open() as f:
563
+ data = f.read()
564
+
565
+ if data:
566
+ manifest_entry = build_parquet_manifest_minmax_entry(data, fp)
567
+ else:
568
+ # Empty file, create placeholder entry
569
+ manifest_entry = ParquetManifestEntry(
570
+ file_path=fp,
571
+ file_format="parquet",
572
+ record_count=0,
573
+ null_counts=[],
574
+ file_size_in_bytes=0,
575
+ uncompressed_size_in_bytes=0,
576
+ column_uncompressed_sizes_in_bytes=[],
577
+ min_k_hashes=[],
578
+ histogram_counts=[],
579
+ histogram_bins=0,
580
+ min_values=[],
581
+ max_values=[],
582
+ )
583
+ except Exception:
584
+ # If metadata read fails, fall back to placeholders
585
+ manifest_entry = ParquetManifestEntry(
586
+ file_path=fp,
587
+ file_format="parquet",
588
+ record_count=0,
589
+ null_counts=[],
590
+ file_size_in_bytes=0,
591
+ uncompressed_size_in_bytes=0,
592
+ column_uncompressed_sizes_in_bytes=[],
593
+ min_k_hashes=[],
594
+ histogram_counts=[],
595
+ histogram_bins=0,
596
+ min_values=[],
597
+ max_values=[],
598
+ )
599
+ new_entries.append(manifest_entry.to_dict())
600
+
601
+ merged_entries = prev_entries + new_entries
602
+
603
+ # write cumulative manifest
604
+ manifest_path = None
605
+ if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
606
+ manifest_path = self.catalog.write_parquet_manifest(
607
+ snapshot_id, merged_entries, self.metadata.location
608
+ )
609
+
610
+ # Build summary deltas
611
+ added_data_files = len(new_entries)
612
+ added_files_size = 0
613
+ added_data_size = 0
614
+ added_records = 0
615
+ # Sum uncompressed sizes from new entries
616
+ for entry in new_entries:
617
+ added_data_size += entry.get("uncompressed_size_in_bytes", 0)
618
+ deleted_data_files = 0
619
+ deleted_files_size = 0
620
+ deleted_data_size = 0
621
+ deleted_records = 0
622
+
623
+ prev_total_data_size = (
624
+ int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
625
+ )
626
+
627
+ total_data_files = prev_total_files + added_data_files - deleted_data_files
628
+ total_files_size = prev_total_size + added_files_size - deleted_files_size
629
+ total_data_size = prev_total_data_size + added_data_size - deleted_data_size
630
+ total_records = prev_total_records + added_records - deleted_records
631
+
632
+ summary = {
633
+ "added-data-files": added_data_files,
634
+ "added-files-size": added_files_size,
635
+ "added-data-size": added_data_size,
636
+ "added-records": added_records,
637
+ "deleted-data-files": deleted_data_files,
638
+ "deleted-files-size": deleted_files_size,
639
+ "deleted-data-size": deleted_data_size,
640
+ "deleted-records": deleted_records,
641
+ "total-data-files": total_data_files,
642
+ "total-files-size": total_files_size,
643
+ "total-data-size": total_data_size,
644
+ "total-records": total_records,
645
+ }
646
+
647
+ # Sequence number
648
+ try:
649
+ next_seq = self._next_sequence_number()
650
+ except Exception:
651
+ next_seq = 1
652
+
653
+ parent_id = self.metadata.current_snapshot_id
654
+
655
+ if commit_message is None:
656
+ commit_message = f"add files by {author}"
657
+
658
+ snap = Snapshot(
659
+ snapshot_id=snapshot_id,
660
+ timestamp_ms=snapshot_id,
661
+ author=author,
662
+ sequence_number=next_seq,
663
+ user_created=True,
664
+ operation_type="add-files",
665
+ parent_snapshot_id=parent_id,
666
+ manifest_list=manifest_path,
667
+ schema_id=self.metadata.current_schema_id,
668
+ commit_message=commit_message,
669
+ summary=summary,
670
+ )
671
+
672
+ self.metadata.snapshots.append(snap)
673
+ self.metadata.current_snapshot_id = snapshot_id
674
+
675
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
676
+ self.catalog.save_snapshot(self.identifier, snap)
677
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
678
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
679
+
680
+ def truncate_and_add_files(
681
+ self, files: list[str], author: str = None, commit_message: Optional[str] = None
682
+ ):
683
+ """Truncate dataset (logical) and set manifest to provided files.
684
+
685
+ - Writes a manifest that contains exactly the unique filenames provided.
686
+ - Does not delete objects from storage.
687
+ - Useful for replace/overwrite semantics.
688
+ """
689
+ if author is None:
690
+ raise ValueError("author must be provided when truncating/adding files")
691
+
692
+ snapshot_id = int(time.time() * 1000)
693
+
694
+ # Read previous summary for reporting deleted counts
695
+ prev = self.snapshot(None)
696
+ prev_total_files = 0
697
+ prev_total_size = 0
698
+ prev_total_records = 0
699
+ if prev and prev.summary:
700
+ try:
701
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
702
+ except Exception:
703
+ prev_total_files = 0
704
+ try:
705
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
706
+ except Exception:
707
+ prev_total_size = 0
708
+ try:
709
+ prev_total_records = int(prev.summary.get("total-records", 0))
710
+ except Exception:
711
+ prev_total_records = 0
712
+
713
+ # Build unique new entries (ignore duplicates in input). Only accept
714
+ # parquet files and try to read lightweight metadata from each file.
715
+ new_entries = []
716
+ seen = set()
717
+ for fp in files:
718
+ if not fp or fp in seen:
719
+ continue
720
+ if not fp.lower().endswith(".parquet"):
721
+ continue
722
+ seen.add(fp)
723
+
724
+ file_size = 0
725
+ record_count = 0
726
+ min_values = []
727
+ max_values = []
728
+ try:
729
+ import pyarrow as pa
730
+ import pyarrow.parquet as pq
731
+
732
+ data = None
733
+ if self.io and hasattr(self.io, "new_input"):
734
+ inp = self.io.new_input(fp)
735
+ with inp.open() as f:
736
+ data = f.read()
737
+ else:
738
+ if (
739
+ self.catalog
740
+ and getattr(self.catalog, "_storage_client", None)
741
+ and getattr(self.catalog, "gcs_bucket", None)
742
+ ):
743
+ bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
744
+ parsed = fp
745
+ if parsed.startswith("gs://"):
746
+ parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
747
+ blob = bucket.blob(parsed)
748
+ data = blob.download_as_bytes()
749
+
750
+ if data:
751
+ file_size = len(data)
752
+ pf = pq.ParquetFile(pa.BufferReader(data))
753
+ record_count = int(pf.metadata.num_rows or 0)
754
+
755
+ ncols = pf.metadata.num_columns
756
+ mins = [None] * ncols
757
+ maxs = [None] * ncols
758
+ null_counts = [0] * ncols
759
+ for rg in range(pf.num_row_groups):
760
+ for ci in range(ncols):
761
+ col_meta = pf.metadata.row_group(rg).column(ci)
762
+ stats = getattr(col_meta, "statistics", None)
763
+ if not stats:
764
+ continue
765
+ smin = getattr(stats, "min", None)
766
+ smax = getattr(stats, "max", None)
767
+ snull_count = getattr(stats, "null_count", None)
768
+ if smin is None and smax is None and snull_count is None:
769
+ continue
770
+
771
+ def _to_py(v):
772
+ try:
773
+ return int(v)
774
+ except Exception:
775
+ try:
776
+ return float(v)
777
+ except Exception:
778
+ try:
779
+ if isinstance(v, (bytes, bytearray)):
780
+ return v.decode("utf-8", errors="ignore")
781
+ except Exception:
782
+ pass
783
+ return v
784
+
785
+ if smin is not None:
786
+ sval = _to_py(smin)
787
+ if mins[ci] is None:
788
+ mins[ci] = sval
789
+ else:
790
+ try:
791
+ if sval < mins[ci]:
792
+ mins[ci] = sval
793
+ except Exception:
794
+ pass
795
+ if smax is not None:
796
+ sval = _to_py(smax)
797
+ if maxs[ci] is None:
798
+ maxs[ci] = sval
799
+ else:
800
+ try:
801
+ if sval > maxs[ci]:
802
+ maxs[ci] = sval
803
+ except Exception:
804
+ pass
805
+ if snull_count is not None:
806
+ try:
807
+ null_counts[ci] += int(snull_count)
808
+ except Exception:
809
+ pass
810
+
811
+ min_values = [m for m in mins if m is not None]
812
+ max_values = [m for m in maxs if m is not None]
813
+ except Exception:
814
+ file_size = 0
815
+ record_count = 0
816
+ min_values = []
817
+ max_values = []
818
+ null_counts = []
819
+
820
+ manifest_entry = ParquetManifestEntry(
821
+ file_path=fp,
822
+ file_format="parquet",
823
+ record_count=int(record_count),
824
+ null_counts=null_counts,
825
+ file_size_in_bytes=int(file_size),
826
+ uncompressed_size_in_bytes=int(file_size), # Use compressed size as estimate
827
+ column_uncompressed_sizes_in_bytes=[],
828
+ min_k_hashes=[],
829
+ histogram_counts=[],
830
+ histogram_bins=0,
831
+ min_values=min_values,
832
+ max_values=max_values,
833
+ )
834
+ new_entries.append(manifest_entry.to_dict())
835
+
836
+ manifest_path = None
837
+ if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
838
+ manifest_path = self.catalog.write_parquet_manifest(
839
+ snapshot_id, new_entries, self.metadata.location
840
+ )
841
+
842
+ # Build summary: previous entries become deleted
843
+ deleted_data_files = prev_total_files
844
+ deleted_files_size = prev_total_size
845
+ deleted_data_size = (
846
+ int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
847
+ )
848
+ deleted_records = prev_total_records
849
+
850
+ added_data_files = len(new_entries)
851
+ added_files_size = 0
852
+ added_data_size = 0
853
+ # Sum uncompressed sizes from new entries
854
+ for entry in new_entries:
855
+ added_data_size += entry.get("uncompressed_size_in_bytes", 0)
856
+ added_records = 0
857
+
858
+ total_data_files = added_data_files
859
+ total_files_size = added_files_size
860
+ total_data_size = added_data_size
861
+ total_records = added_records
862
+
863
+ summary = {
864
+ "added-data-files": added_data_files,
865
+ "added-files-size": added_files_size,
866
+ "added-data-size": added_data_size,
867
+ "added-records": added_records,
868
+ "deleted-data-files": deleted_data_files,
869
+ "deleted-files-size": deleted_files_size,
870
+ "deleted-data-size": deleted_data_size,
871
+ "deleted-records": deleted_records,
872
+ "total-data-files": total_data_files,
873
+ "total-files-size": total_files_size,
874
+ "total-data-size": total_data_size,
875
+ "total-records": total_records,
876
+ }
877
+
878
+ # Sequence number
879
+ try:
880
+ next_seq = self._next_sequence_number()
881
+ except Exception:
882
+ next_seq = 1
883
+
884
+ parent_id = self.metadata.current_snapshot_id
885
+
886
+ if commit_message is None:
887
+ commit_message = f"truncate and add files by {author}"
888
+
889
+ snap = Snapshot(
890
+ snapshot_id=snapshot_id,
891
+ timestamp_ms=snapshot_id,
892
+ author=author,
893
+ sequence_number=next_seq,
894
+ user_created=True,
895
+ operation_type="truncate-and-add-files",
896
+ parent_snapshot_id=parent_id,
897
+ manifest_list=manifest_path,
898
+ schema_id=self.metadata.current_schema_id,
899
+ commit_message=commit_message,
900
+ summary=summary,
901
+ )
902
+
903
+ # Replace in-memory snapshots: append snapshot and update current id
904
+ self.metadata.snapshots.append(snap)
905
+ self.metadata.current_snapshot_id = snapshot_id
906
+
907
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
908
+ self.catalog.save_snapshot(self.identifier, snap)
909
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
910
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
911
+
912
+ def scan(self, row_filter=None, snapshot_id: Optional[int] = None) -> Iterable[Datafile]:
913
+ """Return Datafile objects for the given snapshot.
914
+
915
+ - If `snapshot_id` is None, use the current snapshot.
916
+ """
917
+ # Determine snapshot to read using the dataset-level helper which
918
+ # prefers the in-memory current snapshot and otherwise performs a
919
+ # backend lookup for the requested id.
920
+ snap = self.snapshot(snapshot_id)
921
+
922
+ if snap is None or not getattr(snap, "manifest_list", None):
923
+ return iter(())
924
+
925
+ manifest_path = snap.manifest_list
926
+
927
+ # Read manifest via FileIO if available
928
+ try:
929
+ import pyarrow as pa
930
+ import pyarrow.parquet as pq
931
+
932
+ inp = self.io.new_input(manifest_path)
933
+ with inp.open() as f:
934
+ data = f.read()
935
+
936
+ if not data:
937
+ return iter(())
938
+
939
+ table = pq.read_table(pa.BufferReader(data))
940
+ rows = table.to_pylist()
941
+ for r in rows:
942
+ yield Datafile(entry=r)
943
+ except FileNotFoundError:
944
+ return iter(())
945
+ except Exception:
946
+ return iter(())
947
+
948
+ def refresh_manifest(self, agent: str, author: Optional[str] = None) -> Optional[int]:
949
+ """Refresh manifest statistics and create a new snapshot.
950
+
951
+ - `agent`: identifier for the agent performing the refresh (string)
952
+ - `author`: optional author to record; if omitted uses current snapshot author
953
+
954
+ This recalculates per-file statistics (min/max, record counts, sizes)
955
+ for every file in the current manifest, writes a new manifest and
956
+ creates a new snapshot with `user_created=False` and
957
+ `operation_type='statistics-refresh'`.
958
+
959
+ Returns the new `snapshot_id` on success or None on failure.
960
+ """
961
+ prev = self.snapshot(None)
962
+ if prev is None or not getattr(prev, "manifest_list", None):
963
+ raise ValueError("No current manifest available to refresh")
964
+
965
+ # Use same author/commit-timestamp as previous snapshot unless overridden
966
+ use_author = author if author is not None else getattr(prev, "author", None)
967
+
968
+ snapshot_id = int(time.time() * 1000)
969
+
970
+ # Rebuild manifest entries by re-reading each data file
971
+ entries = []
972
+ try:
973
+ # Read previous manifest entries
974
+ inp = self.io.new_input(prev.manifest_list)
975
+ with inp.open() as f:
976
+ prev_data = f.read()
977
+ import pyarrow as pa
978
+ import pyarrow.parquet as pq
979
+
980
+ prev_table = pq.read_table(pa.BufferReader(prev_data))
981
+ prev_rows = prev_table.to_pylist()
982
+ except Exception:
983
+ prev_rows = []
984
+
985
+ total_files = 0
986
+ total_size = 0
987
+ total_data_size = 0
988
+ total_records = 0
989
+
990
+ for ent in prev_rows:
991
+ if not isinstance(ent, dict):
992
+ continue
993
+ fp = ent.get("file_path")
994
+ if not fp:
995
+ continue
996
+ try:
997
+ inp = self.io.new_input(fp)
998
+ with inp.open() as f:
999
+ data = f.read()
1000
+ # Full statistics including histograms and k-hashes
1001
+ table = pq.read_table(pa.BufferReader(data))
1002
+ manifest_entry = build_parquet_manifest_entry(table, fp, len(data))
1003
+ dent = manifest_entry.to_dict()
1004
+ except Exception:
1005
+ # Fall back to original entry if re-read fails
1006
+ dent = ent
1007
+
1008
+ entries.append(dent)
1009
+ total_files += 1
1010
+ total_size += int(dent.get("file_size_in_bytes") or 0)
1011
+ total_data_size += int(dent.get("uncompressed_size_in_bytes") or 0)
1012
+ total_records += int(dent.get("record_count") or 0)
1013
+
1014
+ # write new manifest
1015
+ manifest_path = self.catalog.write_parquet_manifest(
1016
+ snapshot_id, entries, self.metadata.location
1017
+ )
1018
+
1019
+ # Build summary
1020
+ summary = {
1021
+ "added-data-files": 0,
1022
+ "added-files-size": 0,
1023
+ "added-data-size": 0,
1024
+ "added-records": 0,
1025
+ "deleted-data-files": 0,
1026
+ "deleted-files-size": 0,
1027
+ "deleted-data-size": 0,
1028
+ "deleted-records": 0,
1029
+ "total-data-files": total_files,
1030
+ "total-files-size": total_size,
1031
+ "total-data-size": total_data_size,
1032
+ "total-records": total_records,
1033
+ }
1034
+
1035
+ # sequence number
1036
+ try:
1037
+ next_seq = self._next_sequence_number()
1038
+ except Exception:
1039
+ next_seq = 1
1040
+
1041
+ parent_id = self.metadata.current_snapshot_id
1042
+
1043
+ # Agent committer metadata
1044
+ agent_meta = {
1045
+ "timestamp": int(time.time() * 1000),
1046
+ "action": "statistics-refresh",
1047
+ "agent": agent,
1048
+ }
1049
+
1050
+ snap = Snapshot(
1051
+ snapshot_id=snapshot_id,
1052
+ timestamp_ms=getattr(prev, "timestamp_ms", snapshot_id),
1053
+ author=use_author,
1054
+ sequence_number=next_seq,
1055
+ user_created=False,
1056
+ operation_type="statistics-refresh",
1057
+ parent_snapshot_id=parent_id,
1058
+ manifest_list=manifest_path,
1059
+ schema_id=self.metadata.current_schema_id,
1060
+ commit_message=getattr(prev, "commit_message", "statistics refresh"),
1061
+ summary=summary,
1062
+ )
1063
+
1064
+ # attach agent metadata under summary
1065
+ if snap.summary is None:
1066
+ snap.summary = {}
1067
+ snap.summary["agent-committer"] = agent_meta
1068
+
1069
+ # update in-memory metadata
1070
+ self.metadata.snapshots.append(snap)
1071
+ self.metadata.current_snapshot_id = snapshot_id
1072
+
1073
+ # persist
1074
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
1075
+ self.catalog.save_snapshot(self.identifier, snap)
1076
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
1077
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
1078
+
1079
+ return snapshot_id
1080
+
1081
+ def truncate(self, author: str = None, commit_message: Optional[str] = None) -> None:
1082
+ """Delete all data files and manifests for this dataset.
1083
+
1084
+ This attempts to delete every data file referenced by existing
1085
+ Parquet manifests and then delete the manifest files themselves.
1086
+ Finally it clears the in-memory snapshot list and persists the
1087
+ empty snapshot set via the attached `catalog` (if available).
1088
+ """
1089
+ import pyarrow as pa
1090
+ import pyarrow.parquet as pq
1091
+
1092
+ io = self.io
1093
+ # Collect files referenced by existing manifests but do NOT delete
1094
+ # them from storage. Instead we will write a new empty manifest and
1095
+ # create a truncate snapshot that records these files as deleted.
1096
+ snaps = list(self.metadata.snapshots)
1097
+ removed_files = []
1098
+ removed_total_size = 0
1099
+ removed_data_size = 0
1100
+
1101
+ for snap in snaps:
1102
+ manifest_path = getattr(snap, "manifest_list", None)
1103
+ if not manifest_path:
1104
+ continue
1105
+
1106
+ # Read manifest via FileIO if available
1107
+ rows = []
1108
+ try:
1109
+ inp = io.new_input(manifest_path)
1110
+ with inp.open() as f:
1111
+ data = f.read()
1112
+ table = pq.read_table(pa.BufferReader(data))
1113
+ rows = table.to_pylist()
1114
+ except Exception:
1115
+ rows = []
1116
+
1117
+ for r in rows:
1118
+ fp = None
1119
+ fsize = 0
1120
+ data_size = 0
1121
+ if isinstance(r, dict):
1122
+ fp = r.get("file_path")
1123
+ fsize = int(r.get("file_size_in_bytes") or 0)
1124
+ data_size = int(r.get("uncompressed_size_in_bytes") or 0)
1125
+ if not fp and "data_file" in r and isinstance(r["data_file"], dict):
1126
+ fp = r["data_file"].get("file_path") or r["data_file"].get("path")
1127
+ fsize = int(r["data_file"].get("file_size_in_bytes") or 0)
1128
+ data_size = int(r["data_file"].get("uncompressed_size_in_bytes") or 0)
1129
+
1130
+ if fp:
1131
+ removed_files.append(fp)
1132
+ removed_total_size += fsize
1133
+ removed_data_size += data_size
1134
+
1135
+ # Create a new empty Parquet manifest (entries=[]) to represent the
1136
+ # truncated dataset for the new snapshot. Do not delete objects.
1137
+ snapshot_id = int(time.time() * 1000)
1138
+
1139
+ # Do NOT write an empty Parquet manifest when there are no entries.
1140
+ # Per policy, create the snapshot without a manifest so older
1141
+ # snapshots remain readable and we avoid creating empty manifest files.
1142
+ manifest_path = None
1143
+
1144
+ # Build summary reflecting deleted files (tracked, not removed)
1145
+ deleted_count = len(removed_files)
1146
+ deleted_size = removed_total_size
1147
+
1148
+ summary = {
1149
+ "added-data-files": 0,
1150
+ "added-files-size": 0,
1151
+ "added-data-size": 0,
1152
+ "added-records": 0,
1153
+ "deleted-data-files": deleted_count,
1154
+ "deleted-files-size": deleted_size,
1155
+ "deleted-data-size": removed_data_size,
1156
+ "deleted-records": 0,
1157
+ "total-data-files": 0,
1158
+ "total-files-size": 0,
1159
+ "total-data-size": 0,
1160
+ "total-records": 0,
1161
+ }
1162
+
1163
+ # Sequence number
1164
+ try:
1165
+ next_seq = self._next_sequence_number()
1166
+ except Exception:
1167
+ next_seq = 1
1168
+
1169
+ if author is None:
1170
+ raise ValueError(
1171
+ "truncate() must be called with an explicit author; use truncate(author=...) in caller"
1172
+ )
1173
+ # update metadata author/timestamp for this truncate
1174
+ self.metadata.author = author
1175
+ self.metadata.timestamp_ms = snapshot_id
1176
+ # default commit message
1177
+ if commit_message is None:
1178
+ commit_message = f"commit by {author}"
1179
+
1180
+ parent_id = self.metadata.current_snapshot_id
1181
+
1182
+ snap = Snapshot(
1183
+ snapshot_id=snapshot_id,
1184
+ timestamp_ms=snapshot_id,
1185
+ author=author,
1186
+ sequence_number=next_seq,
1187
+ user_created=True,
1188
+ operation_type="truncate",
1189
+ parent_snapshot_id=parent_id,
1190
+ manifest_list=manifest_path,
1191
+ schema_id=self.metadata.current_schema_id,
1192
+ commit_message=commit_message,
1193
+ summary=summary,
1194
+ )
1195
+
1196
+ # Append new snapshot and update current snapshot id
1197
+ self.metadata.snapshots.append(snap)
1198
+ self.metadata.current_snapshot_id = snapshot_id
1199
+
1200
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
1201
+ self.catalog.save_snapshot(self.identifier, snap)