opteryx-catalog 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opteryx-catalog might be problematic. Click here for more details.

@@ -0,0 +1,1199 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import time
5
+ import uuid
6
+ from dataclasses import dataclass
7
+ from typing import Any
8
+ from typing import Iterable
9
+ from typing import Optional
10
+
11
+ from .manifest import ParquetManifestEntry
12
+ from .manifest import build_parquet_manifest_entry
13
+ from .manifest import build_parquet_manifest_minmax_entry
14
+ from .metadata import DatasetMetadata
15
+ from .metadata import Snapshot
16
+ from .metastore import Dataset
17
+
18
+ # Stable node identifier for this process (hex-mac-hex-pid)
19
+ _NODE = f"{uuid.getnode():x}-{os.getpid():x}"
20
+
21
+
22
+ @dataclass
23
+ class Datafile:
24
+ """Wrapper for a manifest entry representing a data file."""
25
+
26
+ entry: dict
27
+
28
+ @property
29
+ def file_path(self) -> Optional[str]:
30
+ return self.entry.get("file_path")
31
+
32
+ @property
33
+ def record_count(self) -> int:
34
+ return int(self.entry.get("record_count") or 0)
35
+
36
+ @property
37
+ def file_size_in_bytes(self) -> int:
38
+ return int(self.entry.get("file_size_in_bytes") or 0)
39
+
40
+ def to_dict(self) -> dict:
41
+ return dict(self.entry)
42
+
43
+ @property
44
+ def min_k_hashes(self) -> list:
45
+ return self.entry.get("min_k_hashes") or []
46
+
47
+ @property
48
+ def histogram_counts(self) -> list:
49
+ return self.entry.get("histogram_counts") or []
50
+
51
+ @property
52
+ def histogram_bins(self) -> int:
53
+ return int(self.entry.get("histogram_bins") or 0)
54
+
55
+ @property
56
+ def min_values(self) -> list:
57
+ return self.entry.get("min_values") or []
58
+
59
+ @property
60
+ def max_values(self) -> list:
61
+ return self.entry.get("max_values") or []
62
+
63
+
64
+ @dataclass
65
+ class SimpleDataset(Dataset):
66
+ identifier: str
67
+ _metadata: DatasetMetadata
68
+ io: Any = None
69
+ catalog: Any = None
70
+
71
+ @property
72
+ def metadata(self) -> DatasetMetadata:
73
+ return self._metadata
74
+
75
+ def _next_sequence_number(self) -> int:
76
+ """Calculate the next sequence number.
77
+
78
+ Uses the current snapshot's sequence number + 1. Works efficiently
79
+ with load_history=False since we only need the most recent snapshot,
80
+ not the full history.
81
+
82
+ Returns:
83
+ The next sequence number (current snapshot's sequence + 1, or 1 if no snapshots).
84
+ """
85
+ if not self.metadata.snapshots:
86
+ # No snapshots yet - this is the first one
87
+ return 1
88
+
89
+ # Get the current (most recent) snapshot - should have the highest sequence number
90
+ current = self.snapshot()
91
+ if current:
92
+ seq = getattr(current, "sequence_number", None)
93
+ return int(seq) + 1 if seq is not None else 1
94
+
95
+ def snapshot(self, snapshot_id: Optional[int] = None) -> Optional[Snapshot]:
96
+ """Return a Snapshot.
97
+
98
+ - If `snapshot_id` is None, return the in-memory current snapshot.
99
+ - If a `snapshot_id` is provided, prefer a Firestore lookup via the
100
+ attached `catalog` (O(1) document get). Fall back to the in-memory
101
+ `metadata.snapshots` list only when no catalog is attached or the
102
+ remote lookup fails.
103
+ """
104
+ # Current snapshot: keep in memory for fast access
105
+ if snapshot_id is None:
106
+ return self.metadata.current_snapshot()
107
+
108
+ # Try Firestore document lookup when catalog attached
109
+ if self.catalog:
110
+ try:
111
+ collection, dataset_name = self.identifier.split(".")
112
+ doc = (
113
+ self.catalog._dataset_doc_ref(collection, dataset_name)
114
+ .collection("snapshots")
115
+ .document(str(snapshot_id))
116
+ .get()
117
+ )
118
+ if doc.exists:
119
+ sd = doc.to_dict() or {}
120
+ snap = Snapshot(
121
+ snapshot_id=int(sd.get("snapshot-id") or snapshot_id),
122
+ timestamp_ms=int(sd.get("timestamp-ms", 0)),
123
+ author=sd.get("author"),
124
+ sequence_number=sd.get("sequence-number", 0),
125
+ user_created=sd.get("user-created"),
126
+ manifest_list=sd.get("manifest"),
127
+ schema_id=sd.get("schema-id"),
128
+ summary=sd.get("summary", {}),
129
+ operation_type=sd.get("operation-type"),
130
+ parent_snapshot_id=sd.get("parent-snapshot-id"),
131
+ commit_message=sd.get("commit-message"),
132
+ )
133
+ return snap
134
+ except Exception:
135
+ # Be conservative: fall through to in-memory fallback
136
+ pass
137
+
138
+ # Fallback: search in-memory snapshots (only used when no catalog)
139
+ for s in self.metadata.snapshots:
140
+ if s.snapshot_id == snapshot_id:
141
+ return s
142
+
143
+ return None
144
+
145
+ def _get_node(self) -> str:
146
+ """Return the stable node identifier for this process.
147
+
148
+ Uses a module-level constant to avoid per-instance hashing/caching.
149
+ """
150
+ return _NODE
151
+
152
+ def snapshots(self) -> Iterable[Snapshot]:
153
+ return list(self.metadata.snapshots)
154
+
155
+ def schema(self, schema_id: Optional[str] = None) -> Optional[dict]:
156
+ """Return a stored schema description.
157
+
158
+ If `schema_id` is None, return the current schema (by
159
+ `metadata.current_schema_id` or last-known schema). If a
160
+ specific `schema_id` is provided, attempt to find it in the
161
+ in-memory `metadata.schemas` list and, failing that, fetch it
162
+ from the catalog's `schemas` subcollection when a catalog is
163
+ attached.
164
+
165
+ Returns the stored schema dict (contains keys like `schema_id`,
166
+ `columns`, `timestamp-ms`, etc.) or None if not found.
167
+ """
168
+ # Determine which schema id to use
169
+ sid = schema_id or self.metadata.current_schema_id
170
+
171
+ # If no sid and a raw schema is stored on the metadata, return it
172
+ if sid is None:
173
+ return getattr(self.metadata, "schema", None)
174
+
175
+ # Fast path: if this is the current schema id, prefer the cached
176
+ # current schema (99% case) rather than scanning the entire list.
177
+ sdict = None
178
+ if sid == self.metadata.current_schema_id:
179
+ if getattr(self.metadata, "schemas", None):
180
+ last = self.metadata.schemas[-1]
181
+ if last.get("schema_id") == sid:
182
+ sdict = last
183
+ else:
184
+ # If a raw schema is stored directly on metadata, use it.
185
+ raw = getattr(self.metadata, "schema", None)
186
+ if raw is not None:
187
+ sdict = {"schema_id": sid, "columns": raw}
188
+
189
+ # If not the current schema, or cached current not present,
190
+ # prefer to load the schema document from the backend (O(1) doc get).
191
+ if sdict is None and self.catalog:
192
+ try:
193
+ collection, dataset_name = self.identifier.split(".")
194
+ doc = (
195
+ self.catalog._dataset_doc_ref(collection, dataset_name)
196
+ .collection("schemas")
197
+ .document(sid)
198
+ .get()
199
+ )
200
+ sdict = doc.to_dict() or None
201
+ except Exception:
202
+ sdict = None
203
+
204
+ # As a last-resort when no catalog is attached, fall back to an
205
+ # in-memory search for compatibility (offline/unit-test mode).
206
+ if sdict is None and not self.catalog:
207
+ for s in self.metadata.schemas or []:
208
+ if s.get("schema_id") == sid:
209
+ sdict = s
210
+ break
211
+
212
+ if sdict is None:
213
+ return None
214
+
215
+ # Try to construct an Orso RelationSchema
216
+ from orso.schema import FlatColumn
217
+ from orso.schema import RelationSchema
218
+
219
+ # If metadata stored a raw schema
220
+ raw = sdict.get("columns")
221
+
222
+ columns = [
223
+ FlatColumn(
224
+ name=c.get("name"),
225
+ type=c.get("type"),
226
+ element_type=c.get("element-type"),
227
+ precision=c.get("precision"),
228
+ scale=c.get("scale"),
229
+ )
230
+ for c in raw
231
+ ]
232
+ orso_schema = RelationSchema(name=self.identifier, columns=columns)
233
+ return orso_schema
234
+
235
+ def append(self, table: Any, author: str = None, commit_message: Optional[str] = None):
236
+ """Append a pyarrow.Table:
237
+
238
+ - write a Parquet data file via `self.io`
239
+ - create a simple Parquet manifest (one entry)
240
+ - persist manifest and snapshot metadata using the attached `catalog`
241
+ """
242
+ import pyarrow as pa
243
+ import pyarrow.parquet as pq
244
+
245
+ snapshot_id = int(time.time() * 1000)
246
+
247
+ if not hasattr(table, "schema"):
248
+ raise TypeError("append() expects a pyarrow.Table-like object")
249
+
250
+ # Write table and build manifest entry
251
+ manifest_entry = self._write_table_and_build_entry(table)
252
+ entries = [manifest_entry.to_dict()]
253
+
254
+ # persist manifest: for append, merge previous manifest entries
255
+ # with the new entries so the snapshot's manifest is cumulative.
256
+ manifest_path = None
257
+ if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
258
+ merged_entries = list(entries)
259
+
260
+ # If there is a previous snapshot with a manifest, try to read
261
+ # it and prepend its entries. Any read error is non-fatal and we
262
+ # fall back to writing only the new entries.
263
+ prev_snap = self.snapshot(None)
264
+ if prev_snap and getattr(prev_snap, "manifest_list", None):
265
+ prev_manifest_path = prev_snap.manifest_list
266
+ try:
267
+ # Prefer FileIO when available
268
+ inp = self.io.new_input(prev_manifest_path)
269
+ with inp.open() as f:
270
+ prev_data = f.read()
271
+ import pyarrow as pa
272
+ import pyarrow.parquet as pq
273
+
274
+ prev_table = pq.read_table(pa.BufferReader(prev_data))
275
+ prev_rows = prev_table.to_pylist()
276
+ merged_entries = prev_rows + merged_entries
277
+ except Exception:
278
+ # If we can't read the previous manifest, continue with
279
+ # just the new entries (don't fail the append).
280
+ pass
281
+
282
+ manifest_path = self.catalog.write_parquet_manifest(
283
+ snapshot_id, merged_entries, self.metadata.location
284
+ )
285
+
286
+ # snapshot metadata
287
+ if author is None:
288
+ raise ValueError("author must be provided when appending to a dataset")
289
+ # update metadata author/timestamp for this append
290
+ self.metadata.author = author
291
+ self.metadata.timestamp_ms = snapshot_id
292
+ # default commit message
293
+ if commit_message is None:
294
+ commit_message = f"commit by {author}"
295
+
296
+ recs = int(table.num_rows)
297
+ fsize = int(getattr(manifest_entry, "file_size_in_bytes", 0))
298
+ # Calculate uncompressed size from the manifest entry
299
+ added_data_size = manifest_entry.uncompressed_size_in_bytes
300
+ added_data_files = 1
301
+ added_files_size = fsize
302
+ added_records = recs
303
+ deleted_data_files = 0
304
+ deleted_files_size = 0
305
+ deleted_data_size = 0
306
+ deleted_records = 0
307
+
308
+ prev = self.snapshot()
309
+ if prev and prev.summary:
310
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
311
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
312
+ prev_total_data_size = int(prev.summary.get("total-data-size", 0))
313
+ prev_total_records = int(prev.summary.get("total-records", 0))
314
+ else:
315
+ prev_total_files = 0
316
+ prev_total_size = 0
317
+ prev_total_data_size = 0
318
+ prev_total_records = 0
319
+
320
+ total_data_files = prev_total_files + added_data_files - deleted_data_files
321
+ total_files_size = prev_total_size + added_files_size - deleted_files_size
322
+ total_data_size = prev_total_data_size + added_data_size - deleted_data_size
323
+ total_records = prev_total_records + added_records - deleted_records
324
+
325
+ summary = {
326
+ "added-data-files": added_data_files,
327
+ "added-files-size": added_files_size,
328
+ "added-data-size": added_data_size,
329
+ "added-records": added_records,
330
+ "deleted-data-files": deleted_data_files,
331
+ "deleted-files-size": deleted_files_size,
332
+ "deleted-data-size": deleted_data_size,
333
+ "deleted-records": deleted_records,
334
+ "total-data-files": total_data_files,
335
+ "total-files-size": total_files_size,
336
+ "total-data-size": total_data_size,
337
+ "total-records": total_records,
338
+ }
339
+
340
+ # sequence number
341
+ try:
342
+ next_seq = self._next_sequence_number()
343
+ except Exception:
344
+ next_seq = 1
345
+
346
+ parent_id = self.metadata.current_snapshot_id
347
+
348
+ snap = Snapshot(
349
+ snapshot_id=snapshot_id,
350
+ timestamp_ms=snapshot_id,
351
+ author=author,
352
+ sequence_number=next_seq,
353
+ user_created=True,
354
+ operation_type="append",
355
+ parent_snapshot_id=parent_id,
356
+ manifest_list=manifest_path,
357
+ schema_id=self.metadata.current_schema_id,
358
+ commit_message=commit_message,
359
+ summary=summary,
360
+ )
361
+
362
+ self.metadata.snapshots.append(snap)
363
+ self.metadata.current_snapshot_id = snapshot_id
364
+
365
+ # persist metadata (let errors propagate)
366
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
367
+ self.catalog.save_snapshot(self.identifier, snap)
368
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
369
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
370
+
371
+ def _write_table_and_build_entry(self, table: Any):
372
+ """Write a PyArrow table to storage and return a ParquetManifestEntry.
373
+
374
+ This centralizes the IO and manifest construction so other operations
375
+ (e.g. `overwrite`) can reuse the same behavior as `append`.
376
+ """
377
+ # Write parquet file with collision-resistant name
378
+ fname = f"{time.time_ns():x}-{self._get_node()}.parquet"
379
+ data_path = f"{self.metadata.location}/data/{fname}"
380
+
381
+ import pyarrow as pa
382
+ import pyarrow.parquet as pq
383
+
384
+ buf = pa.BufferOutputStream()
385
+ pq.write_table(table, buf, compression="zstd")
386
+ pdata = buf.getvalue().to_pybytes()
387
+
388
+ out = self.io.new_output(data_path).create()
389
+ out.write(pdata)
390
+ out.close()
391
+
392
+ # Build manifest entry with statistics
393
+ manifest_entry = build_parquet_manifest_entry(table, data_path, len(pdata))
394
+ return manifest_entry
395
+
396
+ def overwrite(self, table: Any, author: str = None, commit_message: Optional[str] = None):
397
+ """Replace the dataset entirely with `table` in a single snapshot.
398
+
399
+ Semantics:
400
+ - Write the provided table as new data file(s)
401
+ - Create a new parquet manifest that contains only the new entries
402
+ - Create a snapshot that records previous files as deleted and the
403
+ new files as added (logical replace)
404
+ """
405
+ # Similar validation as append
406
+ snapshot_id = int(time.time() * 1000)
407
+
408
+ if not hasattr(table, "schema"):
409
+ raise TypeError("overwrite() expects a pyarrow.Table-like object")
410
+
411
+ if author is None:
412
+ raise ValueError("author must be provided when overwriting a dataset")
413
+
414
+ # Write new data and build manifest entries (single table -> single entry)
415
+ manifest_entry = self._write_table_and_build_entry(table)
416
+ new_entries = [manifest_entry.to_dict()]
417
+
418
+ # Write manifest containing only the new entries
419
+ manifest_path = None
420
+ if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
421
+ manifest_path = self.catalog.write_parquet_manifest(
422
+ snapshot_id, new_entries, self.metadata.location
423
+ )
424
+
425
+ # Compute deltas: previous manifest becomes deleted
426
+ prev = self.snapshot(None)
427
+ prev_total_files = 0
428
+ prev_total_size = 0
429
+ prev_total_data_size = 0
430
+ prev_total_records = 0
431
+ if prev and prev.summary:
432
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
433
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
434
+ prev_total_data_size = int(prev.summary.get("total-data-size", 0))
435
+ prev_total_records = int(prev.summary.get("total-records", 0))
436
+
437
+ deleted_data_files = prev_total_files
438
+ deleted_files_size = prev_total_size
439
+ deleted_data_size = prev_total_data_size
440
+ deleted_records = prev_total_records
441
+
442
+ added_data_files = len(new_entries)
443
+ added_files_size = sum(e.get("file_size_in_bytes", 0) for e in new_entries)
444
+ added_data_size = sum(e.get("uncompressed_size_in_bytes", 0) for e in new_entries)
445
+ added_records = sum(e.get("record_count", 0) for e in new_entries)
446
+
447
+ total_data_files = added_data_files
448
+ total_files_size = added_files_size
449
+ total_data_size = added_data_size
450
+ total_records = added_records
451
+
452
+ summary = {
453
+ "added-data-files": added_data_files,
454
+ "added-files-size": added_files_size,
455
+ "added-data-size": added_data_size,
456
+ "added-records": added_records,
457
+ "deleted-data-files": deleted_data_files,
458
+ "deleted-files-size": deleted_files_size,
459
+ "deleted-data-size": deleted_data_size,
460
+ "deleted-records": deleted_records,
461
+ "total-data-files": total_data_files,
462
+ "total-files-size": total_files_size,
463
+ "total-data-size": total_data_size,
464
+ "total-records": total_records,
465
+ }
466
+
467
+ # sequence number
468
+ try:
469
+ next_seq = self._next_sequence_number()
470
+ except Exception:
471
+ next_seq = 1
472
+
473
+ parent_id = self.metadata.current_snapshot_id
474
+
475
+ if commit_message is None:
476
+ commit_message = f"overwrite by {author}"
477
+
478
+ snap = Snapshot(
479
+ snapshot_id=snapshot_id,
480
+ timestamp_ms=snapshot_id,
481
+ author=author,
482
+ sequence_number=next_seq,
483
+ user_created=True,
484
+ operation_type="overwrite",
485
+ parent_snapshot_id=parent_id,
486
+ manifest_list=manifest_path,
487
+ schema_id=self.metadata.current_schema_id,
488
+ commit_message=commit_message,
489
+ summary=summary,
490
+ )
491
+
492
+ # Replace in-memory snapshots
493
+ self.metadata.snapshots.append(snap)
494
+ self.metadata.current_snapshot_id = snapshot_id
495
+
496
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
497
+ self.catalog.save_snapshot(self.identifier, snap)
498
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
499
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
500
+
501
+ def add_files(self, files: list[str], author: str = None, commit_message: Optional[str] = None):
502
+ """Add filenames to the dataset manifest without writing the files.
503
+
504
+ - `files` is a list of file paths (strings). Files are assumed to
505
+ already exist in storage; this method only updates the manifest.
506
+ - Does not add files that already appear in the current manifest
507
+ (deduplicates by `file_path`).
508
+ - Creates a cumulative manifest for the new snapshot (previous
509
+ entries + new unique entries).
510
+ """
511
+ if author is None:
512
+ raise ValueError("author must be provided when adding files to a dataset")
513
+
514
+ snapshot_id = int(time.time() * 1000)
515
+
516
+ # Gather previous summary and manifest entries
517
+ prev = self.snapshot(None)
518
+ prev_total_files = 0
519
+ prev_total_size = 0
520
+ prev_total_records = 0
521
+ prev_entries = []
522
+ if prev and prev.summary:
523
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
524
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
525
+ prev_total_records = int(prev.summary.get("total-records", 0))
526
+ if prev and getattr(prev, "manifest_list", None):
527
+ # try to read prev manifest entries
528
+ try:
529
+ import pyarrow as pa
530
+ import pyarrow.parquet as pq
531
+
532
+ inp = self.io.new_input(prev.manifest_list)
533
+ with inp.open() as f:
534
+ data = f.read()
535
+ table = pq.read_table(pa.BufferReader(data))
536
+ prev_entries = table.to_pylist()
537
+ except Exception:
538
+ prev_entries = []
539
+
540
+ existing = {
541
+ e.get("file_path") for e in prev_entries if isinstance(e, dict) and e.get("file_path")
542
+ }
543
+
544
+ # Build new entries for files that don't already exist. Only accept
545
+ # Parquet files and attempt to read lightweight metadata (bytes,
546
+ # row count, per-column min/max) from the Parquet footer when
547
+ # available.
548
+ new_entries = []
549
+ seen = set()
550
+ for fp in files:
551
+ if not fp or fp in existing or fp in seen:
552
+ continue
553
+ if not fp.lower().endswith(".parquet"):
554
+ # only accept parquet files
555
+ continue
556
+ seen.add(fp)
557
+
558
+ # Attempt to read file bytes and parquet metadata
559
+ # Use rugo's metadata reader which is much faster (microseconds per file)
560
+ try:
561
+ inp = self.io.new_input(fp)
562
+ with inp.open() as f:
563
+ data = f.read()
564
+
565
+ if data:
566
+ manifest_entry = build_parquet_manifest_minmax_entry(data, fp)
567
+ else:
568
+ # Empty file, create placeholder entry
569
+ manifest_entry = ParquetManifestEntry(
570
+ file_path=fp,
571
+ file_format="parquet",
572
+ record_count=0,
573
+ file_size_in_bytes=0,
574
+ uncompressed_size_in_bytes=0,
575
+ min_k_hashes=[],
576
+ histogram_counts=[],
577
+ histogram_bins=0,
578
+ min_values=[],
579
+ max_values=[],
580
+ )
581
+ except Exception:
582
+ # If metadata read fails, fall back to placeholders
583
+ manifest_entry = ParquetManifestEntry(
584
+ file_path=fp,
585
+ file_format="parquet",
586
+ record_count=0,
587
+ file_size_in_bytes=0,
588
+ uncompressed_size_in_bytes=0,
589
+ min_k_hashes=[],
590
+ histogram_counts=[],
591
+ histogram_bins=0,
592
+ min_values=[],
593
+ max_values=[],
594
+ )
595
+ new_entries.append(manifest_entry.to_dict())
596
+
597
+ merged_entries = prev_entries + new_entries
598
+
599
+ # write cumulative manifest
600
+ manifest_path = None
601
+ if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
602
+ manifest_path = self.catalog.write_parquet_manifest(
603
+ snapshot_id, merged_entries, self.metadata.location
604
+ )
605
+
606
+ # Build summary deltas
607
+ added_data_files = len(new_entries)
608
+ added_files_size = 0
609
+ added_data_size = 0
610
+ added_records = 0
611
+ # Sum uncompressed sizes from new entries
612
+ for entry in new_entries:
613
+ added_data_size += entry.get("uncompressed_size_in_bytes", 0)
614
+ deleted_data_files = 0
615
+ deleted_files_size = 0
616
+ deleted_data_size = 0
617
+ deleted_records = 0
618
+
619
+ prev_total_data_size = (
620
+ int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
621
+ )
622
+
623
+ total_data_files = prev_total_files + added_data_files - deleted_data_files
624
+ total_files_size = prev_total_size + added_files_size - deleted_files_size
625
+ total_data_size = prev_total_data_size + added_data_size - deleted_data_size
626
+ total_records = prev_total_records + added_records - deleted_records
627
+
628
+ summary = {
629
+ "added-data-files": added_data_files,
630
+ "added-files-size": added_files_size,
631
+ "added-data-size": added_data_size,
632
+ "added-records": added_records,
633
+ "deleted-data-files": deleted_data_files,
634
+ "deleted-files-size": deleted_files_size,
635
+ "deleted-data-size": deleted_data_size,
636
+ "deleted-records": deleted_records,
637
+ "total-data-files": total_data_files,
638
+ "total-files-size": total_files_size,
639
+ "total-data-size": total_data_size,
640
+ "total-records": total_records,
641
+ }
642
+
643
+ # Sequence number
644
+ try:
645
+ next_seq = self._next_sequence_number()
646
+ except Exception:
647
+ next_seq = 1
648
+
649
+ parent_id = self.metadata.current_snapshot_id
650
+
651
+ if commit_message is None:
652
+ commit_message = f"add files by {author}"
653
+
654
+ snap = Snapshot(
655
+ snapshot_id=snapshot_id,
656
+ timestamp_ms=snapshot_id,
657
+ author=author,
658
+ sequence_number=next_seq,
659
+ user_created=True,
660
+ operation_type="add-files",
661
+ parent_snapshot_id=parent_id,
662
+ manifest_list=manifest_path,
663
+ schema_id=self.metadata.current_schema_id,
664
+ commit_message=commit_message,
665
+ summary=summary,
666
+ )
667
+
668
+ self.metadata.snapshots.append(snap)
669
+ self.metadata.current_snapshot_id = snapshot_id
670
+
671
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
672
+ self.catalog.save_snapshot(self.identifier, snap)
673
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
674
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
675
+
676
+ def truncate_and_add_files(
677
+ self, files: list[str], author: str = None, commit_message: Optional[str] = None
678
+ ):
679
+ """Truncate dataset (logical) and set manifest to provided files.
680
+
681
+ - Writes a manifest that contains exactly the unique filenames provided.
682
+ - Does not delete objects from storage.
683
+ - Useful for replace/overwrite semantics.
684
+ """
685
+ if author is None:
686
+ raise ValueError("author must be provided when truncating/adding files")
687
+
688
+ snapshot_id = int(time.time() * 1000)
689
+
690
+ # Read previous summary for reporting deleted counts
691
+ prev = self.snapshot(None)
692
+ prev_total_files = 0
693
+ prev_total_size = 0
694
+ prev_total_records = 0
695
+ if prev and prev.summary:
696
+ try:
697
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
698
+ except Exception:
699
+ prev_total_files = 0
700
+ try:
701
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
702
+ except Exception:
703
+ prev_total_size = 0
704
+ try:
705
+ prev_total_records = int(prev.summary.get("total-records", 0))
706
+ except Exception:
707
+ prev_total_records = 0
708
+
709
+ # Build unique new entries (ignore duplicates in input). Only accept
710
+ # parquet files and try to read lightweight metadata from each file.
711
+ new_entries = []
712
+ seen = set()
713
+ for fp in files:
714
+ if not fp or fp in seen:
715
+ continue
716
+ if not fp.lower().endswith(".parquet"):
717
+ continue
718
+ seen.add(fp)
719
+
720
+ file_size = 0
721
+ record_count = 0
722
+ min_values = []
723
+ max_values = []
724
+ try:
725
+ import pyarrow as pa
726
+ import pyarrow.parquet as pq
727
+
728
+ data = None
729
+ if self.io and hasattr(self.io, "new_input"):
730
+ inp = self.io.new_input(fp)
731
+ with inp.open() as f:
732
+ data = f.read()
733
+ else:
734
+ if (
735
+ self.catalog
736
+ and getattr(self.catalog, "_storage_client", None)
737
+ and getattr(self.catalog, "gcs_bucket", None)
738
+ ):
739
+ bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
740
+ parsed = fp
741
+ if parsed.startswith("gs://"):
742
+ parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
743
+ blob = bucket.blob(parsed)
744
+ data = blob.download_as_bytes()
745
+
746
+ if data:
747
+ file_size = len(data)
748
+ pf = pq.ParquetFile(pa.BufferReader(data))
749
+ record_count = int(pf.metadata.num_rows or 0)
750
+
751
+ ncols = pf.metadata.num_columns
752
+ mins = [None] * ncols
753
+ maxs = [None] * ncols
754
+ for rg in range(pf.num_row_groups):
755
+ for ci in range(ncols):
756
+ col_meta = pf.metadata.row_group(rg).column(ci)
757
+ stats = getattr(col_meta, "statistics", None)
758
+ if not stats:
759
+ continue
760
+ smin = getattr(stats, "min", None)
761
+ smax = getattr(stats, "max", None)
762
+ if smin is None and smax is None:
763
+ continue
764
+
765
+ def _to_py(v):
766
+ try:
767
+ return int(v)
768
+ except Exception:
769
+ try:
770
+ return float(v)
771
+ except Exception:
772
+ try:
773
+ if isinstance(v, (bytes, bytearray)):
774
+ return v.decode("utf-8", errors="ignore")
775
+ except Exception:
776
+ pass
777
+ return v
778
+
779
+ if smin is not None:
780
+ sval = _to_py(smin)
781
+ if mins[ci] is None:
782
+ mins[ci] = sval
783
+ else:
784
+ try:
785
+ if sval < mins[ci]:
786
+ mins[ci] = sval
787
+ except Exception:
788
+ pass
789
+ if smax is not None:
790
+ sval = _to_py(smax)
791
+ if maxs[ci] is None:
792
+ maxs[ci] = sval
793
+ else:
794
+ try:
795
+ if sval > maxs[ci]:
796
+ maxs[ci] = sval
797
+ except Exception:
798
+ pass
799
+
800
+ min_values = [m for m in mins if m is not None]
801
+ max_values = [m for m in maxs if m is not None]
802
+ except Exception:
803
+ file_size = 0
804
+ record_count = 0
805
+ min_values = []
806
+ max_values = []
807
+
808
+ manifest_entry = ParquetManifestEntry(
809
+ file_path=fp,
810
+ file_format="parquet",
811
+ record_count=int(record_count),
812
+ file_size_in_bytes=int(file_size),
813
+ uncompressed_size_in_bytes=int(file_size), # Use compressed size as estimate
814
+ min_k_hashes=[],
815
+ histogram_counts=[],
816
+ histogram_bins=0,
817
+ min_values=min_values,
818
+ max_values=max_values,
819
+ )
820
+ new_entries.append(manifest_entry.to_dict())
821
+
822
+ manifest_path = None
823
+ if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
824
+ manifest_path = self.catalog.write_parquet_manifest(
825
+ snapshot_id, new_entries, self.metadata.location
826
+ )
827
+
828
+ # Build summary: previous entries become deleted
829
+ deleted_data_files = prev_total_files
830
+ deleted_files_size = prev_total_size
831
+ deleted_data_size = (
832
+ int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
833
+ )
834
+ deleted_records = prev_total_records
835
+
836
+ added_data_files = len(new_entries)
837
+ added_files_size = 0
838
+ added_data_size = 0
839
+ # Sum uncompressed sizes from new entries
840
+ for entry in new_entries:
841
+ added_data_size += entry.get("uncompressed_size_in_bytes", 0)
842
+ added_records = 0
843
+
844
+ total_data_files = added_data_files
845
+ total_files_size = added_files_size
846
+ total_data_size = added_data_size
847
+ total_records = added_records
848
+
849
+ summary = {
850
+ "added-data-files": added_data_files,
851
+ "added-files-size": added_files_size,
852
+ "added-data-size": added_data_size,
853
+ "added-records": added_records,
854
+ "deleted-data-files": deleted_data_files,
855
+ "deleted-files-size": deleted_files_size,
856
+ "deleted-data-size": deleted_data_size,
857
+ "deleted-records": deleted_records,
858
+ "total-data-files": total_data_files,
859
+ "total-files-size": total_files_size,
860
+ "total-data-size": total_data_size,
861
+ "total-records": total_records,
862
+ }
863
+
864
+ # Sequence number
865
+ try:
866
+ next_seq = self._next_sequence_number()
867
+ except Exception:
868
+ next_seq = 1
869
+
870
+ parent_id = self.metadata.current_snapshot_id
871
+
872
+ if commit_message is None:
873
+ commit_message = f"truncate and add files by {author}"
874
+
875
+ snap = Snapshot(
876
+ snapshot_id=snapshot_id,
877
+ timestamp_ms=snapshot_id,
878
+ author=author,
879
+ sequence_number=next_seq,
880
+ user_created=True,
881
+ operation_type="truncate-and-add-files",
882
+ parent_snapshot_id=parent_id,
883
+ manifest_list=manifest_path,
884
+ schema_id=self.metadata.current_schema_id,
885
+ commit_message=commit_message,
886
+ summary=summary,
887
+ )
888
+
889
+ # Replace in-memory snapshots: append snapshot and update current id
890
+ self.metadata.snapshots.append(snap)
891
+ self.metadata.current_snapshot_id = snapshot_id
892
+
893
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
894
+ self.catalog.save_snapshot(self.identifier, snap)
895
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
896
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
897
+
898
+ def scan(
899
+ self, row_filter=None, row_limit=None, snapshot_id: Optional[int] = None
900
+ ) -> Iterable[Datafile]:
901
+ """Return Datafile objects for the given snapshot.
902
+
903
+ - If `snapshot_id` is None, use the current snapshot.
904
+ - Ignore `row_filter` for now and return all files listed in the
905
+ snapshot's parquet manifest (if present).
906
+ """
907
+ # Determine snapshot to read using the dataset-level helper which
908
+ # prefers the in-memory current snapshot and otherwise performs a
909
+ # backend lookup for the requested id.
910
+ snap = self.snapshot(snapshot_id)
911
+
912
+ if snap is None or not getattr(snap, "manifest_list", None):
913
+ return iter(())
914
+
915
+ manifest_path = snap.manifest_list
916
+
917
+ # Read manifest via FileIO if available
918
+ try:
919
+ import pyarrow as pa
920
+ import pyarrow.parquet as pq
921
+
922
+ inp = self.io.new_input(manifest_path)
923
+ with inp.open() as f:
924
+ data = f.read()
925
+
926
+ if not data:
927
+ return iter(())
928
+
929
+ table = pq.read_table(pa.BufferReader(data))
930
+ rows = table.to_pylist()
931
+ cum_rows = 0
932
+ for r in rows:
933
+ yield Datafile(entry=r)
934
+ try:
935
+ rc = int(r.get("record_count") or 0)
936
+ except Exception:
937
+ rc = 0
938
+ cum_rows += rc
939
+ if row_limit is not None and cum_rows >= row_limit:
940
+ break
941
+ except FileNotFoundError:
942
+ return iter(())
943
+ except Exception:
944
+ return iter(())
945
+
946
+ def refresh_manifest(self, agent: str, author: Optional[str] = None) -> Optional[int]:
947
+ """Refresh manifest statistics and create a new snapshot.
948
+
949
+ - `agent`: identifier for the agent performing the refresh (string)
950
+ - `author`: optional author to record; if omitted uses current snapshot author
951
+
952
+ This recalculates per-file statistics (min/max, record counts, sizes)
953
+ for every file in the current manifest, writes a new manifest and
954
+ creates a new snapshot with `user_created=False` and
955
+ `operation_type='statistics-refresh'`.
956
+
957
+ Returns the new `snapshot_id` on success or None on failure.
958
+ """
959
+ prev = self.snapshot(None)
960
+ if prev is None or not getattr(prev, "manifest_list", None):
961
+ raise ValueError("No current manifest available to refresh")
962
+
963
+ # Use same author/commit-timestamp as previous snapshot unless overridden
964
+ use_author = author if author is not None else getattr(prev, "author", None)
965
+
966
+ snapshot_id = int(time.time() * 1000)
967
+
968
+ # Rebuild manifest entries by re-reading each data file
969
+ entries = []
970
+ try:
971
+ # Read previous manifest entries
972
+ inp = self.io.new_input(prev.manifest_list)
973
+ with inp.open() as f:
974
+ prev_data = f.read()
975
+ import pyarrow as pa
976
+ import pyarrow.parquet as pq
977
+
978
+ prev_table = pq.read_table(pa.BufferReader(prev_data))
979
+ prev_rows = prev_table.to_pylist()
980
+ except Exception:
981
+ prev_rows = []
982
+
983
+ total_files = 0
984
+ total_size = 0
985
+ total_data_size = 0
986
+ total_records = 0
987
+
988
+ for ent in prev_rows:
989
+ if not isinstance(ent, dict):
990
+ continue
991
+ fp = ent.get("file_path")
992
+ if not fp:
993
+ continue
994
+ try:
995
+ inp = self.io.new_input(fp)
996
+ with inp.open() as f:
997
+ data = f.read()
998
+ # Full statistics including histograms and k-hashes
999
+ table = pq.read_table(pa.BufferReader(data))
1000
+ manifest_entry = build_parquet_manifest_entry(table, fp, len(data))
1001
+ dent = manifest_entry.to_dict()
1002
+ except Exception:
1003
+ # Fall back to original entry if re-read fails
1004
+ dent = ent
1005
+
1006
+ entries.append(dent)
1007
+ total_files += 1
1008
+ total_size += int(dent.get("file_size_in_bytes") or 0)
1009
+ total_data_size += int(dent.get("uncompressed_size_in_bytes") or 0)
1010
+ total_records += int(dent.get("record_count") or 0)
1011
+
1012
+ # write new manifest
1013
+ manifest_path = self.catalog.write_parquet_manifest(
1014
+ snapshot_id, entries, self.metadata.location
1015
+ )
1016
+
1017
+ # Build summary
1018
+ summary = {
1019
+ "added-data-files": 0,
1020
+ "added-files-size": 0,
1021
+ "added-data-size": 0,
1022
+ "added-records": 0,
1023
+ "deleted-data-files": 0,
1024
+ "deleted-files-size": 0,
1025
+ "deleted-data-size": 0,
1026
+ "deleted-records": 0,
1027
+ "total-data-files": total_files,
1028
+ "total-files-size": total_size,
1029
+ "total-data-size": total_data_size,
1030
+ "total-records": total_records,
1031
+ }
1032
+
1033
+ # sequence number
1034
+ try:
1035
+ next_seq = self._next_sequence_number()
1036
+ except Exception:
1037
+ next_seq = 1
1038
+
1039
+ parent_id = self.metadata.current_snapshot_id
1040
+
1041
+ # Agent committer metadata
1042
+ agent_meta = {
1043
+ "timestamp": int(time.time() * 1000),
1044
+ "action": "statistics-refresh",
1045
+ "agent": agent,
1046
+ }
1047
+
1048
+ snap = Snapshot(
1049
+ snapshot_id=snapshot_id,
1050
+ timestamp_ms=getattr(prev, "timestamp_ms", snapshot_id),
1051
+ author=use_author,
1052
+ sequence_number=next_seq,
1053
+ user_created=False,
1054
+ operation_type="statistics-refresh",
1055
+ parent_snapshot_id=parent_id,
1056
+ manifest_list=manifest_path,
1057
+ schema_id=self.metadata.current_schema_id,
1058
+ commit_message=getattr(prev, "commit_message", "statistics refresh"),
1059
+ summary=summary,
1060
+ )
1061
+
1062
+ # attach agent metadata under summary
1063
+ if snap.summary is None:
1064
+ snap.summary = {}
1065
+ snap.summary["agent-committer"] = agent_meta
1066
+
1067
+ # update in-memory metadata
1068
+ self.metadata.snapshots.append(snap)
1069
+ self.metadata.current_snapshot_id = snapshot_id
1070
+
1071
+ # persist
1072
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
1073
+ self.catalog.save_snapshot(self.identifier, snap)
1074
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
1075
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
1076
+
1077
+ return snapshot_id
1078
+
1079
+ def truncate(self, author: str = None, commit_message: Optional[str] = None) -> None:
1080
+ """Delete all data files and manifests for this dataset.
1081
+
1082
+ This attempts to delete every data file referenced by existing
1083
+ Parquet manifests and then delete the manifest files themselves.
1084
+ Finally it clears the in-memory snapshot list and persists the
1085
+ empty snapshot set via the attached `catalog` (if available).
1086
+ """
1087
+ import pyarrow as pa
1088
+ import pyarrow.parquet as pq
1089
+
1090
+ io = self.io
1091
+ # Collect files referenced by existing manifests but do NOT delete
1092
+ # them from storage. Instead we will write a new empty manifest and
1093
+ # create a truncate snapshot that records these files as deleted.
1094
+ snaps = list(self.metadata.snapshots)
1095
+ removed_files = []
1096
+ removed_total_size = 0
1097
+ removed_data_size = 0
1098
+
1099
+ for snap in snaps:
1100
+ manifest_path = getattr(snap, "manifest_list", None)
1101
+ if not manifest_path:
1102
+ continue
1103
+
1104
+ # Read manifest via FileIO if available
1105
+ rows = []
1106
+ try:
1107
+ inp = io.new_input(manifest_path)
1108
+ with inp.open() as f:
1109
+ data = f.read()
1110
+ table = pq.read_table(pa.BufferReader(data))
1111
+ rows = table.to_pylist()
1112
+ except Exception:
1113
+ rows = []
1114
+
1115
+ for r in rows:
1116
+ fp = None
1117
+ fsize = 0
1118
+ data_size = 0
1119
+ if isinstance(r, dict):
1120
+ fp = r.get("file_path")
1121
+ fsize = int(r.get("file_size_in_bytes") or 0)
1122
+ data_size = int(r.get("uncompressed_size_in_bytes") or 0)
1123
+ if not fp and "data_file" in r and isinstance(r["data_file"], dict):
1124
+ fp = r["data_file"].get("file_path") or r["data_file"].get("path")
1125
+ fsize = int(r["data_file"].get("file_size_in_bytes") or 0)
1126
+ data_size = int(r["data_file"].get("uncompressed_size_in_bytes") or 0)
1127
+
1128
+ if fp:
1129
+ removed_files.append(fp)
1130
+ removed_total_size += fsize
1131
+ removed_data_size += data_size
1132
+
1133
+ # Create a new empty Parquet manifest (entries=[]) to represent the
1134
+ # truncated dataset for the new snapshot. Do not delete objects.
1135
+ snapshot_id = int(time.time() * 1000)
1136
+
1137
+ # Do NOT write an empty Parquet manifest when there are no entries.
1138
+ # Per policy, create the snapshot without a manifest so older
1139
+ # snapshots remain readable and we avoid creating empty manifest files.
1140
+ manifest_path = None
1141
+
1142
+ # Build summary reflecting deleted files (tracked, not removed)
1143
+ deleted_count = len(removed_files)
1144
+ deleted_size = removed_total_size
1145
+
1146
+ summary = {
1147
+ "added-data-files": 0,
1148
+ "added-files-size": 0,
1149
+ "added-data-size": 0,
1150
+ "added-records": 0,
1151
+ "deleted-data-files": deleted_count,
1152
+ "deleted-files-size": deleted_size,
1153
+ "deleted-data-size": removed_data_size,
1154
+ "deleted-records": 0,
1155
+ "total-data-files": 0,
1156
+ "total-files-size": 0,
1157
+ "total-data-size": 0,
1158
+ "total-records": 0,
1159
+ }
1160
+
1161
+ # Sequence number
1162
+ try:
1163
+ next_seq = self._next_sequence_number()
1164
+ except Exception:
1165
+ next_seq = 1
1166
+
1167
+ if author is None:
1168
+ raise ValueError(
1169
+ "truncate() must be called with an explicit author; use truncate(author=...) in caller"
1170
+ )
1171
+ # update metadata author/timestamp for this truncate
1172
+ self.metadata.author = author
1173
+ self.metadata.timestamp_ms = snapshot_id
1174
+ # default commit message
1175
+ if commit_message is None:
1176
+ commit_message = f"commit by {author}"
1177
+
1178
+ parent_id = self.metadata.current_snapshot_id
1179
+
1180
+ snap = Snapshot(
1181
+ snapshot_id=snapshot_id,
1182
+ timestamp_ms=snapshot_id,
1183
+ author=author,
1184
+ sequence_number=next_seq,
1185
+ user_created=True,
1186
+ operation_type="truncate",
1187
+ parent_snapshot_id=parent_id,
1188
+ manifest_list=manifest_path,
1189
+ schema_id=self.metadata.current_schema_id,
1190
+ commit_message=commit_message,
1191
+ summary=summary,
1192
+ )
1193
+
1194
+ # Append new snapshot and update current snapshot id
1195
+ self.metadata.snapshots.append(snap)
1196
+ self.metadata.current_snapshot_id = snapshot_id
1197
+
1198
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
1199
+ self.catalog.save_snapshot(self.identifier, snap)