opteryx-catalog 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opteryx-catalog might be problematic. Click here for more details.

@@ -0,0 +1,31 @@
1
+ """Opteryx lightweight catalog library.
2
+
3
+ This package provides base classes and simple datatypes for a custom
4
+ catalog implementation that stores dataset metadata in Firestore and
5
+ consolidated Parquet manifests in GCS.
6
+
7
+ Start here for building a Firestore+GCS backed catalog that writes
8
+ Parquet manifests and stores metadata/snapshots in Firestore.
9
+ """
10
+
11
+ from .catalog.dataset import SimpleDataset
12
+ from .catalog.manifest import DataFile
13
+ from .catalog.manifest import ManifestEntry
14
+ from .catalog.metadata import DatasetMetadata
15
+ from .catalog.metadata import Snapshot
16
+ from .catalog.metastore import Dataset
17
+ from .catalog.metastore import Metastore
18
+ from .catalog.metastore import View
19
+ from .opteryx_catalog import OpteryxCatalog
20
+
21
+ __all__ = [
22
+ "OpteryxCatalog",
23
+ "Metastore",
24
+ "Dataset",
25
+ "View",
26
+ "SimpleDataset",
27
+ "DatasetMetadata",
28
+ "Snapshot",
29
+ "DataFile",
30
+ "ManifestEntry",
31
+ ]
@@ -0,0 +1,4 @@
1
+ from .compaction import DatasetCompactor
2
+ from .metastore import Metastore
3
+
4
+ __all__ = ["Metastore", "DatasetCompactor"]
@@ -0,0 +1,529 @@
1
+ """
2
+ Compaction module for optimizing dataset file layout.
3
+
4
+ Provides incremental compaction strategies to address the small files problem.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import time
11
+ from typing import List
12
+ from typing import Optional
13
+
14
+ import pyarrow as pa
15
+ import pyarrow.parquet as pq
16
+
17
+ from .manifest import build_parquet_manifest_entry
18
+ from .metadata import Snapshot
19
+
20
+ # Constants
21
+ TARGET_SIZE_MB = 128
22
+ TARGET_SIZE_BYTES = TARGET_SIZE_MB * 1024 * 1024
23
+ MIN_SIZE_MB = 100
24
+ MIN_SIZE_BYTES = MIN_SIZE_MB * 1024 * 1024
25
+ MAX_SIZE_MB = 140
26
+ MAX_SIZE_BYTES = MAX_SIZE_MB * 1024 * 1024
27
+ SMALL_FILE_MB = 64
28
+ SMALL_FILE_BYTES = SMALL_FILE_MB * 1024 * 1024
29
+ LARGE_FILE_MB = 196
30
+ LARGE_FILE_BYTES = LARGE_FILE_MB * 1024 * 1024
31
+ MAX_MEMORY_FILES = 2 # Maximum files to hold in memory at once
32
+ MAX_MEMORY_BYTES = 280 * 1024 * 1024 # 280MB
33
+
34
+
35
+ class DatasetCompactor:
36
+ """
37
+ Incremental compaction for datasets to optimize file layout.
38
+
39
+ Supports two strategies:
40
+ - 'brute': Combines small files to reach target size (128MB)
41
+ - 'performance': Optimizes pruning by merging overlapping ranges
42
+
43
+ Each compact() call performs one compaction operation.
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ dataset,
49
+ strategy: Optional[str] = None,
50
+ author: Optional[str] = None,
51
+ agent: Optional[str] = None,
52
+ ):
53
+ """
54
+ Initialize compactor for a dataset.
55
+
56
+ Args:
57
+ dataset: SimpleDataset instance to compact
58
+ strategy: 'brute', 'performance', or None (auto-detect)
59
+ author: Author name for snapshot metadata
60
+ agent: Agent identifier for snapshot metadata
61
+ """
62
+ self.dataset = dataset
63
+ self.author = author
64
+ self.agent = agent or "compactor"
65
+
66
+ # Auto-detect strategy if not specified
67
+ if strategy is None:
68
+ # Check if dataset has sort order - if so, performance mode is available
69
+ sort_orders = getattr(dataset.metadata, "sort_orders", [])
70
+ if sort_orders and len(sort_orders) > 0:
71
+ self.strategy = "performance"
72
+ self.decision = "auto"
73
+ else:
74
+ self.strategy = "brute"
75
+ self.decision = "no-sort"
76
+ else:
77
+ self.strategy = strategy
78
+ self.decision = "user"
79
+
80
+ # Get sort column if available
81
+ self.sort_column_id = None
82
+ if self.strategy == "performance":
83
+ sort_orders = getattr(dataset.metadata, "sort_orders", [])
84
+ if sort_orders and len(sort_orders) > 0:
85
+ self.sort_column_id = sort_orders[0]
86
+ else:
87
+ # Fallback to brute if performance requested but no sort order
88
+ self.strategy = "brute"
89
+ self.decision = "no-sort"
90
+
91
+ def compact(self, dry_run: bool = False) -> Optional[Snapshot]:
92
+ """
93
+ Perform one incremental compaction operation.
94
+
95
+ Args:
96
+ dry_run: If True, return plan without executing
97
+
98
+ Returns:
99
+ New Snapshot if compaction was performed, None if nothing to compact
100
+ """
101
+ # Get current manifest entries
102
+ current_snapshot = self.dataset.metadata.current_snapshot
103
+ if not current_snapshot:
104
+ return None
105
+
106
+ manifest_path = current_snapshot.manifest_list
107
+ if not manifest_path:
108
+ return None
109
+
110
+ # Read manifest entries
111
+ entries = self._read_manifest(manifest_path)
112
+ if not entries:
113
+ return None
114
+
115
+ # Select files to compact based on strategy
116
+ if self.strategy == "brute":
117
+ compaction_plan = self._select_brute_compaction(entries)
118
+ else: # performance
119
+ compaction_plan = self._select_performance_compaction(entries)
120
+
121
+ if not compaction_plan:
122
+ return None
123
+
124
+ if dry_run:
125
+ # Return plan information (could extend this to return a structured plan)
126
+ return compaction_plan
127
+
128
+ # Execute compaction
129
+ new_snapshot = self._execute_compaction(entries, compaction_plan)
130
+ return new_snapshot
131
+
132
+ def _read_manifest(self, manifest_path: str) -> List[dict]:
133
+ """Read manifest entries from manifest file."""
134
+ try:
135
+ io = self.dataset.io
136
+ inp = io.new_input(manifest_path)
137
+ with inp.open() as f:
138
+ data = f.read()
139
+ table = pq.read_table(pa.BufferReader(data))
140
+ return table.to_pylist()
141
+ except Exception:
142
+ return []
143
+
144
+ def _select_brute_compaction(self, entries: List[dict]) -> Optional[dict]:
145
+ """
146
+ Select files for brute force compaction.
147
+
148
+ Strategy:
149
+ 1. Find files < 64MB (small files to eliminate)
150
+ 2. Find files >= 196MB (large files to split)
151
+ 3. Combine small files up to 128MB target
152
+ 4. Split large files if any
153
+
154
+ Returns:
155
+ Compaction plan dict or None
156
+ """
157
+ small_files = []
158
+ large_files = []
159
+ acceptable_files = []
160
+
161
+ for entry in entries:
162
+ size = entry.get("uncompressed_size_in_bytes", 0)
163
+ if size < SMALL_FILE_BYTES:
164
+ small_files.append(entry)
165
+ elif size >= LARGE_FILE_BYTES:
166
+ large_files.append(entry)
167
+ elif MIN_SIZE_BYTES <= size <= MAX_SIZE_BYTES:
168
+ acceptable_files.append(entry)
169
+
170
+ # Priority 1: Split large files
171
+ if large_files:
172
+ # Take first large file to split
173
+ return {
174
+ "type": "split",
175
+ "files": [large_files[0]],
176
+ "reason": "file-too-large",
177
+ }
178
+
179
+ # Priority 2: Combine small files
180
+ if len(small_files) >= 2:
181
+ # Find combination that gets close to target
182
+ selected = []
183
+ total_size = 0
184
+
185
+ # Sort by size descending to fill efficiently
186
+ sorted_files = sorted(
187
+ small_files, key=lambda x: x.get("uncompressed_size_in_bytes", 0), reverse=True
188
+ )
189
+
190
+ for entry in sorted_files:
191
+ entry_size = entry.get("uncompressed_size_in_bytes", 0)
192
+ if total_size + entry_size <= MAX_MEMORY_BYTES and len(selected) < MAX_MEMORY_FILES:
193
+ selected.append(entry)
194
+ total_size += entry_size
195
+ # Stop if we've reached acceptable size
196
+ if total_size >= MIN_SIZE_BYTES:
197
+ break
198
+
199
+ if len(selected) >= 2:
200
+ return {
201
+ "type": "combine",
202
+ "files": selected,
203
+ "reason": "small-files",
204
+ }
205
+
206
+ # No compaction needed
207
+ return None
208
+
209
+ def _select_performance_compaction(self, entries: List[dict]) -> Optional[dict]:
210
+ """
211
+ Select files for performance-optimized compaction.
212
+
213
+ Strategy:
214
+ 1. Find files >= 196MB to split
215
+ 2. Find overlapping or adjacent ranges on sort column
216
+ 3. Combine and split to eliminate overlap and reach target size
217
+
218
+ Returns:
219
+ Compaction plan dict or None
220
+ """
221
+ # Priority 1: Split large files (same as brute)
222
+ large_files = []
223
+ for entry in entries:
224
+ size = entry.get("uncompressed_size_in_bytes", 0)
225
+ if size >= LARGE_FILE_BYTES:
226
+ large_files.append(entry)
227
+
228
+ if large_files:
229
+ return {
230
+ "type": "split",
231
+ "files": [large_files[0]],
232
+ "reason": "file-too-large",
233
+ }
234
+
235
+ # Priority 2: Find overlapping ranges
236
+ # Get schema to find sort column name
237
+ schema = self.dataset.metadata.schema
238
+ if not schema or not self.sort_column_id:
239
+ # Fallback to brute logic
240
+ return self._select_brute_compaction(entries)
241
+
242
+ # Find sort column name from schema
243
+ sort_column_name = None
244
+ if hasattr(schema, "fields") and self.sort_column_id < len(schema.fields):
245
+ sort_column_name = schema.fields[self.sort_column_id].name
246
+ elif isinstance(schema, dict) and "fields" in schema:
247
+ fields = schema["fields"]
248
+ if self.sort_column_id < len(fields):
249
+ sort_column_name = fields[self.sort_column_id].get("name")
250
+
251
+ if not sort_column_name:
252
+ # Can't find sort column, fallback to brute
253
+ return self._select_brute_compaction(entries)
254
+
255
+ # Extract ranges for each file
256
+ file_ranges = []
257
+ for entry in entries:
258
+ lower_bounds = entry.get("lower_bounds", {})
259
+ upper_bounds = entry.get("upper_bounds", {})
260
+
261
+ if sort_column_name in lower_bounds and sort_column_name in upper_bounds:
262
+ min_val = lower_bounds[sort_column_name]
263
+ max_val = upper_bounds[sort_column_name]
264
+ size = entry.get("uncompressed_size_in_bytes", 0)
265
+ file_ranges.append(
266
+ {
267
+ "entry": entry,
268
+ "min": min_val,
269
+ "max": max_val,
270
+ "size": size,
271
+ }
272
+ )
273
+
274
+ if not file_ranges:
275
+ # No range information, fallback to brute
276
+ return self._select_brute_compaction(entries)
277
+
278
+ # Sort by min value
279
+ file_ranges.sort(key=lambda x: x["min"])
280
+
281
+ # Find first overlapping or adjacent group
282
+ for i in range(len(file_ranges) - 1):
283
+ current = file_ranges[i]
284
+ next_file = file_ranges[i + 1]
285
+
286
+ # Check for overlap or adjacency
287
+ if current["max"] >= next_file["min"]:
288
+ # Found overlap or adjacency
289
+ # Check if combining would be beneficial
290
+ combined_size = current["size"] + next_file["size"]
291
+
292
+ # Only combine if:
293
+ # 1. Total size is within memory limits
294
+ # 2. At least one file is below acceptable range
295
+ # 3. Combined size would benefit from splitting OR result is in acceptable range
296
+ if combined_size <= MAX_MEMORY_BYTES and (
297
+ current["size"] < MIN_SIZE_BYTES
298
+ or next_file["size"] < MIN_SIZE_BYTES
299
+ or (current["max"] >= next_file["min"]) # Overlap exists
300
+ ):
301
+ return {
302
+ "type": "combine-split",
303
+ "files": [current["entry"], next_file["entry"]],
304
+ "reason": "overlapping-ranges",
305
+ "sort_column": sort_column_name,
306
+ }
307
+
308
+ # No overlaps found, check for small files to combine
309
+ small_files = [fr for fr in file_ranges if fr["size"] < SMALL_FILE_BYTES]
310
+ if len(small_files) >= 2:
311
+ # Combine adjacent small files
312
+ selected = []
313
+ total_size = 0
314
+
315
+ for fr in small_files[:MAX_MEMORY_FILES]:
316
+ if total_size + fr["size"] <= MAX_MEMORY_BYTES:
317
+ selected.append(fr["entry"])
318
+ total_size += fr["size"]
319
+ if total_size >= MIN_SIZE_BYTES:
320
+ break
321
+
322
+ if len(selected) >= 2:
323
+ return {
324
+ "type": "combine-split",
325
+ "files": selected,
326
+ "reason": "small-files",
327
+ "sort_column": sort_column_name,
328
+ }
329
+
330
+ # No compaction opportunities
331
+ return None
332
+
333
+ def _execute_compaction(self, all_entries: List[dict], plan: dict) -> Optional[Snapshot]:
334
+ """
335
+ Execute the compaction plan.
336
+
337
+ Args:
338
+ all_entries: All current manifest entries
339
+ plan: Compaction plan from selection methods
340
+
341
+ Returns:
342
+ New Snapshot or None if failed
343
+ """
344
+ plan_type = plan["type"]
345
+ files_to_compact = plan["files"]
346
+ sort_column = plan.get("sort_column")
347
+
348
+ # Read files to compact
349
+ tables = []
350
+ total_size = 0
351
+ for entry in files_to_compact:
352
+ file_path = entry.get("file_path")
353
+ if not file_path:
354
+ continue
355
+
356
+ try:
357
+ io = self.dataset.io
358
+ inp = io.new_input(file_path)
359
+ with inp.open() as f:
360
+ data = f.read()
361
+ table = pq.read_table(pa.BufferReader(data))
362
+ tables.append(table)
363
+ total_size += entry.get("uncompressed_size_in_bytes", 0)
364
+ except Exception:
365
+ # Failed to read file, abort this compaction
366
+ return None
367
+
368
+ if not tables:
369
+ return None
370
+
371
+ # Combine tables
372
+ combined = pa.concat_tables(tables)
373
+
374
+ # Sort if performance mode
375
+ if sort_column and plan_type == "combine-split":
376
+ try:
377
+ # Sort by the sort column
378
+ combined = combined.sort_by([(sort_column, "ascending")])
379
+ except Exception:
380
+ # Sort failed, continue without sorting
381
+ pass
382
+
383
+ # Determine how to split
384
+ output_tables = []
385
+ if plan_type == "split" or (plan_type == "combine-split" and total_size > MAX_SIZE_BYTES):
386
+ # Split into multiple files
387
+ output_tables = self._split_table(combined, TARGET_SIZE_BYTES)
388
+ else:
389
+ # Single output file
390
+ output_tables = [combined]
391
+
392
+ # Write new files and build manifest entries
393
+ new_entries = []
394
+ snapshot_id = int(time.time() * 1000)
395
+
396
+ for idx, table in enumerate(output_tables):
397
+ # Generate file path
398
+ file_name = f"data-{snapshot_id}-{idx:04d}.parquet"
399
+ file_path = os.path.join(self.dataset.metadata.location, file_name)
400
+
401
+ # Write parquet file
402
+ try:
403
+ io = self.dataset.io
404
+ out = io.new_output(file_path)
405
+ with out.create() as f:
406
+ pq.write_table(table, f)
407
+ except Exception:
408
+ # Failed to write, abort
409
+ return None
410
+
411
+ # Build manifest entry with full statistics
412
+ entry_dict = build_parquet_manifest_entry(table, file_path)
413
+ new_entries.append(entry_dict)
414
+
415
+ # Create new manifest with updated entries
416
+ # Remove old entries, add new entries
417
+ old_file_paths = {f["file_path"] for f in files_to_compact}
418
+ updated_entries = [e for e in all_entries if e.get("file_path") not in old_file_paths]
419
+ updated_entries.extend(new_entries)
420
+
421
+ # Write manifest
422
+ manifest_path = self.dataset.catalog.write_parquet_manifest(
423
+ snapshot_id, updated_entries, self.dataset.metadata.location
424
+ )
425
+
426
+ # Calculate summary statistics
427
+ deleted_files = len(files_to_compact)
428
+ deleted_size = sum(e.get("file_size_in_bytes", 0) for e in files_to_compact)
429
+ deleted_data_size = sum(e.get("uncompressed_size_in_bytes", 0) for e in files_to_compact)
430
+ deleted_records = sum(e.get("record_count", 0) for e in files_to_compact)
431
+
432
+ added_files = len(new_entries)
433
+ added_size = sum(e.get("file_size_in_bytes", 0) for e in new_entries)
434
+ added_data_size = sum(e.get("uncompressed_size_in_bytes", 0) for e in new_entries)
435
+ added_records = sum(e.get("record_count", 0) for e in new_entries)
436
+
437
+ total_files = len(updated_entries)
438
+ total_size = sum(e.get("file_size_in_bytes", 0) for e in updated_entries)
439
+ total_data_size = sum(e.get("uncompressed_size_in_bytes", 0) for e in updated_entries)
440
+ total_records = sum(e.get("record_count", 0) for e in updated_entries)
441
+
442
+ # Build snapshot with agent metadata
443
+ current = self.dataset.metadata.current_snapshot
444
+ new_sequence = (current.sequence_number or 0) + 1 if current else 1
445
+
446
+ snapshot = Snapshot(
447
+ snapshot_id=snapshot_id,
448
+ timestamp_ms=snapshot_id,
449
+ author=self.author,
450
+ user_created=False,
451
+ sequence_number=new_sequence,
452
+ manifest_list=manifest_path,
453
+ operation_type="compact",
454
+ parent_snapshot_id=current.snapshot_id if current else None,
455
+ schema_id=getattr(self.dataset.metadata.schema, "schema_id", None),
456
+ commit_message=f"Compaction: {self.strategy} strategy, {deleted_files} files → {added_files} files",
457
+ summary={
458
+ "added-data-files": added_files,
459
+ "added-files-size": added_size,
460
+ "added-data-size": added_data_size,
461
+ "added-records": added_records,
462
+ "deleted-data-files": deleted_files,
463
+ "deleted-files-size": deleted_size,
464
+ "deleted-data-size": deleted_data_size,
465
+ "deleted-records": deleted_records,
466
+ "total-data-files": total_files,
467
+ "total-files-size": total_size,
468
+ "total-data-size": total_data_size,
469
+ "total-records": total_records,
470
+ "agent_meta": {
471
+ "committer": self.agent,
472
+ "compaction-algorithm": self.strategy,
473
+ "compaction-algorithm-decision": self.decision,
474
+ "compaction-files-combined": deleted_files,
475
+ "compaction-files-written": added_files,
476
+ },
477
+ },
478
+ )
479
+
480
+ # Commit snapshot
481
+ try:
482
+ self.dataset.metadata.snapshots.append(snapshot)
483
+ self.dataset.metadata.current_snapshot = snapshot
484
+
485
+ # Persist metadata via catalog
486
+ if self.dataset.catalog:
487
+ self.dataset.catalog.save_dataset_metadata(self.dataset.metadata)
488
+ except Exception:
489
+ return None
490
+
491
+ return snapshot
492
+
493
+ def _split_table(self, table: pa.Table, target_size: int) -> List[pa.Table]:
494
+ """
495
+ Split a table into multiple tables of approximately target size.
496
+
497
+ Args:
498
+ table: PyArrow table to split
499
+ target_size: Target size in bytes (uncompressed)
500
+
501
+ Returns:
502
+ List of tables
503
+ """
504
+ if not table or table.num_rows == 0:
505
+ return [table]
506
+
507
+ # Estimate size per row
508
+ total_size = sum(sum(chunk.size for chunk in col.chunks) for col in table.columns)
509
+
510
+ if total_size <= target_size:
511
+ return [table]
512
+
513
+ # Calculate rows per split
514
+ avg_row_size = total_size / table.num_rows
515
+ rows_per_split = int(target_size / avg_row_size)
516
+
517
+ if rows_per_split <= 0:
518
+ rows_per_split = 1
519
+
520
+ # Split into chunks
521
+ splits = []
522
+ offset = 0
523
+ while offset < table.num_rows:
524
+ end = min(offset + rows_per_split, table.num_rows)
525
+ split = table.slice(offset, end - offset)
526
+ splits.append(split)
527
+ offset = end
528
+
529
+ return splits if splits else [table]