opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. opteryx_catalog/catalog/compaction.py +15 -8
  2. opteryx_catalog/catalog/dataset.py +449 -111
  3. opteryx_catalog/catalog/manifest.py +390 -330
  4. opteryx_catalog/catalog/metadata.py +3 -0
  5. opteryx_catalog/iops/fileio.py +13 -0
  6. opteryx_catalog/maki_nage/__init__.py +8 -0
  7. opteryx_catalog/maki_nage/distogram.py +558 -0
  8. opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
  9. opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
  10. opteryx_catalog/maki_nage/tests/test_count.py +19 -0
  11. opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
  12. opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
  13. opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
  14. opteryx_catalog/maki_nage/tests/test_update.py +44 -0
  15. opteryx_catalog/opteryx_catalog.py +82 -54
  16. opteryx_catalog/webhooks/__init__.py +230 -0
  17. opteryx_catalog/webhooks/events.py +177 -0
  18. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
  19. opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
  20. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
  21. scripts/collect_byte_counts.py +42 -0
  22. scripts/emit_full_single_file.py +81 -0
  23. scripts/inspect_manifest_dryrun.py +322 -0
  24. scripts/inspect_single_file.py +147 -0
  25. scripts/inspect_single_file_gcs.py +124 -0
  26. tests/test_collections.py +37 -0
  27. tests/test_describe_uncompressed.py +127 -0
  28. tests/test_refresh_manifest.py +275 -0
  29. tests/test_webhooks.py +177 -0
  30. opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
  31. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
  32. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,230 @@
1
+ """Webhook system for Opteryx Catalog events.
2
+
3
+ This module provides webhook notification capabilities for key catalog events.
4
+ Webhooks can be delivered either directly via HTTP or asynchronously via
5
+ Google Cloud Tasks.
6
+
7
+ Configuration:
8
+ OPTERYX_WEBHOOK_DOMAIN: Base domain for webhook delivery (e.g., router.opteryx.app)
9
+ OPTERYX_WEBHOOK_QUEUE: Cloud Tasks queue path for async delivery
10
+ Format: projects/PROJECT/locations/LOCATION/queues/QUEUE
11
+ If not set, webhooks are sent directly via HTTP
12
+
13
+ Example:
14
+ export OPTERYX_WEBHOOK_DOMAIN=router.opteryx.app
15
+ export OPTERYX_WEBHOOK_QUEUE=projects/my-project/locations/us-central1/queues/webhooks
16
+
17
+ Webhook Endpoint:
18
+ All webhooks are sent to: https://{OPTERYX_WEBHOOK_DOMAIN}/event
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import json
24
+ import os
25
+ import time
26
+ from typing import Any
27
+ from typing import Optional
28
+
29
+ import requests
30
+
31
+
32
+ class WebhookManager:
33
+ """Manages webhook delivery for catalog events.
34
+
35
+ Supports two delivery modes:
36
+ 1. Direct HTTP POST (when OPTERYX_WEBHOOK_QUEUE is not set)
37
+ 2. Cloud Tasks async delivery (when OPTERYX_WEBHOOK_QUEUE is set)
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ domain: Optional[str] = None,
43
+ queue_path: Optional[str] = None,
44
+ timeout: int = 10,
45
+ ):
46
+ """Initialize the webhook manager.
47
+
48
+ Args:
49
+ domain: Base domain for webhooks (e.g., 'hook.opteryx.app')
50
+ Falls back to OPTERYX_WEBHOOK_DOMAIN env var
51
+ queue_path: Cloud Tasks queue path for async delivery
52
+ Falls back to OPTERYX_WEBHOOK_QUEUE env var
53
+ timeout: HTTP timeout in seconds for direct delivery
54
+ """
55
+ self.domain = domain or os.getenv("OPTERYX_WEBHOOK_DOMAIN")
56
+ self.queue_path = queue_path or os.getenv("OPTERYX_WEBHOOK_QUEUE")
57
+ self.timeout = timeout
58
+ self.enabled = bool(self.domain)
59
+
60
+ # Initialize Cloud Tasks client only if needed
61
+ self._tasks_client = None
62
+ if self.enabled and self.queue_path:
63
+ try:
64
+ from google.cloud import tasks_v2
65
+
66
+ self._tasks_client = tasks_v2.CloudTasksClient()
67
+ except ImportError:
68
+ # Cloud Tasks not available, fall back to direct HTTP
69
+ self._tasks_client = None
70
+
71
+ def send(
72
+ self,
73
+ action: str,
74
+ workspace: str,
75
+ collection: str,
76
+ resource_type: str,
77
+ resource_name: str,
78
+ payload: Optional[dict[str, Any]] = None,
79
+ ) -> bool:
80
+ """Send a webhook notification.
81
+
82
+ Args:
83
+ action: Action type (create, delete, update, commit, execute)
84
+ workspace: Workspace name
85
+ collection: Collection name
86
+ resource_type: Type of resource (dataset, view, collection)
87
+ resource_name: Name of the resource
88
+ payload: Additional data to include in the webhook body
89
+
90
+ Returns:
91
+ True if webhook was sent successfully (or queued), False otherwise
92
+ """
93
+ if not self.enabled:
94
+ return False
95
+
96
+ # Simple endpoint URL
97
+ url = f"https://{self.domain}/event"
98
+
99
+ # Build the payload
100
+ body = self._build_payload(
101
+ action=action,
102
+ workspace=workspace,
103
+ collection=collection,
104
+ resource_type=resource_type,
105
+ resource_name=resource_name,
106
+ additional=payload or {},
107
+ )
108
+
109
+ # Deliver via Cloud Tasks or direct HTTP
110
+ if self._tasks_client and self.queue_path:
111
+ return self._send_via_cloud_tasks(url, body)
112
+ else:
113
+ return self._send_direct(url, body)
114
+
115
+ def _build_payload(
116
+ self,
117
+ action: str,
118
+ workspace: str,
119
+ collection: str,
120
+ resource_type: str,
121
+ resource_name: str,
122
+ additional: dict[str, Any],
123
+ ) -> dict[str, Any]:
124
+ """Build the webhook payload.
125
+
126
+ Returns a standardized payload with event metadata and additional data.
127
+ """
128
+ return {
129
+ "event": {
130
+ "action": action,
131
+ "workspace": workspace,
132
+ "collection": collection,
133
+ "resource_type": resource_type,
134
+ "resource_name": resource_name,
135
+ "timestamp": int(time.time() * 1000), # milliseconds
136
+ },
137
+ "data": additional,
138
+ }
139
+
140
+ def _send_direct(self, url: str, payload: dict[str, Any]) -> bool:
141
+ """Send webhook directly via HTTP POST.
142
+
143
+ Args:
144
+ url: Full webhook URL
145
+ payload: JSON payload
146
+
147
+ Returns:
148
+ True if successful (2xx response), False otherwise
149
+ """
150
+ try:
151
+ response = requests.post(
152
+ url,
153
+ json=payload,
154
+ timeout=self.timeout,
155
+ headers={
156
+ "Content-Type": "application/json",
157
+ "User-Agent": "opteryx-catalog-webhook/1.0",
158
+ },
159
+ )
160
+ return response.status_code >= 200 and response.status_code < 300
161
+ except Exception:
162
+ # Log errors in production; for now, silently fail
163
+ return False
164
+
165
+ def _send_via_cloud_tasks(self, url: str, payload: dict[str, Any]) -> bool:
166
+ """Send webhook asynchronously via Cloud Tasks.
167
+
168
+ Args:
169
+ url: Full webhook URL
170
+ payload: JSON payload
171
+
172
+ Returns:
173
+ True if task was created successfully, False otherwise
174
+ """
175
+ if not self._tasks_client:
176
+ # Fall back to direct delivery if client unavailable
177
+ return self._send_direct(url, payload)
178
+
179
+ try:
180
+ from google.cloud import tasks_v2
181
+
182
+ # Create the task
183
+ task = tasks_v2.Task(
184
+ http_request=tasks_v2.HttpRequest(
185
+ http_method=tasks_v2.HttpMethod.POST,
186
+ url=url,
187
+ headers={
188
+ "Content-Type": "application/json",
189
+ "User-Agent": "opteryx-catalog-webhook/1.0",
190
+ },
191
+ body=json.dumps(payload).encode(),
192
+ )
193
+ )
194
+
195
+ # Queue the task
196
+ self._tasks_client.create_task(
197
+ request=tasks_v2.CreateTaskRequest(
198
+ parent=self.queue_path,
199
+ task=task,
200
+ )
201
+ )
202
+ return True
203
+ except Exception:
204
+ # Log errors in production; for now, silently fail
205
+ return False
206
+
207
+
208
+ # Global webhook manager instance
209
+ _webhook_manager: Optional[WebhookManager] = None
210
+
211
+
212
+ def get_webhook_manager() -> WebhookManager:
213
+ """Get or create the global webhook manager instance."""
214
+ global _webhook_manager
215
+ if _webhook_manager is None:
216
+ _webhook_manager = WebhookManager()
217
+ return _webhook_manager
218
+
219
+
220
+ def send_webhook(
221
+ action: str,
222
+ workspace: str,
223
+ collection: str,
224
+ resource_type: str,
225
+ resource_name: str,
226
+ payload: Optional[dict[str, Any]] = None,
227
+ ) -> bool:
228
+ """Convenience function to send a webhook via the global manager."""
229
+ manager = get_webhook_manager()
230
+ return manager.send(action, workspace, collection, resource_type, resource_name, payload)
@@ -0,0 +1,177 @@
1
+ """Event definitions and payload builders for webhook notifications.
2
+
3
+ This module provides helper functions to create standardized payloads
4
+ for different types of catalog events.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any
10
+ from typing import Optional
11
+
12
+
13
+ def dataset_created_payload(
14
+ schema: Any,
15
+ location: Optional[str] = None,
16
+ properties: Optional[dict[str, Any]] = None,
17
+ ) -> dict[str, Any]:
18
+ """Build payload for dataset creation event.
19
+
20
+ Args:
21
+ schema: Dataset schema (arrow or pyiceberg schema)
22
+ location: GCS location of the dataset
23
+ properties: Additional dataset properties
24
+
25
+ Returns:
26
+ Payload dictionary with dataset metadata
27
+ """
28
+ payload = {
29
+ "location": location,
30
+ "properties": properties or {},
31
+ }
32
+
33
+ # Include schema information if available
34
+ try:
35
+ if hasattr(schema, "names"): # PyArrow schema
36
+ payload["schema"] = {
37
+ "fields": [
38
+ {"name": name, "type": str(schema.field(name).type)} for name in schema.names
39
+ ]
40
+ }
41
+ except Exception:
42
+ pass
43
+
44
+ return payload
45
+
46
+
47
+ def dataset_deleted_payload() -> dict[str, Any]:
48
+ """Build payload for dataset deletion event."""
49
+ return {}
50
+
51
+
52
+ def dataset_updated_payload(
53
+ description: Optional[str] = None,
54
+ properties: Optional[dict[str, Any]] = None,
55
+ ) -> dict[str, Any]:
56
+ """Build payload for dataset update event.
57
+
58
+ Args:
59
+ description: New description
60
+ properties: Updated properties
61
+
62
+ Returns:
63
+ Payload dictionary with updated fields
64
+ """
65
+ return {
66
+ "description": description,
67
+ "properties": properties or {},
68
+ }
69
+
70
+
71
+ def dataset_commit_payload(
72
+ snapshot_id: int,
73
+ sequence_number: int,
74
+ record_count: int,
75
+ file_count: int,
76
+ ) -> dict[str, Any]:
77
+ """Build payload for dataset commit (append) event.
78
+
79
+ Args:
80
+ snapshot_id: New snapshot ID
81
+ sequence_number: Sequence number of the commit
82
+ record_count: Number of records added
83
+ file_count: Number of files added
84
+
85
+ Returns:
86
+ Payload dictionary with commit metadata
87
+ """
88
+ return {
89
+ "snapshot_id": snapshot_id,
90
+ "sequence_number": sequence_number,
91
+ "record_count": record_count,
92
+ "file_count": file_count,
93
+ }
94
+
95
+
96
+ def collection_created_payload(
97
+ properties: Optional[dict[str, Any]] = None,
98
+ ) -> dict[str, Any]:
99
+ """Build payload for collection creation event.
100
+
101
+ Args:
102
+ properties: Collection properties
103
+
104
+ Returns:
105
+ Payload dictionary with collection metadata
106
+ """
107
+ return {
108
+ "properties": properties or {},
109
+ }
110
+
111
+
112
+ def view_created_payload(
113
+ definition: str,
114
+ properties: Optional[dict[str, Any]] = None,
115
+ ) -> dict[str, Any]:
116
+ """Build payload for view creation event.
117
+
118
+ Args:
119
+ definition: SQL definition of the view
120
+ properties: Additional view properties
121
+
122
+ Returns:
123
+ Payload dictionary with view metadata
124
+ """
125
+ return {
126
+ "definition": definition,
127
+ "properties": properties or {},
128
+ }
129
+
130
+
131
+ def view_deleted_payload() -> dict[str, Any]:
132
+ """Build payload for view deletion event."""
133
+ return {}
134
+
135
+
136
+ def view_updated_payload(
137
+ description: Optional[str] = None,
138
+ properties: Optional[dict[str, Any]] = None,
139
+ ) -> dict[str, Any]:
140
+ """Build payload for view update event.
141
+
142
+ Args:
143
+ description: New description
144
+ properties: Updated properties
145
+
146
+ Returns:
147
+ Payload dictionary with updated fields
148
+ """
149
+ return {
150
+ "description": description,
151
+ "properties": properties or {},
152
+ }
153
+
154
+
155
+ def view_executed_payload(
156
+ execution_time_ms: Optional[int] = None,
157
+ row_count: Optional[int] = None,
158
+ error: Optional[str] = None,
159
+ ) -> dict[str, Any]:
160
+ """Build payload for view execution event.
161
+
162
+ Args:
163
+ execution_time_ms: Execution time in milliseconds
164
+ row_count: Number of rows returned
165
+ error: Error message if execution failed
166
+
167
+ Returns:
168
+ Payload dictionary with execution metadata
169
+ """
170
+ payload = {}
171
+ if execution_time_ms is not None:
172
+ payload["execution_time_ms"] = execution_time_ms
173
+ if row_count is not None:
174
+ payload["row_count"] = row_count
175
+ if error is not None:
176
+ payload["error"] = error
177
+ return payload
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opteryx-catalog
3
- Version: 0.4.11
3
+ Version: 0.4.26
4
4
  Summary: Opteryx Cloud Catalog
5
5
  Author-email: joocer <justin.joyce@joocer.com>
6
6
  License: Apache License
@@ -218,12 +218,14 @@ Classifier: Operating System :: OS Independent
218
218
  Requires-Python: >=3.9
219
219
  Description-Content-Type: text/markdown
220
220
  License-File: LICENSE
221
- Requires-Dist: google-cloud-firestore==2.22.0
222
- Requires-Dist: google-cloud-storage==3.7.0
221
+ Requires-Dist: google-cloud-firestore==2.23.0
222
+ Requires-Dist: google-cloud-storage==3.8.0
223
223
  Requires-Dist: orso==0.0.226
224
224
  Requires-Dist: opteryx-core
225
- Requires-Dist: pyarrow==22.0.0
225
+ Requires-Dist: pyarrow==23.0.0
226
226
  Requires-Dist: requests==2.32.5
227
+ Provides-Extra: webhooks
228
+ Requires-Dist: google-cloud-tasks>=2.16.0; extra == "webhooks"
227
229
  Dynamic: license-file
228
230
 
229
231
  # pyiceberg-firestore-gcs
@@ -233,17 +235,18 @@ lightweight catalog interface. This package provides an opinionated
233
235
  catalog implementation for storing table metadata documents in Firestore and
234
236
  consolidated Parquet manifests in GCS.
235
237
 
236
- This project is intended to be used as a catalog component in GCP-based
237
- environments and provides utilities to interoperate with Avro/manifest-based
238
- workflows when needed.
238
+ **Important:** This library is *modelled after* Apache Iceberg but is **not
239
+ compatible** with Iceberg; it is a separate implementation with different
240
+ storage conventions and metadata layout. This library is the catalog and
241
+ metastore used by [opteryx.app](https://opteryx.app/) and uses **Firestore** as the primary
242
+ metastore and **GCS** for data and manifest storage.
239
243
 
240
244
  ---
241
245
 
242
246
  ## Features ✅
243
247
 
244
248
  - Firestore-backed catalog and collection storage
245
- - GCS-based table metadata storage (with optional compatibility mode)
246
- - GCS-based table metadata storage; export/import utilities provide Avro interoperability
249
+ - GCS-based table metadata storage; export/import utilities available for artifact conversion
247
250
  - Table creation, registration, listing, loading, renaming, and deletion
248
251
  - Commit operations that write updated metadata to GCS and persist references in Firestore
249
252
  - Simple, opinionated defaults (e.g., default GCS location derived from catalog properties)
@@ -299,7 +302,7 @@ print(tbl.metadata)
299
302
  - GCP authentication: Use `GOOGLE_APPLICATION_CREDENTIALS` or Application Default Credentials
300
303
  - `firestore_project` and `firestore_database` can be supplied when creating the catalog
301
304
  - `gcs_bucket` is recommended to allow `create_dataset` to write metadata automatically; otherwise pass `location` explicitly to `create_dataset`
302
- - The catalog does not write Avro/manifest-list artifacts in the hot path; use the provided export/import utilities for interoperability
305
+ - The catalog writes consolidated Parquet manifests and does not write manifest-list artifacts in the hot path. Use the provided export/import utilities for artifact conversion when necessary.
303
306
 
304
307
  Example environment variables:
305
308
 
@@ -308,15 +311,9 @@ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account.json"
308
311
  export GOOGLE_CLOUD_PROJECT="my-gcp-project"
309
312
  ```
310
313
 
311
- ### Interoperability
314
+ ### Manifest format
312
315
 
313
- This catalog implementation does not write Avro manifest-list/Avro manifest files
314
- in the hot path. Instead, table metadata is stored in Firestore and the runtime
315
- writes a consolidated Parquet manifest for fast query planning.
316
-
317
- If you need full Avro-compatible artifacts for other engines or tools, use the
318
- provided export/import utilities to transform between Avro manifests and the
319
- Parquet-first storage layout used by this catalog.
316
+ This catalog writes consolidated Parquet manifests for fast query planning and stores table metadata in Firestore. Manifests and data files are stored in GCS. If you need different artifact formats, use the provided export/import utilities to convert manifests outside the hot path.
320
317
 
321
318
  ## API overview 📚
322
319
 
@@ -0,0 +1,45 @@
1
+ opteryx_catalog/__init__.py,sha256=cqGY7bl6iMBIqY_x6VTc5fAFH23M3XQeJYrHPX6FglY,902
2
+ opteryx_catalog/exceptions.py,sha256=ZEaXmrrn030V8pfy8YMaLwzBWFms9OgZG21zVRGKlxM,652
3
+ opteryx_catalog/opteryx_catalog.py,sha256=yL_9baNhvKS1U5kRqGt1xOqEigHhbGsWBT-pTwWn2kw,40178
4
+ opteryx_catalog/catalog/__init__.py,sha256=yD7egf-dLd1z_CNXunz3ldLyLMMkSNbS3aKjGp3dKQY,119
5
+ opteryx_catalog/catalog/compaction.py,sha256=GFwRiQssFh80w1uUqIy-oiz35HzkrDXi9wfPPOjv9uU,19483
6
+ opteryx_catalog/catalog/dataset.py,sha256=XTW4HY7qZw3VIWIezO-xykQBrchZKGPmblQkBjUa9PY,59944
7
+ opteryx_catalog/catalog/manifest.py,sha256=CcPfgTXQV4QyspFoAqFYMmx3qrfXUFNVxPjGhHPyvN8,17475
8
+ opteryx_catalog/catalog/metadata.py,sha256=wGb1Z72oKrTJ7ANn7VOtuUq3Fu0RZo4233iGCoF_g1Y,3065
9
+ opteryx_catalog/catalog/metastore.py,sha256=mS4qaaOMzcIu730Jm0K_Nq-4sNI8kIX3UscevYO5E08,1997
10
+ opteryx_catalog/catalog/view.py,sha256=mUzfRGYqLRx_9BfZdGY5HNz6na9VMEPITrYKiI5m694,219
11
+ opteryx_catalog/iops/__init__.py,sha256=_CxR-hg8XUD2cIFucb3aHyTFqwi41QmEDf9gXzXt3ZU,171
12
+ opteryx_catalog/iops/base.py,sha256=1IW9qjDkQEMXvrA2J73VSBCdzkf2W5xVsWVnpNglL1U,1206
13
+ opteryx_catalog/iops/fileio.py,sha256=CBbeSXZw52lgcE8lqzKHREACqufiVUId0gwtTt8PSpY,4711
14
+ opteryx_catalog/iops/gcs.py,sha256=aB6hvSAQhbKTSyaLbAPgpXtSnvkI7fndXCRjaAZ1Dxo,8155
15
+ opteryx_catalog/maki_nage/__init__.py,sha256=yBJlF6TsfFHNVz21xzCG6Ty4FSQOdByWA3fAkCSATNo,298
16
+ opteryx_catalog/maki_nage/distogram.py,sha256=FMG4g423XOPAahd7Kg8YL5uB5UU5Ds39mV7JsFKlrTA,15976
17
+ opteryx_catalog/maki_nage/tests/_test_histogram.py,sha256=rlT1mD8YTC2Ag5HpMrOK1DM_vaic0JW-dPrfpIz72Rs,1314
18
+ opteryx_catalog/maki_nage/tests/test_bounds.py,sha256=4QCq0eaCKScv21917sRvvWn1No-zDei0FSIYx3eIqx8,508
19
+ opteryx_catalog/maki_nage/tests/test_count.py,sha256=_qCRIY19oKla9qH079sS0JFQw7cxzmiFIXiDxChVMkA,469
20
+ opteryx_catalog/maki_nage/tests/test_count_at.py,sha256=A2zbSZ6pusMvDoXQ90fr7gjIC4jORTpWbJyI9PnEtu4,2023
21
+ opteryx_catalog/maki_nage/tests/test_quantile.py,sha256=dE-iR7O-tNrbfum8O1-QTswLa1pwWAyIEeKZOd1wB8w,2235
22
+ opteryx_catalog/maki_nage/tests/test_stats.py,sha256=-6fhxttxP1QwbKGPjLBZmhB9I7aA5zJ3zlrAM9MmL9o,601
23
+ opteryx_catalog/maki_nage/tests/test_update.py,sha256=UNbxbJe-GbubjQ2QIC2OXNV-jal881jQOxE7LkMkX8M,1114
24
+ opteryx_catalog/webhooks/__init__.py,sha256=9vcC-m3Nzgmzu0x7jkU4ZEl1rXWovDEne5S1PcGll-w,7358
25
+ opteryx_catalog/webhooks/events.py,sha256=82rRZd4tagbS2Lc19xrmPubvafG9YDs7J6XS-BdHIF4,4370
26
+ opteryx_catalog-0.4.26.dist-info/licenses/LICENSE,sha256=mc5l20siqdcNQM54xALIWJhyaWsmQJ-NZt81UjgJejo,11351
27
+ scripts/collect_byte_counts.py,sha256=F0I_HIg0PRPSJ5vh6z-9IYa3g-WO5fvFVkjwYyt3Ggo,1393
28
+ scripts/create_dataset.py,sha256=K8zmQo3xbwc_yz2BxNK0IKj-DkDt3pFf13ycI6rgTHo,7798
29
+ scripts/emit_full_single_file.py,sha256=r9-y3P_J_HZaLa56F4ifQ5mcq8wZbKu-l-e2zRNytMw,2464
30
+ scripts/inspect_manifest_dryrun.py,sha256=i_6p9Bhfu3WlQVXDoGTEs4bBaVr09cvKLD9QQP5Taf4,13370
31
+ scripts/inspect_single_file.py,sha256=P8E-Csqj-Sm3PcqgP95lh9idbSyAMqxIRKsEoCne8wo,5242
32
+ scripts/inspect_single_file_gcs.py,sha256=R6-nQWxy-gHp3lFFBeB9vlsZVm6JD6FwUKHG_aFHmwc,3861
33
+ scripts/read_dataset.py,sha256=hpBa8Qv1Oj6ffVIUmELGSri2eYHPpdqLnWFKgKpG-FM,9610
34
+ tests/test_collections.py,sha256=JdAoRHw7tj4lIj2CQdLVrrbNiiaTD0onCKJCuI7EES0,882
35
+ tests/test_compaction.py,sha256=7MLnfbGi3j17ZON8Qi9oq4i1UWkW0JigX46BBFWecMk,7871
36
+ tests/test_dataset_metadata.py,sha256=bMzX2HiUnzFTyU3VkFuW5xjmFEP8cJSYPt1XF6IS0Qk,1019
37
+ tests/test_describe_uncompressed.py,sha256=UTXM0-6wUepNFM2V-I6smoGCc3K2BDUcElLpClDD54E,4280
38
+ tests/test_import.py,sha256=ZvoHW-rmcYqkW6TJKD_brgeePqHHbz2iTyRWKIBHGHk,137
39
+ tests/test_pyproject.py,sha256=o3rS_GOems1oYQDH3UATfqc6XUwDTKZF2Q4cspU-NYc,206
40
+ tests/test_refresh_manifest.py,sha256=5rGRmCNoQH5GeE-jGKgTDj8974eVT_GW6RQc5DYE9vo,10253
41
+ tests/test_webhooks.py,sha256=dB15wwX-aJ18AhIUS_eOulUZtOyH1Lf6YMQFueRa8z0,5978
42
+ opteryx_catalog-0.4.26.dist-info/METADATA,sha256=BD3PZ_VQfRcRtufRTngMTazm-lu2fPlXvfewOXnSvvo,22504
43
+ opteryx_catalog-0.4.26.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
44
+ opteryx_catalog-0.4.26.dist-info/top_level.txt,sha256=HWATr4Wgxbg3c1X3EcsJ6cnHoR6ZAdTe1LQ2VssIBUo,30
45
+ opteryx_catalog-0.4.26.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,42 @@
1
+ import os
2
+ import sys
3
+
4
+ from google.cloud import firestore
5
+
6
+ from opteryx_catalog import OpteryxCatalog
7
+
8
+ sys.path.insert(0, os.path.join(sys.path[0], "..")) # Add parent dir for pyiceberg_firestore_gcs
9
+ sys.path.insert(1, os.path.join(sys.path[0], "../opteryx-core"))
10
+
11
+
12
+ FIRESTORE_DATABASE = os.environ.get("FIRESTORE_DATABASE")
13
+ BUCKET_NAME = os.environ.get("GCS_BUCKET")
14
+ GCP_PROJECT_ID = os.environ.get("GCP_PROJECT_ID")
15
+
16
+
17
+ def get_workspaces():
18
+ firestore_client = firestore.Client(project=GCP_PROJECT_ID, database=FIRESTORE_DATABASE)
19
+ workspaces = firestore_client.collections()
20
+ yield from [w.id for w in workspaces]
21
+
22
+
23
+ for workspace in get_workspaces():
24
+ catalog = OpteryxCatalog(
25
+ workspace,
26
+ firestore_project=GCP_PROJECT_ID,
27
+ firestore_database=FIRESTORE_DATABASE,
28
+ gcs_bucket=BUCKET_NAME,
29
+ )
30
+
31
+ print(f"\nWorkspace '{workspace}':")
32
+ collections = catalog.list_collections()
33
+ for collection_name in collections:
34
+ print(f" Collection: {collection_name}")
35
+ collection = catalog.list_datasets(collection_name)
36
+ for dataset_name in collection:
37
+ print(f" Dataset: {collection_name}.{dataset_name}")
38
+ dataset = catalog.load_dataset(f"{collection_name}.{dataset_name}")
39
+ print(f" - {dataset_name} ({dataset.snapshot().summary.get('total-data-size')})")
40
+
41
+ if __name__ == "__main__":
42
+ pass
@@ -0,0 +1,81 @@
1
+ import json
2
+ import os
3
+ import sys
4
+ import time
5
+
6
+ # Recursively convert non-JSON types (bytes, pyarrow Buffers, etc.) to hex or JSON-safe types
7
+ from google.cloud import storage
8
+
9
+ from opteryx_catalog.catalog.manifest import build_parquet_manifest_entry_from_bytes
10
+
11
+ # Add local paths to sys.path to use local code instead of installed packages
12
+ sys.path.insert(0, os.path.join(sys.path[0], "..")) # Add parent dir for pyiceberg_firestore_gcs
13
+ sys.path.insert(1, os.path.join(sys.path[0], "../opteryx-core"))
14
+ sys.path.insert(1, os.path.join(sys.path[0], "../pyiceberg-firestore-gcs"))
15
+
16
+
17
+ TARGET = "gs://opteryx_data/opteryx/ops/audit_log/data/188fa239430f10c3-59275747aed0-2.parquet"
18
+ OUT = "artifacts/single_file_188fa239430f10c3_full.json"
19
+
20
+ _, rest = TARGET.split("://", 1)
21
+ bucket_name, path = rest.split("/", 1)
22
+ client = storage.Client()
23
+ blob = client.bucket(bucket_name).blob(path)
24
+ print("Downloading", TARGET)
25
+ data = blob.download_as_bytes()
26
+ print("Downloaded bytes:", len(data))
27
+
28
+
29
+ # read parquet bytes via a BufferReader
30
+ entry = build_parquet_manifest_entry_from_bytes(data, TARGET, len(data)).to_dict()
31
+
32
+ out = {
33
+ "_meta": {
34
+ "dataset": "opteryx.ops.audit_log",
35
+ "timestamp": int(time.time() * 1000),
36
+ "source": "single-file-full-json",
37
+ },
38
+ "file_path": TARGET,
39
+ "recomputed_full": entry,
40
+ }
41
+
42
+
43
+ def _hexify(obj):
44
+ # raw bytes-like
45
+ if isinstance(obj, (bytes, bytearray, memoryview)):
46
+ return obj.hex()
47
+ # pyarrow buffers/scalars -> try to extract python value
48
+ if hasattr(obj, "to_py"):
49
+ try:
50
+ val = obj.to_py()
51
+ return _hexify(val)
52
+ except Exception:
53
+ try:
54
+ # fallback: bytes representation
55
+ return bytes(obj).hex()
56
+ except Exception:
57
+ return str(obj)
58
+ # dict/list/tuple recursion
59
+ if isinstance(obj, dict):
60
+ return {k: _hexify(v) for k, v in obj.items()}
61
+ if isinstance(obj, list):
62
+ return [_hexify(v) for v in obj]
63
+ if isinstance(obj, tuple):
64
+ return tuple(_hexify(v) for v in obj)
65
+ # primitive
66
+ if isinstance(obj, (str, int, float, bool)) or obj is None:
67
+ return obj
68
+ # fallback to str
69
+ try:
70
+ return str(obj)
71
+ except Exception:
72
+ return None
73
+
74
+
75
+ safe_out = _hexify(out)
76
+
77
+ os.makedirs(os.path.dirname(OUT), exist_ok=True)
78
+ with open(OUT, "w", encoding="utf-8") as of:
79
+ json.dump(safe_out, of, indent=2, ensure_ascii=False)
80
+
81
+ print("WROTE", OUT)