opteryx-catalog 0.4.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opteryx-catalog might be problematic. Click here for more details.
- opteryx_catalog/__init__.py +31 -0
- opteryx_catalog/catalog/__init__.py +4 -0
- opteryx_catalog/catalog/compaction.py +529 -0
- opteryx_catalog/catalog/dataset.py +1201 -0
- opteryx_catalog/catalog/manifest.py +438 -0
- opteryx_catalog/catalog/metadata.py +81 -0
- opteryx_catalog/catalog/metastore.py +68 -0
- opteryx_catalog/catalog/view.py +12 -0
- opteryx_catalog/exceptions.py +38 -0
- opteryx_catalog/iops/__init__.py +6 -0
- opteryx_catalog/iops/base.py +42 -0
- opteryx_catalog/iops/fileio.py +125 -0
- opteryx_catalog/iops/gcs.py +255 -0
- opteryx_catalog/opteryx_catalog.py +979 -0
- opteryx_catalog/webhooks/__init__.py +230 -0
- opteryx_catalog/webhooks/events.py +177 -0
- opteryx_catalog-0.4.13.dist-info/METADATA +466 -0
- opteryx_catalog-0.4.13.dist-info/RECORD +28 -0
- opteryx_catalog-0.4.13.dist-info/WHEEL +5 -0
- opteryx_catalog-0.4.13.dist-info/licenses/LICENSE +201 -0
- opteryx_catalog-0.4.13.dist-info/top_level.txt +3 -0
- scripts/create_dataset.py +201 -0
- scripts/read_dataset.py +268 -0
- tests/test_compaction.py +233 -0
- tests/test_dataset_metadata.py +29 -0
- tests/test_import.py +5 -0
- tests/test_pyproject.py +8 -0
- tests/test_webhooks.py +177 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""Webhook system for Opteryx Catalog events.
|
|
2
|
+
|
|
3
|
+
This module provides webhook notification capabilities for key catalog events.
|
|
4
|
+
Webhooks can be delivered either directly via HTTP or asynchronously via
|
|
5
|
+
Google Cloud Tasks.
|
|
6
|
+
|
|
7
|
+
Configuration:
|
|
8
|
+
OPTERYX_WEBHOOK_DOMAIN: Base domain for webhook delivery (e.g., router.opteryx.app)
|
|
9
|
+
OPTERYX_WEBHOOK_QUEUE: Cloud Tasks queue path for async delivery
|
|
10
|
+
Format: projects/PROJECT/locations/LOCATION/queues/QUEUE
|
|
11
|
+
If not set, webhooks are sent directly via HTTP
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
export OPTERYX_WEBHOOK_DOMAIN=router.opteryx.app
|
|
15
|
+
export OPTERYX_WEBHOOK_QUEUE=projects/my-project/locations/us-central1/queues/webhooks
|
|
16
|
+
|
|
17
|
+
Webhook Endpoint:
|
|
18
|
+
All webhooks are sent to: https://{OPTERYX_WEBHOOK_DOMAIN}/event
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import json
|
|
24
|
+
import os
|
|
25
|
+
import time
|
|
26
|
+
from typing import Any
|
|
27
|
+
from typing import Optional
|
|
28
|
+
|
|
29
|
+
import requests
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class WebhookManager:
|
|
33
|
+
"""Manages webhook delivery for catalog events.
|
|
34
|
+
|
|
35
|
+
Supports two delivery modes:
|
|
36
|
+
1. Direct HTTP POST (when OPTERYX_WEBHOOK_QUEUE is not set)
|
|
37
|
+
2. Cloud Tasks async delivery (when OPTERYX_WEBHOOK_QUEUE is set)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
domain: Optional[str] = None,
|
|
43
|
+
queue_path: Optional[str] = None,
|
|
44
|
+
timeout: int = 10,
|
|
45
|
+
):
|
|
46
|
+
"""Initialize the webhook manager.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
domain: Base domain for webhooks (e.g., 'hook.opteryx.app')
|
|
50
|
+
Falls back to OPTERYX_WEBHOOK_DOMAIN env var
|
|
51
|
+
queue_path: Cloud Tasks queue path for async delivery
|
|
52
|
+
Falls back to OPTERYX_WEBHOOK_QUEUE env var
|
|
53
|
+
timeout: HTTP timeout in seconds for direct delivery
|
|
54
|
+
"""
|
|
55
|
+
self.domain = domain or os.getenv("OPTERYX_WEBHOOK_DOMAIN")
|
|
56
|
+
self.queue_path = queue_path or os.getenv("OPTERYX_WEBHOOK_QUEUE")
|
|
57
|
+
self.timeout = timeout
|
|
58
|
+
self.enabled = bool(self.domain)
|
|
59
|
+
|
|
60
|
+
# Initialize Cloud Tasks client only if needed
|
|
61
|
+
self._tasks_client = None
|
|
62
|
+
if self.enabled and self.queue_path:
|
|
63
|
+
try:
|
|
64
|
+
from google.cloud import tasks_v2
|
|
65
|
+
|
|
66
|
+
self._tasks_client = tasks_v2.CloudTasksClient()
|
|
67
|
+
except ImportError:
|
|
68
|
+
# Cloud Tasks not available, fall back to direct HTTP
|
|
69
|
+
self._tasks_client = None
|
|
70
|
+
|
|
71
|
+
def send(
|
|
72
|
+
self,
|
|
73
|
+
action: str,
|
|
74
|
+
workspace: str,
|
|
75
|
+
collection: str,
|
|
76
|
+
resource_type: str,
|
|
77
|
+
resource_name: str,
|
|
78
|
+
payload: Optional[dict[str, Any]] = None,
|
|
79
|
+
) -> bool:
|
|
80
|
+
"""Send a webhook notification.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
action: Action type (create, delete, update, commit, execute)
|
|
84
|
+
workspace: Workspace name
|
|
85
|
+
collection: Collection name
|
|
86
|
+
resource_type: Type of resource (dataset, view, collection)
|
|
87
|
+
resource_name: Name of the resource
|
|
88
|
+
payload: Additional data to include in the webhook body
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
True if webhook was sent successfully (or queued), False otherwise
|
|
92
|
+
"""
|
|
93
|
+
if not self.enabled:
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
# Simple endpoint URL
|
|
97
|
+
url = f"https://{self.domain}/event"
|
|
98
|
+
|
|
99
|
+
# Build the payload
|
|
100
|
+
body = self._build_payload(
|
|
101
|
+
action=action,
|
|
102
|
+
workspace=workspace,
|
|
103
|
+
collection=collection,
|
|
104
|
+
resource_type=resource_type,
|
|
105
|
+
resource_name=resource_name,
|
|
106
|
+
additional=payload or {},
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Deliver via Cloud Tasks or direct HTTP
|
|
110
|
+
if self._tasks_client and self.queue_path:
|
|
111
|
+
return self._send_via_cloud_tasks(url, body)
|
|
112
|
+
else:
|
|
113
|
+
return self._send_direct(url, body)
|
|
114
|
+
|
|
115
|
+
def _build_payload(
|
|
116
|
+
self,
|
|
117
|
+
action: str,
|
|
118
|
+
workspace: str,
|
|
119
|
+
collection: str,
|
|
120
|
+
resource_type: str,
|
|
121
|
+
resource_name: str,
|
|
122
|
+
additional: dict[str, Any],
|
|
123
|
+
) -> dict[str, Any]:
|
|
124
|
+
"""Build the webhook payload.
|
|
125
|
+
|
|
126
|
+
Returns a standardized payload with event metadata and additional data.
|
|
127
|
+
"""
|
|
128
|
+
return {
|
|
129
|
+
"event": {
|
|
130
|
+
"action": action,
|
|
131
|
+
"workspace": workspace,
|
|
132
|
+
"collection": collection,
|
|
133
|
+
"resource_type": resource_type,
|
|
134
|
+
"resource_name": resource_name,
|
|
135
|
+
"timestamp": int(time.time() * 1000), # milliseconds
|
|
136
|
+
},
|
|
137
|
+
"data": additional,
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
def _send_direct(self, url: str, payload: dict[str, Any]) -> bool:
|
|
141
|
+
"""Send webhook directly via HTTP POST.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
url: Full webhook URL
|
|
145
|
+
payload: JSON payload
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
True if successful (2xx response), False otherwise
|
|
149
|
+
"""
|
|
150
|
+
try:
|
|
151
|
+
response = requests.post(
|
|
152
|
+
url,
|
|
153
|
+
json=payload,
|
|
154
|
+
timeout=self.timeout,
|
|
155
|
+
headers={
|
|
156
|
+
"Content-Type": "application/json",
|
|
157
|
+
"User-Agent": "opteryx-catalog-webhook/1.0",
|
|
158
|
+
},
|
|
159
|
+
)
|
|
160
|
+
return response.status_code >= 200 and response.status_code < 300
|
|
161
|
+
except Exception:
|
|
162
|
+
# Log errors in production; for now, silently fail
|
|
163
|
+
return False
|
|
164
|
+
|
|
165
|
+
def _send_via_cloud_tasks(self, url: str, payload: dict[str, Any]) -> bool:
|
|
166
|
+
"""Send webhook asynchronously via Cloud Tasks.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
url: Full webhook URL
|
|
170
|
+
payload: JSON payload
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
True if task was created successfully, False otherwise
|
|
174
|
+
"""
|
|
175
|
+
if not self._tasks_client:
|
|
176
|
+
# Fall back to direct delivery if client unavailable
|
|
177
|
+
return self._send_direct(url, payload)
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
from google.cloud import tasks_v2
|
|
181
|
+
|
|
182
|
+
# Create the task
|
|
183
|
+
task = tasks_v2.Task(
|
|
184
|
+
http_request=tasks_v2.HttpRequest(
|
|
185
|
+
http_method=tasks_v2.HttpMethod.POST,
|
|
186
|
+
url=url,
|
|
187
|
+
headers={
|
|
188
|
+
"Content-Type": "application/json",
|
|
189
|
+
"User-Agent": "opteryx-catalog-webhook/1.0",
|
|
190
|
+
},
|
|
191
|
+
body=json.dumps(payload).encode(),
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Queue the task
|
|
196
|
+
self._tasks_client.create_task(
|
|
197
|
+
request=tasks_v2.CreateTaskRequest(
|
|
198
|
+
parent=self.queue_path,
|
|
199
|
+
task=task,
|
|
200
|
+
)
|
|
201
|
+
)
|
|
202
|
+
return True
|
|
203
|
+
except Exception:
|
|
204
|
+
# Log errors in production; for now, silently fail
|
|
205
|
+
return False
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# Global webhook manager instance
|
|
209
|
+
_webhook_manager: Optional[WebhookManager] = None
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def get_webhook_manager() -> WebhookManager:
|
|
213
|
+
"""Get or create the global webhook manager instance."""
|
|
214
|
+
global _webhook_manager
|
|
215
|
+
if _webhook_manager is None:
|
|
216
|
+
_webhook_manager = WebhookManager()
|
|
217
|
+
return _webhook_manager
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def send_webhook(
|
|
221
|
+
action: str,
|
|
222
|
+
workspace: str,
|
|
223
|
+
collection: str,
|
|
224
|
+
resource_type: str,
|
|
225
|
+
resource_name: str,
|
|
226
|
+
payload: Optional[dict[str, Any]] = None,
|
|
227
|
+
) -> bool:
|
|
228
|
+
"""Convenience function to send a webhook via the global manager."""
|
|
229
|
+
manager = get_webhook_manager()
|
|
230
|
+
return manager.send(action, workspace, collection, resource_type, resource_name, payload)
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Event definitions and payload builders for webhook notifications.
|
|
2
|
+
|
|
3
|
+
This module provides helper functions to create standardized payloads
|
|
4
|
+
for different types of catalog events.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def dataset_created_payload(
|
|
14
|
+
schema: Any,
|
|
15
|
+
location: Optional[str] = None,
|
|
16
|
+
properties: Optional[dict[str, Any]] = None,
|
|
17
|
+
) -> dict[str, Any]:
|
|
18
|
+
"""Build payload for dataset creation event.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
schema: Dataset schema (arrow or pyiceberg schema)
|
|
22
|
+
location: GCS location of the dataset
|
|
23
|
+
properties: Additional dataset properties
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Payload dictionary with dataset metadata
|
|
27
|
+
"""
|
|
28
|
+
payload = {
|
|
29
|
+
"location": location,
|
|
30
|
+
"properties": properties or {},
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# Include schema information if available
|
|
34
|
+
try:
|
|
35
|
+
if hasattr(schema, "names"): # PyArrow schema
|
|
36
|
+
payload["schema"] = {
|
|
37
|
+
"fields": [
|
|
38
|
+
{"name": name, "type": str(schema.field(name).type)} for name in schema.names
|
|
39
|
+
]
|
|
40
|
+
}
|
|
41
|
+
except Exception:
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
return payload
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def dataset_deleted_payload() -> dict[str, Any]:
|
|
48
|
+
"""Build payload for dataset deletion event."""
|
|
49
|
+
return {}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def dataset_updated_payload(
|
|
53
|
+
description: Optional[str] = None,
|
|
54
|
+
properties: Optional[dict[str, Any]] = None,
|
|
55
|
+
) -> dict[str, Any]:
|
|
56
|
+
"""Build payload for dataset update event.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
description: New description
|
|
60
|
+
properties: Updated properties
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Payload dictionary with updated fields
|
|
64
|
+
"""
|
|
65
|
+
return {
|
|
66
|
+
"description": description,
|
|
67
|
+
"properties": properties or {},
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def dataset_commit_payload(
|
|
72
|
+
snapshot_id: int,
|
|
73
|
+
sequence_number: int,
|
|
74
|
+
record_count: int,
|
|
75
|
+
file_count: int,
|
|
76
|
+
) -> dict[str, Any]:
|
|
77
|
+
"""Build payload for dataset commit (append) event.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
snapshot_id: New snapshot ID
|
|
81
|
+
sequence_number: Sequence number of the commit
|
|
82
|
+
record_count: Number of records added
|
|
83
|
+
file_count: Number of files added
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Payload dictionary with commit metadata
|
|
87
|
+
"""
|
|
88
|
+
return {
|
|
89
|
+
"snapshot_id": snapshot_id,
|
|
90
|
+
"sequence_number": sequence_number,
|
|
91
|
+
"record_count": record_count,
|
|
92
|
+
"file_count": file_count,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def collection_created_payload(
|
|
97
|
+
properties: Optional[dict[str, Any]] = None,
|
|
98
|
+
) -> dict[str, Any]:
|
|
99
|
+
"""Build payload for collection creation event.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
properties: Collection properties
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Payload dictionary with collection metadata
|
|
106
|
+
"""
|
|
107
|
+
return {
|
|
108
|
+
"properties": properties or {},
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def view_created_payload(
|
|
113
|
+
definition: str,
|
|
114
|
+
properties: Optional[dict[str, Any]] = None,
|
|
115
|
+
) -> dict[str, Any]:
|
|
116
|
+
"""Build payload for view creation event.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
definition: SQL definition of the view
|
|
120
|
+
properties: Additional view properties
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Payload dictionary with view metadata
|
|
124
|
+
"""
|
|
125
|
+
return {
|
|
126
|
+
"definition": definition,
|
|
127
|
+
"properties": properties or {},
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def view_deleted_payload() -> dict[str, Any]:
|
|
132
|
+
"""Build payload for view deletion event."""
|
|
133
|
+
return {}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def view_updated_payload(
|
|
137
|
+
description: Optional[str] = None,
|
|
138
|
+
properties: Optional[dict[str, Any]] = None,
|
|
139
|
+
) -> dict[str, Any]:
|
|
140
|
+
"""Build payload for view update event.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
description: New description
|
|
144
|
+
properties: Updated properties
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Payload dictionary with updated fields
|
|
148
|
+
"""
|
|
149
|
+
return {
|
|
150
|
+
"description": description,
|
|
151
|
+
"properties": properties or {},
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def view_executed_payload(
|
|
156
|
+
execution_time_ms: Optional[int] = None,
|
|
157
|
+
row_count: Optional[int] = None,
|
|
158
|
+
error: Optional[str] = None,
|
|
159
|
+
) -> dict[str, Any]:
|
|
160
|
+
"""Build payload for view execution event.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
execution_time_ms: Execution time in milliseconds
|
|
164
|
+
row_count: Number of rows returned
|
|
165
|
+
error: Error message if execution failed
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
Payload dictionary with execution metadata
|
|
169
|
+
"""
|
|
170
|
+
payload = {}
|
|
171
|
+
if execution_time_ms is not None:
|
|
172
|
+
payload["execution_time_ms"] = execution_time_ms
|
|
173
|
+
if row_count is not None:
|
|
174
|
+
payload["row_count"] = row_count
|
|
175
|
+
if error is not None:
|
|
176
|
+
payload["error"] = error
|
|
177
|
+
return payload
|