proxilion 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proxilion/__init__.py +136 -0
- proxilion/audit/__init__.py +133 -0
- proxilion/audit/base_exporters.py +527 -0
- proxilion/audit/compliance/__init__.py +130 -0
- proxilion/audit/compliance/base.py +457 -0
- proxilion/audit/compliance/eu_ai_act.py +603 -0
- proxilion/audit/compliance/iso27001.py +544 -0
- proxilion/audit/compliance/soc2.py +491 -0
- proxilion/audit/events.py +493 -0
- proxilion/audit/explainability.py +1173 -0
- proxilion/audit/exporters/__init__.py +58 -0
- proxilion/audit/exporters/aws_s3.py +636 -0
- proxilion/audit/exporters/azure_storage.py +608 -0
- proxilion/audit/exporters/cloud_base.py +468 -0
- proxilion/audit/exporters/gcp_storage.py +570 -0
- proxilion/audit/exporters/multi_exporter.py +498 -0
- proxilion/audit/hash_chain.py +652 -0
- proxilion/audit/logger.py +543 -0
- proxilion/caching/__init__.py +49 -0
- proxilion/caching/tool_cache.py +633 -0
- proxilion/context/__init__.py +73 -0
- proxilion/context/context_window.py +556 -0
- proxilion/context/message_history.py +505 -0
- proxilion/context/session.py +735 -0
- proxilion/contrib/__init__.py +51 -0
- proxilion/contrib/anthropic.py +609 -0
- proxilion/contrib/google.py +1012 -0
- proxilion/contrib/langchain.py +641 -0
- proxilion/contrib/mcp.py +893 -0
- proxilion/contrib/openai.py +646 -0
- proxilion/core.py +3058 -0
- proxilion/decorators.py +966 -0
- proxilion/engines/__init__.py +287 -0
- proxilion/engines/base.py +266 -0
- proxilion/engines/casbin_engine.py +412 -0
- proxilion/engines/opa_engine.py +493 -0
- proxilion/engines/simple.py +437 -0
- proxilion/exceptions.py +887 -0
- proxilion/guards/__init__.py +54 -0
- proxilion/guards/input_guard.py +522 -0
- proxilion/guards/output_guard.py +634 -0
- proxilion/observability/__init__.py +198 -0
- proxilion/observability/cost_tracker.py +866 -0
- proxilion/observability/hooks.py +683 -0
- proxilion/observability/metrics.py +798 -0
- proxilion/observability/session_cost_tracker.py +1063 -0
- proxilion/policies/__init__.py +67 -0
- proxilion/policies/base.py +304 -0
- proxilion/policies/builtin.py +486 -0
- proxilion/policies/registry.py +376 -0
- proxilion/providers/__init__.py +201 -0
- proxilion/providers/adapter.py +468 -0
- proxilion/providers/anthropic_adapter.py +330 -0
- proxilion/providers/gemini_adapter.py +391 -0
- proxilion/providers/openai_adapter.py +294 -0
- proxilion/py.typed +0 -0
- proxilion/resilience/__init__.py +81 -0
- proxilion/resilience/degradation.py +615 -0
- proxilion/resilience/fallback.py +555 -0
- proxilion/resilience/retry.py +554 -0
- proxilion/scheduling/__init__.py +57 -0
- proxilion/scheduling/priority_queue.py +419 -0
- proxilion/scheduling/scheduler.py +459 -0
- proxilion/security/__init__.py +244 -0
- proxilion/security/agent_trust.py +968 -0
- proxilion/security/behavioral_drift.py +794 -0
- proxilion/security/cascade_protection.py +869 -0
- proxilion/security/circuit_breaker.py +428 -0
- proxilion/security/cost_limiter.py +690 -0
- proxilion/security/idor_protection.py +460 -0
- proxilion/security/intent_capsule.py +849 -0
- proxilion/security/intent_validator.py +495 -0
- proxilion/security/memory_integrity.py +767 -0
- proxilion/security/rate_limiter.py +509 -0
- proxilion/security/scope_enforcer.py +680 -0
- proxilion/security/sequence_validator.py +636 -0
- proxilion/security/trust_boundaries.py +784 -0
- proxilion/streaming/__init__.py +70 -0
- proxilion/streaming/detector.py +761 -0
- proxilion/streaming/transformer.py +674 -0
- proxilion/timeouts/__init__.py +55 -0
- proxilion/timeouts/decorators.py +477 -0
- proxilion/timeouts/manager.py +545 -0
- proxilion/tools/__init__.py +69 -0
- proxilion/tools/decorators.py +493 -0
- proxilion/tools/registry.py +732 -0
- proxilion/types.py +339 -0
- proxilion/validation/__init__.py +93 -0
- proxilion/validation/pydantic_schema.py +351 -0
- proxilion/validation/schema.py +651 -0
- proxilion-0.0.1.dist-info/METADATA +872 -0
- proxilion-0.0.1.dist-info/RECORD +94 -0
- proxilion-0.0.1.dist-info/WHEEL +4 -0
- proxilion-0.0.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cloud storage exporters for Proxilion audit logs.
|
|
3
|
+
|
|
4
|
+
This module provides exporters for sending audit logs to cloud storage:
|
|
5
|
+
|
|
6
|
+
- AWS S3 / S3-compatible storage
|
|
7
|
+
- GCP Cloud Storage / BigQuery
|
|
8
|
+
- Azure Blob Storage / ADLS Gen2
|
|
9
|
+
- Multi-cloud redundant export
|
|
10
|
+
|
|
11
|
+
Example:
|
|
12
|
+
>>> from proxilion.audit.exporters import S3Exporter, CloudExporterConfig
|
|
13
|
+
>>>
|
|
14
|
+
>>> config = CloudExporterConfig(
|
|
15
|
+
... provider="aws",
|
|
16
|
+
... bucket_name="my-audit-logs",
|
|
17
|
+
... prefix="proxilion/prod/",
|
|
18
|
+
... region="us-west-2",
|
|
19
|
+
... )
|
|
20
|
+
>>> exporter = S3Exporter(config)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from proxilion.audit.exporters.aws_s3 import S3DataLakeExporter, S3Exporter
|
|
26
|
+
from proxilion.audit.exporters.azure_storage import AzureBlobExporter, AzureDataLakeExporter
|
|
27
|
+
from proxilion.audit.exporters.cloud_base import (
|
|
28
|
+
CloudExporter,
|
|
29
|
+
CloudExporterConfig,
|
|
30
|
+
CompressionType,
|
|
31
|
+
ExportBatch,
|
|
32
|
+
ExportFormat,
|
|
33
|
+
ExportResult,
|
|
34
|
+
)
|
|
35
|
+
from proxilion.audit.exporters.gcp_storage import BigQueryExporter, GCSExporter
|
|
36
|
+
from proxilion.audit.exporters.multi_exporter import FailureStrategy, MultiCloudExporter
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
# Base
|
|
40
|
+
"CloudExporter",
|
|
41
|
+
"CloudExporterConfig",
|
|
42
|
+
"ExportResult",
|
|
43
|
+
"ExportBatch",
|
|
44
|
+
"CompressionType",
|
|
45
|
+
"ExportFormat",
|
|
46
|
+
# AWS
|
|
47
|
+
"S3Exporter",
|
|
48
|
+
"S3DataLakeExporter",
|
|
49
|
+
# GCP
|
|
50
|
+
"GCSExporter",
|
|
51
|
+
"BigQueryExporter",
|
|
52
|
+
# Azure
|
|
53
|
+
"AzureBlobExporter",
|
|
54
|
+
"AzureDataLakeExporter",
|
|
55
|
+
# Multi-cloud
|
|
56
|
+
"MultiCloudExporter",
|
|
57
|
+
"FailureStrategy",
|
|
58
|
+
]
|
|
@@ -0,0 +1,636 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AWS S3 exporter for Proxilion audit logs.
|
|
3
|
+
|
|
4
|
+
Supports exporting audit logs to:
|
|
5
|
+
- Amazon S3
|
|
6
|
+
- S3-compatible storage (MinIO, DigitalOcean Spaces, etc.)
|
|
7
|
+
- AWS Data Lake with Athena/Glue integration
|
|
8
|
+
|
|
9
|
+
Uses boto3 if available, falls back to stdlib urllib with SigV4 signing.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import hashlib
|
|
15
|
+
import hmac
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
import time
|
|
20
|
+
import urllib.error
|
|
21
|
+
import urllib.parse
|
|
22
|
+
import urllib.request
|
|
23
|
+
from datetime import datetime, timezone
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
from proxilion.audit.exporters.cloud_base import (
|
|
27
|
+
BaseCloudExporter,
|
|
28
|
+
CloudExporterConfig,
|
|
29
|
+
ExportBatch,
|
|
30
|
+
ExportResult,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
# Check for boto3 availability
|
|
36
|
+
try:
|
|
37
|
+
import boto3
|
|
38
|
+
from botocore.config import Config as BotoConfig
|
|
39
|
+
HAS_BOTO3 = True
|
|
40
|
+
except ImportError:
|
|
41
|
+
HAS_BOTO3 = False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class S3Exporter(BaseCloudExporter):
|
|
45
|
+
"""
|
|
46
|
+
Export audit logs to Amazon S3 or S3-compatible storage.
|
|
47
|
+
|
|
48
|
+
Uses boto3 if installed, otherwise falls back to stdlib urllib
|
|
49
|
+
with AWS Signature Version 4 signing.
|
|
50
|
+
|
|
51
|
+
Example:
|
|
52
|
+
>>> config = CloudExporterConfig(
|
|
53
|
+
... provider="aws",
|
|
54
|
+
... bucket_name="my-audit-logs",
|
|
55
|
+
... prefix="proxilion/prod/",
|
|
56
|
+
... region="us-west-2",
|
|
57
|
+
... )
|
|
58
|
+
>>> exporter = S3Exporter(config)
|
|
59
|
+
>>> result = exporter.export(events)
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(self, config: CloudExporterConfig) -> None:
|
|
63
|
+
"""
|
|
64
|
+
Initialize the S3 exporter.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
config: Exporter configuration.
|
|
68
|
+
"""
|
|
69
|
+
super().__init__(config)
|
|
70
|
+
self._client = None
|
|
71
|
+
self._initialize_client()
|
|
72
|
+
|
|
73
|
+
def _initialize_client(self) -> None:
|
|
74
|
+
"""Initialize the S3 client."""
|
|
75
|
+
if HAS_BOTO3:
|
|
76
|
+
self._init_boto3_client()
|
|
77
|
+
else:
|
|
78
|
+
self._init_urllib_client()
|
|
79
|
+
|
|
80
|
+
def _init_boto3_client(self) -> None:
|
|
81
|
+
"""Initialize boto3 S3 client."""
|
|
82
|
+
client_config = BotoConfig(
|
|
83
|
+
connect_timeout=self.config.connection_timeout,
|
|
84
|
+
read_timeout=self.config.read_timeout,
|
|
85
|
+
retries={"max_attempts": 0}, # We handle retries ourselves
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
client_kwargs: dict[str, Any] = {
|
|
89
|
+
"config": client_config,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if self.config.region:
|
|
93
|
+
client_kwargs["region_name"] = self.config.region
|
|
94
|
+
|
|
95
|
+
if self.config.endpoint_url:
|
|
96
|
+
client_kwargs["endpoint_url"] = self.config.endpoint_url
|
|
97
|
+
|
|
98
|
+
if self.config.credentials_path:
|
|
99
|
+
# Load credentials from file
|
|
100
|
+
creds = self._load_credentials_file(self.config.credentials_path)
|
|
101
|
+
client_kwargs["aws_access_key_id"] = creds.get("aws_access_key_id")
|
|
102
|
+
client_kwargs["aws_secret_access_key"] = creds.get("aws_secret_access_key")
|
|
103
|
+
if "aws_session_token" in creds:
|
|
104
|
+
client_kwargs["aws_session_token"] = creds["aws_session_token"]
|
|
105
|
+
|
|
106
|
+
self._client = boto3.client("s3", **client_kwargs)
|
|
107
|
+
|
|
108
|
+
def _init_urllib_client(self) -> None:
|
|
109
|
+
"""Initialize urllib-based client with SigV4 signing."""
|
|
110
|
+
# For urllib fallback, we'll use environment variables or credentials file
|
|
111
|
+
self._aws_access_key = os.environ.get("AWS_ACCESS_KEY_ID")
|
|
112
|
+
self._aws_secret_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
|
|
113
|
+
self._aws_session_token = os.environ.get("AWS_SESSION_TOKEN")
|
|
114
|
+
|
|
115
|
+
if self.config.credentials_path:
|
|
116
|
+
creds = self._load_credentials_file(self.config.credentials_path)
|
|
117
|
+
self._aws_access_key = creds.get("aws_access_key_id", self._aws_access_key)
|
|
118
|
+
self._aws_secret_key = creds.get("aws_secret_access_key", self._aws_secret_key)
|
|
119
|
+
self._aws_session_token = creds.get("aws_session_token", self._aws_session_token)
|
|
120
|
+
|
|
121
|
+
def _load_credentials_file(self, path: str) -> dict[str, str]:
|
|
122
|
+
"""Load credentials from a JSON file."""
|
|
123
|
+
try:
|
|
124
|
+
with open(path) as f:
|
|
125
|
+
return json.load(f)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.warning(f"Failed to load credentials from {path}: {e}")
|
|
128
|
+
return {}
|
|
129
|
+
|
|
130
|
+
def export_batch(self, batch: ExportBatch) -> ExportResult:
|
|
131
|
+
"""
|
|
132
|
+
Export a batch to S3.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
batch: The batch to export.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
ExportResult with success/failure information.
|
|
139
|
+
"""
|
|
140
|
+
start_time = time.time()
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
# Prepare data
|
|
144
|
+
data = batch.to_bytes(self.config.compression)
|
|
145
|
+
key = self.generate_key(batch.created_at, batch.batch_id)
|
|
146
|
+
checksum = self.compute_checksum(data)
|
|
147
|
+
|
|
148
|
+
# Upload with retry
|
|
149
|
+
self.with_retry(self._upload_object, key, data)
|
|
150
|
+
|
|
151
|
+
duration_ms = (time.time() - start_time) * 1000
|
|
152
|
+
|
|
153
|
+
logger.info(
|
|
154
|
+
f"Exported {batch.event_count} events to s3://{self.config.bucket_name}/{key}"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
return ExportResult(
|
|
158
|
+
success=True,
|
|
159
|
+
events_exported=batch.event_count,
|
|
160
|
+
batch_id=batch.batch_id,
|
|
161
|
+
destination=f"s3://{self.config.bucket_name}/{key}",
|
|
162
|
+
duration_ms=duration_ms,
|
|
163
|
+
bytes_written=len(data),
|
|
164
|
+
checksum=checksum,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
except Exception as e:
|
|
168
|
+
duration_ms = (time.time() - start_time) * 1000
|
|
169
|
+
logger.error(f"Failed to export batch {batch.batch_id}: {e}")
|
|
170
|
+
|
|
171
|
+
return ExportResult(
|
|
172
|
+
success=False,
|
|
173
|
+
events_exported=0,
|
|
174
|
+
batch_id=batch.batch_id,
|
|
175
|
+
error=str(e),
|
|
176
|
+
duration_ms=duration_ms,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def _upload_object(self, key: str, data: bytes) -> None:
|
|
180
|
+
"""
|
|
181
|
+
Upload an object to S3.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
key: Object key.
|
|
185
|
+
data: Object data.
|
|
186
|
+
"""
|
|
187
|
+
if HAS_BOTO3:
|
|
188
|
+
self._upload_boto3(key, data)
|
|
189
|
+
else:
|
|
190
|
+
self._upload_urllib(key, data)
|
|
191
|
+
|
|
192
|
+
def _upload_boto3(self, key: str, data: bytes) -> None:
|
|
193
|
+
"""Upload using boto3."""
|
|
194
|
+
extra_args: dict[str, str] = {
|
|
195
|
+
"ContentType": self.get_content_type(),
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
encoding = self.get_content_encoding()
|
|
199
|
+
if encoding:
|
|
200
|
+
extra_args["ContentEncoding"] = encoding
|
|
201
|
+
|
|
202
|
+
self._client.put_object(
|
|
203
|
+
Bucket=self.config.bucket_name,
|
|
204
|
+
Key=key,
|
|
205
|
+
Body=data,
|
|
206
|
+
**extra_args,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def _upload_urllib(self, key: str, data: bytes) -> None:
|
|
210
|
+
"""Upload using urllib with SigV4 signing."""
|
|
211
|
+
if not self._aws_access_key or not self._aws_secret_key:
|
|
212
|
+
raise ValueError(
|
|
213
|
+
"AWS credentials not configured. Set AWS_ACCESS_KEY_ID and "
|
|
214
|
+
"AWS_SECRET_ACCESS_KEY environment variables or use credentials_path."
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
region = self.config.region or "us-east-1"
|
|
218
|
+
service = "s3"
|
|
219
|
+
host = self.config.endpoint_url or f"s3.{region}.amazonaws.com"
|
|
220
|
+
|
|
221
|
+
# Remove protocol from host if present
|
|
222
|
+
if host.startswith("https://"):
|
|
223
|
+
host = host[8:]
|
|
224
|
+
elif host.startswith("http://"):
|
|
225
|
+
host = host[7:]
|
|
226
|
+
|
|
227
|
+
# Build URL
|
|
228
|
+
url = f"https://{host}/{self.config.bucket_name}/{key}"
|
|
229
|
+
|
|
230
|
+
# Create signed request
|
|
231
|
+
headers = self._sign_request(
|
|
232
|
+
method="PUT",
|
|
233
|
+
url=url,
|
|
234
|
+
host=host,
|
|
235
|
+
region=region,
|
|
236
|
+
service=service,
|
|
237
|
+
payload=data,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
headers["Content-Type"] = self.get_content_type()
|
|
241
|
+
encoding = self.get_content_encoding()
|
|
242
|
+
if encoding:
|
|
243
|
+
headers["Content-Encoding"] = encoding
|
|
244
|
+
|
|
245
|
+
# Make request
|
|
246
|
+
request = urllib.request.Request(url, data=data, headers=headers, method="PUT")
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
with urllib.request.urlopen(
|
|
250
|
+
request, timeout=self.config.read_timeout
|
|
251
|
+
) as response:
|
|
252
|
+
if response.status not in (200, 201, 204):
|
|
253
|
+
raise ValueError(f"S3 upload failed with status {response.status}")
|
|
254
|
+
except urllib.error.HTTPError as e:
|
|
255
|
+
raise ValueError(f"S3 upload failed: {e.code} {e.reason}") from e
|
|
256
|
+
|
|
257
|
+
def _sign_request(
|
|
258
|
+
self,
|
|
259
|
+
method: str,
|
|
260
|
+
url: str,
|
|
261
|
+
host: str,
|
|
262
|
+
region: str,
|
|
263
|
+
service: str,
|
|
264
|
+
payload: bytes,
|
|
265
|
+
) -> dict[str, str]:
|
|
266
|
+
"""
|
|
267
|
+
Sign a request using AWS Signature Version 4.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
method: HTTP method.
|
|
271
|
+
url: Request URL.
|
|
272
|
+
host: Host header value.
|
|
273
|
+
region: AWS region.
|
|
274
|
+
service: AWS service name.
|
|
275
|
+
payload: Request payload.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
Dict of headers including Authorization.
|
|
279
|
+
"""
|
|
280
|
+
# Parse URL
|
|
281
|
+
parsed = urllib.parse.urlparse(url)
|
|
282
|
+
canonical_uri = parsed.path or "/"
|
|
283
|
+
canonical_querystring = parsed.query
|
|
284
|
+
|
|
285
|
+
# Current time
|
|
286
|
+
t = datetime.now(timezone.utc)
|
|
287
|
+
amz_date = t.strftime("%Y%m%dT%H%M%SZ")
|
|
288
|
+
date_stamp = t.strftime("%Y%m%d")
|
|
289
|
+
|
|
290
|
+
# Create payload hash
|
|
291
|
+
payload_hash = hashlib.sha256(payload).hexdigest()
|
|
292
|
+
|
|
293
|
+
# Create canonical headers
|
|
294
|
+
headers_to_sign = {
|
|
295
|
+
"host": host,
|
|
296
|
+
"x-amz-date": amz_date,
|
|
297
|
+
"x-amz-content-sha256": payload_hash,
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
if self._aws_session_token:
|
|
301
|
+
headers_to_sign["x-amz-security-token"] = self._aws_session_token
|
|
302
|
+
|
|
303
|
+
signed_headers = ";".join(sorted(headers_to_sign.keys()))
|
|
304
|
+
|
|
305
|
+
canonical_headers = ""
|
|
306
|
+
for key in sorted(headers_to_sign.keys()):
|
|
307
|
+
canonical_headers += f"{key}:{headers_to_sign[key]}\n"
|
|
308
|
+
|
|
309
|
+
# Create canonical request
|
|
310
|
+
canonical_request = (
|
|
311
|
+
f"{method}\n"
|
|
312
|
+
f"{canonical_uri}\n"
|
|
313
|
+
f"{canonical_querystring}\n"
|
|
314
|
+
f"{canonical_headers}\n"
|
|
315
|
+
f"{signed_headers}\n"
|
|
316
|
+
f"{payload_hash}"
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# Create string to sign
|
|
320
|
+
algorithm = "AWS4-HMAC-SHA256"
|
|
321
|
+
credential_scope = f"{date_stamp}/{region}/{service}/aws4_request"
|
|
322
|
+
string_to_sign = (
|
|
323
|
+
f"{algorithm}\n"
|
|
324
|
+
f"{amz_date}\n"
|
|
325
|
+
f"{credential_scope}\n"
|
|
326
|
+
f"{hashlib.sha256(canonical_request.encode()).hexdigest()}"
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# Create signing key
|
|
330
|
+
def sign(key: bytes, msg: str) -> bytes:
|
|
331
|
+
return hmac.new(key, msg.encode(), hashlib.sha256).digest()
|
|
332
|
+
|
|
333
|
+
k_date = sign(f"AWS4{self._aws_secret_key}".encode(), date_stamp)
|
|
334
|
+
k_region = sign(k_date, region)
|
|
335
|
+
k_service = sign(k_region, service)
|
|
336
|
+
k_signing = sign(k_service, "aws4_request")
|
|
337
|
+
|
|
338
|
+
# Create signature
|
|
339
|
+
signature = hmac.new(k_signing, string_to_sign.encode(), hashlib.sha256).hexdigest()
|
|
340
|
+
|
|
341
|
+
# Create authorization header
|
|
342
|
+
authorization_header = (
|
|
343
|
+
f"{algorithm} "
|
|
344
|
+
f"Credential={self._aws_access_key}/{credential_scope}, "
|
|
345
|
+
f"SignedHeaders={signed_headers}, "
|
|
346
|
+
f"Signature={signature}"
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# Build final headers
|
|
350
|
+
result_headers = {
|
|
351
|
+
"x-amz-date": amz_date,
|
|
352
|
+
"x-amz-content-sha256": payload_hash,
|
|
353
|
+
"Authorization": authorization_header,
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
if self._aws_session_token:
|
|
357
|
+
result_headers["x-amz-security-token"] = self._aws_session_token
|
|
358
|
+
|
|
359
|
+
return result_headers
|
|
360
|
+
|
|
361
|
+
def health_check(self) -> bool:
|
|
362
|
+
"""
|
|
363
|
+
Check if we can connect to S3.
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
True if healthy.
|
|
367
|
+
"""
|
|
368
|
+
try:
|
|
369
|
+
if HAS_BOTO3:
|
|
370
|
+
self._client.head_bucket(Bucket=self.config.bucket_name)
|
|
371
|
+
else:
|
|
372
|
+
# Try to list bucket (HEAD not easily done with urllib)
|
|
373
|
+
region = self.config.region or "us-east-1"
|
|
374
|
+
host = self.config.endpoint_url or f"s3.{region}.amazonaws.com"
|
|
375
|
+
if host.startswith("https://"):
|
|
376
|
+
host = host[8:]
|
|
377
|
+
url = f"https://{host}/{self.config.bucket_name}?max-keys=1"
|
|
378
|
+
|
|
379
|
+
headers = self._sign_request(
|
|
380
|
+
method="GET",
|
|
381
|
+
url=url,
|
|
382
|
+
host=host,
|
|
383
|
+
region=region,
|
|
384
|
+
service="s3",
|
|
385
|
+
payload=b"",
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
request = urllib.request.Request(url, headers=headers)
|
|
389
|
+
with urllib.request.urlopen(request, timeout=10) as response:
|
|
390
|
+
return response.status == 200
|
|
391
|
+
|
|
392
|
+
return True
|
|
393
|
+
except Exception as e:
|
|
394
|
+
logger.warning(f"S3 health check failed: {e}")
|
|
395
|
+
return False
|
|
396
|
+
|
|
397
|
+
def list_exports(
|
|
398
|
+
self,
|
|
399
|
+
start_date: datetime | None = None,
|
|
400
|
+
end_date: datetime | None = None,
|
|
401
|
+
max_keys: int = 1000,
|
|
402
|
+
) -> list[str]:
|
|
403
|
+
"""
|
|
404
|
+
List exported files in the bucket.
|
|
405
|
+
|
|
406
|
+
Args:
|
|
407
|
+
start_date: Filter to exports after this date.
|
|
408
|
+
end_date: Filter to exports before this date.
|
|
409
|
+
max_keys: Maximum number of keys to return.
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
List of object keys.
|
|
413
|
+
"""
|
|
414
|
+
if not HAS_BOTO3:
|
|
415
|
+
raise NotImplementedError("list_exports requires boto3")
|
|
416
|
+
|
|
417
|
+
prefix = self.config.prefix
|
|
418
|
+
|
|
419
|
+
# Add date prefix if filtering
|
|
420
|
+
if start_date:
|
|
421
|
+
prefix += f"{start_date.year:04d}/"
|
|
422
|
+
|
|
423
|
+
paginator = self._client.get_paginator("list_objects_v2")
|
|
424
|
+
pages = paginator.paginate(
|
|
425
|
+
Bucket=self.config.bucket_name,
|
|
426
|
+
Prefix=prefix,
|
|
427
|
+
MaxKeys=max_keys,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
keys = []
|
|
431
|
+
for page in pages:
|
|
432
|
+
for obj in page.get("Contents", []):
|
|
433
|
+
key = obj["Key"]
|
|
434
|
+
|
|
435
|
+
# Filter by date if needed
|
|
436
|
+
if end_date:
|
|
437
|
+
# Extract date from key path
|
|
438
|
+
parts = key.split("/")
|
|
439
|
+
if len(parts) >= 4:
|
|
440
|
+
try:
|
|
441
|
+
year = int(parts[-4])
|
|
442
|
+
month = int(parts[-3])
|
|
443
|
+
day = int(parts[-2])
|
|
444
|
+
key_date = datetime(year, month, day, tzinfo=timezone.utc)
|
|
445
|
+
if key_date > end_date:
|
|
446
|
+
continue
|
|
447
|
+
except (ValueError, IndexError):
|
|
448
|
+
pass
|
|
449
|
+
|
|
450
|
+
keys.append(key)
|
|
451
|
+
|
|
452
|
+
if len(keys) >= max_keys:
|
|
453
|
+
break
|
|
454
|
+
|
|
455
|
+
return keys
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
class S3DataLakeExporter(S3Exporter):
|
|
459
|
+
"""
|
|
460
|
+
Export audit logs to S3 with Data Lake / Athena optimization.
|
|
461
|
+
|
|
462
|
+
Extends S3Exporter with:
|
|
463
|
+
- Partitioned paths compatible with Athena/Glue
|
|
464
|
+
- Optional Parquet format for analytics
|
|
465
|
+
- Hive-style partition naming
|
|
466
|
+
|
|
467
|
+
Example:
|
|
468
|
+
>>> config = CloudExporterConfig(
|
|
469
|
+
... provider="aws",
|
|
470
|
+
... bucket_name="my-data-lake",
|
|
471
|
+
... prefix="audit/proxilion/",
|
|
472
|
+
... format=ExportFormat.PARQUET,
|
|
473
|
+
... )
|
|
474
|
+
>>> exporter = S3DataLakeExporter(config)
|
|
475
|
+
"""
|
|
476
|
+
|
|
477
|
+
def __init__(
|
|
478
|
+
self,
|
|
479
|
+
config: CloudExporterConfig,
|
|
480
|
+
use_hive_partitions: bool = True,
|
|
481
|
+
) -> None:
|
|
482
|
+
"""
|
|
483
|
+
Initialize the Data Lake exporter.
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
config: Exporter configuration.
|
|
487
|
+
use_hive_partitions: Use Hive-style partition naming (year=YYYY/).
|
|
488
|
+
"""
|
|
489
|
+
super().__init__(config)
|
|
490
|
+
self.use_hive_partitions = use_hive_partitions
|
|
491
|
+
|
|
492
|
+
def generate_key(
|
|
493
|
+
self,
|
|
494
|
+
timestamp: datetime | None = None,
|
|
495
|
+
batch_id: str | None = None,
|
|
496
|
+
) -> str:
|
|
497
|
+
"""
|
|
498
|
+
Generate an object key with Hive-style partitioning.
|
|
499
|
+
|
|
500
|
+
Format: {prefix}/year=YYYY/month=MM/day=DD/hour=HH/{batch_id}.{ext}
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
timestamp: Timestamp for partitioning.
|
|
504
|
+
batch_id: Unique batch identifier.
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
The generated object key.
|
|
508
|
+
"""
|
|
509
|
+
if timestamp is None:
|
|
510
|
+
timestamp = datetime.now(timezone.utc)
|
|
511
|
+
|
|
512
|
+
if batch_id is None:
|
|
513
|
+
with self._lock:
|
|
514
|
+
self._batch_counter += 1
|
|
515
|
+
batch_id = f"{timestamp.strftime('%Y%m%d%H%M%S')}_{self._batch_counter:06d}"
|
|
516
|
+
|
|
517
|
+
# Determine file extension
|
|
518
|
+
ext = self.config.format.value
|
|
519
|
+
if self.config.compression.value != "none":
|
|
520
|
+
ext += f".{self.config.compression.value}"
|
|
521
|
+
|
|
522
|
+
# Build partitioned path
|
|
523
|
+
if self.use_hive_partitions:
|
|
524
|
+
key = (
|
|
525
|
+
f"{self.config.prefix}"
|
|
526
|
+
f"year={timestamp.year:04d}/"
|
|
527
|
+
f"month={timestamp.month:02d}/"
|
|
528
|
+
f"day={timestamp.day:02d}/"
|
|
529
|
+
f"hour={timestamp.hour:02d}/"
|
|
530
|
+
f"{batch_id}.{ext}"
|
|
531
|
+
)
|
|
532
|
+
else:
|
|
533
|
+
key = super().generate_key(timestamp, batch_id)
|
|
534
|
+
|
|
535
|
+
return key
|
|
536
|
+
|
|
537
|
+
def get_athena_table_ddl(
|
|
538
|
+
self,
|
|
539
|
+
table_name: str,
|
|
540
|
+
database: str = "default",
|
|
541
|
+
) -> str:
|
|
542
|
+
"""
|
|
543
|
+
Generate Athena CREATE TABLE DDL for the audit logs.
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
table_name: Name for the Athena table.
|
|
547
|
+
database: Athena database name.
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
CREATE TABLE DDL statement.
|
|
551
|
+
"""
|
|
552
|
+
location = f"s3://{self.config.bucket_name}/{self.config.prefix}"
|
|
553
|
+
|
|
554
|
+
if self.config.format.value == "parquet":
|
|
555
|
+
serde = "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
|
|
556
|
+
input_format = "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"
|
|
557
|
+
output_format = "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"
|
|
558
|
+
else:
|
|
559
|
+
serde = "org.openx.data.jsonserde.JsonSerDe"
|
|
560
|
+
input_format = "org.apache.hadoop.mapred.TextInputFormat"
|
|
561
|
+
output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
|
|
562
|
+
|
|
563
|
+
ddl = f"""
|
|
564
|
+
CREATE EXTERNAL TABLE IF NOT EXISTS `{database}`.`{table_name}` (
|
|
565
|
+
`event_id` string,
|
|
566
|
+
`timestamp` timestamp,
|
|
567
|
+
`sequence_number` bigint,
|
|
568
|
+
`event_type` string,
|
|
569
|
+
`user_id` string,
|
|
570
|
+
`user_roles` array<string>,
|
|
571
|
+
`session_id` string,
|
|
572
|
+
`agent_id` string,
|
|
573
|
+
`tool_name` string,
|
|
574
|
+
`tool_arguments` string,
|
|
575
|
+
`authorization_allowed` boolean,
|
|
576
|
+
`authorization_reason` string,
|
|
577
|
+
`policies_evaluated` array<string>,
|
|
578
|
+
`event_hash` string,
|
|
579
|
+
`previous_hash` string
|
|
580
|
+
)
|
|
581
|
+
PARTITIONED BY (
|
|
582
|
+
`year` int,
|
|
583
|
+
`month` int,
|
|
584
|
+
`day` int,
|
|
585
|
+
`hour` int
|
|
586
|
+
)
|
|
587
|
+
ROW FORMAT SERDE '{serde}'
|
|
588
|
+
STORED AS INPUTFORMAT '{input_format}'
|
|
589
|
+
OUTPUTFORMAT '{output_format}'
|
|
590
|
+
LOCATION '{location}'
|
|
591
|
+
TBLPROPERTIES ('has_encrypted_data'='false');
|
|
592
|
+
"""
|
|
593
|
+
return ddl.strip()
|
|
594
|
+
|
|
595
|
+
def get_glue_schema(self) -> dict[str, Any]:
|
|
596
|
+
"""
|
|
597
|
+
Get Glue catalog schema for the audit logs.
|
|
598
|
+
|
|
599
|
+
Returns:
|
|
600
|
+
Schema dictionary for Glue catalog.
|
|
601
|
+
"""
|
|
602
|
+
columns = [
|
|
603
|
+
{"Name": "event_id", "Type": "string"},
|
|
604
|
+
{"Name": "timestamp", "Type": "timestamp"},
|
|
605
|
+
{"Name": "sequence_number", "Type": "bigint"},
|
|
606
|
+
{"Name": "event_type", "Type": "string"},
|
|
607
|
+
{"Name": "user_id", "Type": "string"},
|
|
608
|
+
{"Name": "user_roles", "Type": "array<string>"},
|
|
609
|
+
{"Name": "session_id", "Type": "string"},
|
|
610
|
+
{"Name": "agent_id", "Type": "string"},
|
|
611
|
+
{"Name": "tool_name", "Type": "string"},
|
|
612
|
+
{"Name": "tool_arguments", "Type": "string"},
|
|
613
|
+
{"Name": "authorization_allowed", "Type": "boolean"},
|
|
614
|
+
{"Name": "authorization_reason", "Type": "string"},
|
|
615
|
+
{"Name": "policies_evaluated", "Type": "array<string>"},
|
|
616
|
+
{"Name": "event_hash", "Type": "string"},
|
|
617
|
+
{"Name": "previous_hash", "Type": "string"},
|
|
618
|
+
]
|
|
619
|
+
|
|
620
|
+
partition_keys = [
|
|
621
|
+
{"Name": "year", "Type": "int"},
|
|
622
|
+
{"Name": "month", "Type": "int"},
|
|
623
|
+
{"Name": "day", "Type": "int"},
|
|
624
|
+
{"Name": "hour", "Type": "int"},
|
|
625
|
+
]
|
|
626
|
+
|
|
627
|
+
return {
|
|
628
|
+
"Columns": columns,
|
|
629
|
+
"PartitionKeys": partition_keys,
|
|
630
|
+
"Location": f"s3://{self.config.bucket_name}/{self.config.prefix}",
|
|
631
|
+
"InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
|
|
632
|
+
"OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
|
|
633
|
+
"SerdeInfo": {
|
|
634
|
+
"SerializationLibrary": "org.openx.data.jsonserde.JsonSerDe",
|
|
635
|
+
},
|
|
636
|
+
}
|