proxilion 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. proxilion/__init__.py +136 -0
  2. proxilion/audit/__init__.py +133 -0
  3. proxilion/audit/base_exporters.py +527 -0
  4. proxilion/audit/compliance/__init__.py +130 -0
  5. proxilion/audit/compliance/base.py +457 -0
  6. proxilion/audit/compliance/eu_ai_act.py +603 -0
  7. proxilion/audit/compliance/iso27001.py +544 -0
  8. proxilion/audit/compliance/soc2.py +491 -0
  9. proxilion/audit/events.py +493 -0
  10. proxilion/audit/explainability.py +1173 -0
  11. proxilion/audit/exporters/__init__.py +58 -0
  12. proxilion/audit/exporters/aws_s3.py +636 -0
  13. proxilion/audit/exporters/azure_storage.py +608 -0
  14. proxilion/audit/exporters/cloud_base.py +468 -0
  15. proxilion/audit/exporters/gcp_storage.py +570 -0
  16. proxilion/audit/exporters/multi_exporter.py +498 -0
  17. proxilion/audit/hash_chain.py +652 -0
  18. proxilion/audit/logger.py +543 -0
  19. proxilion/caching/__init__.py +49 -0
  20. proxilion/caching/tool_cache.py +633 -0
  21. proxilion/context/__init__.py +73 -0
  22. proxilion/context/context_window.py +556 -0
  23. proxilion/context/message_history.py +505 -0
  24. proxilion/context/session.py +735 -0
  25. proxilion/contrib/__init__.py +51 -0
  26. proxilion/contrib/anthropic.py +609 -0
  27. proxilion/contrib/google.py +1012 -0
  28. proxilion/contrib/langchain.py +641 -0
  29. proxilion/contrib/mcp.py +893 -0
  30. proxilion/contrib/openai.py +646 -0
  31. proxilion/core.py +3058 -0
  32. proxilion/decorators.py +966 -0
  33. proxilion/engines/__init__.py +287 -0
  34. proxilion/engines/base.py +266 -0
  35. proxilion/engines/casbin_engine.py +412 -0
  36. proxilion/engines/opa_engine.py +493 -0
  37. proxilion/engines/simple.py +437 -0
  38. proxilion/exceptions.py +887 -0
  39. proxilion/guards/__init__.py +54 -0
  40. proxilion/guards/input_guard.py +522 -0
  41. proxilion/guards/output_guard.py +634 -0
  42. proxilion/observability/__init__.py +198 -0
  43. proxilion/observability/cost_tracker.py +866 -0
  44. proxilion/observability/hooks.py +683 -0
  45. proxilion/observability/metrics.py +798 -0
  46. proxilion/observability/session_cost_tracker.py +1063 -0
  47. proxilion/policies/__init__.py +67 -0
  48. proxilion/policies/base.py +304 -0
  49. proxilion/policies/builtin.py +486 -0
  50. proxilion/policies/registry.py +376 -0
  51. proxilion/providers/__init__.py +201 -0
  52. proxilion/providers/adapter.py +468 -0
  53. proxilion/providers/anthropic_adapter.py +330 -0
  54. proxilion/providers/gemini_adapter.py +391 -0
  55. proxilion/providers/openai_adapter.py +294 -0
  56. proxilion/py.typed +0 -0
  57. proxilion/resilience/__init__.py +81 -0
  58. proxilion/resilience/degradation.py +615 -0
  59. proxilion/resilience/fallback.py +555 -0
  60. proxilion/resilience/retry.py +554 -0
  61. proxilion/scheduling/__init__.py +57 -0
  62. proxilion/scheduling/priority_queue.py +419 -0
  63. proxilion/scheduling/scheduler.py +459 -0
  64. proxilion/security/__init__.py +244 -0
  65. proxilion/security/agent_trust.py +968 -0
  66. proxilion/security/behavioral_drift.py +794 -0
  67. proxilion/security/cascade_protection.py +869 -0
  68. proxilion/security/circuit_breaker.py +428 -0
  69. proxilion/security/cost_limiter.py +690 -0
  70. proxilion/security/idor_protection.py +460 -0
  71. proxilion/security/intent_capsule.py +849 -0
  72. proxilion/security/intent_validator.py +495 -0
  73. proxilion/security/memory_integrity.py +767 -0
  74. proxilion/security/rate_limiter.py +509 -0
  75. proxilion/security/scope_enforcer.py +680 -0
  76. proxilion/security/sequence_validator.py +636 -0
  77. proxilion/security/trust_boundaries.py +784 -0
  78. proxilion/streaming/__init__.py +70 -0
  79. proxilion/streaming/detector.py +761 -0
  80. proxilion/streaming/transformer.py +674 -0
  81. proxilion/timeouts/__init__.py +55 -0
  82. proxilion/timeouts/decorators.py +477 -0
  83. proxilion/timeouts/manager.py +545 -0
  84. proxilion/tools/__init__.py +69 -0
  85. proxilion/tools/decorators.py +493 -0
  86. proxilion/tools/registry.py +732 -0
  87. proxilion/types.py +339 -0
  88. proxilion/validation/__init__.py +93 -0
  89. proxilion/validation/pydantic_schema.py +351 -0
  90. proxilion/validation/schema.py +651 -0
  91. proxilion-0.0.1.dist-info/METADATA +872 -0
  92. proxilion-0.0.1.dist-info/RECORD +94 -0
  93. proxilion-0.0.1.dist-info/WHEEL +4 -0
  94. proxilion-0.0.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,58 @@
1
+ """
2
+ Cloud storage exporters for Proxilion audit logs.
3
+
4
+ This module provides exporters for sending audit logs to cloud storage:
5
+
6
+ - AWS S3 / S3-compatible storage
7
+ - GCP Cloud Storage / BigQuery
8
+ - Azure Blob Storage / ADLS Gen2
9
+ - Multi-cloud redundant export
10
+
11
+ Example:
12
+ >>> from proxilion.audit.exporters import S3Exporter, CloudExporterConfig
13
+ >>>
14
+ >>> config = CloudExporterConfig(
15
+ ... provider="aws",
16
+ ... bucket_name="my-audit-logs",
17
+ ... prefix="proxilion/prod/",
18
+ ... region="us-west-2",
19
+ ... )
20
+ >>> exporter = S3Exporter(config)
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ from proxilion.audit.exporters.aws_s3 import S3DataLakeExporter, S3Exporter
26
+ from proxilion.audit.exporters.azure_storage import AzureBlobExporter, AzureDataLakeExporter
27
+ from proxilion.audit.exporters.cloud_base import (
28
+ CloudExporter,
29
+ CloudExporterConfig,
30
+ CompressionType,
31
+ ExportBatch,
32
+ ExportFormat,
33
+ ExportResult,
34
+ )
35
+ from proxilion.audit.exporters.gcp_storage import BigQueryExporter, GCSExporter
36
+ from proxilion.audit.exporters.multi_exporter import FailureStrategy, MultiCloudExporter
37
+
38
+ __all__ = [
39
+ # Base
40
+ "CloudExporter",
41
+ "CloudExporterConfig",
42
+ "ExportResult",
43
+ "ExportBatch",
44
+ "CompressionType",
45
+ "ExportFormat",
46
+ # AWS
47
+ "S3Exporter",
48
+ "S3DataLakeExporter",
49
+ # GCP
50
+ "GCSExporter",
51
+ "BigQueryExporter",
52
+ # Azure
53
+ "AzureBlobExporter",
54
+ "AzureDataLakeExporter",
55
+ # Multi-cloud
56
+ "MultiCloudExporter",
57
+ "FailureStrategy",
58
+ ]
@@ -0,0 +1,636 @@
1
+ """
2
+ AWS S3 exporter for Proxilion audit logs.
3
+
4
+ Supports exporting audit logs to:
5
+ - Amazon S3
6
+ - S3-compatible storage (MinIO, DigitalOcean Spaces, etc.)
7
+ - AWS Data Lake with Athena/Glue integration
8
+
9
+ Uses boto3 if available, falls back to stdlib urllib with SigV4 signing.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import hashlib
15
+ import hmac
16
+ import json
17
+ import logging
18
+ import os
19
+ import time
20
+ import urllib.error
21
+ import urllib.parse
22
+ import urllib.request
23
+ from datetime import datetime, timezone
24
+ from typing import Any
25
+
26
+ from proxilion.audit.exporters.cloud_base import (
27
+ BaseCloudExporter,
28
+ CloudExporterConfig,
29
+ ExportBatch,
30
+ ExportResult,
31
+ )
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # Check for boto3 availability
36
+ try:
37
+ import boto3
38
+ from botocore.config import Config as BotoConfig
39
+ HAS_BOTO3 = True
40
+ except ImportError:
41
+ HAS_BOTO3 = False
42
+
43
+
44
+ class S3Exporter(BaseCloudExporter):
45
+ """
46
+ Export audit logs to Amazon S3 or S3-compatible storage.
47
+
48
+ Uses boto3 if installed, otherwise falls back to stdlib urllib
49
+ with AWS Signature Version 4 signing.
50
+
51
+ Example:
52
+ >>> config = CloudExporterConfig(
53
+ ... provider="aws",
54
+ ... bucket_name="my-audit-logs",
55
+ ... prefix="proxilion/prod/",
56
+ ... region="us-west-2",
57
+ ... )
58
+ >>> exporter = S3Exporter(config)
59
+ >>> result = exporter.export(events)
60
+ """
61
+
62
+ def __init__(self, config: CloudExporterConfig) -> None:
63
+ """
64
+ Initialize the S3 exporter.
65
+
66
+ Args:
67
+ config: Exporter configuration.
68
+ """
69
+ super().__init__(config)
70
+ self._client = None
71
+ self._initialize_client()
72
+
73
+ def _initialize_client(self) -> None:
74
+ """Initialize the S3 client."""
75
+ if HAS_BOTO3:
76
+ self._init_boto3_client()
77
+ else:
78
+ self._init_urllib_client()
79
+
80
+ def _init_boto3_client(self) -> None:
81
+ """Initialize boto3 S3 client."""
82
+ client_config = BotoConfig(
83
+ connect_timeout=self.config.connection_timeout,
84
+ read_timeout=self.config.read_timeout,
85
+ retries={"max_attempts": 0}, # We handle retries ourselves
86
+ )
87
+
88
+ client_kwargs: dict[str, Any] = {
89
+ "config": client_config,
90
+ }
91
+
92
+ if self.config.region:
93
+ client_kwargs["region_name"] = self.config.region
94
+
95
+ if self.config.endpoint_url:
96
+ client_kwargs["endpoint_url"] = self.config.endpoint_url
97
+
98
+ if self.config.credentials_path:
99
+ # Load credentials from file
100
+ creds = self._load_credentials_file(self.config.credentials_path)
101
+ client_kwargs["aws_access_key_id"] = creds.get("aws_access_key_id")
102
+ client_kwargs["aws_secret_access_key"] = creds.get("aws_secret_access_key")
103
+ if "aws_session_token" in creds:
104
+ client_kwargs["aws_session_token"] = creds["aws_session_token"]
105
+
106
+ self._client = boto3.client("s3", **client_kwargs)
107
+
108
+ def _init_urllib_client(self) -> None:
109
+ """Initialize urllib-based client with SigV4 signing."""
110
+ # For urllib fallback, we'll use environment variables or credentials file
111
+ self._aws_access_key = os.environ.get("AWS_ACCESS_KEY_ID")
112
+ self._aws_secret_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
113
+ self._aws_session_token = os.environ.get("AWS_SESSION_TOKEN")
114
+
115
+ if self.config.credentials_path:
116
+ creds = self._load_credentials_file(self.config.credentials_path)
117
+ self._aws_access_key = creds.get("aws_access_key_id", self._aws_access_key)
118
+ self._aws_secret_key = creds.get("aws_secret_access_key", self._aws_secret_key)
119
+ self._aws_session_token = creds.get("aws_session_token", self._aws_session_token)
120
+
121
+ def _load_credentials_file(self, path: str) -> dict[str, str]:
122
+ """Load credentials from a JSON file."""
123
+ try:
124
+ with open(path) as f:
125
+ return json.load(f)
126
+ except Exception as e:
127
+ logger.warning(f"Failed to load credentials from {path}: {e}")
128
+ return {}
129
+
130
+ def export_batch(self, batch: ExportBatch) -> ExportResult:
131
+ """
132
+ Export a batch to S3.
133
+
134
+ Args:
135
+ batch: The batch to export.
136
+
137
+ Returns:
138
+ ExportResult with success/failure information.
139
+ """
140
+ start_time = time.time()
141
+
142
+ try:
143
+ # Prepare data
144
+ data = batch.to_bytes(self.config.compression)
145
+ key = self.generate_key(batch.created_at, batch.batch_id)
146
+ checksum = self.compute_checksum(data)
147
+
148
+ # Upload with retry
149
+ self.with_retry(self._upload_object, key, data)
150
+
151
+ duration_ms = (time.time() - start_time) * 1000
152
+
153
+ logger.info(
154
+ f"Exported {batch.event_count} events to s3://{self.config.bucket_name}/{key}"
155
+ )
156
+
157
+ return ExportResult(
158
+ success=True,
159
+ events_exported=batch.event_count,
160
+ batch_id=batch.batch_id,
161
+ destination=f"s3://{self.config.bucket_name}/{key}",
162
+ duration_ms=duration_ms,
163
+ bytes_written=len(data),
164
+ checksum=checksum,
165
+ )
166
+
167
+ except Exception as e:
168
+ duration_ms = (time.time() - start_time) * 1000
169
+ logger.error(f"Failed to export batch {batch.batch_id}: {e}")
170
+
171
+ return ExportResult(
172
+ success=False,
173
+ events_exported=0,
174
+ batch_id=batch.batch_id,
175
+ error=str(e),
176
+ duration_ms=duration_ms,
177
+ )
178
+
179
+ def _upload_object(self, key: str, data: bytes) -> None:
180
+ """
181
+ Upload an object to S3.
182
+
183
+ Args:
184
+ key: Object key.
185
+ data: Object data.
186
+ """
187
+ if HAS_BOTO3:
188
+ self._upload_boto3(key, data)
189
+ else:
190
+ self._upload_urllib(key, data)
191
+
192
+ def _upload_boto3(self, key: str, data: bytes) -> None:
193
+ """Upload using boto3."""
194
+ extra_args: dict[str, str] = {
195
+ "ContentType": self.get_content_type(),
196
+ }
197
+
198
+ encoding = self.get_content_encoding()
199
+ if encoding:
200
+ extra_args["ContentEncoding"] = encoding
201
+
202
+ self._client.put_object(
203
+ Bucket=self.config.bucket_name,
204
+ Key=key,
205
+ Body=data,
206
+ **extra_args,
207
+ )
208
+
209
+ def _upload_urllib(self, key: str, data: bytes) -> None:
210
+ """Upload using urllib with SigV4 signing."""
211
+ if not self._aws_access_key or not self._aws_secret_key:
212
+ raise ValueError(
213
+ "AWS credentials not configured. Set AWS_ACCESS_KEY_ID and "
214
+ "AWS_SECRET_ACCESS_KEY environment variables or use credentials_path."
215
+ )
216
+
217
+ region = self.config.region or "us-east-1"
218
+ service = "s3"
219
+ host = self.config.endpoint_url or f"s3.{region}.amazonaws.com"
220
+
221
+ # Remove protocol from host if present
222
+ if host.startswith("https://"):
223
+ host = host[8:]
224
+ elif host.startswith("http://"):
225
+ host = host[7:]
226
+
227
+ # Build URL
228
+ url = f"https://{host}/{self.config.bucket_name}/{key}"
229
+
230
+ # Create signed request
231
+ headers = self._sign_request(
232
+ method="PUT",
233
+ url=url,
234
+ host=host,
235
+ region=region,
236
+ service=service,
237
+ payload=data,
238
+ )
239
+
240
+ headers["Content-Type"] = self.get_content_type()
241
+ encoding = self.get_content_encoding()
242
+ if encoding:
243
+ headers["Content-Encoding"] = encoding
244
+
245
+ # Make request
246
+ request = urllib.request.Request(url, data=data, headers=headers, method="PUT")
247
+
248
+ try:
249
+ with urllib.request.urlopen(
250
+ request, timeout=self.config.read_timeout
251
+ ) as response:
252
+ if response.status not in (200, 201, 204):
253
+ raise ValueError(f"S3 upload failed with status {response.status}")
254
+ except urllib.error.HTTPError as e:
255
+ raise ValueError(f"S3 upload failed: {e.code} {e.reason}") from e
256
+
257
+ def _sign_request(
258
+ self,
259
+ method: str,
260
+ url: str,
261
+ host: str,
262
+ region: str,
263
+ service: str,
264
+ payload: bytes,
265
+ ) -> dict[str, str]:
266
+ """
267
+ Sign a request using AWS Signature Version 4.
268
+
269
+ Args:
270
+ method: HTTP method.
271
+ url: Request URL.
272
+ host: Host header value.
273
+ region: AWS region.
274
+ service: AWS service name.
275
+ payload: Request payload.
276
+
277
+ Returns:
278
+ Dict of headers including Authorization.
279
+ """
280
+ # Parse URL
281
+ parsed = urllib.parse.urlparse(url)
282
+ canonical_uri = parsed.path or "/"
283
+ canonical_querystring = parsed.query
284
+
285
+ # Current time
286
+ t = datetime.now(timezone.utc)
287
+ amz_date = t.strftime("%Y%m%dT%H%M%SZ")
288
+ date_stamp = t.strftime("%Y%m%d")
289
+
290
+ # Create payload hash
291
+ payload_hash = hashlib.sha256(payload).hexdigest()
292
+
293
+ # Create canonical headers
294
+ headers_to_sign = {
295
+ "host": host,
296
+ "x-amz-date": amz_date,
297
+ "x-amz-content-sha256": payload_hash,
298
+ }
299
+
300
+ if self._aws_session_token:
301
+ headers_to_sign["x-amz-security-token"] = self._aws_session_token
302
+
303
+ signed_headers = ";".join(sorted(headers_to_sign.keys()))
304
+
305
+ canonical_headers = ""
306
+ for key in sorted(headers_to_sign.keys()):
307
+ canonical_headers += f"{key}:{headers_to_sign[key]}\n"
308
+
309
+ # Create canonical request
310
+ canonical_request = (
311
+ f"{method}\n"
312
+ f"{canonical_uri}\n"
313
+ f"{canonical_querystring}\n"
314
+ f"{canonical_headers}\n"
315
+ f"{signed_headers}\n"
316
+ f"{payload_hash}"
317
+ )
318
+
319
+ # Create string to sign
320
+ algorithm = "AWS4-HMAC-SHA256"
321
+ credential_scope = f"{date_stamp}/{region}/{service}/aws4_request"
322
+ string_to_sign = (
323
+ f"{algorithm}\n"
324
+ f"{amz_date}\n"
325
+ f"{credential_scope}\n"
326
+ f"{hashlib.sha256(canonical_request.encode()).hexdigest()}"
327
+ )
328
+
329
+ # Create signing key
330
+ def sign(key: bytes, msg: str) -> bytes:
331
+ return hmac.new(key, msg.encode(), hashlib.sha256).digest()
332
+
333
+ k_date = sign(f"AWS4{self._aws_secret_key}".encode(), date_stamp)
334
+ k_region = sign(k_date, region)
335
+ k_service = sign(k_region, service)
336
+ k_signing = sign(k_service, "aws4_request")
337
+
338
+ # Create signature
339
+ signature = hmac.new(k_signing, string_to_sign.encode(), hashlib.sha256).hexdigest()
340
+
341
+ # Create authorization header
342
+ authorization_header = (
343
+ f"{algorithm} "
344
+ f"Credential={self._aws_access_key}/{credential_scope}, "
345
+ f"SignedHeaders={signed_headers}, "
346
+ f"Signature={signature}"
347
+ )
348
+
349
+ # Build final headers
350
+ result_headers = {
351
+ "x-amz-date": amz_date,
352
+ "x-amz-content-sha256": payload_hash,
353
+ "Authorization": authorization_header,
354
+ }
355
+
356
+ if self._aws_session_token:
357
+ result_headers["x-amz-security-token"] = self._aws_session_token
358
+
359
+ return result_headers
360
+
361
+ def health_check(self) -> bool:
362
+ """
363
+ Check if we can connect to S3.
364
+
365
+ Returns:
366
+ True if healthy.
367
+ """
368
+ try:
369
+ if HAS_BOTO3:
370
+ self._client.head_bucket(Bucket=self.config.bucket_name)
371
+ else:
372
+ # Try to list bucket (HEAD not easily done with urllib)
373
+ region = self.config.region or "us-east-1"
374
+ host = self.config.endpoint_url or f"s3.{region}.amazonaws.com"
375
+ if host.startswith("https://"):
376
+ host = host[8:]
377
+ url = f"https://{host}/{self.config.bucket_name}?max-keys=1"
378
+
379
+ headers = self._sign_request(
380
+ method="GET",
381
+ url=url,
382
+ host=host,
383
+ region=region,
384
+ service="s3",
385
+ payload=b"",
386
+ )
387
+
388
+ request = urllib.request.Request(url, headers=headers)
389
+ with urllib.request.urlopen(request, timeout=10) as response:
390
+ return response.status == 200
391
+
392
+ return True
393
+ except Exception as e:
394
+ logger.warning(f"S3 health check failed: {e}")
395
+ return False
396
+
397
+ def list_exports(
398
+ self,
399
+ start_date: datetime | None = None,
400
+ end_date: datetime | None = None,
401
+ max_keys: int = 1000,
402
+ ) -> list[str]:
403
+ """
404
+ List exported files in the bucket.
405
+
406
+ Args:
407
+ start_date: Filter to exports after this date.
408
+ end_date: Filter to exports before this date.
409
+ max_keys: Maximum number of keys to return.
410
+
411
+ Returns:
412
+ List of object keys.
413
+ """
414
+ if not HAS_BOTO3:
415
+ raise NotImplementedError("list_exports requires boto3")
416
+
417
+ prefix = self.config.prefix
418
+
419
+ # Add date prefix if filtering
420
+ if start_date:
421
+ prefix += f"{start_date.year:04d}/"
422
+
423
+ paginator = self._client.get_paginator("list_objects_v2")
424
+ pages = paginator.paginate(
425
+ Bucket=self.config.bucket_name,
426
+ Prefix=prefix,
427
+ MaxKeys=max_keys,
428
+ )
429
+
430
+ keys = []
431
+ for page in pages:
432
+ for obj in page.get("Contents", []):
433
+ key = obj["Key"]
434
+
435
+ # Filter by date if needed
436
+ if end_date:
437
+ # Extract date from key path
438
+ parts = key.split("/")
439
+ if len(parts) >= 4:
440
+ try:
441
+ year = int(parts[-4])
442
+ month = int(parts[-3])
443
+ day = int(parts[-2])
444
+ key_date = datetime(year, month, day, tzinfo=timezone.utc)
445
+ if key_date > end_date:
446
+ continue
447
+ except (ValueError, IndexError):
448
+ pass
449
+
450
+ keys.append(key)
451
+
452
+ if len(keys) >= max_keys:
453
+ break
454
+
455
+ return keys
456
+
457
+
458
+ class S3DataLakeExporter(S3Exporter):
459
+ """
460
+ Export audit logs to S3 with Data Lake / Athena optimization.
461
+
462
+ Extends S3Exporter with:
463
+ - Partitioned paths compatible with Athena/Glue
464
+ - Optional Parquet format for analytics
465
+ - Hive-style partition naming
466
+
467
+ Example:
468
+ >>> config = CloudExporterConfig(
469
+ ... provider="aws",
470
+ ... bucket_name="my-data-lake",
471
+ ... prefix="audit/proxilion/",
472
+ ... format=ExportFormat.PARQUET,
473
+ ... )
474
+ >>> exporter = S3DataLakeExporter(config)
475
+ """
476
+
477
+ def __init__(
478
+ self,
479
+ config: CloudExporterConfig,
480
+ use_hive_partitions: bool = True,
481
+ ) -> None:
482
+ """
483
+ Initialize the Data Lake exporter.
484
+
485
+ Args:
486
+ config: Exporter configuration.
487
+ use_hive_partitions: Use Hive-style partition naming (year=YYYY/).
488
+ """
489
+ super().__init__(config)
490
+ self.use_hive_partitions = use_hive_partitions
491
+
492
+ def generate_key(
493
+ self,
494
+ timestamp: datetime | None = None,
495
+ batch_id: str | None = None,
496
+ ) -> str:
497
+ """
498
+ Generate an object key with Hive-style partitioning.
499
+
500
+ Format: {prefix}/year=YYYY/month=MM/day=DD/hour=HH/{batch_id}.{ext}
501
+
502
+ Args:
503
+ timestamp: Timestamp for partitioning.
504
+ batch_id: Unique batch identifier.
505
+
506
+ Returns:
507
+ The generated object key.
508
+ """
509
+ if timestamp is None:
510
+ timestamp = datetime.now(timezone.utc)
511
+
512
+ if batch_id is None:
513
+ with self._lock:
514
+ self._batch_counter += 1
515
+ batch_id = f"{timestamp.strftime('%Y%m%d%H%M%S')}_{self._batch_counter:06d}"
516
+
517
+ # Determine file extension
518
+ ext = self.config.format.value
519
+ if self.config.compression.value != "none":
520
+ ext += f".{self.config.compression.value}"
521
+
522
+ # Build partitioned path
523
+ if self.use_hive_partitions:
524
+ key = (
525
+ f"{self.config.prefix}"
526
+ f"year={timestamp.year:04d}/"
527
+ f"month={timestamp.month:02d}/"
528
+ f"day={timestamp.day:02d}/"
529
+ f"hour={timestamp.hour:02d}/"
530
+ f"{batch_id}.{ext}"
531
+ )
532
+ else:
533
+ key = super().generate_key(timestamp, batch_id)
534
+
535
+ return key
536
+
537
+ def get_athena_table_ddl(
538
+ self,
539
+ table_name: str,
540
+ database: str = "default",
541
+ ) -> str:
542
+ """
543
+ Generate Athena CREATE TABLE DDL for the audit logs.
544
+
545
+ Args:
546
+ table_name: Name for the Athena table.
547
+ database: Athena database name.
548
+
549
+ Returns:
550
+ CREATE TABLE DDL statement.
551
+ """
552
+ location = f"s3://{self.config.bucket_name}/{self.config.prefix}"
553
+
554
+ if self.config.format.value == "parquet":
555
+ serde = "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
556
+ input_format = "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"
557
+ output_format = "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"
558
+ else:
559
+ serde = "org.openx.data.jsonserde.JsonSerDe"
560
+ input_format = "org.apache.hadoop.mapred.TextInputFormat"
561
+ output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
562
+
563
+ ddl = f"""
564
+ CREATE EXTERNAL TABLE IF NOT EXISTS `{database}`.`{table_name}` (
565
+ `event_id` string,
566
+ `timestamp` timestamp,
567
+ `sequence_number` bigint,
568
+ `event_type` string,
569
+ `user_id` string,
570
+ `user_roles` array<string>,
571
+ `session_id` string,
572
+ `agent_id` string,
573
+ `tool_name` string,
574
+ `tool_arguments` string,
575
+ `authorization_allowed` boolean,
576
+ `authorization_reason` string,
577
+ `policies_evaluated` array<string>,
578
+ `event_hash` string,
579
+ `previous_hash` string
580
+ )
581
+ PARTITIONED BY (
582
+ `year` int,
583
+ `month` int,
584
+ `day` int,
585
+ `hour` int
586
+ )
587
+ ROW FORMAT SERDE '{serde}'
588
+ STORED AS INPUTFORMAT '{input_format}'
589
+ OUTPUTFORMAT '{output_format}'
590
+ LOCATION '{location}'
591
+ TBLPROPERTIES ('has_encrypted_data'='false');
592
+ """
593
+ return ddl.strip()
594
+
595
+ def get_glue_schema(self) -> dict[str, Any]:
596
+ """
597
+ Get Glue catalog schema for the audit logs.
598
+
599
+ Returns:
600
+ Schema dictionary for Glue catalog.
601
+ """
602
+ columns = [
603
+ {"Name": "event_id", "Type": "string"},
604
+ {"Name": "timestamp", "Type": "timestamp"},
605
+ {"Name": "sequence_number", "Type": "bigint"},
606
+ {"Name": "event_type", "Type": "string"},
607
+ {"Name": "user_id", "Type": "string"},
608
+ {"Name": "user_roles", "Type": "array<string>"},
609
+ {"Name": "session_id", "Type": "string"},
610
+ {"Name": "agent_id", "Type": "string"},
611
+ {"Name": "tool_name", "Type": "string"},
612
+ {"Name": "tool_arguments", "Type": "string"},
613
+ {"Name": "authorization_allowed", "Type": "boolean"},
614
+ {"Name": "authorization_reason", "Type": "string"},
615
+ {"Name": "policies_evaluated", "Type": "array<string>"},
616
+ {"Name": "event_hash", "Type": "string"},
617
+ {"Name": "previous_hash", "Type": "string"},
618
+ ]
619
+
620
+ partition_keys = [
621
+ {"Name": "year", "Type": "int"},
622
+ {"Name": "month", "Type": "int"},
623
+ {"Name": "day", "Type": "int"},
624
+ {"Name": "hour", "Type": "int"},
625
+ ]
626
+
627
+ return {
628
+ "Columns": columns,
629
+ "PartitionKeys": partition_keys,
630
+ "Location": f"s3://{self.config.bucket_name}/{self.config.prefix}",
631
+ "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
632
+ "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
633
+ "SerdeInfo": {
634
+ "SerializationLibrary": "org.openx.data.jsonserde.JsonSerDe",
635
+ },
636
+ }