atlan-application-sdk 0.1.1rc39__py3-none-any.whl → 0.1.1rc41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. application_sdk/activities/.cursor/BUGBOT.md +424 -0
  2. application_sdk/activities/metadata_extraction/sql.py +400 -25
  3. application_sdk/application/__init__.py +2 -0
  4. application_sdk/application/metadata_extraction/sql.py +3 -0
  5. application_sdk/clients/.cursor/BUGBOT.md +280 -0
  6. application_sdk/clients/models.py +42 -0
  7. application_sdk/clients/sql.py +127 -87
  8. application_sdk/clients/temporal.py +3 -1
  9. application_sdk/common/.cursor/BUGBOT.md +316 -0
  10. application_sdk/common/aws_utils.py +259 -11
  11. application_sdk/common/utils.py +145 -9
  12. application_sdk/constants.py +8 -0
  13. application_sdk/decorators/.cursor/BUGBOT.md +279 -0
  14. application_sdk/handlers/__init__.py +8 -1
  15. application_sdk/handlers/sql.py +63 -22
  16. application_sdk/inputs/.cursor/BUGBOT.md +250 -0
  17. application_sdk/interceptors/.cursor/BUGBOT.md +320 -0
  18. application_sdk/interceptors/cleanup.py +171 -0
  19. application_sdk/interceptors/events.py +6 -6
  20. application_sdk/observability/decorators/observability_decorator.py +36 -22
  21. application_sdk/outputs/.cursor/BUGBOT.md +295 -0
  22. application_sdk/outputs/iceberg.py +4 -0
  23. application_sdk/outputs/json.py +6 -0
  24. application_sdk/outputs/parquet.py +13 -3
  25. application_sdk/server/.cursor/BUGBOT.md +442 -0
  26. application_sdk/server/fastapi/__init__.py +59 -3
  27. application_sdk/server/fastapi/models.py +27 -0
  28. application_sdk/services/objectstore.py +16 -3
  29. application_sdk/version.py +1 -1
  30. application_sdk/workflows/.cursor/BUGBOT.md +218 -0
  31. {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/METADATA +1 -1
  32. {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/RECORD +35 -24
  33. {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/WHEEL +0 -0
  34. {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/licenses/LICENSE +0 -0
  35. {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,424 @@
1
+ # Activity Code Review Guidelines - Temporal Activities
2
+
3
+ ## Context-Specific Patterns
4
+
5
+ This directory contains Temporal activity implementations that perform the actual work of workflows. Activities handle external I/O, database operations, and non-deterministic tasks.
6
+
7
+ ### Phase 1: Critical Activity Safety Issues
8
+
9
+ **External Resource Safety:**
10
+
11
+ - All external connections (database, API, file) must have explicit timeouts
12
+ - Connection failures must be handled gracefully with proper retry logic
13
+ - Resource cleanup must happen in finally blocks or context managers
14
+ - Sensitive data must not be logged or exposed in error messages
15
+ - All user inputs must be validated before processing
16
+
17
+ **Activity Timeout Management:**
18
+
19
+ - Activities must respect Temporal heartbeat timeouts for long-running operations
20
+ - Progress should be reported via heartbeat for operations > 30 seconds
21
+ - Activities should check for cancellation requests periodically
22
+ - Timeout values must be realistic for the operation being performed
23
+
24
+ ```python
25
+ # ✅ DO: Proper activity with heartbeat and cancellation
26
+ @activity.defn
27
+ async def process_large_dataset_activity(dataset_config: dict) -> dict:
28
+ total_records = await get_record_count(dataset_config)
29
+ processed = 0
30
+
31
+ async for batch in process_in_batches(dataset_config):
32
+ # Check for cancellation
33
+ activity.heartbeat({"progress": processed, "total": total_records})
34
+
35
+ try:
36
+ await process_batch(batch)
37
+ processed += len(batch)
38
+ except Exception as e:
39
+ activity.logger.error(f"Batch processing failed: {e}", exc_info=True)
40
+ raise
41
+
42
+ return {"processed_records": processed}
43
+
44
+ # ❌ NEVER: Long-running activity without heartbeat
45
+ @activity.defn
46
+ async def bad_process_activity(data):
47
+ # No heartbeat, no cancellation check, no progress reporting
48
+ return await process_all_data_at_once(data)
49
+ ```
50
+
51
+ ### Phase 2: Activity Architecture Patterns
52
+
53
+ **Resource Management:**
54
+
55
+ - Use connection pooling for database operations
56
+ - Implement proper connection context managers
57
+ - Clean up temporary files and resources
58
+ - Handle partial failures gracefully
59
+ - Implement idempotent operations where possible
60
+
61
+ **Default Value Management:**
62
+
63
+ - **Always define sensible defaults**: Activity parameters should have reasonable default values where appropriate
64
+ - **Avoid required parameters for inferable values**: Values like `owner_id` that can be derived (e.g., from `application_name:run_id`) should not be required parameters
65
+ - **Default TTL values**: Lock operations, cache entries, and timeouts should have documented default values (e.g., 300 seconds for locks)
66
+ - **Environment-based defaults**: Different environments (dev/prod) may need different defaults
67
+
68
+ ```python
69
+ # ✅ DO: Proper default value management
70
+ @activity.defn
71
+ async def acquire_distributed_lock_activity(
72
+ lock_name: str,
73
+ max_locks: int = 10, # Sensible default
74
+ ttl_seconds: int = 300, # 5 minutes default
75
+ owner_id: Optional[str] = None # Will be inferred
76
+ ) -> dict:
77
+ """Acquire a distributed lock with proper defaults."""
78
+
79
+ # Infer owner_id if not provided
80
+ if owner_id is None:
81
+ workflow_info = activity.info().workflow_execution
82
+ owner_id = f"{workflow_info.workflow_type}:{workflow_info.workflow_id}"
83
+
84
+ # Validate parameters
85
+ if max_locks <= 0:
86
+ raise ValueError(f"max_locks must be positive, got: {max_locks}")
87
+
88
+ return await lock_manager.acquire_lock(lock_name, max_locks, ttl_seconds, owner_id)
89
+
90
+ # ❌ REJECT: Poor parameter management
91
+ @activity.defn
92
+ async def bad_acquire_lock_activity(
93
+ lock_name: str,
94
+ max_locks: int, # No default
95
+ ttl_seconds: int, # No default
96
+ owner_id: str, # Required but could be inferred
97
+ application_name: str, # Redundant - should be inferred
98
+ run_id: str # Redundant - should be inferred
99
+ ) -> dict:
100
+ # Forces users to pass values that could be automatically determined
101
+ pass
102
+ ```
103
+
104
+ **Error Handling and Retries:**
105
+
106
+ - Distinguish between retryable and non-retryable errors
107
+ - Use specific exception types for different error conditions
108
+ - Log errors with sufficient context for debugging
109
+ - Implement exponential backoff for retryable operations
110
+ - Preserve error context across retries
111
+
112
+ ```python
113
+ # ✅ DO: Proper error handling with context
114
+ @activity.defn
115
+ async def extract_metadata_activity(connection_config: dict) -> dict:
116
+ client = None
117
+ try:
118
+ client = await create_database_client(connection_config)
119
+ await client.validate_connection()
120
+
121
+ metadata = await client.extract_metadata()
122
+
123
+ activity.logger.info(
124
+ f"Extracted metadata for {len(metadata)} objects",
125
+ extra={"database": connection_config.get("database", "unknown")}
126
+ )
127
+
128
+ return metadata
129
+
130
+ except ConnectionError as e:
131
+ # Retryable error
132
+ activity.logger.warning(f"Connection failed, will retry: {e}")
133
+ raise # Let Temporal handle retry
134
+
135
+ except ValidationError as e:
136
+ # Non-retryable error
137
+ activity.logger.error(f"Invalid connection config: {e}")
138
+ raise ApplicationError(f"Configuration validation failed: {e}", non_retryable=True)
139
+
140
+ finally:
141
+ if client:
142
+ await client.close()
143
+ ```
144
+
145
+ **Resource Validation and Limits:**
146
+
147
+ - **Key length validation**: Ensure generated keys (Redis, cache) don't exceed system limits
148
+ - **Memory constraints**: Validate that operations won't exceed available memory
149
+ - **Connection limits**: Check that concurrent operations stay within connection pool limits
150
+ - **Processing time estimates**: Validate that operations can complete within activity timeouts
151
+
152
+ ```python
153
+ # ✅ DO: Resource validation
154
+ @activity.defn
155
+ async def process_with_validation_activity(
156
+ resource_name: str,
157
+ data_size_mb: int,
158
+ max_processing_time_minutes: int = 30
159
+ ) -> dict:
160
+ """Process data with proper resource validation."""
161
+
162
+ # Validate resource constraints
163
+ if len(resource_name.encode('utf-8')) > 512 * 1024 * 1024: # 512MB Redis key limit
164
+ raise ValueError(f"Resource name too long: {len(resource_name)} bytes")
165
+
166
+ if data_size_mb > 1000: # 1GB memory limit
167
+ raise ValueError(f"Data size {data_size_mb}MB exceeds 1GB limit")
168
+
169
+ # Validate processing time against activity timeout
170
+ activity_timeout = activity.info().start_to_close_timeout
171
+ if max_processing_time_minutes * 60 > activity_timeout.total_seconds():
172
+ raise ValueError(f"Processing time {max_processing_time_minutes}m exceeds timeout")
173
+
174
+ return await process_data(resource_name, data_size_mb)
175
+ ```
176
+
177
+ ### Phase 3: Activity Testing Requirements
178
+
179
+ **Activity Testing Standards:**
180
+
181
+ - Test activities independently from workflows
182
+ - Mock external dependencies (databases, APIs, file systems)
183
+ - Test timeout and cancellation behaviors
184
+ - Test retry scenarios with different error types
185
+ - Include performance tests for long-running activities
186
+ - Test heartbeat and progress reporting
187
+
188
+ **Integration Testing:**
189
+
190
+ - Use test databases/services for integration tests
191
+ - Test real connection failures and recovery
192
+ - Verify proper resource cleanup
193
+ - Test activity behavior under load
194
+ - Include end-to-end tests with real workflows
195
+
196
+ ### Phase 4: Performance and Scalability
197
+
198
+ **Activity Performance:**
199
+
200
+ - Use async/await for all I/O operations
201
+ - Implement proper batching for bulk operations
202
+ - Use streaming for large datasets
203
+ - Monitor activity execution time and resource usage
204
+ - Optimize database queries and API calls
205
+
206
+ **Memory Management:**
207
+
208
+ - Process large datasets in chunks, not all at once
209
+ - Use generators for memory-efficient iteration
210
+ - Clean up large objects explicitly
211
+ - Monitor memory usage in long-running activities
212
+ - Use appropriate data types and structures
213
+
214
+ ```python
215
+ # ✅ DO: Memory-efficient processing
216
+ @activity.defn
217
+ async def process_large_file_activity(file_path: str, chunk_size: int = 1000) -> dict:
218
+ processed_count = 0
219
+
220
+ async with aiofiles.open(file_path, 'r') as file:
221
+ chunk = []
222
+ async for line in file:
223
+ chunk.append(line.strip())
224
+
225
+ if len(chunk) >= chunk_size:
226
+ await process_chunk(chunk)
227
+ processed_count += len(chunk)
228
+ chunk = []
229
+
230
+ # Report progress and check for cancellation
231
+ activity.heartbeat({"processed": processed_count})
232
+
233
+ # Process remaining items
234
+ if chunk:
235
+ await process_chunk(chunk)
236
+ processed_count += len(chunk)
237
+
238
+ return {"total_processed": processed_count}
239
+
240
+ # ❌ NEVER: Load entire file into memory
241
+ @activity.defn
242
+ async def bad_file_activity(file_path: str):
243
+ with open(file_path, 'r') as file:
244
+ all_lines = file.readlines() # Memory intensive!
245
+ return process_all_lines(all_lines)
246
+ ```
247
+
248
+ **Parallelization Opportunities:**
249
+
250
+ - **Flag sequential operations**: When processing multiple files or resources, suggest parallel processing
251
+ - **Batch operations**: Group related operations to reduce overhead
252
+ - **Connection reuse**: Optimize connection usage across operations
253
+ - **Async patterns**: Ensure I/O operations don't block other processing
254
+
255
+ ### Phase 5: Activity Maintainability
256
+
257
+ **Code Organization:**
258
+
259
+ - Keep activities focused on a single responsibility
260
+ - Use dependency injection for external services
261
+ - Implement proper logging with activity context
262
+ - Document activity parameters and return values
263
+ - Follow consistent naming conventions
264
+
265
+ **Configuration and Environment:**
266
+
267
+ - Externalize all configuration parameters
268
+ - Use environment-specific settings appropriately
269
+ - Validate configuration before using it
270
+ - Support development and production configurations
271
+ - Document all required configuration options
272
+
273
+ **Error Context Enhancement:**
274
+
275
+ - **Operation identification**: Include the specific operation that failed in error messages
276
+ - **Parameter context**: Log relevant parameters (sanitized) when operations fail
277
+ - **Resource state**: Include information about resource availability/state in errors
278
+ - **Recovery suggestions**: Where possible, include suggestions for resolving errors
279
+
280
+ ```python
281
+ # ✅ DO: Enhanced error context
282
+ @activity.defn
283
+ async def enhanced_error_activity(
284
+ database_name: str,
285
+ table_names: List[str],
286
+ timeout_seconds: int = 300
287
+ ) -> dict:
288
+ """Activity with comprehensive error context."""
289
+
290
+ try:
291
+ result = await extract_table_metadata(database_name, table_names, timeout_seconds)
292
+ return result
293
+
294
+ except ConnectionTimeout as e:
295
+ activity.logger.error(
296
+ f"Database connection timeout during metadata extraction",
297
+ extra={
298
+ "database": database_name,
299
+ "tables_requested": len(table_names),
300
+ "timeout_used": timeout_seconds,
301
+ "suggestion": "Consider increasing timeout or reducing table count"
302
+ }
303
+ )
304
+ raise ApplicationError(
305
+ f"Metadata extraction timed out after {timeout_seconds}s for database '{database_name}' "
306
+ f"with {len(table_names)} tables. Consider reducing scope or increasing timeout.",
307
+ non_retryable=True
308
+ )
309
+
310
+ except InsufficientPrivileges as e:
311
+ activity.logger.error(
312
+ f"Insufficient database privileges for metadata extraction",
313
+ extra={
314
+ "database": database_name,
315
+ "required_privileges": ["SELECT", "INFORMATION_SCHEMA_READ"],
316
+ "suggestion": "Grant required database privileges to connection user"
317
+ }
318
+ )
319
+ raise ApplicationError(
320
+ f"Missing database privileges for '{database_name}'. "
321
+ f"Ensure connection user has SELECT and INFORMATION_SCHEMA access.",
322
+ non_retryable=True
323
+ )
324
+ ```
325
+
326
+ ---
327
+
328
+ ## Activity-Specific Anti-Patterns
329
+
330
+ **Always Reject:**
331
+
332
+ - Activities without proper timeout handling
333
+ - Long-running activities without heartbeat reporting
334
+ - Missing resource cleanup (connections, files, etc.)
335
+ - Generic exception handling without specific error types
336
+ - Activities that don't handle cancellation
337
+ - Synchronous I/O operations in async activities
338
+ - Missing logging for error conditions
339
+ - Activities without proper input validation
340
+
341
+ **Parameter Management Anti-Patterns:**
342
+
343
+ - **Over-parameterization**: Requiring parameters that can be inferred from context
344
+ - **Missing defaults**: Parameters without reasonable default values
345
+ - **No validation**: Accepting parameters without validating constraints
346
+ - **Redundant parameters**: Multiple parameters representing the same concept
347
+
348
+ **Resource Management Anti-Patterns:**
349
+
350
+ ```python
351
+ # ❌ REJECT: Poor resource management
352
+ @activity.defn
353
+ async def bad_database_activity(query: str):
354
+ # No connection pooling, no cleanup, no error handling
355
+ conn = await psycopg.connect("host=localhost...")
356
+ result = await conn.execute(query) # No timeout
357
+ return result.fetchall() # Connection never closed
358
+
359
+ # ✅ REQUIRE: Proper resource management
360
+ @activity.defn
361
+ async def good_database_activity(query: str, params: tuple = ()) -> list:
362
+ async with get_connection_pool().acquire() as conn:
363
+ try:
364
+ # Set query timeout
365
+ async with conn.cursor() as cursor:
366
+ await cursor.execute(query, params)
367
+ return await cursor.fetchall()
368
+ except Exception as e:
369
+ activity.logger.error(f"Database query failed: {query[:100]}...", exc_info=True)
370
+ raise
371
+ # Connection automatically returned to pool
372
+ ```
373
+
374
+ **Heartbeat and Cancellation Anti-Patterns:**
375
+
376
+ ```python
377
+ # ❌ REJECT: No heartbeat or cancellation handling
378
+ @activity.defn
379
+ async def bad_long_running_activity(data_list: list):
380
+ results = []
381
+ for item in data_list: # Could take hours
382
+ result = await expensive_operation(item)
383
+ results.append(result)
384
+ return results
385
+
386
+ # ✅ REQUIRE: Proper heartbeat and cancellation
387
+ @activity.defn
388
+ async def good_long_running_activity(data_list: list) -> list:
389
+ results = []
390
+ total_items = len(data_list)
391
+
392
+ for i, item in enumerate(data_list):
393
+ # Check for cancellation and report progress
394
+ activity.heartbeat({
395
+ "processed": i,
396
+ "total": total_items,
397
+ "percent_complete": (i / total_items) * 100
398
+ })
399
+
400
+ try:
401
+ result = await expensive_operation(item)
402
+ results.append(result)
403
+ except Exception as e:
404
+ activity.logger.error(f"Processing failed for item {i}: {e}")
405
+ raise
406
+
407
+ return results
408
+ ```
409
+
410
+ ## Educational Context for Activity Reviews
411
+
412
+ When reviewing activity code, emphasize:
413
+
414
+ 1. **Reliability Impact**: "Activities are where the real work happens. Proper error handling and resource management in activities determines whether workflows succeed or fail under real-world conditions."
415
+
416
+ 2. **Performance Impact**: "Activity performance directly affects workflow execution time. Inefficient activities create bottlenecks that slow down entire business processes."
417
+
418
+ 3. **Observability Impact**: "Activity logging and heartbeat reporting are essential for monitoring long-running processes. Without proper observability, debugging workflow issues becomes nearly impossible."
419
+
420
+ 4. **Resource Impact**: "Activities consume actual system resources. Poor resource management in activities can cause memory leaks, connection pool exhaustion, and system instability."
421
+
422
+ 5. **Cancellation Impact**: "Activities that don't handle cancellation properly can continue consuming resources even after workflows are cancelled, leading to resource waste and potential system overload."
423
+
424
+ 6. **Parameter Design Impact**: "Well-designed activity parameters with sensible defaults make activities easier to use and less error-prone. Over-parameterization creates maintenance burden and increases the chance of configuration errors."