@cloudstreamsoftware/claude-tools 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +152 -37
- package/agents/INDEX.md +183 -0
- package/agents/architect.md +247 -0
- package/agents/build-error-resolver.md +555 -0
- package/agents/catalyst-deployer.md +132 -0
- package/agents/code-reviewer.md +121 -0
- package/agents/compliance-auditor.md +148 -0
- package/agents/creator-architect.md +395 -0
- package/agents/deluge-reviewer.md +98 -0
- package/agents/doc-updater.md +471 -0
- package/agents/e2e-runner.md +711 -0
- package/agents/planner.md +122 -0
- package/agents/refactor-cleaner.md +309 -0
- package/agents/security-reviewer.md +582 -0
- package/agents/tdd-guide.md +302 -0
- package/bin/cloudstream-setup.js +16 -6
- package/config/versions.json +63 -0
- package/dist/hooks/hooks.json +209 -0
- package/dist/index.js +47 -0
- package/dist/lib/asset-value.js +609 -0
- package/dist/lib/client-manager.js +300 -0
- package/dist/lib/command-matcher.js +242 -0
- package/dist/lib/cross-session-patterns.js +754 -0
- package/dist/lib/intent-classifier.js +1075 -0
- package/dist/lib/package-manager.js +374 -0
- package/dist/lib/recommendation-engine.js +597 -0
- package/dist/lib/session-memory.js +489 -0
- package/dist/lib/skill-effectiveness.js +486 -0
- package/dist/lib/skill-matcher.js +595 -0
- package/dist/lib/tutorial-metrics.js +242 -0
- package/dist/lib/tutorial-progress.js +209 -0
- package/dist/lib/tutorial-renderer.js +431 -0
- package/dist/lib/utils.js +380 -0
- package/dist/lib/verify-formatter.js +143 -0
- package/dist/lib/workflow-state.js +249 -0
- package/hooks/hooks.json +209 -0
- package/package.json +5 -1
- package/scripts/aggregate-sessions.js +290 -0
- package/scripts/branch-name-validator.js +291 -0
- package/scripts/build.js +101 -0
- package/scripts/commands/client-switch.js +231 -0
- package/scripts/deprecate-skill.js +610 -0
- package/scripts/diagnose.js +324 -0
- package/scripts/doc-freshness.js +168 -0
- package/scripts/generate-weekly-digest.js +393 -0
- package/scripts/health-check.js +270 -0
- package/scripts/hooks/credential-check.js +101 -0
- package/scripts/hooks/evaluate-session.js +81 -0
- package/scripts/hooks/pre-compact.js +66 -0
- package/scripts/hooks/prompt-analyzer.js +276 -0
- package/scripts/hooks/prompt-router.js +422 -0
- package/scripts/hooks/quality-gate-enforcer.js +371 -0
- package/scripts/hooks/session-end.js +156 -0
- package/scripts/hooks/session-start.js +195 -0
- package/scripts/hooks/skill-injector.js +333 -0
- package/scripts/hooks/suggest-compact.js +58 -0
- package/scripts/lib/asset-value.js +609 -0
- package/scripts/lib/client-manager.js +300 -0
- package/scripts/lib/command-matcher.js +242 -0
- package/scripts/lib/cross-session-patterns.js +754 -0
- package/scripts/lib/intent-classifier.js +1075 -0
- package/scripts/lib/package-manager.js +374 -0
- package/scripts/lib/recommendation-engine.js +597 -0
- package/scripts/lib/session-memory.js +489 -0
- package/scripts/lib/skill-effectiveness.js +486 -0
- package/scripts/lib/skill-matcher.js +595 -0
- package/scripts/lib/tutorial-metrics.js +242 -0
- package/scripts/lib/tutorial-progress.js +209 -0
- package/scripts/lib/tutorial-renderer.js +431 -0
- package/scripts/lib/utils.js +380 -0
- package/scripts/lib/verify-formatter.js +143 -0
- package/scripts/lib/workflow-state.js +249 -0
- package/scripts/onboard.js +363 -0
- package/scripts/quarterly-report.js +692 -0
- package/scripts/setup-package-manager.js +204 -0
- package/scripts/sync-upstream.js +391 -0
- package/scripts/test.js +108 -0
- package/scripts/tutorial-runner.js +351 -0
- package/scripts/validate-all.js +201 -0
- package/scripts/verifiers/agents.js +245 -0
- package/scripts/verifiers/config.js +186 -0
- package/scripts/verifiers/environment.js +123 -0
- package/scripts/verifiers/hooks.js +188 -0
- package/scripts/verifiers/index.js +38 -0
- package/scripts/verifiers/persistence.js +140 -0
- package/scripts/verifiers/plugin.js +215 -0
- package/scripts/verifiers/skills.js +209 -0
- package/scripts/verify-setup.js +164 -0
- package/skills/INDEX.md +157 -0
- package/skills/backend-patterns/SKILL.md +586 -0
- package/skills/backend-patterns/catalyst-patterns.md +128 -0
- package/skills/bigquery-patterns/SKILL.md +27 -0
- package/skills/bigquery-patterns/performance-optimization.md +518 -0
- package/skills/bigquery-patterns/query-patterns.md +372 -0
- package/skills/bigquery-patterns/schema-design.md +78 -0
- package/skills/cloudstream-project-template/SKILL.md +20 -0
- package/skills/cloudstream-project-template/structure.md +65 -0
- package/skills/coding-standards/SKILL.md +524 -0
- package/skills/coding-standards/deluge-standards.md +83 -0
- package/skills/compliance-patterns/SKILL.md +28 -0
- package/skills/compliance-patterns/hipaa/audit-requirements.md +251 -0
- package/skills/compliance-patterns/hipaa/baa-process.md +298 -0
- package/skills/compliance-patterns/hipaa/data-archival-strategy.md +387 -0
- package/skills/compliance-patterns/hipaa/phi-handling.md +52 -0
- package/skills/compliance-patterns/pci-dss/saq-a-requirements.md +307 -0
- package/skills/compliance-patterns/pci-dss/tokenization-patterns.md +382 -0
- package/skills/compliance-patterns/pci-dss/zoho-checkout-patterns.md +56 -0
- package/skills/compliance-patterns/soc2/access-controls.md +344 -0
- package/skills/compliance-patterns/soc2/audit-logging.md +458 -0
- package/skills/compliance-patterns/soc2/change-management.md +403 -0
- package/skills/compliance-patterns/soc2/deluge-execution-logging.md +407 -0
- package/skills/consultancy-workflows/SKILL.md +19 -0
- package/skills/consultancy-workflows/client-isolation.md +21 -0
- package/skills/consultancy-workflows/documentation-automation.md +454 -0
- package/skills/consultancy-workflows/handoff-procedures.md +257 -0
- package/skills/consultancy-workflows/knowledge-capture.md +513 -0
- package/skills/consultancy-workflows/time-tracking.md +26 -0
- package/skills/continuous-learning/SKILL.md +84 -0
- package/skills/continuous-learning/config.json +18 -0
- package/skills/continuous-learning/evaluate-session.sh +60 -0
- package/skills/continuous-learning-v2/SKILL.md +126 -0
- package/skills/continuous-learning-v2/config.json +61 -0
- package/skills/frontend-patterns/SKILL.md +635 -0
- package/skills/frontend-patterns/zoho-widget-patterns.md +103 -0
- package/skills/gcp-data-engineering/SKILL.md +36 -0
- package/skills/gcp-data-engineering/bigquery/performance-optimization.md +337 -0
- package/skills/gcp-data-engineering/dataflow/error-handling.md +496 -0
- package/skills/gcp-data-engineering/dataflow/pipeline-patterns.md +444 -0
- package/skills/gcp-data-engineering/dbt/model-organization.md +63 -0
- package/skills/gcp-data-engineering/dbt/testing-patterns.md +503 -0
- package/skills/gcp-data-engineering/medallion-architecture/bronze-layer.md +60 -0
- package/skills/gcp-data-engineering/medallion-architecture/gold-layer.md +311 -0
- package/skills/gcp-data-engineering/medallion-architecture/layer-transitions.md +517 -0
- package/skills/gcp-data-engineering/medallion-architecture/silver-layer.md +305 -0
- package/skills/gcp-data-engineering/zoho-to-gcp/data-extraction.md +543 -0
- package/skills/gcp-data-engineering/zoho-to-gcp/real-time-vs-batch.md +337 -0
- package/skills/security-review/SKILL.md +498 -0
- package/skills/security-review/compliance-checklist.md +53 -0
- package/skills/strategic-compact/SKILL.md +67 -0
- package/skills/tdd-workflow/SKILL.md +413 -0
- package/skills/tdd-workflow/zoho-testing.md +124 -0
- package/skills/tutorial/SKILL.md +249 -0
- package/skills/tutorial/docs/ACCESSIBILITY.md +169 -0
- package/skills/tutorial/lessons/00-philosophy-and-workflow.md +198 -0
- package/skills/tutorial/lessons/01-basics.md +81 -0
- package/skills/tutorial/lessons/02-training.md +86 -0
- package/skills/tutorial/lessons/03-commands.md +109 -0
- package/skills/tutorial/lessons/04-workflows.md +115 -0
- package/skills/tutorial/lessons/05-compliance.md +116 -0
- package/skills/tutorial/lessons/06-zoho.md +121 -0
- package/skills/tutorial/lessons/07-hooks-system.md +277 -0
- package/skills/tutorial/lessons/08-mcp-servers.md +316 -0
- package/skills/tutorial/lessons/09-client-management.md +215 -0
- package/skills/tutorial/lessons/10-testing-e2e.md +260 -0
- package/skills/tutorial/lessons/11-skills-deep-dive.md +272 -0
- package/skills/tutorial/lessons/12-rules-system.md +326 -0
- package/skills/tutorial/lessons/13-golden-standard-graduation.md +213 -0
- package/skills/tutorial/lessons/14-fork-setup-and-sync.md +312 -0
- package/skills/tutorial/lessons/15-living-examples-system.md +221 -0
- package/skills/tutorial/tracks/accelerated/README.md +134 -0
- package/skills/tutorial/tracks/accelerated/assessment/checkpoint-1.md +161 -0
- package/skills/tutorial/tracks/accelerated/assessment/checkpoint-2.md +175 -0
- package/skills/tutorial/tracks/accelerated/day-1-core-concepts.md +234 -0
- package/skills/tutorial/tracks/accelerated/day-2-essential-commands.md +270 -0
- package/skills/tutorial/tracks/accelerated/day-3-workflow-mastery.md +305 -0
- package/skills/tutorial/tracks/accelerated/day-4-compliance-zoho.md +304 -0
- package/skills/tutorial/tracks/accelerated/day-5-hooks-skills.md +344 -0
- package/skills/tutorial/tracks/accelerated/day-6-client-testing.md +386 -0
- package/skills/tutorial/tracks/accelerated/day-7-graduation.md +369 -0
- package/skills/zoho-patterns/CHANGELOG.md +108 -0
- package/skills/zoho-patterns/SKILL.md +446 -0
- package/skills/zoho-patterns/analytics/dashboard-patterns.md +352 -0
- package/skills/zoho-patterns/analytics/zoho-to-bigquery-pipeline.md +427 -0
- package/skills/zoho-patterns/catalyst/appsail-deployment.md +349 -0
- package/skills/zoho-patterns/catalyst/context-close-patterns.md +354 -0
- package/skills/zoho-patterns/catalyst/cron-batch-processing.md +374 -0
- package/skills/zoho-patterns/catalyst/function-patterns.md +439 -0
- package/skills/zoho-patterns/creator/form-design.md +304 -0
- package/skills/zoho-patterns/creator/publish-api-patterns.md +313 -0
- package/skills/zoho-patterns/creator/widget-integration.md +306 -0
- package/skills/zoho-patterns/creator/workflow-automation.md +253 -0
- package/skills/zoho-patterns/deluge/api-patterns.md +468 -0
- package/skills/zoho-patterns/deluge/batch-processing.md +403 -0
- package/skills/zoho-patterns/deluge/cross-app-integration.md +356 -0
- package/skills/zoho-patterns/deluge/error-handling.md +423 -0
- package/skills/zoho-patterns/deluge/syntax-reference.md +65 -0
- package/skills/zoho-patterns/integration/cors-proxy-architecture.md +426 -0
- package/skills/zoho-patterns/integration/crm-books-native-sync.md +277 -0
- package/skills/zoho-patterns/integration/oauth-token-management.md +461 -0
- package/skills/zoho-patterns/integration/zoho-flow-patterns.md +334 -0
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
# Dataflow Error Handling
|
|
2
|
+
|
|
3
|
+
> Patterns for dead letter queues, retry logic, poison message handling, and monitoring in CloudStream Dataflow pipelines.
|
|
4
|
+
|
|
5
|
+
## Dead Letter Queue (DLQ) Pattern
|
|
6
|
+
|
|
7
|
+
The DLQ pattern routes unprocessable records to a separate destination for later investigation, preventing a single bad record from blocking the entire pipeline.
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
# dlq_pipeline.py
|
|
11
|
+
import apache_beam as beam
|
|
12
|
+
from apache_beam.io.gcp.bigquery import WriteToBigQuery
|
|
13
|
+
import json
|
|
14
|
+
import traceback
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
|
|
17
|
+
class ProcessWithDLQ(beam.DoFn):
|
|
18
|
+
"""Process records, routing failures to dead letter queue."""
|
|
19
|
+
|
|
20
|
+
# Output tags
|
|
21
|
+
VALID_TAG = 'valid'
|
|
22
|
+
DLQ_TAG = 'dlq'
|
|
23
|
+
|
|
24
|
+
def process(self, element):
|
|
25
|
+
try:
|
|
26
|
+
record = json.loads(element.decode('utf-8'))
|
|
27
|
+
|
|
28
|
+
# Validation checks
|
|
29
|
+
if not record.get('record_id'):
|
|
30
|
+
raise ValueError("Missing required field: record_id")
|
|
31
|
+
|
|
32
|
+
if record.get('amount') and float(record['amount']) < 0:
|
|
33
|
+
raise ValueError(f"Negative amount: {record['amount']}")
|
|
34
|
+
|
|
35
|
+
# Type coercion
|
|
36
|
+
cleaned = {
|
|
37
|
+
'record_id': str(record['record_id']),
|
|
38
|
+
'amount': float(record.get('amount', 0)),
|
|
39
|
+
'stage': record.get('stage', 'Unknown'),
|
|
40
|
+
'_processed_at': datetime.utcnow().isoformat()
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
yield beam.pvalue.TaggedOutput(self.VALID_TAG, cleaned)
|
|
44
|
+
|
|
45
|
+
except Exception as e:
|
|
46
|
+
yield beam.pvalue.TaggedOutput(self.DLQ_TAG, {
|
|
47
|
+
'raw_message': element.decode('utf-8') if isinstance(element, bytes) else str(element),
|
|
48
|
+
'error_type': type(e).__name__,
|
|
49
|
+
'error_message': str(e),
|
|
50
|
+
'stack_trace': traceback.format_exc(),
|
|
51
|
+
'failed_at': datetime.utcnow().isoformat(),
|
|
52
|
+
'pipeline_name': 'zoho_deals_streaming'
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
def run():
|
|
56
|
+
with beam.Pipeline(options=get_streaming_options()) as p:
|
|
57
|
+
messages = (
|
|
58
|
+
p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
|
|
59
|
+
subscription='projects/cloudstream-prod/subscriptions/zoho-deals-sub'
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
results = messages | 'Process' >> beam.ParDo(
|
|
64
|
+
ProcessWithDLQ()
|
|
65
|
+
).with_outputs(ProcessWithDLQ.VALID_TAG, ProcessWithDLQ.DLQ_TAG)
|
|
66
|
+
|
|
67
|
+
# Valid records to silver
|
|
68
|
+
results[ProcessWithDLQ.VALID_TAG] | 'WriteValid' >> WriteToBigQuery(
|
|
69
|
+
table='cloudstream-prod:silver.zoho_deals',
|
|
70
|
+
write_disposition='WRITE_APPEND'
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Failed records to DLQ table
|
|
74
|
+
results[ProcessWithDLQ.DLQ_TAG] | 'WriteDLQ' >> WriteToBigQuery(
|
|
75
|
+
table='cloudstream-prod:ops.dead_letter_queue',
|
|
76
|
+
write_disposition='WRITE_APPEND'
|
|
77
|
+
)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Error Tagging and Routing
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
# Multi-category error routing
|
|
84
|
+
class ErrorRouter(beam.DoFn):
|
|
85
|
+
"""Route errors to appropriate handlers based on type."""
|
|
86
|
+
|
|
87
|
+
RETRYABLE = 'retryable' # Transient errors (API timeout, rate limit)
|
|
88
|
+
VALIDATION = 'validation' # Data quality failures
|
|
89
|
+
POISON = 'poison' # Unrecoverable (corrupt data, schema mismatch)
|
|
90
|
+
|
|
91
|
+
def process(self, element):
|
|
92
|
+
error_type = element.get('error_type', '')
|
|
93
|
+
error_msg = element.get('error_message', '')
|
|
94
|
+
|
|
95
|
+
if any(t in error_type for t in ['TimeoutError', 'ConnectionError', 'ServiceUnavailable']):
|
|
96
|
+
yield beam.pvalue.TaggedOutput(self.RETRYABLE, element)
|
|
97
|
+
elif any(t in error_type for t in ['ValueError', 'TypeError', 'ValidationError']):
|
|
98
|
+
yield beam.pvalue.TaggedOutput(self.VALIDATION, element)
|
|
99
|
+
else:
|
|
100
|
+
yield beam.pvalue.TaggedOutput(self.POISON, element)
|
|
101
|
+
|
|
102
|
+
# Usage in pipeline
|
|
103
|
+
routed = (
|
|
104
|
+
dlq_records
|
|
105
|
+
| 'RouteErrors' >> beam.ParDo(ErrorRouter()).with_outputs(
|
|
106
|
+
ErrorRouter.RETRYABLE, ErrorRouter.VALIDATION, ErrorRouter.POISON
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Retryable → back to input topic with delay
|
|
111
|
+
routed[ErrorRouter.RETRYABLE] | 'RepublishRetryable' >> beam.ParDo(RepublishWithDelay())
|
|
112
|
+
|
|
113
|
+
# Validation → quarantine table for data team
|
|
114
|
+
routed[ErrorRouter.VALIDATION] | 'WriteValidation' >> WriteToBigQuery(
|
|
115
|
+
table='cloudstream-prod:ops.validation_failures'
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Poison → alert and archive
|
|
119
|
+
routed[ErrorRouter.POISON] | 'AlertPoison' >> beam.ParDo(AlertAndArchive())
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Retry with Exponential Backoff
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
import time
|
|
126
|
+
import random
|
|
127
|
+
from google.api_core import retry as api_retry
|
|
128
|
+
from google.api_core.exceptions import ServiceUnavailable, TooManyRequests
|
|
129
|
+
|
|
130
|
+
class RetryableTransform(beam.DoFn):
|
|
131
|
+
"""Transform with built-in retry logic for external API calls."""
|
|
132
|
+
|
|
133
|
+
MAX_RETRIES = 5
|
|
134
|
+
BASE_DELAY = 1.0 # seconds
|
|
135
|
+
MAX_DELAY = 60.0 # seconds
|
|
136
|
+
|
|
137
|
+
def process(self, element):
|
|
138
|
+
for attempt in range(self.MAX_RETRIES):
|
|
139
|
+
try:
|
|
140
|
+
result = self._call_external_api(element)
|
|
141
|
+
yield result
|
|
142
|
+
return
|
|
143
|
+
except (ServiceUnavailable, TooManyRequests, ConnectionError) as e:
|
|
144
|
+
if attempt == self.MAX_RETRIES - 1:
|
|
145
|
+
# Final attempt failed - route to DLQ
|
|
146
|
+
yield beam.pvalue.TaggedOutput('dlq', {
|
|
147
|
+
'record': element,
|
|
148
|
+
'error': str(e),
|
|
149
|
+
'attempts': self.MAX_RETRIES,
|
|
150
|
+
'failed_at': datetime.utcnow().isoformat()
|
|
151
|
+
})
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
# Exponential backoff with jitter
|
|
155
|
+
delay = min(
|
|
156
|
+
self.BASE_DELAY * (2 ** attempt) + random.uniform(0, 1),
|
|
157
|
+
self.MAX_DELAY
|
|
158
|
+
)
|
|
159
|
+
time.sleep(delay)
|
|
160
|
+
|
|
161
|
+
except Exception as e:
|
|
162
|
+
# Non-retryable error - immediate DLQ
|
|
163
|
+
yield beam.pvalue.TaggedOutput('dlq', {
|
|
164
|
+
'record': element,
|
|
165
|
+
'error': str(e),
|
|
166
|
+
'error_type': 'non_retryable',
|
|
167
|
+
'failed_at': datetime.utcnow().isoformat()
|
|
168
|
+
})
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
def _call_external_api(self, element):
|
|
172
|
+
"""Call Zoho API or other external service."""
|
|
173
|
+
# Implementation here
|
|
174
|
+
pass
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# For BigQuery write retries, use built-in retry strategy
|
|
178
|
+
WriteToBigQuery(
|
|
179
|
+
table='...',
|
|
180
|
+
insert_retry_strategy='RETRY_ON_TRANSIENT_ERROR',
|
|
181
|
+
# Default: retries on 5xx errors and rate limiting
|
|
182
|
+
)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## Poison Message Handling
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
class PoisonMessageHandler(beam.DoFn):
|
|
189
|
+
"""Handle messages that repeatedly fail processing."""
|
|
190
|
+
|
|
191
|
+
def __init__(self, max_attempts=3):
|
|
192
|
+
self.max_attempts = max_attempts
|
|
193
|
+
|
|
194
|
+
def process(self, element, window=beam.DoFn.WindowParam):
|
|
195
|
+
# Check retry count from PubSub attributes
|
|
196
|
+
attributes = element.attributes if hasattr(element, 'attributes') else {}
|
|
197
|
+
retry_count = int(attributes.get('retry_count', '0'))
|
|
198
|
+
|
|
199
|
+
if retry_count >= self.max_attempts:
|
|
200
|
+
# This is a poison message - archive and alert
|
|
201
|
+
yield beam.pvalue.TaggedOutput('poison', {
|
|
202
|
+
'message': element.data.decode('utf-8'),
|
|
203
|
+
'retry_count': retry_count,
|
|
204
|
+
'first_seen': attributes.get('first_seen', 'unknown'),
|
|
205
|
+
'last_attempt': datetime.utcnow().isoformat(),
|
|
206
|
+
'reason': 'max_retries_exceeded'
|
|
207
|
+
})
|
|
208
|
+
else:
|
|
209
|
+
try:
|
|
210
|
+
result = self._process_message(element)
|
|
211
|
+
yield result
|
|
212
|
+
except Exception as e:
|
|
213
|
+
# Increment retry count and republish
|
|
214
|
+
yield beam.pvalue.TaggedOutput('retry', {
|
|
215
|
+
'data': element.data,
|
|
216
|
+
'attributes': {
|
|
217
|
+
**attributes,
|
|
218
|
+
'retry_count': str(retry_count + 1),
|
|
219
|
+
'first_seen': attributes.get('first_seen', datetime.utcnow().isoformat()),
|
|
220
|
+
'last_error': str(e)
|
|
221
|
+
}
|
|
222
|
+
})
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
```sql
|
|
226
|
+
-- DLQ table schema for tracking poison messages
|
|
227
|
+
CREATE TABLE `project.ops.dead_letter_queue` (
|
|
228
|
+
pipeline_name STRING,
|
|
229
|
+
source_topic STRING,
|
|
230
|
+
raw_message STRING,
|
|
231
|
+
error_type STRING,
|
|
232
|
+
error_message STRING,
|
|
233
|
+
stack_trace STRING,
|
|
234
|
+
retry_count INT64,
|
|
235
|
+
first_seen_at TIMESTAMP,
|
|
236
|
+
failed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP(),
|
|
237
|
+
resolved_at TIMESTAMP,
|
|
238
|
+
resolution STRING -- 'reprocessed', 'discarded', 'manual_fix'
|
|
239
|
+
)
|
|
240
|
+
PARTITION BY DATE(failed_at)
|
|
241
|
+
CLUSTER BY pipeline_name, error_type;
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
## Monitoring with Cloud Monitoring
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
# Custom metrics for pipeline health
|
|
248
|
+
from google.cloud import monitoring_v3
|
|
249
|
+
from google.protobuf.timestamp_pb2 import Timestamp
|
|
250
|
+
import time
|
|
251
|
+
|
|
252
|
+
class MetricsReporter(beam.DoFn):
|
|
253
|
+
"""Report custom metrics to Cloud Monitoring."""
|
|
254
|
+
|
|
255
|
+
def setup(self):
|
|
256
|
+
self.client = monitoring_v3.MetricServiceClient()
|
|
257
|
+
self.project_name = f"projects/cloudstream-prod"
|
|
258
|
+
|
|
259
|
+
def process(self, element):
|
|
260
|
+
# Report processing metrics
|
|
261
|
+
self._write_metric(
|
|
262
|
+
'custom.googleapis.com/dataflow/records_processed',
|
|
263
|
+
1,
|
|
264
|
+
labels={'pipeline': 'zoho_deals', 'status': 'success'}
|
|
265
|
+
)
|
|
266
|
+
yield element
|
|
267
|
+
|
|
268
|
+
def _write_metric(self, metric_type, value, labels=None):
|
|
269
|
+
series = monitoring_v3.TimeSeries()
|
|
270
|
+
series.metric.type = metric_type
|
|
271
|
+
if labels:
|
|
272
|
+
for k, v in labels.items():
|
|
273
|
+
series.metric.labels[k] = v
|
|
274
|
+
|
|
275
|
+
series.resource.type = 'global'
|
|
276
|
+
now = time.time()
|
|
277
|
+
interval = monitoring_v3.TimeInterval(
|
|
278
|
+
end_time=Timestamp(seconds=int(now))
|
|
279
|
+
)
|
|
280
|
+
point = monitoring_v3.Point(
|
|
281
|
+
interval=interval,
|
|
282
|
+
value=monitoring_v3.TypedValue(int64_value=value)
|
|
283
|
+
)
|
|
284
|
+
series.points = [point]
|
|
285
|
+
|
|
286
|
+
self.client.create_time_series(
|
|
287
|
+
request={"name": self.project_name, "time_series": [series]}
|
|
288
|
+
)
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
## Alerting Policies
|
|
292
|
+
|
|
293
|
+
```yaml
|
|
294
|
+
# terraform/dataflow_alerts.tf
|
|
295
|
+
resource "google_monitoring_alert_policy" "dlq_rate" {
|
|
296
|
+
display_name = "Dataflow DLQ Rate Too High"
|
|
297
|
+
combiner = "OR"
|
|
298
|
+
|
|
299
|
+
conditions {
|
|
300
|
+
display_name = "DLQ records > 100 in 5 minutes"
|
|
301
|
+
condition_threshold {
|
|
302
|
+
filter = "metric.type=\"custom.googleapis.com/dataflow/dlq_count\" AND resource.type=\"global\""
|
|
303
|
+
comparison = "COMPARISON_GT"
|
|
304
|
+
threshold_value = 100
|
|
305
|
+
duration = "300s"
|
|
306
|
+
aggregations {
|
|
307
|
+
alignment_period = "300s"
|
|
308
|
+
per_series_aligner = "ALIGN_SUM"
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
notification_channels = [
|
|
314
|
+
google_monitoring_notification_channel.slack_data_alerts.id
|
|
315
|
+
]
|
|
316
|
+
|
|
317
|
+
alert_strategy {
|
|
318
|
+
auto_close = "1800s" # Auto-close after 30 minutes of recovery
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
resource "google_monitoring_alert_policy" "pipeline_backlog" {
|
|
323
|
+
display_name = "Dataflow Backlog Growing"
|
|
324
|
+
combiner = "OR"
|
|
325
|
+
|
|
326
|
+
conditions {
|
|
327
|
+
display_name = "PubSub unacked messages > 10000"
|
|
328
|
+
condition_threshold {
|
|
329
|
+
filter = "metric.type=\"pubsub.googleapis.com/subscription/num_unacked_messages\""
|
|
330
|
+
comparison = "COMPARISON_GT"
|
|
331
|
+
threshold_value = 10000
|
|
332
|
+
duration = "600s"
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
notification_channels = [
|
|
337
|
+
google_monitoring_notification_channel.slack_data_alerts.id
|
|
338
|
+
]
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
resource "google_monitoring_alert_policy" "pipeline_errors" {
|
|
342
|
+
display_name = "Dataflow System Errors"
|
|
343
|
+
combiner = "OR"
|
|
344
|
+
|
|
345
|
+
conditions {
|
|
346
|
+
display_name = "Dataflow worker errors > 5/min"
|
|
347
|
+
condition_threshold {
|
|
348
|
+
filter = "metric.type=\"dataflow.googleapis.com/job/system_error_count\""
|
|
349
|
+
comparison = "COMPARISON_GT"
|
|
350
|
+
threshold_value = 5
|
|
351
|
+
duration = "60s"
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
notification_channels = [
|
|
356
|
+
google_monitoring_notification_channel.pagerduty.id
|
|
357
|
+
]
|
|
358
|
+
}
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
## Graceful Shutdown
|
|
362
|
+
|
|
363
|
+
```python
|
|
364
|
+
import signal
|
|
365
|
+
import apache_beam as beam
|
|
366
|
+
|
|
367
|
+
class GracefulPipeline:
|
|
368
|
+
"""Handle pipeline shutdown gracefully, flushing in-flight records."""
|
|
369
|
+
|
|
370
|
+
def __init__(self):
|
|
371
|
+
self.shutdown_requested = False
|
|
372
|
+
signal.signal(signal.SIGTERM, self._handle_shutdown)
|
|
373
|
+
|
|
374
|
+
def _handle_shutdown(self, signum, frame):
|
|
375
|
+
"""Handle SIGTERM from Dataflow for drain operations."""
|
|
376
|
+
self.shutdown_requested = True
|
|
377
|
+
# Dataflow handles drain automatically when you call:
|
|
378
|
+
# gcloud dataflow jobs drain JOB_ID --region=us-central1
|
|
379
|
+
|
|
380
|
+
def run(self):
|
|
381
|
+
"""
|
|
382
|
+
Dataflow drain vs cancel:
|
|
383
|
+
- drain: Finishes processing in-flight elements, stops reading new input
|
|
384
|
+
- cancel: Immediately stops all processing (may lose data)
|
|
385
|
+
|
|
386
|
+
Always prefer drain for graceful shutdown.
|
|
387
|
+
"""
|
|
388
|
+
pass
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
```bash
|
|
392
|
+
# Graceful shutdown commands
|
|
393
|
+
# DRAIN: Process remaining elements, then stop (recommended)
|
|
394
|
+
gcloud dataflow jobs drain JOB_ID --region=us-central1
|
|
395
|
+
|
|
396
|
+
# CANCEL: Immediate stop (data loss possible)
|
|
397
|
+
gcloud dataflow jobs cancel JOB_ID --region=us-central1
|
|
398
|
+
|
|
399
|
+
# UPDATE: Replace running pipeline with new version (no downtime)
|
|
400
|
+
gcloud dataflow jobs run NEW_JOB_NAME \
|
|
401
|
+
--gcs-location gs://cloudstream-dataflow-templates/template.json \
|
|
402
|
+
--region us-central1 \
|
|
403
|
+
--update
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
## Checkpoint/Restart Patterns
|
|
407
|
+
|
|
408
|
+
```python
|
|
409
|
+
# For batch pipelines: checkpoint progress in GCS
|
|
410
|
+
class CheckpointedBatchPipeline:
|
|
411
|
+
"""Resume batch processing from last successful checkpoint."""
|
|
412
|
+
|
|
413
|
+
CHECKPOINT_PATH = 'gs://cloudstream-dataflow-checkpoints/{pipeline}/{date}.json'
|
|
414
|
+
|
|
415
|
+
def get_last_checkpoint(self, pipeline_name):
|
|
416
|
+
"""Read last successful offset from GCS."""
|
|
417
|
+
from google.cloud import storage
|
|
418
|
+
client = storage.Client()
|
|
419
|
+
bucket = client.bucket('cloudstream-dataflow-checkpoints')
|
|
420
|
+
blob = bucket.blob(f'{pipeline_name}/latest.json')
|
|
421
|
+
|
|
422
|
+
if blob.exists():
|
|
423
|
+
checkpoint = json.loads(blob.download_as_string())
|
|
424
|
+
return checkpoint.get('last_processed_id')
|
|
425
|
+
return None
|
|
426
|
+
|
|
427
|
+
def save_checkpoint(self, pipeline_name, last_id):
|
|
428
|
+
"""Save checkpoint after successful batch completion."""
|
|
429
|
+
from google.cloud import storage
|
|
430
|
+
client = storage.Client()
|
|
431
|
+
bucket = client.bucket('cloudstream-dataflow-checkpoints')
|
|
432
|
+
blob = bucket.blob(f'{pipeline_name}/latest.json')
|
|
433
|
+
blob.upload_from_string(json.dumps({
|
|
434
|
+
'last_processed_id': last_id,
|
|
435
|
+
'checkpoint_time': datetime.utcnow().isoformat(),
|
|
436
|
+
'pipeline_name': pipeline_name
|
|
437
|
+
}))
|
|
438
|
+
|
|
439
|
+
# For streaming pipelines: PubSub handles checkpointing automatically
|
|
440
|
+
# via acknowledgment. Unacked messages will be redelivered.
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
## Error Rate Thresholds
|
|
444
|
+
|
|
445
|
+
```python
|
|
446
|
+
# Dynamic error rate monitoring within pipeline
|
|
447
|
+
class ErrorRateMonitor(beam.DoFn):
|
|
448
|
+
"""Track error rates and trigger alerts when thresholds exceeded."""
|
|
449
|
+
|
|
450
|
+
# Thresholds
|
|
451
|
+
WARN_THRESHOLD = 0.02 # 2% error rate = warning
|
|
452
|
+
ERROR_THRESHOLD = 0.05 # 5% error rate = alert
|
|
453
|
+
CRITICAL_THRESHOLD = 0.10 # 10% error rate = pause pipeline
|
|
454
|
+
|
|
455
|
+
def __init__(self):
|
|
456
|
+
self.total_counter = beam.metrics.Metrics.counter('pipeline', 'total_records')
|
|
457
|
+
self.error_counter = beam.metrics.Metrics.counter('pipeline', 'error_records')
|
|
458
|
+
self.error_rate_gauge = beam.metrics.Metrics.gauge('pipeline', 'error_rate')
|
|
459
|
+
|
|
460
|
+
def process(self, element):
|
|
461
|
+
self.total_counter.inc()
|
|
462
|
+
|
|
463
|
+
if element.get('_has_error'):
|
|
464
|
+
self.error_counter.inc()
|
|
465
|
+
# Beam metrics are reported to Cloud Monitoring automatically
|
|
466
|
+
# Alert policies trigger based on error_rate gauge
|
|
467
|
+
|
|
468
|
+
yield element
|
|
469
|
+
```
|
|
470
|
+
|
|
471
|
+
```sql
|
|
472
|
+
-- Query DLQ for error rate analysis
|
|
473
|
+
SELECT
|
|
474
|
+
DATE(failed_at) AS error_date,
|
|
475
|
+
pipeline_name,
|
|
476
|
+
error_type,
|
|
477
|
+
COUNT(*) AS error_count,
|
|
478
|
+
COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (PARTITION BY DATE(failed_at), pipeline_name) AS pct_of_errors
|
|
479
|
+
FROM `project.ops.dead_letter_queue`
|
|
480
|
+
WHERE failed_at >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY)
|
|
481
|
+
AND resolved_at IS NULL
|
|
482
|
+
GROUP BY 1, 2, 3
|
|
483
|
+
ORDER BY error_count DESC;
|
|
484
|
+
```
|
|
485
|
+
|
|
486
|
+
## Best Practices
|
|
487
|
+
|
|
488
|
+
1. **Always implement DLQ** - Never drop records silently; route failures for investigation
|
|
489
|
+
2. **Separate retryable from poison** - Transient errors should retry; corrupt data should not
|
|
490
|
+
3. **Use exponential backoff** - Avoid thundering herd on service recovery
|
|
491
|
+
4. **Prefer drain over cancel** - Graceful shutdown preserves data integrity
|
|
492
|
+
5. **Monitor error rates, not just counts** - A spike from 0.1% to 2% is more significant than 100 errors
|
|
493
|
+
6. **Set PubSub ack deadline appropriately** - Too short causes unnecessary retries; too long delays DLQ routing
|
|
494
|
+
7. **Archive DLQ records with context** - Include stack trace, retry count, and raw message for debugging
|
|
495
|
+
|
|
496
|
+
> **WARNING**: PubSub default ack deadline is 10 seconds. For pipelines with heavy processing, extend to 60-120 seconds to prevent duplicate processing.
|