@cloudstreamsoftware/claude-tools 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. package/README.md +152 -37
  2. package/agents/INDEX.md +183 -0
  3. package/agents/architect.md +247 -0
  4. package/agents/build-error-resolver.md +555 -0
  5. package/agents/catalyst-deployer.md +132 -0
  6. package/agents/code-reviewer.md +121 -0
  7. package/agents/compliance-auditor.md +148 -0
  8. package/agents/creator-architect.md +395 -0
  9. package/agents/deluge-reviewer.md +98 -0
  10. package/agents/doc-updater.md +471 -0
  11. package/agents/e2e-runner.md +711 -0
  12. package/agents/planner.md +122 -0
  13. package/agents/refactor-cleaner.md +309 -0
  14. package/agents/security-reviewer.md +582 -0
  15. package/agents/tdd-guide.md +302 -0
  16. package/config/versions.json +63 -0
  17. package/dist/hooks/hooks.json +209 -0
  18. package/dist/index.js +47 -0
  19. package/dist/lib/asset-value.js +609 -0
  20. package/dist/lib/client-manager.js +300 -0
  21. package/dist/lib/command-matcher.js +242 -0
  22. package/dist/lib/cross-session-patterns.js +754 -0
  23. package/dist/lib/intent-classifier.js +1075 -0
  24. package/dist/lib/package-manager.js +374 -0
  25. package/dist/lib/recommendation-engine.js +597 -0
  26. package/dist/lib/session-memory.js +489 -0
  27. package/dist/lib/skill-effectiveness.js +486 -0
  28. package/dist/lib/skill-matcher.js +595 -0
  29. package/dist/lib/tutorial-metrics.js +242 -0
  30. package/dist/lib/tutorial-progress.js +209 -0
  31. package/dist/lib/tutorial-renderer.js +431 -0
  32. package/dist/lib/utils.js +380 -0
  33. package/dist/lib/verify-formatter.js +143 -0
  34. package/dist/lib/workflow-state.js +249 -0
  35. package/hooks/hooks.json +209 -0
  36. package/package.json +5 -1
  37. package/scripts/aggregate-sessions.js +290 -0
  38. package/scripts/branch-name-validator.js +291 -0
  39. package/scripts/build.js +101 -0
  40. package/scripts/commands/client-switch.js +231 -0
  41. package/scripts/deprecate-skill.js +610 -0
  42. package/scripts/diagnose.js +324 -0
  43. package/scripts/doc-freshness.js +168 -0
  44. package/scripts/generate-weekly-digest.js +393 -0
  45. package/scripts/health-check.js +270 -0
  46. package/scripts/hooks/credential-check.js +101 -0
  47. package/scripts/hooks/evaluate-session.js +81 -0
  48. package/scripts/hooks/pre-compact.js +66 -0
  49. package/scripts/hooks/prompt-analyzer.js +276 -0
  50. package/scripts/hooks/prompt-router.js +422 -0
  51. package/scripts/hooks/quality-gate-enforcer.js +371 -0
  52. package/scripts/hooks/session-end.js +156 -0
  53. package/scripts/hooks/session-start.js +195 -0
  54. package/scripts/hooks/skill-injector.js +333 -0
  55. package/scripts/hooks/suggest-compact.js +58 -0
  56. package/scripts/lib/asset-value.js +609 -0
  57. package/scripts/lib/client-manager.js +300 -0
  58. package/scripts/lib/command-matcher.js +242 -0
  59. package/scripts/lib/cross-session-patterns.js +754 -0
  60. package/scripts/lib/intent-classifier.js +1075 -0
  61. package/scripts/lib/package-manager.js +374 -0
  62. package/scripts/lib/recommendation-engine.js +597 -0
  63. package/scripts/lib/session-memory.js +489 -0
  64. package/scripts/lib/skill-effectiveness.js +486 -0
  65. package/scripts/lib/skill-matcher.js +595 -0
  66. package/scripts/lib/tutorial-metrics.js +242 -0
  67. package/scripts/lib/tutorial-progress.js +209 -0
  68. package/scripts/lib/tutorial-renderer.js +431 -0
  69. package/scripts/lib/utils.js +380 -0
  70. package/scripts/lib/verify-formatter.js +143 -0
  71. package/scripts/lib/workflow-state.js +249 -0
  72. package/scripts/onboard.js +363 -0
  73. package/scripts/quarterly-report.js +692 -0
  74. package/scripts/setup-package-manager.js +204 -0
  75. package/scripts/sync-upstream.js +391 -0
  76. package/scripts/test.js +108 -0
  77. package/scripts/tutorial-runner.js +351 -0
  78. package/scripts/validate-all.js +201 -0
  79. package/scripts/verifiers/agents.js +245 -0
  80. package/scripts/verifiers/config.js +186 -0
  81. package/scripts/verifiers/environment.js +123 -0
  82. package/scripts/verifiers/hooks.js +188 -0
  83. package/scripts/verifiers/index.js +38 -0
  84. package/scripts/verifiers/persistence.js +140 -0
  85. package/scripts/verifiers/plugin.js +215 -0
  86. package/scripts/verifiers/skills.js +209 -0
  87. package/scripts/verify-setup.js +164 -0
  88. package/skills/INDEX.md +157 -0
  89. package/skills/backend-patterns/SKILL.md +586 -0
  90. package/skills/backend-patterns/catalyst-patterns.md +128 -0
  91. package/skills/bigquery-patterns/SKILL.md +27 -0
  92. package/skills/bigquery-patterns/performance-optimization.md +518 -0
  93. package/skills/bigquery-patterns/query-patterns.md +372 -0
  94. package/skills/bigquery-patterns/schema-design.md +78 -0
  95. package/skills/cloudstream-project-template/SKILL.md +20 -0
  96. package/skills/cloudstream-project-template/structure.md +65 -0
  97. package/skills/coding-standards/SKILL.md +524 -0
  98. package/skills/coding-standards/deluge-standards.md +83 -0
  99. package/skills/compliance-patterns/SKILL.md +28 -0
  100. package/skills/compliance-patterns/hipaa/audit-requirements.md +251 -0
  101. package/skills/compliance-patterns/hipaa/baa-process.md +298 -0
  102. package/skills/compliance-patterns/hipaa/data-archival-strategy.md +387 -0
  103. package/skills/compliance-patterns/hipaa/phi-handling.md +52 -0
  104. package/skills/compliance-patterns/pci-dss/saq-a-requirements.md +307 -0
  105. package/skills/compliance-patterns/pci-dss/tokenization-patterns.md +382 -0
  106. package/skills/compliance-patterns/pci-dss/zoho-checkout-patterns.md +56 -0
  107. package/skills/compliance-patterns/soc2/access-controls.md +344 -0
  108. package/skills/compliance-patterns/soc2/audit-logging.md +458 -0
  109. package/skills/compliance-patterns/soc2/change-management.md +403 -0
  110. package/skills/compliance-patterns/soc2/deluge-execution-logging.md +407 -0
  111. package/skills/consultancy-workflows/SKILL.md +19 -0
  112. package/skills/consultancy-workflows/client-isolation.md +21 -0
  113. package/skills/consultancy-workflows/documentation-automation.md +454 -0
  114. package/skills/consultancy-workflows/handoff-procedures.md +257 -0
  115. package/skills/consultancy-workflows/knowledge-capture.md +513 -0
  116. package/skills/consultancy-workflows/time-tracking.md +26 -0
  117. package/skills/continuous-learning/SKILL.md +84 -0
  118. package/skills/continuous-learning/config.json +18 -0
  119. package/skills/continuous-learning/evaluate-session.sh +60 -0
  120. package/skills/continuous-learning-v2/SKILL.md +126 -0
  121. package/skills/continuous-learning-v2/config.json +61 -0
  122. package/skills/frontend-patterns/SKILL.md +635 -0
  123. package/skills/frontend-patterns/zoho-widget-patterns.md +103 -0
  124. package/skills/gcp-data-engineering/SKILL.md +36 -0
  125. package/skills/gcp-data-engineering/bigquery/performance-optimization.md +337 -0
  126. package/skills/gcp-data-engineering/dataflow/error-handling.md +496 -0
  127. package/skills/gcp-data-engineering/dataflow/pipeline-patterns.md +444 -0
  128. package/skills/gcp-data-engineering/dbt/model-organization.md +63 -0
  129. package/skills/gcp-data-engineering/dbt/testing-patterns.md +503 -0
  130. package/skills/gcp-data-engineering/medallion-architecture/bronze-layer.md +60 -0
  131. package/skills/gcp-data-engineering/medallion-architecture/gold-layer.md +311 -0
  132. package/skills/gcp-data-engineering/medallion-architecture/layer-transitions.md +517 -0
  133. package/skills/gcp-data-engineering/medallion-architecture/silver-layer.md +305 -0
  134. package/skills/gcp-data-engineering/zoho-to-gcp/data-extraction.md +543 -0
  135. package/skills/gcp-data-engineering/zoho-to-gcp/real-time-vs-batch.md +337 -0
  136. package/skills/security-review/SKILL.md +498 -0
  137. package/skills/security-review/compliance-checklist.md +53 -0
  138. package/skills/strategic-compact/SKILL.md +67 -0
  139. package/skills/tdd-workflow/SKILL.md +413 -0
  140. package/skills/tdd-workflow/zoho-testing.md +124 -0
  141. package/skills/tutorial/SKILL.md +249 -0
  142. package/skills/tutorial/docs/ACCESSIBILITY.md +169 -0
  143. package/skills/tutorial/lessons/00-philosophy-and-workflow.md +198 -0
  144. package/skills/tutorial/lessons/01-basics.md +81 -0
  145. package/skills/tutorial/lessons/02-training.md +86 -0
  146. package/skills/tutorial/lessons/03-commands.md +109 -0
  147. package/skills/tutorial/lessons/04-workflows.md +115 -0
  148. package/skills/tutorial/lessons/05-compliance.md +116 -0
  149. package/skills/tutorial/lessons/06-zoho.md +121 -0
  150. package/skills/tutorial/lessons/07-hooks-system.md +277 -0
  151. package/skills/tutorial/lessons/08-mcp-servers.md +316 -0
  152. package/skills/tutorial/lessons/09-client-management.md +215 -0
  153. package/skills/tutorial/lessons/10-testing-e2e.md +260 -0
  154. package/skills/tutorial/lessons/11-skills-deep-dive.md +272 -0
  155. package/skills/tutorial/lessons/12-rules-system.md +326 -0
  156. package/skills/tutorial/lessons/13-golden-standard-graduation.md +213 -0
  157. package/skills/tutorial/lessons/14-fork-setup-and-sync.md +312 -0
  158. package/skills/tutorial/lessons/15-living-examples-system.md +221 -0
  159. package/skills/tutorial/tracks/accelerated/README.md +134 -0
  160. package/skills/tutorial/tracks/accelerated/assessment/checkpoint-1.md +161 -0
  161. package/skills/tutorial/tracks/accelerated/assessment/checkpoint-2.md +175 -0
  162. package/skills/tutorial/tracks/accelerated/day-1-core-concepts.md +234 -0
  163. package/skills/tutorial/tracks/accelerated/day-2-essential-commands.md +270 -0
  164. package/skills/tutorial/tracks/accelerated/day-3-workflow-mastery.md +305 -0
  165. package/skills/tutorial/tracks/accelerated/day-4-compliance-zoho.md +304 -0
  166. package/skills/tutorial/tracks/accelerated/day-5-hooks-skills.md +344 -0
  167. package/skills/tutorial/tracks/accelerated/day-6-client-testing.md +386 -0
  168. package/skills/tutorial/tracks/accelerated/day-7-graduation.md +369 -0
  169. package/skills/zoho-patterns/CHANGELOG.md +108 -0
  170. package/skills/zoho-patterns/SKILL.md +446 -0
  171. package/skills/zoho-patterns/analytics/dashboard-patterns.md +352 -0
  172. package/skills/zoho-patterns/analytics/zoho-to-bigquery-pipeline.md +427 -0
  173. package/skills/zoho-patterns/catalyst/appsail-deployment.md +349 -0
  174. package/skills/zoho-patterns/catalyst/context-close-patterns.md +354 -0
  175. package/skills/zoho-patterns/catalyst/cron-batch-processing.md +374 -0
  176. package/skills/zoho-patterns/catalyst/function-patterns.md +439 -0
  177. package/skills/zoho-patterns/creator/form-design.md +304 -0
  178. package/skills/zoho-patterns/creator/publish-api-patterns.md +313 -0
  179. package/skills/zoho-patterns/creator/widget-integration.md +306 -0
  180. package/skills/zoho-patterns/creator/workflow-automation.md +253 -0
  181. package/skills/zoho-patterns/deluge/api-patterns.md +468 -0
  182. package/skills/zoho-patterns/deluge/batch-processing.md +403 -0
  183. package/skills/zoho-patterns/deluge/cross-app-integration.md +356 -0
  184. package/skills/zoho-patterns/deluge/error-handling.md +423 -0
  185. package/skills/zoho-patterns/deluge/syntax-reference.md +65 -0
  186. package/skills/zoho-patterns/integration/cors-proxy-architecture.md +426 -0
  187. package/skills/zoho-patterns/integration/crm-books-native-sync.md +277 -0
  188. package/skills/zoho-patterns/integration/oauth-token-management.md +461 -0
  189. package/skills/zoho-patterns/integration/zoho-flow-patterns.md +334 -0
@@ -0,0 +1,496 @@
1
+ # Dataflow Error Handling
2
+
3
+ > Patterns for dead letter queues, retry logic, poison message handling, and monitoring in CloudStream Dataflow pipelines.
4
+
5
+ ## Dead Letter Queue (DLQ) Pattern
6
+
7
+ The DLQ pattern routes unprocessable records to a separate destination for later investigation, preventing a single bad record from blocking the entire pipeline.
8
+
9
+ ```python
10
+ # dlq_pipeline.py
11
+ import apache_beam as beam
12
+ from apache_beam.io.gcp.bigquery import WriteToBigQuery
13
+ import json
14
+ import traceback
15
+ from datetime import datetime
16
+
17
+ class ProcessWithDLQ(beam.DoFn):
18
+ """Process records, routing failures to dead letter queue."""
19
+
20
+ # Output tags
21
+ VALID_TAG = 'valid'
22
+ DLQ_TAG = 'dlq'
23
+
24
+ def process(self, element):
25
+ try:
26
+ record = json.loads(element.decode('utf-8'))
27
+
28
+ # Validation checks
29
+ if not record.get('record_id'):
30
+ raise ValueError("Missing required field: record_id")
31
+
32
+ if record.get('amount') and float(record['amount']) < 0:
33
+ raise ValueError(f"Negative amount: {record['amount']}")
34
+
35
+ # Type coercion
36
+ cleaned = {
37
+ 'record_id': str(record['record_id']),
38
+ 'amount': float(record.get('amount', 0)),
39
+ 'stage': record.get('stage', 'Unknown'),
40
+ '_processed_at': datetime.utcnow().isoformat()
41
+ }
42
+
43
+ yield beam.pvalue.TaggedOutput(self.VALID_TAG, cleaned)
44
+
45
+ except Exception as e:
46
+ yield beam.pvalue.TaggedOutput(self.DLQ_TAG, {
47
+ 'raw_message': element.decode('utf-8') if isinstance(element, bytes) else str(element),
48
+ 'error_type': type(e).__name__,
49
+ 'error_message': str(e),
50
+ 'stack_trace': traceback.format_exc(),
51
+ 'failed_at': datetime.utcnow().isoformat(),
52
+ 'pipeline_name': 'zoho_deals_streaming'
53
+ })
54
+
55
+ def run():
56
+ with beam.Pipeline(options=get_streaming_options()) as p:
57
+ messages = (
58
+ p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
59
+ subscription='projects/cloudstream-prod/subscriptions/zoho-deals-sub'
60
+ )
61
+ )
62
+
63
+ results = messages | 'Process' >> beam.ParDo(
64
+ ProcessWithDLQ()
65
+ ).with_outputs(ProcessWithDLQ.VALID_TAG, ProcessWithDLQ.DLQ_TAG)
66
+
67
+ # Valid records to silver
68
+ results[ProcessWithDLQ.VALID_TAG] | 'WriteValid' >> WriteToBigQuery(
69
+ table='cloudstream-prod:silver.zoho_deals',
70
+ write_disposition='WRITE_APPEND'
71
+ )
72
+
73
+ # Failed records to DLQ table
74
+ results[ProcessWithDLQ.DLQ_TAG] | 'WriteDLQ' >> WriteToBigQuery(
75
+ table='cloudstream-prod:ops.dead_letter_queue',
76
+ write_disposition='WRITE_APPEND'
77
+ )
78
+ ```
79
+
80
+ ## Error Tagging and Routing
81
+
82
+ ```python
83
+ # Multi-category error routing
84
+ class ErrorRouter(beam.DoFn):
85
+ """Route errors to appropriate handlers based on type."""
86
+
87
+ RETRYABLE = 'retryable' # Transient errors (API timeout, rate limit)
88
+ VALIDATION = 'validation' # Data quality failures
89
+ POISON = 'poison' # Unrecoverable (corrupt data, schema mismatch)
90
+
91
+ def process(self, element):
92
+ error_type = element.get('error_type', '')
93
+ error_msg = element.get('error_message', '')
94
+
95
+ if any(t in error_type for t in ['TimeoutError', 'ConnectionError', 'ServiceUnavailable']):
96
+ yield beam.pvalue.TaggedOutput(self.RETRYABLE, element)
97
+ elif any(t in error_type for t in ['ValueError', 'TypeError', 'ValidationError']):
98
+ yield beam.pvalue.TaggedOutput(self.VALIDATION, element)
99
+ else:
100
+ yield beam.pvalue.TaggedOutput(self.POISON, element)
101
+
102
+ # Usage in pipeline
103
+ routed = (
104
+ dlq_records
105
+ | 'RouteErrors' >> beam.ParDo(ErrorRouter()).with_outputs(
106
+ ErrorRouter.RETRYABLE, ErrorRouter.VALIDATION, ErrorRouter.POISON
107
+ )
108
+ )
109
+
110
+ # Retryable → back to input topic with delay
111
+ routed[ErrorRouter.RETRYABLE] | 'RepublishRetryable' >> beam.ParDo(RepublishWithDelay())
112
+
113
+ # Validation → quarantine table for data team
114
+ routed[ErrorRouter.VALIDATION] | 'WriteValidation' >> WriteToBigQuery(
115
+ table='cloudstream-prod:ops.validation_failures'
116
+ )
117
+
118
+ # Poison → alert and archive
119
+ routed[ErrorRouter.POISON] | 'AlertPoison' >> beam.ParDo(AlertAndArchive())
120
+ ```
121
+
122
+ ## Retry with Exponential Backoff
123
+
124
+ ```python
125
+ import time
126
+ import random
127
+ from google.api_core import retry as api_retry
128
+ from google.api_core.exceptions import ServiceUnavailable, TooManyRequests
129
+
130
+ class RetryableTransform(beam.DoFn):
131
+ """Transform with built-in retry logic for external API calls."""
132
+
133
+ MAX_RETRIES = 5
134
+ BASE_DELAY = 1.0 # seconds
135
+ MAX_DELAY = 60.0 # seconds
136
+
137
+ def process(self, element):
138
+ for attempt in range(self.MAX_RETRIES):
139
+ try:
140
+ result = self._call_external_api(element)
141
+ yield result
142
+ return
143
+ except (ServiceUnavailable, TooManyRequests, ConnectionError) as e:
144
+ if attempt == self.MAX_RETRIES - 1:
145
+ # Final attempt failed - route to DLQ
146
+ yield beam.pvalue.TaggedOutput('dlq', {
147
+ 'record': element,
148
+ 'error': str(e),
149
+ 'attempts': self.MAX_RETRIES,
150
+ 'failed_at': datetime.utcnow().isoformat()
151
+ })
152
+ return
153
+
154
+ # Exponential backoff with jitter
155
+ delay = min(
156
+ self.BASE_DELAY * (2 ** attempt) + random.uniform(0, 1),
157
+ self.MAX_DELAY
158
+ )
159
+ time.sleep(delay)
160
+
161
+ except Exception as e:
162
+ # Non-retryable error - immediate DLQ
163
+ yield beam.pvalue.TaggedOutput('dlq', {
164
+ 'record': element,
165
+ 'error': str(e),
166
+ 'error_type': 'non_retryable',
167
+ 'failed_at': datetime.utcnow().isoformat()
168
+ })
169
+ return
170
+
171
+ def _call_external_api(self, element):
172
+ """Call Zoho API or other external service."""
173
+ # Implementation here
174
+ pass
175
+
176
+
177
+ # For BigQuery write retries, use built-in retry strategy
178
+ WriteToBigQuery(
179
+ table='...',
180
+ insert_retry_strategy='RETRY_ON_TRANSIENT_ERROR',
181
+ # Default: retries on 5xx errors and rate limiting
182
+ )
183
+ ```
184
+
185
+ ## Poison Message Handling
186
+
187
+ ```python
188
+ class PoisonMessageHandler(beam.DoFn):
189
+ """Handle messages that repeatedly fail processing."""
190
+
191
+ def __init__(self, max_attempts=3):
192
+ self.max_attempts = max_attempts
193
+
194
+ def process(self, element, window=beam.DoFn.WindowParam):
195
+ # Check retry count from PubSub attributes
196
+ attributes = element.attributes if hasattr(element, 'attributes') else {}
197
+ retry_count = int(attributes.get('retry_count', '0'))
198
+
199
+ if retry_count >= self.max_attempts:
200
+ # This is a poison message - archive and alert
201
+ yield beam.pvalue.TaggedOutput('poison', {
202
+ 'message': element.data.decode('utf-8'),
203
+ 'retry_count': retry_count,
204
+ 'first_seen': attributes.get('first_seen', 'unknown'),
205
+ 'last_attempt': datetime.utcnow().isoformat(),
206
+ 'reason': 'max_retries_exceeded'
207
+ })
208
+ else:
209
+ try:
210
+ result = self._process_message(element)
211
+ yield result
212
+ except Exception as e:
213
+ # Increment retry count and republish
214
+ yield beam.pvalue.TaggedOutput('retry', {
215
+ 'data': element.data,
216
+ 'attributes': {
217
+ **attributes,
218
+ 'retry_count': str(retry_count + 1),
219
+ 'first_seen': attributes.get('first_seen', datetime.utcnow().isoformat()),
220
+ 'last_error': str(e)
221
+ }
222
+ })
223
+ ```
224
+
225
+ ```sql
226
+ -- DLQ table schema for tracking poison messages
227
+ CREATE TABLE `project.ops.dead_letter_queue` (
228
+ pipeline_name STRING,
229
+ source_topic STRING,
230
+ raw_message STRING,
231
+ error_type STRING,
232
+ error_message STRING,
233
+ stack_trace STRING,
234
+ retry_count INT64,
235
+ first_seen_at TIMESTAMP,
236
+ failed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP(),
237
+ resolved_at TIMESTAMP,
238
+ resolution STRING -- 'reprocessed', 'discarded', 'manual_fix'
239
+ )
240
+ PARTITION BY DATE(failed_at)
241
+ CLUSTER BY pipeline_name, error_type;
242
+ ```
243
+
244
+ ## Monitoring with Cloud Monitoring
245
+
246
+ ```python
247
+ # Custom metrics for pipeline health
248
+ from google.cloud import monitoring_v3
249
+ from google.protobuf.timestamp_pb2 import Timestamp
250
+ import time
251
+
252
+ class MetricsReporter(beam.DoFn):
253
+ """Report custom metrics to Cloud Monitoring."""
254
+
255
+ def setup(self):
256
+ self.client = monitoring_v3.MetricServiceClient()
257
+ self.project_name = f"projects/cloudstream-prod"
258
+
259
+ def process(self, element):
260
+ # Report processing metrics
261
+ self._write_metric(
262
+ 'custom.googleapis.com/dataflow/records_processed',
263
+ 1,
264
+ labels={'pipeline': 'zoho_deals', 'status': 'success'}
265
+ )
266
+ yield element
267
+
268
+ def _write_metric(self, metric_type, value, labels=None):
269
+ series = monitoring_v3.TimeSeries()
270
+ series.metric.type = metric_type
271
+ if labels:
272
+ for k, v in labels.items():
273
+ series.metric.labels[k] = v
274
+
275
+ series.resource.type = 'global'
276
+ now = time.time()
277
+ interval = monitoring_v3.TimeInterval(
278
+ end_time=Timestamp(seconds=int(now))
279
+ )
280
+ point = monitoring_v3.Point(
281
+ interval=interval,
282
+ value=monitoring_v3.TypedValue(int64_value=value)
283
+ )
284
+ series.points = [point]
285
+
286
+ self.client.create_time_series(
287
+ request={"name": self.project_name, "time_series": [series]}
288
+ )
289
+ ```
290
+
291
+ ## Alerting Policies
292
+
293
+ ```yaml
294
+ # terraform/dataflow_alerts.tf
295
+ resource "google_monitoring_alert_policy" "dlq_rate" {
296
+ display_name = "Dataflow DLQ Rate Too High"
297
+ combiner = "OR"
298
+
299
+ conditions {
300
+ display_name = "DLQ records > 100 in 5 minutes"
301
+ condition_threshold {
302
+ filter = "metric.type=\"custom.googleapis.com/dataflow/dlq_count\" AND resource.type=\"global\""
303
+ comparison = "COMPARISON_GT"
304
+ threshold_value = 100
305
+ duration = "300s"
306
+ aggregations {
307
+ alignment_period = "300s"
308
+ per_series_aligner = "ALIGN_SUM"
309
+ }
310
+ }
311
+ }
312
+
313
+ notification_channels = [
314
+ google_monitoring_notification_channel.slack_data_alerts.id
315
+ ]
316
+
317
+ alert_strategy {
318
+ auto_close = "1800s" # Auto-close after 30 minutes of recovery
319
+ }
320
+ }
321
+
322
+ resource "google_monitoring_alert_policy" "pipeline_backlog" {
323
+ display_name = "Dataflow Backlog Growing"
324
+ combiner = "OR"
325
+
326
+ conditions {
327
+ display_name = "PubSub unacked messages > 10000"
328
+ condition_threshold {
329
+ filter = "metric.type=\"pubsub.googleapis.com/subscription/num_unacked_messages\""
330
+ comparison = "COMPARISON_GT"
331
+ threshold_value = 10000
332
+ duration = "600s"
333
+ }
334
+ }
335
+
336
+ notification_channels = [
337
+ google_monitoring_notification_channel.slack_data_alerts.id
338
+ ]
339
+ }
340
+
341
+ resource "google_monitoring_alert_policy" "pipeline_errors" {
342
+ display_name = "Dataflow System Errors"
343
+ combiner = "OR"
344
+
345
+ conditions {
346
+ display_name = "Dataflow worker errors > 5/min"
347
+ condition_threshold {
348
+ filter = "metric.type=\"dataflow.googleapis.com/job/system_error_count\""
349
+ comparison = "COMPARISON_GT"
350
+ threshold_value = 5
351
+ duration = "60s"
352
+ }
353
+ }
354
+
355
+ notification_channels = [
356
+ google_monitoring_notification_channel.pagerduty.id
357
+ ]
358
+ }
359
+ ```
360
+
361
+ ## Graceful Shutdown
362
+
363
+ ```python
364
+ import signal
365
+ import apache_beam as beam
366
+
367
+ class GracefulPipeline:
368
+ """Handle pipeline shutdown gracefully, flushing in-flight records."""
369
+
370
+ def __init__(self):
371
+ self.shutdown_requested = False
372
+ signal.signal(signal.SIGTERM, self._handle_shutdown)
373
+
374
+ def _handle_shutdown(self, signum, frame):
375
+ """Handle SIGTERM from Dataflow for drain operations."""
376
+ self.shutdown_requested = True
377
+ # Dataflow handles drain automatically when you call:
378
+ # gcloud dataflow jobs drain JOB_ID --region=us-central1
379
+
380
+ def run(self):
381
+ """
382
+ Dataflow drain vs cancel:
383
+ - drain: Finishes processing in-flight elements, stops reading new input
384
+ - cancel: Immediately stops all processing (may lose data)
385
+
386
+ Always prefer drain for graceful shutdown.
387
+ """
388
+ pass
389
+ ```
390
+
391
+ ```bash
392
+ # Graceful shutdown commands
393
+ # DRAIN: Process remaining elements, then stop (recommended)
394
+ gcloud dataflow jobs drain JOB_ID --region=us-central1
395
+
396
+ # CANCEL: Immediate stop (data loss possible)
397
+ gcloud dataflow jobs cancel JOB_ID --region=us-central1
398
+
399
+ # UPDATE: Replace running pipeline with new version (no downtime)
400
+ gcloud dataflow jobs run NEW_JOB_NAME \
401
+ --gcs-location gs://cloudstream-dataflow-templates/template.json \
402
+ --region us-central1 \
403
+ --update
404
+ ```
405
+
406
+ ## Checkpoint/Restart Patterns
407
+
408
+ ```python
409
+ # For batch pipelines: checkpoint progress in GCS
410
+ class CheckpointedBatchPipeline:
411
+ """Resume batch processing from last successful checkpoint."""
412
+
413
+ CHECKPOINT_PATH = 'gs://cloudstream-dataflow-checkpoints/{pipeline}/{date}.json'
414
+
415
+ def get_last_checkpoint(self, pipeline_name):
416
+ """Read last successful offset from GCS."""
417
+ from google.cloud import storage
418
+ client = storage.Client()
419
+ bucket = client.bucket('cloudstream-dataflow-checkpoints')
420
+ blob = bucket.blob(f'{pipeline_name}/latest.json')
421
+
422
+ if blob.exists():
423
+ checkpoint = json.loads(blob.download_as_string())
424
+ return checkpoint.get('last_processed_id')
425
+ return None
426
+
427
+ def save_checkpoint(self, pipeline_name, last_id):
428
+ """Save checkpoint after successful batch completion."""
429
+ from google.cloud import storage
430
+ client = storage.Client()
431
+ bucket = client.bucket('cloudstream-dataflow-checkpoints')
432
+ blob = bucket.blob(f'{pipeline_name}/latest.json')
433
+ blob.upload_from_string(json.dumps({
434
+ 'last_processed_id': last_id,
435
+ 'checkpoint_time': datetime.utcnow().isoformat(),
436
+ 'pipeline_name': pipeline_name
437
+ }))
438
+
439
+ # For streaming pipelines: PubSub handles checkpointing automatically
440
+ # via acknowledgment. Unacked messages will be redelivered.
441
+ ```
442
+
443
+ ## Error Rate Thresholds
444
+
445
+ ```python
446
+ # Dynamic error rate monitoring within pipeline
447
+ class ErrorRateMonitor(beam.DoFn):
448
+ """Track error rates and trigger alerts when thresholds exceeded."""
449
+
450
+ # Thresholds
451
+ WARN_THRESHOLD = 0.02 # 2% error rate = warning
452
+ ERROR_THRESHOLD = 0.05 # 5% error rate = alert
453
+ CRITICAL_THRESHOLD = 0.10 # 10% error rate = pause pipeline
454
+
455
+ def __init__(self):
456
+ self.total_counter = beam.metrics.Metrics.counter('pipeline', 'total_records')
457
+ self.error_counter = beam.metrics.Metrics.counter('pipeline', 'error_records')
458
+ self.error_rate_gauge = beam.metrics.Metrics.gauge('pipeline', 'error_rate')
459
+
460
+ def process(self, element):
461
+ self.total_counter.inc()
462
+
463
+ if element.get('_has_error'):
464
+ self.error_counter.inc()
465
+ # Beam metrics are reported to Cloud Monitoring automatically
466
+ # Alert policies trigger based on error_rate gauge
467
+
468
+ yield element
469
+ ```
470
+
471
+ ```sql
472
+ -- Query DLQ for error rate analysis
473
+ SELECT
474
+ DATE(failed_at) AS error_date,
475
+ pipeline_name,
476
+ error_type,
477
+ COUNT(*) AS error_count,
478
+ COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (PARTITION BY DATE(failed_at), pipeline_name) AS pct_of_errors
479
+ FROM `project.ops.dead_letter_queue`
480
+ WHERE failed_at >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY)
481
+ AND resolved_at IS NULL
482
+ GROUP BY 1, 2, 3
483
+ ORDER BY error_count DESC;
484
+ ```
485
+
486
+ ## Best Practices
487
+
488
+ 1. **Always implement DLQ** - Never drop records silently; route failures for investigation
489
+ 2. **Separate retryable from poison** - Transient errors should retry; corrupt data should not
490
+ 3. **Use exponential backoff** - Avoid thundering herd on service recovery
491
+ 4. **Prefer drain over cancel** - Graceful shutdown preserves data integrity
492
+ 5. **Monitor error rates, not just counts** - A spike from 0.1% to 2% is more significant than 100 errors
493
+ 6. **Set PubSub ack deadline appropriately** - Too short causes unnecessary retries; too long delays DLQ routing
494
+ 7. **Archive DLQ records with context** - Include stack trace, retry count, and raw message for debugging
495
+
496
+ > **WARNING**: PubSub default ack deadline is 10 seconds. For pipelines with heavy processing, extend to 60-120 seconds to prevent duplicate processing.