@cloudstreamsoftware/claude-tools 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/README.md +152 -37
  2. package/agents/INDEX.md +183 -0
  3. package/agents/architect.md +247 -0
  4. package/agents/build-error-resolver.md +555 -0
  5. package/agents/catalyst-deployer.md +132 -0
  6. package/agents/code-reviewer.md +121 -0
  7. package/agents/compliance-auditor.md +148 -0
  8. package/agents/creator-architect.md +395 -0
  9. package/agents/deluge-reviewer.md +98 -0
  10. package/agents/doc-updater.md +471 -0
  11. package/agents/e2e-runner.md +711 -0
  12. package/agents/planner.md +122 -0
  13. package/agents/refactor-cleaner.md +309 -0
  14. package/agents/security-reviewer.md +582 -0
  15. package/agents/tdd-guide.md +302 -0
  16. package/bin/cloudstream-setup.js +16 -6
  17. package/config/versions.json +63 -0
  18. package/dist/hooks/hooks.json +209 -0
  19. package/dist/index.js +47 -0
  20. package/dist/lib/asset-value.js +609 -0
  21. package/dist/lib/client-manager.js +300 -0
  22. package/dist/lib/command-matcher.js +242 -0
  23. package/dist/lib/cross-session-patterns.js +754 -0
  24. package/dist/lib/intent-classifier.js +1075 -0
  25. package/dist/lib/package-manager.js +374 -0
  26. package/dist/lib/recommendation-engine.js +597 -0
  27. package/dist/lib/session-memory.js +489 -0
  28. package/dist/lib/skill-effectiveness.js +486 -0
  29. package/dist/lib/skill-matcher.js +595 -0
  30. package/dist/lib/tutorial-metrics.js +242 -0
  31. package/dist/lib/tutorial-progress.js +209 -0
  32. package/dist/lib/tutorial-renderer.js +431 -0
  33. package/dist/lib/utils.js +380 -0
  34. package/dist/lib/verify-formatter.js +143 -0
  35. package/dist/lib/workflow-state.js +249 -0
  36. package/hooks/hooks.json +209 -0
  37. package/package.json +5 -1
  38. package/scripts/aggregate-sessions.js +290 -0
  39. package/scripts/branch-name-validator.js +291 -0
  40. package/scripts/build.js +101 -0
  41. package/scripts/commands/client-switch.js +231 -0
  42. package/scripts/deprecate-skill.js +610 -0
  43. package/scripts/diagnose.js +324 -0
  44. package/scripts/doc-freshness.js +168 -0
  45. package/scripts/generate-weekly-digest.js +393 -0
  46. package/scripts/health-check.js +270 -0
  47. package/scripts/hooks/credential-check.js +101 -0
  48. package/scripts/hooks/evaluate-session.js +81 -0
  49. package/scripts/hooks/pre-compact.js +66 -0
  50. package/scripts/hooks/prompt-analyzer.js +276 -0
  51. package/scripts/hooks/prompt-router.js +422 -0
  52. package/scripts/hooks/quality-gate-enforcer.js +371 -0
  53. package/scripts/hooks/session-end.js +156 -0
  54. package/scripts/hooks/session-start.js +195 -0
  55. package/scripts/hooks/skill-injector.js +333 -0
  56. package/scripts/hooks/suggest-compact.js +58 -0
  57. package/scripts/lib/asset-value.js +609 -0
  58. package/scripts/lib/client-manager.js +300 -0
  59. package/scripts/lib/command-matcher.js +242 -0
  60. package/scripts/lib/cross-session-patterns.js +754 -0
  61. package/scripts/lib/intent-classifier.js +1075 -0
  62. package/scripts/lib/package-manager.js +374 -0
  63. package/scripts/lib/recommendation-engine.js +597 -0
  64. package/scripts/lib/session-memory.js +489 -0
  65. package/scripts/lib/skill-effectiveness.js +486 -0
  66. package/scripts/lib/skill-matcher.js +595 -0
  67. package/scripts/lib/tutorial-metrics.js +242 -0
  68. package/scripts/lib/tutorial-progress.js +209 -0
  69. package/scripts/lib/tutorial-renderer.js +431 -0
  70. package/scripts/lib/utils.js +380 -0
  71. package/scripts/lib/verify-formatter.js +143 -0
  72. package/scripts/lib/workflow-state.js +249 -0
  73. package/scripts/onboard.js +363 -0
  74. package/scripts/quarterly-report.js +692 -0
  75. package/scripts/setup-package-manager.js +204 -0
  76. package/scripts/sync-upstream.js +391 -0
  77. package/scripts/test.js +108 -0
  78. package/scripts/tutorial-runner.js +351 -0
  79. package/scripts/validate-all.js +201 -0
  80. package/scripts/verifiers/agents.js +245 -0
  81. package/scripts/verifiers/config.js +186 -0
  82. package/scripts/verifiers/environment.js +123 -0
  83. package/scripts/verifiers/hooks.js +188 -0
  84. package/scripts/verifiers/index.js +38 -0
  85. package/scripts/verifiers/persistence.js +140 -0
  86. package/scripts/verifiers/plugin.js +215 -0
  87. package/scripts/verifiers/skills.js +209 -0
  88. package/scripts/verify-setup.js +164 -0
  89. package/skills/INDEX.md +157 -0
  90. package/skills/backend-patterns/SKILL.md +586 -0
  91. package/skills/backend-patterns/catalyst-patterns.md +128 -0
  92. package/skills/bigquery-patterns/SKILL.md +27 -0
  93. package/skills/bigquery-patterns/performance-optimization.md +518 -0
  94. package/skills/bigquery-patterns/query-patterns.md +372 -0
  95. package/skills/bigquery-patterns/schema-design.md +78 -0
  96. package/skills/cloudstream-project-template/SKILL.md +20 -0
  97. package/skills/cloudstream-project-template/structure.md +65 -0
  98. package/skills/coding-standards/SKILL.md +524 -0
  99. package/skills/coding-standards/deluge-standards.md +83 -0
  100. package/skills/compliance-patterns/SKILL.md +28 -0
  101. package/skills/compliance-patterns/hipaa/audit-requirements.md +251 -0
  102. package/skills/compliance-patterns/hipaa/baa-process.md +298 -0
  103. package/skills/compliance-patterns/hipaa/data-archival-strategy.md +387 -0
  104. package/skills/compliance-patterns/hipaa/phi-handling.md +52 -0
  105. package/skills/compliance-patterns/pci-dss/saq-a-requirements.md +307 -0
  106. package/skills/compliance-patterns/pci-dss/tokenization-patterns.md +382 -0
  107. package/skills/compliance-patterns/pci-dss/zoho-checkout-patterns.md +56 -0
  108. package/skills/compliance-patterns/soc2/access-controls.md +344 -0
  109. package/skills/compliance-patterns/soc2/audit-logging.md +458 -0
  110. package/skills/compliance-patterns/soc2/change-management.md +403 -0
  111. package/skills/compliance-patterns/soc2/deluge-execution-logging.md +407 -0
  112. package/skills/consultancy-workflows/SKILL.md +19 -0
  113. package/skills/consultancy-workflows/client-isolation.md +21 -0
  114. package/skills/consultancy-workflows/documentation-automation.md +454 -0
  115. package/skills/consultancy-workflows/handoff-procedures.md +257 -0
  116. package/skills/consultancy-workflows/knowledge-capture.md +513 -0
  117. package/skills/consultancy-workflows/time-tracking.md +26 -0
  118. package/skills/continuous-learning/SKILL.md +84 -0
  119. package/skills/continuous-learning/config.json +18 -0
  120. package/skills/continuous-learning/evaluate-session.sh +60 -0
  121. package/skills/continuous-learning-v2/SKILL.md +126 -0
  122. package/skills/continuous-learning-v2/config.json +61 -0
  123. package/skills/frontend-patterns/SKILL.md +635 -0
  124. package/skills/frontend-patterns/zoho-widget-patterns.md +103 -0
  125. package/skills/gcp-data-engineering/SKILL.md +36 -0
  126. package/skills/gcp-data-engineering/bigquery/performance-optimization.md +337 -0
  127. package/skills/gcp-data-engineering/dataflow/error-handling.md +496 -0
  128. package/skills/gcp-data-engineering/dataflow/pipeline-patterns.md +444 -0
  129. package/skills/gcp-data-engineering/dbt/model-organization.md +63 -0
  130. package/skills/gcp-data-engineering/dbt/testing-patterns.md +503 -0
  131. package/skills/gcp-data-engineering/medallion-architecture/bronze-layer.md +60 -0
  132. package/skills/gcp-data-engineering/medallion-architecture/gold-layer.md +311 -0
  133. package/skills/gcp-data-engineering/medallion-architecture/layer-transitions.md +517 -0
  134. package/skills/gcp-data-engineering/medallion-architecture/silver-layer.md +305 -0
  135. package/skills/gcp-data-engineering/zoho-to-gcp/data-extraction.md +543 -0
  136. package/skills/gcp-data-engineering/zoho-to-gcp/real-time-vs-batch.md +337 -0
  137. package/skills/security-review/SKILL.md +498 -0
  138. package/skills/security-review/compliance-checklist.md +53 -0
  139. package/skills/strategic-compact/SKILL.md +67 -0
  140. package/skills/tdd-workflow/SKILL.md +413 -0
  141. package/skills/tdd-workflow/zoho-testing.md +124 -0
  142. package/skills/tutorial/SKILL.md +249 -0
  143. package/skills/tutorial/docs/ACCESSIBILITY.md +169 -0
  144. package/skills/tutorial/lessons/00-philosophy-and-workflow.md +198 -0
  145. package/skills/tutorial/lessons/01-basics.md +81 -0
  146. package/skills/tutorial/lessons/02-training.md +86 -0
  147. package/skills/tutorial/lessons/03-commands.md +109 -0
  148. package/skills/tutorial/lessons/04-workflows.md +115 -0
  149. package/skills/tutorial/lessons/05-compliance.md +116 -0
  150. package/skills/tutorial/lessons/06-zoho.md +121 -0
  151. package/skills/tutorial/lessons/07-hooks-system.md +277 -0
  152. package/skills/tutorial/lessons/08-mcp-servers.md +316 -0
  153. package/skills/tutorial/lessons/09-client-management.md +215 -0
  154. package/skills/tutorial/lessons/10-testing-e2e.md +260 -0
  155. package/skills/tutorial/lessons/11-skills-deep-dive.md +272 -0
  156. package/skills/tutorial/lessons/12-rules-system.md +326 -0
  157. package/skills/tutorial/lessons/13-golden-standard-graduation.md +213 -0
  158. package/skills/tutorial/lessons/14-fork-setup-and-sync.md +312 -0
  159. package/skills/tutorial/lessons/15-living-examples-system.md +221 -0
  160. package/skills/tutorial/tracks/accelerated/README.md +134 -0
  161. package/skills/tutorial/tracks/accelerated/assessment/checkpoint-1.md +161 -0
  162. package/skills/tutorial/tracks/accelerated/assessment/checkpoint-2.md +175 -0
  163. package/skills/tutorial/tracks/accelerated/day-1-core-concepts.md +234 -0
  164. package/skills/tutorial/tracks/accelerated/day-2-essential-commands.md +270 -0
  165. package/skills/tutorial/tracks/accelerated/day-3-workflow-mastery.md +305 -0
  166. package/skills/tutorial/tracks/accelerated/day-4-compliance-zoho.md +304 -0
  167. package/skills/tutorial/tracks/accelerated/day-5-hooks-skills.md +344 -0
  168. package/skills/tutorial/tracks/accelerated/day-6-client-testing.md +386 -0
  169. package/skills/tutorial/tracks/accelerated/day-7-graduation.md +369 -0
  170. package/skills/zoho-patterns/CHANGELOG.md +108 -0
  171. package/skills/zoho-patterns/SKILL.md +446 -0
  172. package/skills/zoho-patterns/analytics/dashboard-patterns.md +352 -0
  173. package/skills/zoho-patterns/analytics/zoho-to-bigquery-pipeline.md +427 -0
  174. package/skills/zoho-patterns/catalyst/appsail-deployment.md +349 -0
  175. package/skills/zoho-patterns/catalyst/context-close-patterns.md +354 -0
  176. package/skills/zoho-patterns/catalyst/cron-batch-processing.md +374 -0
  177. package/skills/zoho-patterns/catalyst/function-patterns.md +439 -0
  178. package/skills/zoho-patterns/creator/form-design.md +304 -0
  179. package/skills/zoho-patterns/creator/publish-api-patterns.md +313 -0
  180. package/skills/zoho-patterns/creator/widget-integration.md +306 -0
  181. package/skills/zoho-patterns/creator/workflow-automation.md +253 -0
  182. package/skills/zoho-patterns/deluge/api-patterns.md +468 -0
  183. package/skills/zoho-patterns/deluge/batch-processing.md +403 -0
  184. package/skills/zoho-patterns/deluge/cross-app-integration.md +356 -0
  185. package/skills/zoho-patterns/deluge/error-handling.md +423 -0
  186. package/skills/zoho-patterns/deluge/syntax-reference.md +65 -0
  187. package/skills/zoho-patterns/integration/cors-proxy-architecture.md +426 -0
  188. package/skills/zoho-patterns/integration/crm-books-native-sync.md +277 -0
  189. package/skills/zoho-patterns/integration/oauth-token-management.md +461 -0
  190. package/skills/zoho-patterns/integration/zoho-flow-patterns.md +334 -0
@@ -0,0 +1,444 @@
1
+ # Dataflow Pipeline Patterns
2
+
3
+ > Apache Beam pipeline templates for batch and streaming workloads in CloudStream's GCP data platform.
4
+
5
+ ## Batch Pipeline Templates
6
+
7
+ ### TextIO: CSV/JSON File Processing
8
+
9
+ ```python
10
+ # batch_csv_to_bigquery.py
11
+ import apache_beam as beam
12
+ from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions
13
+ from apache_beam.io.gcp.bigquery import WriteToBigQuery, BigQueryDisposition
14
+ import csv
15
+ import io
16
+ from datetime import datetime
17
+
18
+ class ParseCSV(beam.DoFn):
19
+ """Parse CSV rows into dictionaries."""
20
+
21
+ def __init__(self, headers):
22
+ self.headers = headers
23
+
24
+ def process(self, element):
25
+ reader = csv.reader(io.StringIO(element))
26
+ for row in reader:
27
+ if len(row) == len(self.headers):
28
+ record = dict(zip(self.headers, row))
29
+ record['_ingestion_timestamp'] = datetime.utcnow().isoformat()
30
+ record['_source_system'] = 'csv_import'
31
+ yield record
32
+
33
+ def run_csv_pipeline(input_path, output_table, headers):
34
+ options = PipelineOptions(
35
+ runner='DataflowRunner',
36
+ project='cloudstream-prod',
37
+ region='us-central1',
38
+ temp_location='gs://cloudstream-dataflow-temp/tmp',
39
+ staging_location='gs://cloudstream-dataflow-temp/staging',
40
+ max_num_workers=10,
41
+ machine_type='n1-standard-4',
42
+ )
43
+
44
+ with beam.Pipeline(options=options) as p:
45
+ (
46
+ p
47
+ | 'ReadCSV' >> beam.io.ReadFromText(input_path, skip_header_lines=1)
48
+ | 'ParseRows' >> beam.ParDo(ParseCSV(headers))
49
+ | 'WriteBQ' >> WriteToBigQuery(
50
+ table=output_table,
51
+ write_disposition=BigQueryDisposition.WRITE_APPEND,
52
+ create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
53
+ custom_gcs_temp_location='gs://cloudstream-dataflow-temp/bq-tmp'
54
+ )
55
+ )
56
+ ```
57
+
58
+ ### BigQueryIO: Table-to-Table Transforms
59
+
60
+ ```python
61
+ # bigquery_transform_pipeline.py
62
+ import apache_beam as beam
63
+ from apache_beam.io.gcp.bigquery import ReadFromBigQuery, WriteToBigQuery
64
+
65
+ def enrich_record(record):
66
+ """Add computed fields to deal records."""
67
+ record['deal_age_days'] = (
68
+ datetime.now() - datetime.fromisoformat(record['created_time'])
69
+ ).days if record.get('created_time') else None
70
+ record['amount_bucket'] = (
71
+ 'small' if (record.get('amount') or 0) < 10000
72
+ else 'medium' if (record.get('amount') or 0) < 100000
73
+ else 'large'
74
+ )
75
+ return record
76
+
77
+ def run():
78
+ with beam.Pipeline(options=get_pipeline_options()) as p:
79
+ (
80
+ p
81
+ | 'ReadSilver' >> ReadFromBigQuery(
82
+ query="""
83
+ SELECT record_id, deal_name, amount, stage, created_time, owner_id
84
+ FROM `cloudstream-prod.silver.zoho_deals`
85
+ WHERE _ingestion_date = CURRENT_DATE()
86
+ """,
87
+ use_standard_sql=True
88
+ )
89
+ | 'Enrich' >> beam.Map(enrich_record)
90
+ | 'WriteGold' >> WriteToBigQuery(
91
+ table='cloudstream-prod:gold.enriched_deals',
92
+ write_disposition='WRITE_TRUNCATE',
93
+ schema='SCHEMA_AUTODETECT'
94
+ )
95
+ )
96
+ ```
97
+
98
+ ### AvroIO: Schema-Aware Processing
99
+
100
+ ```python
101
+ # avro_pipeline.py
102
+ import apache_beam as beam
103
+ from apache_beam.io.avroio import ReadFromAvro, WriteToAvro
104
+
105
+ AVRO_SCHEMA = {
106
+ "type": "record",
107
+ "name": "ZohoDeal",
108
+ "fields": [
109
+ {"name": "record_id", "type": "string"},
110
+ {"name": "deal_name", "type": ["null", "string"]},
111
+ {"name": "amount", "type": ["null", "double"]},
112
+ {"name": "stage", "type": "string"},
113
+ {"name": "close_date", "type": ["null", "string"]},
114
+ ]
115
+ }
116
+
117
+ def run():
118
+ with beam.Pipeline(options=get_pipeline_options()) as p:
119
+ (
120
+ p
121
+ | 'ReadAvro' >> ReadFromAvro('gs://cloudstream-landing/avro/deals/*.avro')
122
+ | 'FilterActive' >> beam.Filter(lambda r: r['stage'] != 'Closed Lost')
123
+ | 'WriteAvro' >> WriteToAvro(
124
+ 'gs://cloudstream-processed/deals/active',
125
+ schema=AVRO_SCHEMA,
126
+ file_name_suffix='.avro'
127
+ )
128
+ )
129
+ ```
130
+
131
+ ## Streaming Pipeline Templates
132
+
133
+ ### PubSub to BigQuery (Real-Time Ingestion)
134
+
135
+ ```python
136
+ # streaming_pubsub_to_bq.py
137
+ import apache_beam as beam
138
+ from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
139
+ from apache_beam.io.gcp.bigquery import WriteToBigQuery
140
+ import json
141
+
142
+ class ParsePubSubMessage(beam.DoFn):
143
+ def process(self, element):
144
+ message = json.loads(element.decode('utf-8'))
145
+ message['_ingestion_timestamp'] = datetime.utcnow().isoformat()
146
+ message['_ingestion_date'] = datetime.utcnow().strftime('%Y-%m-%d')
147
+ yield message
148
+
149
+ def run():
150
+ options = PipelineOptions(
151
+ streaming=True,
152
+ project='cloudstream-prod',
153
+ region='us-central1',
154
+ temp_location='gs://cloudstream-dataflow-temp/tmp',
155
+ # Streaming-specific options
156
+ enable_streaming_engine=True, # Reduces worker resource usage
157
+ experiments=['enable_streaming_engine'],
158
+ )
159
+
160
+ with beam.Pipeline(options=options) as p:
161
+ (
162
+ p
163
+ | 'ReadPubSub' >> beam.io.ReadFromPubSub(
164
+ subscription='projects/cloudstream-prod/subscriptions/zoho-events-sub'
165
+ )
166
+ | 'Parse' >> beam.ParDo(ParsePubSubMessage())
167
+ | 'WriteBQ' >> WriteToBigQuery(
168
+ table='cloudstream-prod:bronze.zoho_realtime_events',
169
+ write_disposition='WRITE_APPEND',
170
+ insert_retry_strategy='RETRY_ON_TRANSIENT_ERROR',
171
+ # Use streaming inserts for low-latency
172
+ method='STREAMING_INSERTS'
173
+ )
174
+ )
175
+ ```
176
+
177
+ ### PubSub to GCS (Raw Archive)
178
+
179
+ ```python
180
+ # streaming_pubsub_to_gcs.py
181
+ import apache_beam as beam
182
+ from apache_beam.io.fileio import WriteToFiles, FileSink
183
+ from apache_beam.transforms.window import FixedWindows
184
+
185
+ class JsonSink(FileSink):
186
+ def write(self, record):
187
+ self._file_handle.write(json.dumps(record).encode('utf-8') + b'\n')
188
+
189
+ def run():
190
+ with beam.Pipeline(options=streaming_options()) as p:
191
+ (
192
+ p
193
+ | 'ReadPubSub' >> beam.io.ReadFromPubSub(
194
+ topic='projects/cloudstream-prod/topics/zoho-webhooks'
195
+ )
196
+ | 'Parse' >> beam.Map(lambda x: json.loads(x.decode('utf-8')))
197
+ | 'Window5Min' >> beam.WindowInto(FixedWindows(300)) # 5-minute windows
198
+ | 'WriteGCS' >> WriteToFiles(
199
+ path='gs://cloudstream-landing/streaming/',
200
+ sink=JsonSink(),
201
+ file_naming=beam.io.fileio.destination_prefix_naming(
202
+ suffix='.json'
203
+ )
204
+ )
205
+ )
206
+ ```
207
+
208
+ ## Windowing Strategies
209
+
210
+ ### Fixed Windows (Regular Intervals)
211
+
212
+ ```python
213
+ # Aggregate every 5 minutes
214
+ windowed = (
215
+ events
216
+ | 'FixedWindow' >> beam.WindowInto(
217
+ beam.window.FixedWindows(300), # 5 minutes = 300 seconds
218
+ trigger=beam.trigger.AfterWatermark(
219
+ late=beam.trigger.AfterCount(100) # Handle late data
220
+ ),
221
+ accumulation_mode=beam.trigger.AccumulationMode.ACCUMULATING
222
+ )
223
+ | 'CountPerWindow' >> beam.CombineGlobally(beam.combiners.CountCombineFn())
224
+ )
225
+ ```
226
+
227
+ ### Sliding Windows (Overlapping)
228
+
229
+ ```python
230
+ # 1-hour window, sliding every 5 minutes (for moving averages)
231
+ sliding = (
232
+ events
233
+ | 'SlidingWindow' >> beam.WindowInto(
234
+ beam.window.SlidingWindows(
235
+ size=3600, # 1 hour window
236
+ period=300 # Slide every 5 minutes
237
+ )
238
+ )
239
+ | 'AvgAmount' >> beam.CombineGlobally(
240
+ beam.combiners.MeanCombineFn()
241
+ ).without_defaults()
242
+ )
243
+ ```
244
+
245
+ ### Session Windows (Activity-Based)
246
+
247
+ ```python
248
+ # Group events by user session (10-minute gap = new session)
249
+ sessions = (
250
+ events
251
+ | 'KeyByUser' >> beam.Map(lambda e: (e['user_id'], e))
252
+ | 'SessionWindow' >> beam.WindowInto(
253
+ beam.window.Sessions(600) # 10-minute gap
254
+ )
255
+ | 'GroupSessions' >> beam.GroupByKey()
256
+ | 'AnalyzeSession' >> beam.ParDo(SessionAnalyzer())
257
+ )
258
+ ```
259
+
260
+ ## Watermarks and Late Data
261
+
262
+ ```python
263
+ # Handle late-arriving data with allowed lateness
264
+ late_handling = (
265
+ events
266
+ | 'WindowWithLateness' >> beam.WindowInto(
267
+ beam.window.FixedWindows(300),
268
+ trigger=beam.trigger.AfterWatermark(
269
+ early=beam.trigger.AfterProcessingTime(60), # Early results every 60s
270
+ late=beam.trigger.AfterCount(1) # Fire on each late element
271
+ ),
272
+ allowed_lateness=beam.window.Duration(seconds=86400), # 24h late tolerance
273
+ accumulation_mode=beam.trigger.AccumulationMode.ACCUMULATING
274
+ )
275
+ )
276
+ ```
277
+
278
+ > **WARNING**: Setting `allowed_lateness` too high increases memory usage. For Zoho webhook data, 1 hour is typically sufficient. Use 24 hours only for batch replay scenarios.
279
+
280
+ ## Side Inputs for Enrichment
281
+
282
+ ```python
283
+ # Enrich streaming deals with customer dimension data
284
+ class EnrichWithCustomer(beam.DoFn):
285
+ def process(self, deal, customers):
286
+ customer = customers.get(deal['customer_id'], {})
287
+ deal['customer_name'] = customer.get('account_name', 'Unknown')
288
+ deal['customer_industry'] = customer.get('industry', 'Unknown')
289
+ yield deal
290
+
291
+ def run():
292
+ with beam.Pipeline(options=options) as p:
293
+ # Side input: customer dimension (refreshed periodically)
294
+ customers = (
295
+ p
296
+ | 'ReadCustomers' >> ReadFromBigQuery(
297
+ query='SELECT record_id, account_name, industry FROM silver.zoho_accounts'
298
+ )
299
+ | 'KeyById' >> beam.Map(lambda c: (c['record_id'], c))
300
+ | 'ToDict' >> beam.combiners.ToDict()
301
+ )
302
+
303
+ # Main input: streaming deals
304
+ deals = (
305
+ p
306
+ | 'ReadDeals' >> beam.io.ReadFromPubSub(topic=DEALS_TOPIC)
307
+ | 'ParseDeal' >> beam.Map(json.loads)
308
+ )
309
+
310
+ # Enrich with side input
311
+ enriched = (
312
+ deals
313
+ | 'Enrich' >> beam.ParDo(
314
+ EnrichWithCustomer(),
315
+ customers=beam.pvalue.AsSingleton(customers)
316
+ )
317
+ )
318
+ ```
319
+
320
+ ## Pipeline Options
321
+
322
+ ```python
323
+ # Common pipeline options for CloudStream
324
+ from apache_beam.options.pipeline_options import (
325
+ PipelineOptions, GoogleCloudOptions, WorkerOptions, SetupOptions
326
+ )
327
+
328
+ def get_pipeline_options(streaming=False, job_name='cloudstream-pipeline'):
329
+ options = PipelineOptions()
330
+
331
+ google_options = options.view_as(GoogleCloudOptions)
332
+ google_options.project = 'cloudstream-prod'
333
+ google_options.region = 'us-central1'
334
+ google_options.temp_location = 'gs://cloudstream-dataflow-temp/tmp'
335
+ google_options.staging_location = 'gs://cloudstream-dataflow-temp/staging'
336
+ google_options.job_name = f'{job_name}-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
337
+ google_options.service_account_email = 'dataflow-sa@cloudstream-prod.iam.gserviceaccount.com'
338
+
339
+ worker_options = options.view_as(WorkerOptions)
340
+ worker_options.machine_type = 'n1-standard-4'
341
+ worker_options.max_num_workers = 10 if not streaming else 5
342
+ worker_options.disk_size_gb = 50
343
+ worker_options.network = 'cloudstream-vpc'
344
+ worker_options.subnetwork = 'regions/us-central1/subnetworks/dataflow-subnet'
345
+
346
+ if streaming:
347
+ options.view_as(StandardOptions).streaming = True
348
+
349
+ return options
350
+ ```
351
+
352
+ ## Autoscaling Configuration
353
+
354
+ ```python
355
+ # Autoscaling options
356
+ worker_options.autoscaling_algorithm = 'THROUGHPUT_BASED'
357
+ worker_options.num_workers = 2 # Initial workers
358
+ worker_options.max_num_workers = 20 # Max scale-out
359
+
360
+ # For cost-sensitive batch jobs:
361
+ worker_options.use_public_ips = False # Reduces network cost
362
+ worker_options.experiments = [
363
+ 'shuffle_mode=service', # Use Dataflow Shuffle (faster, less disk)
364
+ 'use_runner_v2', # New Dataflow Runner (better autoscaling)
365
+ ]
366
+ ```
367
+
368
+ ## Flex Templates for Parameterized Pipelines
369
+
370
+ ```python
371
+ # flex_template_pipeline.py
372
+ import apache_beam as beam
373
+ from apache_beam.options.pipeline_options import PipelineOptions
374
+
375
+ class CloudStreamOptions(PipelineOptions):
376
+ @classmethod
377
+ def _add_argparse_args(cls, parser):
378
+ parser.add_argument('--source_table', required=True, help='Source BQ table')
379
+ parser.add_argument('--dest_table', required=True, help='Destination BQ table')
380
+ parser.add_argument('--processing_date', help='Date to process (YYYY-MM-DD)')
381
+ parser.add_argument('--mode', default='incremental', choices=['full', 'incremental'])
382
+
383
+ def run():
384
+ options = PipelineOptions().view_as(CloudStreamOptions)
385
+
386
+ with beam.Pipeline(options=options) as p:
387
+ query = f"""
388
+ SELECT * FROM `{options.source_table}`
389
+ WHERE _ingestion_date = '{options.processing_date}'
390
+ """ if options.mode == 'incremental' else f"""
391
+ SELECT * FROM `{options.source_table}`
392
+ """
393
+
394
+ (
395
+ p
396
+ | 'Read' >> ReadFromBigQuery(query=query, use_standard_sql=True)
397
+ | 'Transform' >> beam.ParDo(TransformDoFn())
398
+ | 'Write' >> WriteToBigQuery(table=options.dest_table)
399
+ )
400
+ ```
401
+
402
+ ```yaml
403
+ # flex_template_metadata.json
404
+ {
405
+ "name": "CloudStream Bronze-to-Silver",
406
+ "description": "Parameterized pipeline for medallion layer transitions",
407
+ "parameters": [
408
+ {"name": "source_table", "label": "Source Table", "helpText": "Fully qualified BQ table"},
409
+ {"name": "dest_table", "label": "Destination Table", "helpText": "Fully qualified BQ table"},
410
+ {"name": "processing_date", "label": "Processing Date", "helpText": "YYYY-MM-DD format"},
411
+ {"name": "mode", "label": "Mode", "helpText": "full or incremental", "isOptional": true}
412
+ ]
413
+ }
414
+ ```
415
+
416
+ ```bash
417
+ # Build and deploy Flex Template
418
+ gcloud dataflow flex-template build \
419
+ gs://cloudstream-dataflow-templates/bronze-to-silver.json \
420
+ --image-gcr-path gcr.io/cloudstream-prod/dataflow/bronze-to-silver:latest \
421
+ --sdk-language PYTHON \
422
+ --metadata-file flex_template_metadata.json
423
+
424
+ # Launch from template
425
+ gcloud dataflow flex-template run "bronze-to-silver-$(date +%Y%m%d)" \
426
+ --template-file-gcs-location gs://cloudstream-dataflow-templates/bronze-to-silver.json \
427
+ --region us-central1 \
428
+ --parameters source_table=cloudstream-prod:bronze.zoho_deals \
429
+ --parameters dest_table=cloudstream-prod:silver.zoho_deals \
430
+ --parameters processing_date=2024-01-15 \
431
+ --parameters mode=incremental
432
+ ```
433
+
434
+ ## Best Practices
435
+
436
+ 1. **Use Flex Templates** for reusable, parameterized pipelines
437
+ 2. **Enable Streaming Engine** for streaming jobs (reduces cost 30-50%)
438
+ 3. **Use Dataflow Shuffle** for batch jobs (`shuffle_mode=service`)
439
+ 4. **Set appropriate machine types** - n1-standard-4 for most, n1-highmem for memory-intensive
440
+ 5. **Monitor with Cloud Monitoring** - Track throughput, watermark lag, backlog
441
+ 6. **Use VPC-native networking** - No public IPs reduces cost and improves security
442
+ 7. **Test locally first** - Use `DirectRunner` before deploying to `DataflowRunner`
443
+
444
+ > **COST WARNING**: Streaming pipelines run 24/7. A single n1-standard-4 worker costs ~$100/month. Design pipelines to scale down to 1 worker during low-traffic periods.
@@ -0,0 +1,63 @@
1
+ # dbt Model Organization
2
+
3
+ ## Directory Structure
4
+ ```
5
+ models/
6
+ ├── staging/ # Bronze → Silver (1:1 with source)
7
+ │ ├── zoho/
8
+ │ │ ├── stg_zoho__contacts.sql
9
+ │ │ ├── stg_zoho__deals.sql
10
+ │ │ └── _zoho__sources.yml
11
+ │ └── gcs/
12
+ │ └── stg_gcs__raw_events.sql
13
+ ├── intermediate/ # Silver transformations
14
+ │ ├── int_contacts_enriched.sql
15
+ │ └── int_deals_with_contacts.sql
16
+ └── marts/ # Gold layer (business-ready)
17
+ ├── finance/
18
+ │ ├── fct_revenue.sql
19
+ │ └── dim_customers.sql
20
+ └── operations/
21
+ ├── fct_pipeline.sql
22
+ └── dim_sales_reps.sql
23
+ ```
24
+
25
+ ## Staging Model Pattern
26
+ ```sql
27
+ -- models/staging/zoho/stg_zoho__contacts.sql
28
+ WITH source AS (
29
+ SELECT * FROM {{ source('zoho', 'contacts_raw') }}
30
+ ),
31
+
32
+ cleaned AS (
33
+ SELECT
34
+ id AS contact_id,
35
+ TRIM(LOWER(email)) AS email,
36
+ INITCAP(first_name) AS first_name,
37
+ INITCAP(last_name) AS last_name,
38
+ PARSE_TIMESTAMP('%Y-%m-%dT%H:%M:%S', created_time) AS created_at,
39
+ PARSE_TIMESTAMP('%Y-%m-%dT%H:%M:%S', modified_time) AS modified_at
40
+ FROM source
41
+ WHERE id IS NOT NULL
42
+ )
43
+
44
+ SELECT * FROM cleaned
45
+ ```
46
+
47
+ ## Testing Pattern
48
+ ```yaml
49
+ # models/staging/zoho/_zoho__sources.yml
50
+ version: 2
51
+ sources:
52
+ - name: zoho
53
+ tables:
54
+ - name: contacts_raw
55
+ columns:
56
+ - name: id
57
+ tests:
58
+ - not_null
59
+ - unique
60
+ - name: email
61
+ tests:
62
+ - not_null
63
+ ```