@cloudstreamsoftware/claude-tools 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +152 -37
- package/agents/INDEX.md +183 -0
- package/agents/architect.md +247 -0
- package/agents/build-error-resolver.md +555 -0
- package/agents/catalyst-deployer.md +132 -0
- package/agents/code-reviewer.md +121 -0
- package/agents/compliance-auditor.md +148 -0
- package/agents/creator-architect.md +395 -0
- package/agents/deluge-reviewer.md +98 -0
- package/agents/doc-updater.md +471 -0
- package/agents/e2e-runner.md +711 -0
- package/agents/planner.md +122 -0
- package/agents/refactor-cleaner.md +309 -0
- package/agents/security-reviewer.md +582 -0
- package/agents/tdd-guide.md +302 -0
- package/bin/cloudstream-setup.js +16 -6
- package/config/versions.json +63 -0
- package/dist/hooks/hooks.json +209 -0
- package/dist/index.js +47 -0
- package/dist/lib/asset-value.js +609 -0
- package/dist/lib/client-manager.js +300 -0
- package/dist/lib/command-matcher.js +242 -0
- package/dist/lib/cross-session-patterns.js +754 -0
- package/dist/lib/intent-classifier.js +1075 -0
- package/dist/lib/package-manager.js +374 -0
- package/dist/lib/recommendation-engine.js +597 -0
- package/dist/lib/session-memory.js +489 -0
- package/dist/lib/skill-effectiveness.js +486 -0
- package/dist/lib/skill-matcher.js +595 -0
- package/dist/lib/tutorial-metrics.js +242 -0
- package/dist/lib/tutorial-progress.js +209 -0
- package/dist/lib/tutorial-renderer.js +431 -0
- package/dist/lib/utils.js +380 -0
- package/dist/lib/verify-formatter.js +143 -0
- package/dist/lib/workflow-state.js +249 -0
- package/hooks/hooks.json +209 -0
- package/package.json +5 -1
- package/scripts/aggregate-sessions.js +290 -0
- package/scripts/branch-name-validator.js +291 -0
- package/scripts/build.js +101 -0
- package/scripts/commands/client-switch.js +231 -0
- package/scripts/deprecate-skill.js +610 -0
- package/scripts/diagnose.js +324 -0
- package/scripts/doc-freshness.js +168 -0
- package/scripts/generate-weekly-digest.js +393 -0
- package/scripts/health-check.js +270 -0
- package/scripts/hooks/credential-check.js +101 -0
- package/scripts/hooks/evaluate-session.js +81 -0
- package/scripts/hooks/pre-compact.js +66 -0
- package/scripts/hooks/prompt-analyzer.js +276 -0
- package/scripts/hooks/prompt-router.js +422 -0
- package/scripts/hooks/quality-gate-enforcer.js +371 -0
- package/scripts/hooks/session-end.js +156 -0
- package/scripts/hooks/session-start.js +195 -0
- package/scripts/hooks/skill-injector.js +333 -0
- package/scripts/hooks/suggest-compact.js +58 -0
- package/scripts/lib/asset-value.js +609 -0
- package/scripts/lib/client-manager.js +300 -0
- package/scripts/lib/command-matcher.js +242 -0
- package/scripts/lib/cross-session-patterns.js +754 -0
- package/scripts/lib/intent-classifier.js +1075 -0
- package/scripts/lib/package-manager.js +374 -0
- package/scripts/lib/recommendation-engine.js +597 -0
- package/scripts/lib/session-memory.js +489 -0
- package/scripts/lib/skill-effectiveness.js +486 -0
- package/scripts/lib/skill-matcher.js +595 -0
- package/scripts/lib/tutorial-metrics.js +242 -0
- package/scripts/lib/tutorial-progress.js +209 -0
- package/scripts/lib/tutorial-renderer.js +431 -0
- package/scripts/lib/utils.js +380 -0
- package/scripts/lib/verify-formatter.js +143 -0
- package/scripts/lib/workflow-state.js +249 -0
- package/scripts/onboard.js +363 -0
- package/scripts/quarterly-report.js +692 -0
- package/scripts/setup-package-manager.js +204 -0
- package/scripts/sync-upstream.js +391 -0
- package/scripts/test.js +108 -0
- package/scripts/tutorial-runner.js +351 -0
- package/scripts/validate-all.js +201 -0
- package/scripts/verifiers/agents.js +245 -0
- package/scripts/verifiers/config.js +186 -0
- package/scripts/verifiers/environment.js +123 -0
- package/scripts/verifiers/hooks.js +188 -0
- package/scripts/verifiers/index.js +38 -0
- package/scripts/verifiers/persistence.js +140 -0
- package/scripts/verifiers/plugin.js +215 -0
- package/scripts/verifiers/skills.js +209 -0
- package/scripts/verify-setup.js +164 -0
- package/skills/INDEX.md +157 -0
- package/skills/backend-patterns/SKILL.md +586 -0
- package/skills/backend-patterns/catalyst-patterns.md +128 -0
- package/skills/bigquery-patterns/SKILL.md +27 -0
- package/skills/bigquery-patterns/performance-optimization.md +518 -0
- package/skills/bigquery-patterns/query-patterns.md +372 -0
- package/skills/bigquery-patterns/schema-design.md +78 -0
- package/skills/cloudstream-project-template/SKILL.md +20 -0
- package/skills/cloudstream-project-template/structure.md +65 -0
- package/skills/coding-standards/SKILL.md +524 -0
- package/skills/coding-standards/deluge-standards.md +83 -0
- package/skills/compliance-patterns/SKILL.md +28 -0
- package/skills/compliance-patterns/hipaa/audit-requirements.md +251 -0
- package/skills/compliance-patterns/hipaa/baa-process.md +298 -0
- package/skills/compliance-patterns/hipaa/data-archival-strategy.md +387 -0
- package/skills/compliance-patterns/hipaa/phi-handling.md +52 -0
- package/skills/compliance-patterns/pci-dss/saq-a-requirements.md +307 -0
- package/skills/compliance-patterns/pci-dss/tokenization-patterns.md +382 -0
- package/skills/compliance-patterns/pci-dss/zoho-checkout-patterns.md +56 -0
- package/skills/compliance-patterns/soc2/access-controls.md +344 -0
- package/skills/compliance-patterns/soc2/audit-logging.md +458 -0
- package/skills/compliance-patterns/soc2/change-management.md +403 -0
- package/skills/compliance-patterns/soc2/deluge-execution-logging.md +407 -0
- package/skills/consultancy-workflows/SKILL.md +19 -0
- package/skills/consultancy-workflows/client-isolation.md +21 -0
- package/skills/consultancy-workflows/documentation-automation.md +454 -0
- package/skills/consultancy-workflows/handoff-procedures.md +257 -0
- package/skills/consultancy-workflows/knowledge-capture.md +513 -0
- package/skills/consultancy-workflows/time-tracking.md +26 -0
- package/skills/continuous-learning/SKILL.md +84 -0
- package/skills/continuous-learning/config.json +18 -0
- package/skills/continuous-learning/evaluate-session.sh +60 -0
- package/skills/continuous-learning-v2/SKILL.md +126 -0
- package/skills/continuous-learning-v2/config.json +61 -0
- package/skills/frontend-patterns/SKILL.md +635 -0
- package/skills/frontend-patterns/zoho-widget-patterns.md +103 -0
- package/skills/gcp-data-engineering/SKILL.md +36 -0
- package/skills/gcp-data-engineering/bigquery/performance-optimization.md +337 -0
- package/skills/gcp-data-engineering/dataflow/error-handling.md +496 -0
- package/skills/gcp-data-engineering/dataflow/pipeline-patterns.md +444 -0
- package/skills/gcp-data-engineering/dbt/model-organization.md +63 -0
- package/skills/gcp-data-engineering/dbt/testing-patterns.md +503 -0
- package/skills/gcp-data-engineering/medallion-architecture/bronze-layer.md +60 -0
- package/skills/gcp-data-engineering/medallion-architecture/gold-layer.md +311 -0
- package/skills/gcp-data-engineering/medallion-architecture/layer-transitions.md +517 -0
- package/skills/gcp-data-engineering/medallion-architecture/silver-layer.md +305 -0
- package/skills/gcp-data-engineering/zoho-to-gcp/data-extraction.md +543 -0
- package/skills/gcp-data-engineering/zoho-to-gcp/real-time-vs-batch.md +337 -0
- package/skills/security-review/SKILL.md +498 -0
- package/skills/security-review/compliance-checklist.md +53 -0
- package/skills/strategic-compact/SKILL.md +67 -0
- package/skills/tdd-workflow/SKILL.md +413 -0
- package/skills/tdd-workflow/zoho-testing.md +124 -0
- package/skills/tutorial/SKILL.md +249 -0
- package/skills/tutorial/docs/ACCESSIBILITY.md +169 -0
- package/skills/tutorial/lessons/00-philosophy-and-workflow.md +198 -0
- package/skills/tutorial/lessons/01-basics.md +81 -0
- package/skills/tutorial/lessons/02-training.md +86 -0
- package/skills/tutorial/lessons/03-commands.md +109 -0
- package/skills/tutorial/lessons/04-workflows.md +115 -0
- package/skills/tutorial/lessons/05-compliance.md +116 -0
- package/skills/tutorial/lessons/06-zoho.md +121 -0
- package/skills/tutorial/lessons/07-hooks-system.md +277 -0
- package/skills/tutorial/lessons/08-mcp-servers.md +316 -0
- package/skills/tutorial/lessons/09-client-management.md +215 -0
- package/skills/tutorial/lessons/10-testing-e2e.md +260 -0
- package/skills/tutorial/lessons/11-skills-deep-dive.md +272 -0
- package/skills/tutorial/lessons/12-rules-system.md +326 -0
- package/skills/tutorial/lessons/13-golden-standard-graduation.md +213 -0
- package/skills/tutorial/lessons/14-fork-setup-and-sync.md +312 -0
- package/skills/tutorial/lessons/15-living-examples-system.md +221 -0
- package/skills/tutorial/tracks/accelerated/README.md +134 -0
- package/skills/tutorial/tracks/accelerated/assessment/checkpoint-1.md +161 -0
- package/skills/tutorial/tracks/accelerated/assessment/checkpoint-2.md +175 -0
- package/skills/tutorial/tracks/accelerated/day-1-core-concepts.md +234 -0
- package/skills/tutorial/tracks/accelerated/day-2-essential-commands.md +270 -0
- package/skills/tutorial/tracks/accelerated/day-3-workflow-mastery.md +305 -0
- package/skills/tutorial/tracks/accelerated/day-4-compliance-zoho.md +304 -0
- package/skills/tutorial/tracks/accelerated/day-5-hooks-skills.md +344 -0
- package/skills/tutorial/tracks/accelerated/day-6-client-testing.md +386 -0
- package/skills/tutorial/tracks/accelerated/day-7-graduation.md +369 -0
- package/skills/zoho-patterns/CHANGELOG.md +108 -0
- package/skills/zoho-patterns/SKILL.md +446 -0
- package/skills/zoho-patterns/analytics/dashboard-patterns.md +352 -0
- package/skills/zoho-patterns/analytics/zoho-to-bigquery-pipeline.md +427 -0
- package/skills/zoho-patterns/catalyst/appsail-deployment.md +349 -0
- package/skills/zoho-patterns/catalyst/context-close-patterns.md +354 -0
- package/skills/zoho-patterns/catalyst/cron-batch-processing.md +374 -0
- package/skills/zoho-patterns/catalyst/function-patterns.md +439 -0
- package/skills/zoho-patterns/creator/form-design.md +304 -0
- package/skills/zoho-patterns/creator/publish-api-patterns.md +313 -0
- package/skills/zoho-patterns/creator/widget-integration.md +306 -0
- package/skills/zoho-patterns/creator/workflow-automation.md +253 -0
- package/skills/zoho-patterns/deluge/api-patterns.md +468 -0
- package/skills/zoho-patterns/deluge/batch-processing.md +403 -0
- package/skills/zoho-patterns/deluge/cross-app-integration.md +356 -0
- package/skills/zoho-patterns/deluge/error-handling.md +423 -0
- package/skills/zoho-patterns/deluge/syntax-reference.md +65 -0
- package/skills/zoho-patterns/integration/cors-proxy-architecture.md +426 -0
- package/skills/zoho-patterns/integration/crm-books-native-sync.md +277 -0
- package/skills/zoho-patterns/integration/oauth-token-management.md +461 -0
- package/skills/zoho-patterns/integration/zoho-flow-patterns.md +334 -0
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
# Dataflow Pipeline Patterns
|
|
2
|
+
|
|
3
|
+
> Apache Beam pipeline templates for batch and streaming workloads in CloudStream's GCP data platform.
|
|
4
|
+
|
|
5
|
+
## Batch Pipeline Templates
|
|
6
|
+
|
|
7
|
+
### TextIO: CSV/JSON File Processing
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
# batch_csv_to_bigquery.py
|
|
11
|
+
import apache_beam as beam
|
|
12
|
+
from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions
|
|
13
|
+
from apache_beam.io.gcp.bigquery import WriteToBigQuery, BigQueryDisposition
|
|
14
|
+
import csv
|
|
15
|
+
import io
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
|
|
18
|
+
class ParseCSV(beam.DoFn):
|
|
19
|
+
"""Parse CSV rows into dictionaries."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, headers):
|
|
22
|
+
self.headers = headers
|
|
23
|
+
|
|
24
|
+
def process(self, element):
|
|
25
|
+
reader = csv.reader(io.StringIO(element))
|
|
26
|
+
for row in reader:
|
|
27
|
+
if len(row) == len(self.headers):
|
|
28
|
+
record = dict(zip(self.headers, row))
|
|
29
|
+
record['_ingestion_timestamp'] = datetime.utcnow().isoformat()
|
|
30
|
+
record['_source_system'] = 'csv_import'
|
|
31
|
+
yield record
|
|
32
|
+
|
|
33
|
+
def run_csv_pipeline(input_path, output_table, headers):
|
|
34
|
+
options = PipelineOptions(
|
|
35
|
+
runner='DataflowRunner',
|
|
36
|
+
project='cloudstream-prod',
|
|
37
|
+
region='us-central1',
|
|
38
|
+
temp_location='gs://cloudstream-dataflow-temp/tmp',
|
|
39
|
+
staging_location='gs://cloudstream-dataflow-temp/staging',
|
|
40
|
+
max_num_workers=10,
|
|
41
|
+
machine_type='n1-standard-4',
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
with beam.Pipeline(options=options) as p:
|
|
45
|
+
(
|
|
46
|
+
p
|
|
47
|
+
| 'ReadCSV' >> beam.io.ReadFromText(input_path, skip_header_lines=1)
|
|
48
|
+
| 'ParseRows' >> beam.ParDo(ParseCSV(headers))
|
|
49
|
+
| 'WriteBQ' >> WriteToBigQuery(
|
|
50
|
+
table=output_table,
|
|
51
|
+
write_disposition=BigQueryDisposition.WRITE_APPEND,
|
|
52
|
+
create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
|
|
53
|
+
custom_gcs_temp_location='gs://cloudstream-dataflow-temp/bq-tmp'
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### BigQueryIO: Table-to-Table Transforms
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
# bigquery_transform_pipeline.py
|
|
62
|
+
import apache_beam as beam
|
|
63
|
+
from apache_beam.io.gcp.bigquery import ReadFromBigQuery, WriteToBigQuery
|
|
64
|
+
|
|
65
|
+
def enrich_record(record):
|
|
66
|
+
"""Add computed fields to deal records."""
|
|
67
|
+
record['deal_age_days'] = (
|
|
68
|
+
datetime.now() - datetime.fromisoformat(record['created_time'])
|
|
69
|
+
).days if record.get('created_time') else None
|
|
70
|
+
record['amount_bucket'] = (
|
|
71
|
+
'small' if (record.get('amount') or 0) < 10000
|
|
72
|
+
else 'medium' if (record.get('amount') or 0) < 100000
|
|
73
|
+
else 'large'
|
|
74
|
+
)
|
|
75
|
+
return record
|
|
76
|
+
|
|
77
|
+
def run():
|
|
78
|
+
with beam.Pipeline(options=get_pipeline_options()) as p:
|
|
79
|
+
(
|
|
80
|
+
p
|
|
81
|
+
| 'ReadSilver' >> ReadFromBigQuery(
|
|
82
|
+
query="""
|
|
83
|
+
SELECT record_id, deal_name, amount, stage, created_time, owner_id
|
|
84
|
+
FROM `cloudstream-prod.silver.zoho_deals`
|
|
85
|
+
WHERE _ingestion_date = CURRENT_DATE()
|
|
86
|
+
""",
|
|
87
|
+
use_standard_sql=True
|
|
88
|
+
)
|
|
89
|
+
| 'Enrich' >> beam.Map(enrich_record)
|
|
90
|
+
| 'WriteGold' >> WriteToBigQuery(
|
|
91
|
+
table='cloudstream-prod:gold.enriched_deals',
|
|
92
|
+
write_disposition='WRITE_TRUNCATE',
|
|
93
|
+
schema='SCHEMA_AUTODETECT'
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### AvroIO: Schema-Aware Processing
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
# avro_pipeline.py
|
|
102
|
+
import apache_beam as beam
|
|
103
|
+
from apache_beam.io.avroio import ReadFromAvro, WriteToAvro
|
|
104
|
+
|
|
105
|
+
AVRO_SCHEMA = {
|
|
106
|
+
"type": "record",
|
|
107
|
+
"name": "ZohoDeal",
|
|
108
|
+
"fields": [
|
|
109
|
+
{"name": "record_id", "type": "string"},
|
|
110
|
+
{"name": "deal_name", "type": ["null", "string"]},
|
|
111
|
+
{"name": "amount", "type": ["null", "double"]},
|
|
112
|
+
{"name": "stage", "type": "string"},
|
|
113
|
+
{"name": "close_date", "type": ["null", "string"]},
|
|
114
|
+
]
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
def run():
|
|
118
|
+
with beam.Pipeline(options=get_pipeline_options()) as p:
|
|
119
|
+
(
|
|
120
|
+
p
|
|
121
|
+
| 'ReadAvro' >> ReadFromAvro('gs://cloudstream-landing/avro/deals/*.avro')
|
|
122
|
+
| 'FilterActive' >> beam.Filter(lambda r: r['stage'] != 'Closed Lost')
|
|
123
|
+
| 'WriteAvro' >> WriteToAvro(
|
|
124
|
+
'gs://cloudstream-processed/deals/active',
|
|
125
|
+
schema=AVRO_SCHEMA,
|
|
126
|
+
file_name_suffix='.avro'
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Streaming Pipeline Templates
|
|
132
|
+
|
|
133
|
+
### PubSub to BigQuery (Real-Time Ingestion)
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
# streaming_pubsub_to_bq.py
|
|
137
|
+
import apache_beam as beam
|
|
138
|
+
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
|
|
139
|
+
from apache_beam.io.gcp.bigquery import WriteToBigQuery
|
|
140
|
+
import json
|
|
141
|
+
|
|
142
|
+
class ParsePubSubMessage(beam.DoFn):
|
|
143
|
+
def process(self, element):
|
|
144
|
+
message = json.loads(element.decode('utf-8'))
|
|
145
|
+
message['_ingestion_timestamp'] = datetime.utcnow().isoformat()
|
|
146
|
+
message['_ingestion_date'] = datetime.utcnow().strftime('%Y-%m-%d')
|
|
147
|
+
yield message
|
|
148
|
+
|
|
149
|
+
def run():
|
|
150
|
+
options = PipelineOptions(
|
|
151
|
+
streaming=True,
|
|
152
|
+
project='cloudstream-prod',
|
|
153
|
+
region='us-central1',
|
|
154
|
+
temp_location='gs://cloudstream-dataflow-temp/tmp',
|
|
155
|
+
# Streaming-specific options
|
|
156
|
+
enable_streaming_engine=True, # Reduces worker resource usage
|
|
157
|
+
experiments=['enable_streaming_engine'],
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
with beam.Pipeline(options=options) as p:
|
|
161
|
+
(
|
|
162
|
+
p
|
|
163
|
+
| 'ReadPubSub' >> beam.io.ReadFromPubSub(
|
|
164
|
+
subscription='projects/cloudstream-prod/subscriptions/zoho-events-sub'
|
|
165
|
+
)
|
|
166
|
+
| 'Parse' >> beam.ParDo(ParsePubSubMessage())
|
|
167
|
+
| 'WriteBQ' >> WriteToBigQuery(
|
|
168
|
+
table='cloudstream-prod:bronze.zoho_realtime_events',
|
|
169
|
+
write_disposition='WRITE_APPEND',
|
|
170
|
+
insert_retry_strategy='RETRY_ON_TRANSIENT_ERROR',
|
|
171
|
+
# Use streaming inserts for low-latency
|
|
172
|
+
method='STREAMING_INSERTS'
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### PubSub to GCS (Raw Archive)
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
# streaming_pubsub_to_gcs.py
|
|
181
|
+
import apache_beam as beam
|
|
182
|
+
from apache_beam.io.fileio import WriteToFiles, FileSink
|
|
183
|
+
from apache_beam.transforms.window import FixedWindows
|
|
184
|
+
|
|
185
|
+
class JsonSink(FileSink):
|
|
186
|
+
def write(self, record):
|
|
187
|
+
self._file_handle.write(json.dumps(record).encode('utf-8') + b'\n')
|
|
188
|
+
|
|
189
|
+
def run():
|
|
190
|
+
with beam.Pipeline(options=streaming_options()) as p:
|
|
191
|
+
(
|
|
192
|
+
p
|
|
193
|
+
| 'ReadPubSub' >> beam.io.ReadFromPubSub(
|
|
194
|
+
topic='projects/cloudstream-prod/topics/zoho-webhooks'
|
|
195
|
+
)
|
|
196
|
+
| 'Parse' >> beam.Map(lambda x: json.loads(x.decode('utf-8')))
|
|
197
|
+
| 'Window5Min' >> beam.WindowInto(FixedWindows(300)) # 5-minute windows
|
|
198
|
+
| 'WriteGCS' >> WriteToFiles(
|
|
199
|
+
path='gs://cloudstream-landing/streaming/',
|
|
200
|
+
sink=JsonSink(),
|
|
201
|
+
file_naming=beam.io.fileio.destination_prefix_naming(
|
|
202
|
+
suffix='.json'
|
|
203
|
+
)
|
|
204
|
+
)
|
|
205
|
+
)
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## Windowing Strategies
|
|
209
|
+
|
|
210
|
+
### Fixed Windows (Regular Intervals)
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
# Aggregate every 5 minutes
|
|
214
|
+
windowed = (
|
|
215
|
+
events
|
|
216
|
+
| 'FixedWindow' >> beam.WindowInto(
|
|
217
|
+
beam.window.FixedWindows(300), # 5 minutes = 300 seconds
|
|
218
|
+
trigger=beam.trigger.AfterWatermark(
|
|
219
|
+
late=beam.trigger.AfterCount(100) # Handle late data
|
|
220
|
+
),
|
|
221
|
+
accumulation_mode=beam.trigger.AccumulationMode.ACCUMULATING
|
|
222
|
+
)
|
|
223
|
+
| 'CountPerWindow' >> beam.CombineGlobally(beam.combiners.CountCombineFn())
|
|
224
|
+
)
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Sliding Windows (Overlapping)
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
# 1-hour window, sliding every 5 minutes (for moving averages)
|
|
231
|
+
sliding = (
|
|
232
|
+
events
|
|
233
|
+
| 'SlidingWindow' >> beam.WindowInto(
|
|
234
|
+
beam.window.SlidingWindows(
|
|
235
|
+
size=3600, # 1 hour window
|
|
236
|
+
period=300 # Slide every 5 minutes
|
|
237
|
+
)
|
|
238
|
+
)
|
|
239
|
+
| 'AvgAmount' >> beam.CombineGlobally(
|
|
240
|
+
beam.combiners.MeanCombineFn()
|
|
241
|
+
).without_defaults()
|
|
242
|
+
)
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Session Windows (Activity-Based)
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
# Group events by user session (10-minute gap = new session)
|
|
249
|
+
sessions = (
|
|
250
|
+
events
|
|
251
|
+
| 'KeyByUser' >> beam.Map(lambda e: (e['user_id'], e))
|
|
252
|
+
| 'SessionWindow' >> beam.WindowInto(
|
|
253
|
+
beam.window.Sessions(600) # 10-minute gap
|
|
254
|
+
)
|
|
255
|
+
| 'GroupSessions' >> beam.GroupByKey()
|
|
256
|
+
| 'AnalyzeSession' >> beam.ParDo(SessionAnalyzer())
|
|
257
|
+
)
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## Watermarks and Late Data
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
# Handle late-arriving data with allowed lateness
|
|
264
|
+
late_handling = (
|
|
265
|
+
events
|
|
266
|
+
| 'WindowWithLateness' >> beam.WindowInto(
|
|
267
|
+
beam.window.FixedWindows(300),
|
|
268
|
+
trigger=beam.trigger.AfterWatermark(
|
|
269
|
+
early=beam.trigger.AfterProcessingTime(60), # Early results every 60s
|
|
270
|
+
late=beam.trigger.AfterCount(1) # Fire on each late element
|
|
271
|
+
),
|
|
272
|
+
allowed_lateness=beam.window.Duration(seconds=86400), # 24h late tolerance
|
|
273
|
+
accumulation_mode=beam.trigger.AccumulationMode.ACCUMULATING
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
> **WARNING**: Setting `allowed_lateness` too high increases memory usage. For Zoho webhook data, 1 hour is typically sufficient. Use 24 hours only for batch replay scenarios.
|
|
279
|
+
|
|
280
|
+
## Side Inputs for Enrichment
|
|
281
|
+
|
|
282
|
+
```python
|
|
283
|
+
# Enrich streaming deals with customer dimension data
|
|
284
|
+
class EnrichWithCustomer(beam.DoFn):
|
|
285
|
+
def process(self, deal, customers):
|
|
286
|
+
customer = customers.get(deal['customer_id'], {})
|
|
287
|
+
deal['customer_name'] = customer.get('account_name', 'Unknown')
|
|
288
|
+
deal['customer_industry'] = customer.get('industry', 'Unknown')
|
|
289
|
+
yield deal
|
|
290
|
+
|
|
291
|
+
def run():
|
|
292
|
+
with beam.Pipeline(options=options) as p:
|
|
293
|
+
# Side input: customer dimension (refreshed periodically)
|
|
294
|
+
customers = (
|
|
295
|
+
p
|
|
296
|
+
| 'ReadCustomers' >> ReadFromBigQuery(
|
|
297
|
+
query='SELECT record_id, account_name, industry FROM silver.zoho_accounts'
|
|
298
|
+
)
|
|
299
|
+
| 'KeyById' >> beam.Map(lambda c: (c['record_id'], c))
|
|
300
|
+
| 'ToDict' >> beam.combiners.ToDict()
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# Main input: streaming deals
|
|
304
|
+
deals = (
|
|
305
|
+
p
|
|
306
|
+
| 'ReadDeals' >> beam.io.ReadFromPubSub(topic=DEALS_TOPIC)
|
|
307
|
+
| 'ParseDeal' >> beam.Map(json.loads)
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
# Enrich with side input
|
|
311
|
+
enriched = (
|
|
312
|
+
deals
|
|
313
|
+
| 'Enrich' >> beam.ParDo(
|
|
314
|
+
EnrichWithCustomer(),
|
|
315
|
+
customers=beam.pvalue.AsSingleton(customers)
|
|
316
|
+
)
|
|
317
|
+
)
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
## Pipeline Options
|
|
321
|
+
|
|
322
|
+
```python
|
|
323
|
+
# Common pipeline options for CloudStream
|
|
324
|
+
from apache_beam.options.pipeline_options import (
|
|
325
|
+
PipelineOptions, GoogleCloudOptions, WorkerOptions, SetupOptions
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
def get_pipeline_options(streaming=False, job_name='cloudstream-pipeline'):
|
|
329
|
+
options = PipelineOptions()
|
|
330
|
+
|
|
331
|
+
google_options = options.view_as(GoogleCloudOptions)
|
|
332
|
+
google_options.project = 'cloudstream-prod'
|
|
333
|
+
google_options.region = 'us-central1'
|
|
334
|
+
google_options.temp_location = 'gs://cloudstream-dataflow-temp/tmp'
|
|
335
|
+
google_options.staging_location = 'gs://cloudstream-dataflow-temp/staging'
|
|
336
|
+
google_options.job_name = f'{job_name}-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
|
|
337
|
+
google_options.service_account_email = 'dataflow-sa@cloudstream-prod.iam.gserviceaccount.com'
|
|
338
|
+
|
|
339
|
+
worker_options = options.view_as(WorkerOptions)
|
|
340
|
+
worker_options.machine_type = 'n1-standard-4'
|
|
341
|
+
worker_options.max_num_workers = 10 if not streaming else 5
|
|
342
|
+
worker_options.disk_size_gb = 50
|
|
343
|
+
worker_options.network = 'cloudstream-vpc'
|
|
344
|
+
worker_options.subnetwork = 'regions/us-central1/subnetworks/dataflow-subnet'
|
|
345
|
+
|
|
346
|
+
if streaming:
|
|
347
|
+
options.view_as(StandardOptions).streaming = True
|
|
348
|
+
|
|
349
|
+
return options
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
## Autoscaling Configuration
|
|
353
|
+
|
|
354
|
+
```python
|
|
355
|
+
# Autoscaling options
|
|
356
|
+
worker_options.autoscaling_algorithm = 'THROUGHPUT_BASED'
|
|
357
|
+
worker_options.num_workers = 2 # Initial workers
|
|
358
|
+
worker_options.max_num_workers = 20 # Max scale-out
|
|
359
|
+
|
|
360
|
+
# For cost-sensitive batch jobs:
|
|
361
|
+
worker_options.use_public_ips = False # Reduces network cost
|
|
362
|
+
worker_options.experiments = [
|
|
363
|
+
'shuffle_mode=service', # Use Dataflow Shuffle (faster, less disk)
|
|
364
|
+
'use_runner_v2', # New Dataflow Runner (better autoscaling)
|
|
365
|
+
]
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
## Flex Templates for Parameterized Pipelines
|
|
369
|
+
|
|
370
|
+
```python
|
|
371
|
+
# flex_template_pipeline.py
|
|
372
|
+
import apache_beam as beam
|
|
373
|
+
from apache_beam.options.pipeline_options import PipelineOptions
|
|
374
|
+
|
|
375
|
+
class CloudStreamOptions(PipelineOptions):
|
|
376
|
+
@classmethod
|
|
377
|
+
def _add_argparse_args(cls, parser):
|
|
378
|
+
parser.add_argument('--source_table', required=True, help='Source BQ table')
|
|
379
|
+
parser.add_argument('--dest_table', required=True, help='Destination BQ table')
|
|
380
|
+
parser.add_argument('--processing_date', help='Date to process (YYYY-MM-DD)')
|
|
381
|
+
parser.add_argument('--mode', default='incremental', choices=['full', 'incremental'])
|
|
382
|
+
|
|
383
|
+
def run():
|
|
384
|
+
options = PipelineOptions().view_as(CloudStreamOptions)
|
|
385
|
+
|
|
386
|
+
with beam.Pipeline(options=options) as p:
|
|
387
|
+
query = f"""
|
|
388
|
+
SELECT * FROM `{options.source_table}`
|
|
389
|
+
WHERE _ingestion_date = '{options.processing_date}'
|
|
390
|
+
""" if options.mode == 'incremental' else f"""
|
|
391
|
+
SELECT * FROM `{options.source_table}`
|
|
392
|
+
"""
|
|
393
|
+
|
|
394
|
+
(
|
|
395
|
+
p
|
|
396
|
+
| 'Read' >> ReadFromBigQuery(query=query, use_standard_sql=True)
|
|
397
|
+
| 'Transform' >> beam.ParDo(TransformDoFn())
|
|
398
|
+
| 'Write' >> WriteToBigQuery(table=options.dest_table)
|
|
399
|
+
)
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
```yaml
|
|
403
|
+
# flex_template_metadata.json
|
|
404
|
+
{
|
|
405
|
+
"name": "CloudStream Bronze-to-Silver",
|
|
406
|
+
"description": "Parameterized pipeline for medallion layer transitions",
|
|
407
|
+
"parameters": [
|
|
408
|
+
{"name": "source_table", "label": "Source Table", "helpText": "Fully qualified BQ table"},
|
|
409
|
+
{"name": "dest_table", "label": "Destination Table", "helpText": "Fully qualified BQ table"},
|
|
410
|
+
{"name": "processing_date", "label": "Processing Date", "helpText": "YYYY-MM-DD format"},
|
|
411
|
+
{"name": "mode", "label": "Mode", "helpText": "full or incremental", "isOptional": true}
|
|
412
|
+
]
|
|
413
|
+
}
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
```bash
|
|
417
|
+
# Build and deploy Flex Template
|
|
418
|
+
gcloud dataflow flex-template build \
|
|
419
|
+
gs://cloudstream-dataflow-templates/bronze-to-silver.json \
|
|
420
|
+
--image-gcr-path gcr.io/cloudstream-prod/dataflow/bronze-to-silver:latest \
|
|
421
|
+
--sdk-language PYTHON \
|
|
422
|
+
--metadata-file flex_template_metadata.json
|
|
423
|
+
|
|
424
|
+
# Launch from template
|
|
425
|
+
gcloud dataflow flex-template run "bronze-to-silver-$(date +%Y%m%d)" \
|
|
426
|
+
--template-file-gcs-location gs://cloudstream-dataflow-templates/bronze-to-silver.json \
|
|
427
|
+
--region us-central1 \
|
|
428
|
+
--parameters source_table=cloudstream-prod:bronze.zoho_deals \
|
|
429
|
+
--parameters dest_table=cloudstream-prod:silver.zoho_deals \
|
|
430
|
+
--parameters processing_date=2024-01-15 \
|
|
431
|
+
--parameters mode=incremental
|
|
432
|
+
```
|
|
433
|
+
|
|
434
|
+
## Best Practices
|
|
435
|
+
|
|
436
|
+
1. **Use Flex Templates** for reusable, parameterized pipelines
|
|
437
|
+
2. **Enable Streaming Engine** for streaming jobs (reduces cost 30-50%)
|
|
438
|
+
3. **Use Dataflow Shuffle** for batch jobs (`shuffle_mode=service`)
|
|
439
|
+
4. **Set appropriate machine types** - n1-standard-4 for most, n1-highmem for memory-intensive
|
|
440
|
+
5. **Monitor with Cloud Monitoring** - Track throughput, watermark lag, backlog
|
|
441
|
+
6. **Use VPC-native networking** - No public IPs reduces cost and improves security
|
|
442
|
+
7. **Test locally first** - Use `DirectRunner` before deploying to `DataflowRunner`
|
|
443
|
+
|
|
444
|
+
> **COST WARNING**: Streaming pipelines run 24/7. A single n1-standard-4 worker costs ~$100/month. Design pipelines to scale down to 1 worker during low-traffic periods.
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# dbt Model Organization
|
|
2
|
+
|
|
3
|
+
## Directory Structure
|
|
4
|
+
```
|
|
5
|
+
models/
|
|
6
|
+
├── staging/ # Bronze → Silver (1:1 with source)
|
|
7
|
+
│ ├── zoho/
|
|
8
|
+
│ │ ├── stg_zoho__contacts.sql
|
|
9
|
+
│ │ ├── stg_zoho__deals.sql
|
|
10
|
+
│ │ └── _zoho__sources.yml
|
|
11
|
+
│ └── gcs/
|
|
12
|
+
│ └── stg_gcs__raw_events.sql
|
|
13
|
+
├── intermediate/ # Silver transformations
|
|
14
|
+
│ ├── int_contacts_enriched.sql
|
|
15
|
+
│ └── int_deals_with_contacts.sql
|
|
16
|
+
└── marts/ # Gold layer (business-ready)
|
|
17
|
+
├── finance/
|
|
18
|
+
│ ├── fct_revenue.sql
|
|
19
|
+
│ └── dim_customers.sql
|
|
20
|
+
└── operations/
|
|
21
|
+
├── fct_pipeline.sql
|
|
22
|
+
└── dim_sales_reps.sql
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Staging Model Pattern
|
|
26
|
+
```sql
|
|
27
|
+
-- models/staging/zoho/stg_zoho__contacts.sql
|
|
28
|
+
WITH source AS (
|
|
29
|
+
SELECT * FROM {{ source('zoho', 'contacts_raw') }}
|
|
30
|
+
),
|
|
31
|
+
|
|
32
|
+
cleaned AS (
|
|
33
|
+
SELECT
|
|
34
|
+
id AS contact_id,
|
|
35
|
+
TRIM(LOWER(email)) AS email,
|
|
36
|
+
INITCAP(first_name) AS first_name,
|
|
37
|
+
INITCAP(last_name) AS last_name,
|
|
38
|
+
PARSE_TIMESTAMP('%Y-%m-%dT%H:%M:%S', created_time) AS created_at,
|
|
39
|
+
PARSE_TIMESTAMP('%Y-%m-%dT%H:%M:%S', modified_time) AS modified_at
|
|
40
|
+
FROM source
|
|
41
|
+
WHERE id IS NOT NULL
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
SELECT * FROM cleaned
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Testing Pattern
|
|
48
|
+
```yaml
|
|
49
|
+
# models/staging/zoho/_zoho__sources.yml
|
|
50
|
+
version: 2
|
|
51
|
+
sources:
|
|
52
|
+
- name: zoho
|
|
53
|
+
tables:
|
|
54
|
+
- name: contacts_raw
|
|
55
|
+
columns:
|
|
56
|
+
- name: id
|
|
57
|
+
tests:
|
|
58
|
+
- not_null
|
|
59
|
+
- unique
|
|
60
|
+
- name: email
|
|
61
|
+
tests:
|
|
62
|
+
- not_null
|
|
63
|
+
```
|