cdp-edge 1.2.2 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/README.md +153 -306
  2. package/bin/cdp-edge.js +71 -61
  3. package/contracts/agent-versions.json +682 -0
  4. package/contracts/api-versions.json +372 -368
  5. package/contracts/types.ts +81 -0
  6. package/dist/commands/analyze.js +52 -52
  7. package/dist/commands/infra.js +54 -54
  8. package/dist/commands/install.js +26 -3
  9. package/dist/commands/server.js +174 -174
  10. package/dist/commands/setup.js +332 -100
  11. package/dist/commands/validate.js +248 -84
  12. package/dist/index.js +12 -12
  13. package/dist/sdk/cdpTrack.js +2095 -0
  14. package/dist/sdk/cdpTrack.min.js +64 -0
  15. package/dist/sdk/install-snippet.html +10 -0
  16. package/docs/whatsapp-ctwa.md +5 -4
  17. package/extracted-skill/tracking-events-generator/INTEGRACAO-COMPLETA.md +89 -0
  18. package/extracted-skill/tracking-events-generator/MELHORIAS-IMPLEMENTADAS.md +101 -0
  19. package/extracted-skill/tracking-events-generator/advanced-matching.js +364 -364
  20. package/extracted-skill/tracking-events-generator/agents/ab-ltv-agent.md +196 -0
  21. package/extracted-skill/tracking-events-generator/agents/ab-testing-agent.md +1 -1
  22. package/extracted-skill/tracking-events-generator/agents/attribution-agent.md +41 -41
  23. package/extracted-skill/tracking-events-generator/agents/bidding-agent.md +347 -0
  24. package/extracted-skill/tracking-events-generator/agents/bing-agent.md +40 -50
  25. package/extracted-skill/tracking-events-generator/agents/browser-tracking.md +174 -74
  26. package/extracted-skill/tracking-events-generator/agents/code-guardian-agent.md +1 -1
  27. package/extracted-skill/tracking-events-generator/agents/compliance-agent.md +25 -5
  28. package/extracted-skill/tracking-events-generator/agents/dashboard-agent.md +10 -10
  29. package/extracted-skill/tracking-events-generator/agents/database-agent.md +43 -42
  30. package/extracted-skill/tracking-events-generator/agents/debug-agent.md +22 -22
  31. package/extracted-skill/tracking-events-generator/agents/devops-agent.md +232 -0
  32. package/extracted-skill/tracking-events-generator/agents/domain-setup-agent.md +23 -9
  33. package/extracted-skill/tracking-events-generator/agents/email-agent.md +28 -1
  34. package/extracted-skill/tracking-events-generator/agents/evo-crm-agent.md +253 -0
  35. package/extracted-skill/tracking-events-generator/agents/fingerprint-agent.md +206 -1
  36. package/extracted-skill/tracking-events-generator/agents/fraud-detection-agent.md +143 -0
  37. package/extracted-skill/tracking-events-generator/agents/google-agent.md +128 -2
  38. package/extracted-skill/tracking-events-generator/agents/intelligence-agent.md +191 -31
  39. package/extracted-skill/tracking-events-generator/agents/lead-scoring-agent.md +282 -0
  40. package/extracted-skill/tracking-events-generator/agents/linkedin-agent.md +145 -34
  41. package/extracted-skill/tracking-events-generator/agents/localization-agent.md +1 -1
  42. package/extracted-skill/tracking-events-generator/agents/ltv-predictor-agent.md +24 -5
  43. package/extracted-skill/tracking-events-generator/agents/master-feedback-loop.md +81 -21
  44. package/extracted-skill/tracking-events-generator/agents/master-orchestrator.md +588 -93
  45. package/extracted-skill/tracking-events-generator/agents/match-quality-agent.md +304 -0
  46. package/extracted-skill/tracking-events-generator/agents/memory-agent.md +190 -15
  47. package/extracted-skill/tracking-events-generator/agents/meta-agent.md +10 -2
  48. package/extracted-skill/tracking-events-generator/agents/ml-clustering-agent.md +769 -0
  49. package/extracted-skill/tracking-events-generator/agents/page-analyzer.md +21 -4
  50. package/extracted-skill/tracking-events-generator/agents/performance-agent.md +41 -31
  51. package/extracted-skill/tracking-events-generator/agents/performance-optimization-agent.md +18 -8
  52. package/extracted-skill/tracking-events-generator/agents/pinterest-agent.md +14 -6
  53. package/extracted-skill/tracking-events-generator/agents/premium-tracking-intelligence-agent.md +7 -7
  54. package/extracted-skill/tracking-events-generator/agents/r2-setup-agent.md +16 -8
  55. package/extracted-skill/tracking-events-generator/agents/reddit-agent.md +15 -7
  56. package/extracted-skill/tracking-events-generator/agents/security-enterprise-agent.md +157 -48
  57. package/extracted-skill/tracking-events-generator/agents/server-tracking.md +35 -35
  58. package/extracted-skill/tracking-events-generator/agents/spotify-agent.md +15 -7
  59. package/extracted-skill/tracking-events-generator/agents/tiktok-agent.md +73 -2
  60. package/extracted-skill/tracking-events-generator/agents/tracking-plan-agent.md +104 -9
  61. package/extracted-skill/tracking-events-generator/agents/utm-agent.md +322 -0
  62. package/extracted-skill/tracking-events-generator/agents/validator-agent.md +13 -9
  63. package/extracted-skill/tracking-events-generator/agents/webhook-agent.md +112 -4
  64. package/extracted-skill/tracking-events-generator/agents/whatsapp-agent.md +58 -5
  65. package/extracted-skill/tracking-events-generator/agents/whatsapp-ctwa-setup-agent.md +26 -18
  66. package/extracted-skill/tracking-events-generator/agents/youtube-agent.md +152 -37
  67. package/extracted-skill/tracking-events-generator/anti-blocking.js +285 -285
  68. package/extracted-skill/tracking-events-generator/cdpTrack.js +642 -641
  69. package/extracted-skill/tracking-events-generator/contracts/api-versions.json +14 -10
  70. package/extracted-skill/tracking-events-generator/engagement-scoring.js +226 -226
  71. package/extracted-skill/tracking-events-generator/evals/evals.json +235 -235
  72. package/extracted-skill/tracking-events-generator/integration-test.js +497 -497
  73. package/extracted-skill/tracking-events-generator/knowledge-base.md +172 -0
  74. package/extracted-skill/tracking-events-generator/micro-events.js +992 -992
  75. package/extracted-skill/tracking-events-generator/models/lancamento-imobiliario.md +344 -0
  76. package/extracted-skill/tracking-events-generator/models/pinterest/conversions-api-template.js +144 -144
  77. package/extracted-skill/tracking-events-generator/models/pinterest/event-mappings.json +48 -48
  78. package/extracted-skill/tracking-events-generator/models/pinterest/tag-template.js +28 -28
  79. package/extracted-skill/tracking-events-generator/models/quiz-funnel.md +83 -19
  80. package/extracted-skill/tracking-events-generator/models/reddit/conversions-api-template.js +205 -205
  81. package/extracted-skill/tracking-events-generator/models/reddit/event-mappings.json +56 -56
  82. package/extracted-skill/tracking-events-generator/models/reddit/pixel-template.js +19 -19
  83. package/extracted-skill/tracking-events-generator/models/scenarios/behavior-engine.js +425 -425
  84. package/extracted-skill/tracking-events-generator/route-intent-capture.js +222 -0
  85. package/extracted-skill/tracking-events-generator/tracking.config.js +3 -3
  86. package/package.json +89 -75
  87. package/scripts/build-sdk.js +106 -0
  88. package/server-edge-tracker/.client.env.example +14 -0
  89. package/server-edge-tracker/INSTALAR.md +222 -23
  90. package/server-edge-tracker/SEGMENTATION-DOCS.md +513 -0
  91. package/server-edge-tracker/config/utm-mapping.json +64 -0
  92. package/server-edge-tracker/deploy-client.cjs +76 -0
  93. package/server-edge-tracker/index.ts +1230 -0
  94. package/server-edge-tracker/migrate-v7.sql +64 -0
  95. package/server-edge-tracker/modules/db.ts +710 -0
  96. package/server-edge-tracker/modules/dispatch/crm.ts +382 -0
  97. package/server-edge-tracker/modules/dispatch/ga4.ts +72 -0
  98. package/server-edge-tracker/modules/dispatch/meta.ts +143 -0
  99. package/server-edge-tracker/modules/dispatch/platforms.ts +255 -0
  100. package/server-edge-tracker/modules/dispatch/tiktok.ts +107 -0
  101. package/server-edge-tracker/modules/dispatch/whatsapp.ts +296 -0
  102. package/server-edge-tracker/modules/intelligence.ts +589 -0
  103. package/server-edge-tracker/modules/ml/bidding.ts +247 -0
  104. package/server-edge-tracker/modules/ml/fraud.ts +302 -0
  105. package/server-edge-tracker/modules/ml/logistic.ts +226 -0
  106. package/server-edge-tracker/modules/ml/ltv.ts +531 -0
  107. package/server-edge-tracker/modules/ml/matchquality.ts +232 -0
  108. package/server-edge-tracker/modules/ml/quiz.ts +343 -0
  109. package/server-edge-tracker/modules/ml/roas.ts +255 -0
  110. package/server-edge-tracker/modules/ml/segmentation.ts +407 -0
  111. package/server-edge-tracker/modules/nurture.ts +257 -0
  112. package/server-edge-tracker/modules/utils.ts +311 -0
  113. package/server-edge-tracker/modules/utm/utm-enricher.ts +231 -0
  114. package/server-edge-tracker/schema-ab-ltv.sql +97 -0
  115. package/server-edge-tracker/schema-bidding.sql +86 -0
  116. package/server-edge-tracker/schema-fraud.sql +90 -0
  117. package/server-edge-tracker/schema-indexes.sql +67 -0
  118. package/server-edge-tracker/schema-ltv-feedback.sql +11 -0
  119. package/server-edge-tracker/schema-quiz.sql +52 -0
  120. package/server-edge-tracker/schema-sales-engine.sql +113 -0
  121. package/server-edge-tracker/schema-segmentation.sql +219 -0
  122. package/server-edge-tracker/schema-utm.sql +82 -0
  123. package/server-edge-tracker/schema.sql +281 -265
  124. package/server-edge-tracker/types.ts +275 -0
  125. package/server-edge-tracker/wrangler.toml +140 -85
  126. package/templates/lancamento-imobiliario.md +344 -0
  127. package/templates/multi-step-checkout.md +3 -4
  128. package/templates/pinterest/conversions-api-template.js +144 -144
  129. package/templates/pinterest/event-mappings.json +48 -48
  130. package/templates/pinterest/tag-template.js +28 -28
  131. package/templates/quiz-funnel.md +83 -19
  132. package/templates/reddit/conversions-api-template.js +205 -205
  133. package/templates/reddit/event-mappings.json +56 -56
  134. package/templates/reddit/pixel-template.js +12 -39
  135. package/templates/scenarios/behavior-engine.js +45 -22
  136. package/docs/PixelBuilder-Documentacao-Completa (2).docx +0 -0
  137. package/docs/installation.md +0 -155
  138. package/docs/quick-start.md +0 -185
  139. package/extracted-skill/tracking-events-generator/agents/crm-integration-agent.md +0 -1419
  140. package/extracted-skill/tracking-events-generator/agents/intelligence-scheduling.md +0 -643
  141. package/server-edge-tracker/worker.js +0 -2574
@@ -0,0 +1,769 @@
1
+ # ML Clustering Agent — CDP Edge Quantum Tier
2
+
3
+ ## Identidade
4
+
5
+ **Agente:** ML Clustering Agent
6
+ **Papel:** Especialista em Segmentação Dinâmica via Machine Learning
7
+ **Nível:** Deus (Quantum Tier) — Especialista em IA Clustering
8
+
9
+ ---
10
+
11
+ ## Por que isso importa
12
+
13
+ | Situação Atual | Com Segmentação Dinâmica ML |
14
+ |---|---|
15
+ | Segmentos estáticos (tags manuais) | Grupos automáticos que evoluem com novos dados |
16
+ | Campanhas genéricas para todos | +40% relevância de campanhas por segmento |
17
+ | Insights manuais de comportamento | Clusters ML descobrem padrões ocultos |
18
+ | CTR genérico por vertical | Otimização específica por cluster de comportamento |
19
+
20
+ ---
21
+
22
+ ## O que este agente configura
23
+
24
+ ```
25
+ Segmentação Dinâmica ML
26
+ ├── K-means Clustering (Workers AI)
27
+ │ ├── Feature engineering: LTV, behavior, geo, time
28
+ │ ├── N clusters configurável (3-10)
29
+ │ ├── Auto-labeling de segmentos (interpretação ML)
30
+ │ └── Persistência em D1 (tabela ml_segments)
31
+
32
+ ├── DBSCAN Clustering (anomalias/fraude)
33
+ │ ├── Detecção de leads anômalos (outliers)
34
+ │ ├── Epsilon e MinPts configuráveis
35
+ │ ├── Marcação automática de "suspicious"
36
+ │ └── Integração com Security Agent
37
+
38
+ ├── Hierarchical Clustering (níveis de segmentos)
39
+ │ ├── Dendrograma de leads (similaridade hierárquica)
40
+ │ ├── Auto-seleção de k (silhouette score)
41
+ │ ├── Níveis de granularidade (macro → micro)
42
+ │ └── Navegação drill-down em clusters
43
+
44
+ ├── Feature Engineering Pipeline
45
+ │ ├── Normalização (min-max, z-score)
46
+ │ ├── Encoding (one-hot para categoricos)
47
+ │ ├── Feature selection (importância)
48
+ │ └── Time-based features (dias desde lead, hora do dia)
49
+
50
+ └── Auto-Interpretation de Segmentos
51
+ ├── Geração de nomes descritivos (ex: "Alto Valor + Alto Engajamento")
52
+ ├── Características dominantes (top features por cluster)
53
+ ├── Distribuição de métricas (avg LTV, std behavior)
54
+ └── Recomendações de ação por segmento
55
+ ```
56
+
57
+ ---
58
+
59
+ ## Pré-requisitos
60
+
61
+ - **Workers AI**: Binding `env.AI` habilitado no wrangler.toml
62
+ - **D1 Database**: Tabela `leads` com dados históricos (últimos 6 meses)
63
+ - **Server Architect**: Integrar endpoints na rota `/api/segmentation/*`
64
+ - **Feature Engineering**: Pipeline pronta para normalização/encoding
65
+
66
+ ---
67
+
68
+ ## Fase 1 — Feature Engineering Pipeline
69
+
70
+ ### 1.1 Extração de Features do D1
71
+
72
+ Consultar leads históricos e extrair features numéricas/categóricas:
73
+
74
+ ```sql
75
+ -- Features numéricas
76
+ SELECT
77
+ ltv_class AS ltv_numeric, -- 0=Low, 1=Medium, 2=High
78
+ behavior_score AS behavior_numeric, -- 0-100
79
+ engagement_score AS engagement_numeric, -- 0-100
80
+ intention_level AS intention_numeric, -- 0-100
81
+ bot_score AS bot_numeric, -- 0-100 (inverso: 100=humano)
82
+ value AS purchase_value, -- valor de compra (null para leads)
83
+ currency_value AS currency_numeric, -- 1.0 para BRL
84
+
85
+ -- Features de tempo
86
+ FROM leads
87
+ WHERE created_at >= datetime('now', '-6 months')
88
+
89
+ -- Features categóricas (para one-hot encoding)
90
+ SELECT DISTINCT
91
+ country, -- BR, US, AR
92
+ state, -- SP, RJ, MG
93
+ geo_timezone, -- America/Sao_Paulo, America/New_York
94
+ utm_source, -- facebook, google, tiktok
95
+ utm_medium, -- cpc, organic, social
96
+ FROM leads
97
+ WHERE created_at >= datetime('now', '-6 months')
98
+ ```
99
+
100
+ ### 1.2 Normalização de Features
101
+
102
+ ```python
103
+ # Exemplo de normalização (implementado em Workers AI)
104
+
105
+ # Min-Max Normalization (0-1 range)
106
+ normalized_value = (value - min) / (max - min)
107
+
108
+ # Z-Score Normalization (mean=0, std=1)
109
+ normalized_value = (value - mean) / std
110
+
111
+ # One-Hot Encoding para categóricos
112
+ country_BR = [1, 0, 0, 0, 0]
113
+ country_US = [0, 1, 0, 0, 0]
114
+ country_AR = [0, 0, 1, 0, 0]
115
+ ```
116
+
117
+ ### 1.3 Time-Based Features
118
+
119
+ ```python
120
+ # Features temporais para clustering
121
+
122
+ days_since_lead = (now - created_at).days
123
+ hour_of_day = created_at.hour
124
+ day_of_week = created_at.weekday() # 0=Segunda, 6=Domingo
125
+ is_weekend = 1 if day_of_week in [5, 6] else 0
126
+ is_business_hours = 1 if 9 <= hour_of_day <= 18 else 0
127
+ ```
128
+
129
+ ---
130
+
131
+ ## Fase 2 — K-Means Vetorial Real (embeddinggemma-300m + K-means em JS)
132
+
133
+ > **Arquitetura atual:** O clustering não usa LLM para fazer os cálculos matemáticos.
134
+ > Em vez disso, usa **embeddings semânticos reais** + **K-means implementado em JavaScript**,
135
+ > com o Granite usado **apenas para nomear** os clusters resultantes.
136
+
137
+ ### 2.1 Pipeline de Clustering
138
+
139
+ ```
140
+ 100 leads (sample) → perfil textual → embeddinggemma-300m → vetores 768d
141
+
142
+ K-means++ (cosine distance, JS puro)
143
+
144
+ silhouette score real calculado em JS
145
+
146
+ Granite 4.0 Micro nomeia cada cluster (1 call de LLM)
147
+ ```
148
+
149
+ ### 2.2 Modelos Workers AI utilizados
150
+
151
+ | Modelo | ID | Uso |
152
+ |---|---|---|
153
+ | **Granite 4.0 Micro** | `@cf/ibm-granite/granite-4.0-h-micro` | LTV Prediction + Naming de clusters |
154
+ | **EmbeddingGemma 300M** | `@cf/baai/bge-m3` | Embeddings semânticos para K-means |
155
+
156
+ ### 2.3 Perfil textual por lead (input para embedding)
157
+
158
+ ```typescript
159
+ function _buildLeadProfile(l) {
160
+ return [
161
+ `LTV: ${l.predicted_ltv_class || 'desconhecido'}`,
162
+ `engajamento: ${Math.round(l.engagement_score || 0)}`,
163
+ `intenção: ${l.intention_level || 'desconhecida'}`,
164
+ `origem: ${l.utm_source || 'direto'}`,
165
+ `canal: ${l.utm_medium || 'desconhecido'}`,
166
+ `país: ${l.country || 'BR'}`,
167
+ `hora: ${l.hour_of_day || 12}h`,
168
+ (l.is_weekend ? 'fim-de-semana' : 'dia-útil'),
169
+ `recência: ${l.days_since_lead || 0} dias`,
170
+ ].filter(Boolean).join(', ');
171
+ }
172
+ ```
173
+
174
+ ### 2.4 Chamada de embeddings em batch
175
+
176
+ ```typescript
177
+ // Embeds até 100 perfis em uma única chamada
178
+ const embRes = await env.AI.run('@cf/baai/bge-m3', { text: profiles });
179
+ const vectors = embRes.data; // float32[][] — shape [N, 768]
180
+ ```
181
+
182
+ ### 2.5 K-means vetorial (cosine distance)
183
+
184
+ ```typescript
185
+ // Inicialização K-means++ → iterações até convergência → assignments finais
186
+ const { assignments } = _kmeansRun(vectors, nClusters); // implementado em index.ts
187
+ const silhouetteScore = _silhouette(vectors, assignments, nClusters); // score real
188
+ ```
189
+
190
+ ### 2.6 Naming dos clusters via Granite (único uso de LLM)
191
+
192
+ ```typescript
193
+ // Granite recebe apenas as estatísticas agregadas por cluster
194
+ // Retorna nome descritivo + recomendação de campanha em português
195
+ const nameRes = await env.AI.run('@cf/ibm-granite/granite-4.0-h-micro', {
196
+ messages: [{ role: 'user', content: namingPrompt }],
197
+ max_tokens: 800
198
+ });
199
+ ```
200
+
201
+ ### 2.2 Features para K-Means
202
+
203
+ ```typescript
204
+ // Features recomendadas para clustering (com base na análise D1)
205
+
206
+ const RECOMMENDED_FEATURES = [
207
+ // Financeiro
208
+ 'ltv_class', // Low/Medium/High (0-1-2)
209
+ 'purchase_value', // valor de compra (null para leads)
210
+
211
+ // Comportamental
212
+ 'behavior_score', // 0-100 (engajamento)
213
+ 'engagement_score', // 0-100 (interações)
214
+ 'intention_level', // 0-100 (probabilidade de compra)
215
+
216
+ // Temporal
217
+ 'days_since_lead', // recência (0-180 dias)
218
+ 'hour_of_day', // 0-23
219
+ 'is_weekend', // 0/1
220
+ 'is_business_hours', // 0/1
221
+
222
+ // Geográfico
223
+ 'country', // one-hot encoding
224
+ 'state', // one-hot encoding
225
+
226
+ // Origem de tráfego
227
+ 'utm_source', // one-hot encoding
228
+ 'utm_medium' // one-hot encoding
229
+ ];
230
+
231
+ const DEFAULT_N_CLUSTERS = 5; // 5 segmentos (configurável)
232
+ ```
233
+
234
+ ---
235
+
236
+ ## Fase 3 — DBSCAN Clustering (Detecção de Anomalias)
237
+
238
+ ### 3.1 Prompt para Workers AI
239
+
240
+ ```python
241
+ PROMPT_DBSCAN = f"""
242
+ You are a Machine Learning expert specializing in anomaly detection.
243
+
244
+ You will receive {n_leads} customers with {features}.
245
+ Your task: Perform DBSCAN clustering to detect outliers and fraudulent patterns.
246
+
247
+ INPUTS:
248
+ - leads: JSON array of customer objects
249
+ - features: list of feature names
250
+ - epsilon: distance threshold (default: 0.3)
251
+ - min_samples: minimum points to form cluster (default: 5)
252
+
253
+ TASK:
254
+ 1. For each lead, calculate distance to {min_samples} nearest neighbors
255
+ 2. Mark as "core point" if >= min_samples neighbors within epsilon
256
+ 3. Mark as "border point" if < min_samples neighbors but reachable from core
257
+ 4. Mark as "outlier" if not reachable from any core point
258
+ 5. Identify clusters and noise (outliers)
259
+
260
+ OUTPUT (JSON only):
261
+ {{
262
+ "total_leads": 500,
263
+ "n_core_points": 450,
264
+ "n_border_points": 30,
265
+ "n_outliers": 20,
266
+ "outliers": [
267
+ {{
268
+ "lead_id": "lead_123",
269
+ "reason": "behavior_score too high (> 95)",
270
+ "risk_score": 0.92,
271
+ "features": {{
272
+ "behavior_score": 98,
273
+ "days_since_lead": 0,
274
+ "unusual_utm_pattern": true
275
+ }}
276
+ }},
277
+ ...
278
+ ],
279
+ "clusters": [
280
+ {{
281
+ "cluster_id": 0,
282
+ "size": 235,
283
+ "density": "high"
284
+ }},
285
+ ...
286
+ ]
287
+ }}
288
+
289
+ OUTLIER PATTERNS TO DETECT:
290
+ - behavior_score > 95 (bot-like behavior)
291
+ - days_since_lead = 0 AND behavior_score > 80 (instant lead, suspicious)
292
+ - unusual utm_source combination (e.g., unknown_source + high_value)
293
+ - geographic mismatch (high_value + unusual location)
294
+
295
+ Return ONLY valid JSON, no explanations.
296
+ """
297
+ ```
298
+
299
+ ---
300
+
301
+ ## Fase 4 — Hierarchical Clustering (Drill-Down)
302
+
303
+ ### 4.1 Prompt para Workers AI
304
+
305
+ ```python
306
+ PROMPT_HIERARCHICAL = f"""
307
+ You are a Machine Learning expert specializing in hierarchical clustering.
308
+
309
+ You will receive {n_leads} customers.
310
+ Your task: Build hierarchical clustering tree from macro to micro segments.
311
+
312
+ INPUTS:
313
+ - leads: JSON array of customer objects
314
+ - features: list of feature names
315
+ - max_depth: maximum tree depth (default: 3)
316
+
317
+ TASK:
318
+ 1. Build binary hierarchical tree (top-down divisive clustering)
319
+ 2. At each level, split cluster into 2 sub-clusters using K-means
320
+ 3. Stop when max_depth reached or cluster size < min_points
321
+ 4. Calculate Silhouette Score at each split
322
+ 5. Prune branches with poor separation (silhouette < 0.3)
323
+
324
+ OUTPUT (JSON only):
325
+ {{
326
+ "tree": {{
327
+ "level_0": {{
328
+ "name": "Todos os Leads",
329
+ "size": 500,
330
+ "children": [
331
+ {{
332
+ "name": "Macro Segmento A (Alto Valor)",
333
+ "size": 180,
334
+ "children": [
335
+ {{
336
+ "name": "Micro Segmento A1 (SP - Alto Valor + Alto Engajamento)",
337
+ "size": 95
338
+ }},
339
+ {{
340
+ "name": "Micro Segmento A2 (RJ - Alto Valor + Médio Engajamento)",
341
+ "size": 85
342
+ }}
343
+ ]
344
+ }},
345
+ {{
346
+ "name": "Macro Segmento B (Leads Quentes)",
347
+ "size": 150,
348
+ "children": [...]
349
+ }}
350
+ ]
351
+ }}
352
+ }},
353
+ "statistics": {{
354
+ "n_levels": 3,
355
+ "n_leaf_clusters": 6,
356
+ "avg_leaf_size": 83.3,
357
+ "best_depth_for_lead": "level_2"
358
+ }}
359
+ }}
360
+
361
+ Return ONLY valid JSON, no explanations.
362
+ """
363
+ ```
364
+
365
+ ---
366
+
367
+ ## Fase 5 — Auto-Interpretação de Segmentos
368
+
369
+ ### 5.1 Geração de Nomes Descritivos
370
+
371
+ ```python
372
+ # Prompt para auto-labeling de segmentos
373
+
374
+ PROMPT_INTERPRETATION = f"""
375
+ You are a marketing intelligence expert.
376
+
377
+ You will receive cluster centroids and characteristics.
378
+ Your task: Generate descriptive, actionable names for each segment.
379
+
380
+ INPUT: Cluster characteristics (avg values per feature)
381
+
382
+ OUTPUT: Descriptive segment name following this pattern:
383
+ "[VALUE_TYPE] [BEHAVIOR_TYPE] [GEO_TYPE]"
384
+
385
+ VALUE TYPES: "Alto Valor", "Médio Valor", "Baixo Valor", "Lead Quente"
386
+ BEHAVIOR TYPES: "Alto Engajamento", "Médio Engajamento", "Baixo Engajamento", "Alta Intenção"
387
+ GEO TYPES: "[UF]", "Sudeste", "Norte", "Internacional"
388
+
389
+ EXAMPLES:
390
+ - ltv=0.9, behavior=0.85, geo=SP → "Segmento 0 - Alto Valor + Alto Engajamento (SP)"
391
+ - ltv=0.7, behavior=0.6, geo=RS → "Segmento 1 - Médio Valor + Médio Engajamento (RJ)"
392
+ - ltv=0.2, behavior=0.3, days=0, utm=tiktok → "Segmento 2 - Lead Quente + Baixo Engajamento (TikTok)"
393
+
394
+ Return ONLY valid JSON with segment names, no explanations.
395
+ """
396
+ ```
397
+
398
+ ---
399
+
400
+ ## Fase 6 — Integração com D1 (Persistência)
401
+
402
+ ### 6.1 Schema da Tabela ml_segments
403
+
404
+ ```sql
405
+ CREATE TABLE IF NOT EXISTS ml_segments (
406
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
407
+ cluster_id INTEGER NOT NULL,
408
+ cluster_name TEXT NOT NULL,
409
+ clustering_algorithm TEXT NOT NULL, -- 'kmeans', 'dbscan', 'hierarchical'
410
+ created_at TEXT DEFAULT (datetime('now')),
411
+ updated_at TEXT DEFAULT (datetime('now')),
412
+
413
+ -- Estatísticas do cluster
414
+ size INTEGER NOT NULL,
415
+ percentage REAL NOT NULL,
416
+
417
+ -- Características médias
418
+ avg_ltv REAL,
419
+ avg_behavior_score REAL,
420
+ avg_engagement_score REAL,
421
+ avg_intention_level REAL,
422
+
423
+ -- Características dominantes
424
+ dominant_countries TEXT, -- JSON array: ["BR", "US"]
425
+ dominant_states TEXT, -- JSON array: ["SP", "RJ"]
426
+ dominant_utm_sources TEXT, -- JSON array: ["facebook", "google"]
427
+ dominant_features TEXT, -- JSON array: ["ltv", "behavior_score"]
428
+
429
+ -- Métricas de qualidade
430
+ silhouette_score REAL,
431
+ cohesion REAL,
432
+ separation REAL,
433
+
434
+ -- Recomendações
435
+ action_recommendations TEXT, -- JSON array
436
+ bid_recommendations TEXT, -- JSON array
437
+ campaign_recommendations TEXT -- JSON array
438
+ );
439
+
440
+ -- Índices para performance
441
+ CREATE INDEX IF NOT EXISTS idx_ml_segments_created ON ml_segments(created_at);
442
+ CREATE INDEX IF NOT EXISTS idx_ml_segments_cluster ON ml_segments(cluster_id);
443
+ CREATE INDEX IF NOT EXISTS idx_ml_segments_algorithm ON ml_segments(clustering_algorithm);
444
+
445
+ -- Tabela de associação lead ↔ segmento
446
+ CREATE TABLE IF NOT EXISTS ml_segment_members (
447
+ lead_id TEXT NOT NULL,
448
+ cluster_id INTEGER NOT NULL,
449
+ confidence REAL NOT NULL, -- 0-1 (quanto perto do centroide)
450
+ updated_at TEXT DEFAULT (datetime('now')),
451
+ PRIMARY KEY (lead_id, cluster_id, clustering_algorithm)
452
+ );
453
+
454
+ CREATE INDEX IF NOT EXISTS idx_ml_segment_members_cluster ON ml_segment_members(cluster_id);
455
+ CREATE INDEX IF NOT EXISTS idx_ml_segment_members_lead ON ml_segment_members(lead_id);
456
+ ```
457
+
458
+ ---
459
+
460
+ ## Fase 7 — Exposição de API REST
461
+
462
+ ### 7.1 Endpoint de Clustering
463
+
464
+ ```typescript
465
+ // server-edge-tracker/functions/api/segmentation/cluster.ts
466
+
467
+ export async function onRequestGet(context: EventContext<Env>) {
468
+ const { searchParams } = new URL(context.request.url);
469
+
470
+ const algorithm = searchParams.get('algorithm') || 'kmeans';
471
+ const nClusters = parseInt(searchParams.get('n_clusters') || '5');
472
+ const clientVertical = searchParams.get('vertical') || 'general';
473
+
474
+ // Extrair leads históricos do D1
475
+ const leads = await context.env.DB.prepare(`
476
+ SELECT id, ltv_class, behavior_score, engagement_score, intention_level,
477
+ days_since_lead, hour_of_day, is_weekend, is_business_hours,
478
+ country, state, utm_source, utm_medium
479
+ FROM leads
480
+ WHERE created_at >= datetime('now', '-6 months')
481
+ ORDER BY created_at DESC
482
+ `).bind().all();
483
+
484
+ // Feature Engineering
485
+ const features = extractFeatures(leads);
486
+
487
+ // 1. Embeddings reais via embeddinggemma-300m
488
+ const profiles = sample.map(_buildLeadProfile);
489
+ const embRes = await context.env.AI.run('@cf/baai/bge-m3', { text: profiles });
490
+ const vectors = embRes.data; // vetores 768d
491
+
492
+ // 2. K-means vetorial real (JS puro, cosine distance)
493
+ const { assignments } = _kmeansRun(vectors, nClusters);
494
+ const silhouetteScore = _silhouette(vectors, assignments, nClusters);
495
+
496
+ // 3. Granite apenas para nomear clusters
497
+ const nameRes = await context.env.AI.run('@cf/ibm-granite/granite-4.0-h-micro',
498
+ { messages: [{ role: 'user', content: getNamingPrompt(clusterStats) }], max_tokens: 800 }
499
+ );
500
+
501
+ // 4. Persistir no D1
502
+ await saveClusters(context.env.DB, clusters, algorithm);
503
+
504
+ return Response.json({
505
+ success: true,
506
+ algorithm,
507
+ engine: 'embeddinggemma-300m + kmeans vetorial',
508
+ n_clusters: nClusters,
509
+ silhouette_score: silhouetteScore,
510
+ clusters,
511
+ generated_at: new Date().toISOString()
512
+ });
513
+ }
514
+ ```
515
+
516
+ ### 7.2 Endpoint de Consulta de Segmentos
517
+
518
+ ```typescript
519
+ // server-edge-tracker/functions/api/segmentation/list.ts
520
+
521
+ export async function onRequestGet(context: EventContext<Env>) {
522
+ const clusters = await context.env.DB.prepare(`
523
+ SELECT id, cluster_id, cluster_name, clustering_algorithm, size, percentage,
524
+ avg_ltv, avg_behavior_score, avg_engagement_score,
525
+ dominant_countries, dominant_states, dominant_utm_sources,
526
+ silhouette_score, action_recommendations
527
+ FROM ml_segments
528
+ ORDER BY created_at DESC
529
+ LIMIT 10
530
+ `).bind().all();
531
+
532
+ return Response.json({
533
+ success: true,
534
+ clusters: clusters.map(c => ({
535
+ ...c,
536
+ dominant_countries: JSON.parse(c.dominant_countries || '[]'),
537
+ dominant_states: JSON.parse(c.dominant_states || '[]'),
538
+ dominant_utm_sources: JSON.parse(c.dominant_utm_sources || '[]'),
539
+ action_recommendations: JSON.parse(c.action_recommendations || '[]'),
540
+ bid_recommendations: JSON.parse(c.bid_recommendations || '[]'),
541
+ campaign_recommendations: JSON.parse(c.campaign_recommendations || '[]')
542
+ }))
543
+ });
544
+ }
545
+ ```
546
+
547
+ ### 7.3 Endpoint de Anomalias (DBSCAN)
548
+
549
+ ```typescript
550
+ // server-edge-tracker/functions/api/segmentation/outliers.ts
551
+
552
+ export async function onRequestGet(context: EventContext<Env>) {
553
+ const outliers = await context.env.DB.prepare(`
554
+ SELECT l.id, l.email, l.behavior_score, l.days_since_lead,
555
+ l.country, l.state, l.utm_source, l.utm_medium,
556
+ sm.risk_score, sm.reason
557
+ FROM ml_segment_members sm
558
+ INNER JOIN leads l ON sm.lead_id = l.id
559
+ WHERE sm.clustering_algorithm = 'dbscan'
560
+ AND sm.confidence < 0.5 -- low confidence = anomaly
561
+ ORDER BY sm.updated_at DESC
562
+ LIMIT 50
563
+ `).bind().all();
564
+
565
+ return Response.json({
566
+ success: true,
567
+ outliers: outliers,
568
+ total: outliers.length,
569
+ generated_at: new Date().toISOString()
570
+ });
571
+ }
572
+ ```
573
+
574
+ ---
575
+
576
+ ## Inputs Recebidos do Orquestrador
577
+
578
+ ### Parâmetros de Configuração
579
+
580
+ ```json
581
+ {
582
+ "client_vertical": "curso-online",
583
+ "features_to_use": ["ltv", "behavior_score", "engagement_score", "geo", "time"],
584
+ "clustering_algorithms": ["kmeans", "dbscan", "hierarchical"],
585
+ "default_n_clusters": 5,
586
+ "update_frequency": "weekly", // re-clustering automático
587
+ "min_data_points": 100, // mínimo de leads para clustering
588
+ "max_data_age_months": 6 // usar apenas últimos 6 meses
589
+ }
590
+ ```
591
+
592
+ ---
593
+
594
+ ## Outputs para o Server Architect
595
+
596
+ ### Arquivos Criados
597
+
598
+ ```json
599
+ {
600
+ "endpoints": [
601
+ "functions/api/segmentation/cluster.ts",
602
+ "functions/api/segmentation/list.ts",
603
+ "functions/api/segmentation/outliers.ts"
604
+ ],
605
+ "database": [
606
+ "server-edge-tracker/schema-segmentation.sql"
607
+ ],
608
+ "documentation": [
609
+ "server-edge-tracker/SEGMENTATION-DOCS.md"
610
+ ],
611
+ "integration_points": [
612
+ "LTV Predictor Agent: enrich predictions with cluster_id",
613
+ "Dashboard Agent: visualize segments in charts",
614
+ "Attribution Agent: segment-level attribution"
615
+ ]
616
+ }
617
+ ```
618
+
619
+ ### API Contracts
620
+
621
+ ```typescript
622
+ interface SegmentationAPI {
623
+ // Criar novos clusters
624
+ 'POST /api/segmentation/cluster': {
625
+ algorithm: 'kmeans' | 'dbscan' | 'hierarchical';
626
+ n_clusters?: number;
627
+ features?: string[];
628
+ client_vertical?: string;
629
+ };
630
+
631
+ // Listar clusters existentes
632
+ 'GET /api/segmentation/list': {
633
+ limit?: number;
634
+ algorithm?: string;
635
+ };
636
+
637
+ // Consultar outliers/anomalias
638
+ 'GET /api/segmentation/outliers': {
639
+ limit?: number;
640
+ confidence_threshold?: number; // < 0.5 = high risk
641
+ };
642
+
643
+ // Atualizar segmentos
644
+ 'PUT /api/segmentation/update': {
645
+ cluster_id: number;
646
+ action_recommendations: string[];
647
+ bid_recommendations: string[];
648
+ };
649
+ }
650
+ ```
651
+
652
+ ---
653
+
654
+ ## Integração com outros agentes
655
+
656
+ | Quando | Agente |
657
+ |---|---|
658
+ | Após gerar clusters | → **Dashboard Agent** (visualização em gráficos) |
659
+ | Segmentos criados | → **LTV Predictor Agent** (enricher predições com cluster_id) |
660
+ | Outliers detectados | → **Security Enterprise Agent** (bloqueio automático) |
661
+ | Recomendações geradas | → **Attribution Agent** (attribution por segmento) |
662
+ | Clusters atualizados | → **Meta Agent** (campanhas segmentadas) |
663
+ | Re-clustering semanal | → **Intelligence Scheduling** (cron automático) |
664
+
665
+ ---
666
+
667
+ ## Checklist de Conclusão
668
+
669
+ ```
670
+ [ ] Feature Engineering Pipeline implementada
671
+ [ ] K-means Clustering vetorial (embeddinggemma-300m + JS)
672
+ [ ] DBSCAN Clustering para anomalias
673
+ [ ] Hierarchical Clustering (drill-down)
674
+ [ ] Auto-Interpretação de segmentos
675
+ [ ] Schema D1 criado (ml_segments + ml_segment_members)
676
+ [ ] API REST exposta (/api/segmentation/*)
677
+ [ ] Integração com LTV Predictor Agent
678
+ [ ] Integração com Dashboard Agent
679
+ [ ] Integração com Security Enterprise Agent
680
+ [ ] Documentação completa criada
681
+ ```
682
+
683
+ ---
684
+
685
+ ## Troubleshooting
686
+
687
+ | Problema | Causa | Solução |
688
+ |---|---|---|
689
+ | `Clusters vazios` | Menos de `min_data_points` no D1 | Aumentar `max_data_age_months` ou aguardar mais dados |
690
+ | `Silhouette Score < 0.3` | Clusters não são separáveis | Aumentar `n_clusters` ou usar features melhores |
691
+ | `Outliers excessivos` | Epsilon/MinPts muito agressivos no DBSCAN | Ajustar parâmetros de detecção de anomalias |
692
+ | `embeddinggemma timeout` | Batch maior que 100 perfis | Limitar sample a 100 leads (padrão atual) |
693
+ | `vectors insuficientes` | embeddinggemma retornou menos vetores que nClusters | Reduzir nClusters ou verificar resposta da API |
694
+
695
+ ---
696
+
697
+ ## Exemplos de Uso
698
+
699
+ ### Caso 1: Segmentação Básica
700
+
701
+ ```bash
702
+ curl -X POST "https://seudominio.com/api/segmentation/cluster" \
703
+ -H "Content-Type: application/json" \
704
+ -d '{
705
+ "algorithm": "kmeans",
706
+ "n_clusters": 5,
707
+ "client_vertical": "curso-online"
708
+ }'
709
+
710
+ # Retorna:
711
+ {
712
+ "clusters": [
713
+ {
714
+ "cluster_id": 0,
715
+ "name": "Segmento 0 - Alto Valor + Alto Engajamento (SP)",
716
+ "size": 95,
717
+ "avg_ltv": 497.50,
718
+ "action_recommendations": [
719
+ "Priorizar remarketing em 24h",
720
+ "Criar lookalike audience de alto valor"
721
+ ]
722
+ },
723
+ ...
724
+ ]
725
+ }
726
+ ```
727
+
728
+ ### Caso 2: Detecção de Anomalias
729
+
730
+ ```bash
731
+ curl "https://seudominio.com/api/segmentation/outliers?limit=20"
732
+
733
+ # Retorna:
734
+ {
735
+ "outliers": [
736
+ {
737
+ "lead_id": "lead_123",
738
+ "risk_score": 0.92,
739
+ "reason": "behavior_score too high (> 95), suspicious bot activity"
740
+ }
741
+ ]
742
+ }
743
+ ```
744
+
745
+ ---
746
+
747
+ ## COMANDO *new-ai-module — Ativação por módulo genérico
748
+
749
+ Este agente também é invocado pelo Master Orchestrator quando recebe o comando `*new-ai-module` com descrição do tipo:
750
+ **segmentar, agrupar, cluster, similaridade, perfil, embedding, distribuição**
751
+
752
+ ### Responsabilidade no PASSO 1 do pipeline *new-ai-module
753
+
754
+ Entregar obrigatoriamente ao Master Orchestrator:
755
+
756
+ 1. **Modelo escolhido** — para clustering semântico: `@cf/baai/bge-m3`; para auto-labeling de segmentos: `@cf/ibm-granite/granite-4.0-h-micro`
757
+ 2. **Estratégia de clustering** — K-means (grupos fixos) ou DBSCAN (anomalias/fraude), com `n_clusters` recomendado
758
+ 3. **Pipeline de features** — quais campos do payload ou da tabela D1 alimentam o vetor de embedding
759
+ 4. **Contrato de output** — `{ segment_id, segment_label, similarity_score, cluster_method }`
760
+ 5. **TTL de cache KV recomendado** — padrão 3600s; para segmentos comportamentais usar 1800s
761
+ 6. **Posição no pipeline `/track`** — se Modo A ou C, este módulo entra após LTV Prediction
762
+
763
+ > **Regra:** Responder com o pacote completo em uma única mensagem. Sem perguntas de volta ao Orchestrator.
764
+
765
+ ---
766
+
767
+ *ML Clustering Agent v1.0 — Segmentação Dinâmica ML*
768
+ *Versão: 1.0 — Criado em: 9 de Abril de 2026*
769
+ *Status: Ready para implementação*