apphud-mcp 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,22 +1,13 @@
1
+ import { execFile } from "node:child_process";
1
2
  import { createHash } from "node:crypto";
2
3
  import { mkdirSync } from "node:fs";
3
- import { mkdir, readdir, readFile, stat, writeFile } from "node:fs/promises";
4
+ import { mkdir, readdir, readFile, writeFile } from "node:fs/promises";
4
5
  import path from "node:path";
6
+ import { promisify } from "node:util";
5
7
  import { gunzipSync } from "node:zlib";
6
- import Database from "better-sqlite3";
7
8
  import { ApphudMcpError } from "../errors/toolError.js";
8
9
  const LOCAL_ROWS_GUARD_LIMIT = 1_000_000;
9
- function interpolatePath(template, vars) {
10
- return template.replace(/\{([a-zA-Z0-9_]+)\}/g, (_, key) => vars[key] ?? "");
11
- }
12
- function normalizeToPath(baseUrl, rawPath) {
13
- if (rawPath.startsWith("http://") || rawPath.startsWith("https://")) {
14
- return rawPath;
15
- }
16
- const normalizedBase = baseUrl.replace(/\/$/, "");
17
- const normalizedPath = rawPath.startsWith("/") ? rawPath : `/${rawPath}`;
18
- return `${normalizedBase}${normalizedPath}`;
19
- }
10
+ const execFileAsync = promisify(execFile);
20
11
  function parseCsvLine(line) {
21
12
  const result = [];
22
13
  let current = "";
@@ -50,16 +41,11 @@ function parseCsv(content) {
50
41
  .map((line) => line.trimEnd())
51
42
  .filter((line) => line.length > 0);
52
43
  if (lines.length === 0) {
53
- return {
54
- headers: [],
55
- rows: [],
56
- };
44
+ return { headers: [], rows: [] };
57
45
  }
58
- const headers = parseCsvLine(lines[0] ?? "");
59
- const rows = lines.slice(1).map((line) => parseCsvLine(line));
60
46
  return {
61
- headers,
62
- rows,
47
+ headers: parseCsvLine(lines[0] ?? ""),
48
+ rows: lines.slice(1).map((line) => parseCsvLine(line)),
63
49
  };
64
50
  }
65
51
  function rowToRecord(headers, values) {
@@ -71,45 +57,6 @@ function rowToRecord(headers, values) {
71
57
  }
72
58
  return result;
73
59
  }
74
- function extractExportCandidates(payload, appId) {
75
- const candidates = [];
76
- const visited = new WeakSet();
77
- const walk = (node) => {
78
- if (Array.isArray(node)) {
79
- for (const item of node) {
80
- walk(item);
81
- }
82
- return;
83
- }
84
- if (!node || typeof node !== "object") {
85
- return;
86
- }
87
- const record = node;
88
- if (visited.has(record)) {
89
- return;
90
- }
91
- visited.add(record);
92
- const downloadUrl = (typeof record.download_url === "string" && record.download_url) ||
93
- (typeof record.downloadUrl === "string" && record.downloadUrl) ||
94
- (typeof record.url === "string" && record.url) ||
95
- (typeof record.file_url === "string" && record.file_url);
96
- const fileName = (typeof record.file_name === "string" && record.file_name) ||
97
- (typeof record.filename === "string" && record.filename) ||
98
- (typeof record.name === "string" && record.name);
99
- if (downloadUrl && fileName) {
100
- candidates.push({
101
- appId,
102
- fileName,
103
- downloadUrl,
104
- });
105
- }
106
- for (const value of Object.values(record)) {
107
- walk(value);
108
- }
109
- };
110
- walk(payload);
111
- return candidates;
112
- }
113
60
  async function listCsvGzFiles(root) {
114
61
  const entries = await readdir(root, { withFileTypes: true });
115
62
  const files = [];
@@ -128,25 +75,24 @@ async function listCsvGzFiles(root) {
128
75
  export class EtlService {
129
76
  config;
130
77
  apphudClient;
131
- db;
132
78
  timer = null;
133
79
  running = false;
80
+ stateLoaded = false;
81
+ appKeys = [];
82
+ manifest = new Map();
83
+ alerts = [];
134
84
  constructor(config, apphudClient) {
135
85
  this.config = config;
136
86
  this.apphudClient = apphudClient;
137
- const sqlitePath = path.resolve(config.sqlitePath ?? ".apphud-mcp/apphud.db");
138
- mkdirSync(path.dirname(sqlitePath), { recursive: true });
139
- this.db = new Database(sqlitePath);
140
- this.db.pragma("journal_mode = WAL");
141
- this.db.pragma("busy_timeout = 5000");
142
- this.initializeSchema();
87
+ mkdirSync(path.resolve(this.config.etlStorageDir), { recursive: true });
88
+ mkdirSync(path.resolve(this.config.etlIncomingDir), { recursive: true });
89
+ mkdirSync(path.resolve(this.stateDir()), { recursive: true });
143
90
  }
144
91
  async start() {
145
92
  if (!this.config.etlEnabled) {
146
93
  return;
147
94
  }
148
- await mkdir(path.resolve(this.config.etlStorageDir), { recursive: true });
149
- await mkdir(path.resolve(this.config.etlIncomingDir), { recursive: true });
95
+ await this.ensureStateLoaded();
150
96
  await this.runOnce();
151
97
  const intervalMs = Math.max(this.config.etlPollIntervalMinutes, 1) * 60_000;
152
98
  this.timer = setInterval(() => {
@@ -158,63 +104,75 @@ export class EtlService {
158
104
  clearInterval(this.timer);
159
105
  this.timer = null;
160
106
  }
161
- this.db.close();
107
+ await this.persistState();
162
108
  }
163
109
  async runOnce() {
164
110
  if (!this.config.etlEnabled || this.running) {
165
111
  return;
166
112
  }
113
+ await this.ensureStateLoaded();
167
114
  this.running = true;
168
115
  try {
169
- const keySync = await this.syncAppApiKeys();
170
- if (this.config.etlRemoteFetchEnabled) {
171
- await this.fetchRemoteExports(keySync);
172
- }
173
- await this.ingestIncomingFiles();
116
+ await this.syncAppApiKeys();
117
+ await this.syncFromStorage();
118
+ await this.refreshManifestFromIncoming();
174
119
  this.applyAlertTodos();
120
+ await this.persistState();
175
121
  }
176
122
  catch (error) {
177
123
  this.insertAlertTodo("ETL_RUN_FAILED", error instanceof Error ? error.message : String(error));
124
+ await this.persistState();
178
125
  }
179
126
  finally {
180
127
  this.running = false;
181
128
  }
182
129
  }
183
130
  async localStatus() {
131
+ await this.ensureStateLoaded();
184
132
  const now = new Date().toISOString();
185
- const manifest = this.db
186
- .prepare(`SELECT COUNT(*) AS total, SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) AS done, MAX(processed_at) AS last_processed
187
- FROM etl_manifest WHERE tenant_id = ?`)
188
- .get(this.config.etlTenantId);
189
- const rawRows = this.db
190
- .prepare(`SELECT COUNT(*) AS total FROM etl_raw_rows WHERE tenant_id = ?`)
191
- .get(this.config.etlTenantId);
192
- const alerts = this.db
193
- .prepare(`SELECT code, message, created_at FROM etl_alerts_todo ORDER BY id DESC LIMIT 20`)
194
- .all();
133
+ const manifestEntries = Array.from(this.manifest.values()).filter((entry) => entry.tenant_id === this.config.etlTenantId);
134
+ const doneEntries = manifestEntries.filter((entry) => entry.status === "done");
135
+ const lastProcessed = doneEntries
136
+ .map((entry) => entry.processed_at)
137
+ .filter((value) => Boolean(value))
138
+ .sort()
139
+ .at(-1) ?? null;
140
+ const rawRowsTotal = doneEntries.reduce((sum, entry) => sum + entry.row_count, 0);
195
141
  return {
196
- source: "apphud_local_etl",
142
+ source: "apphud_storage_etl",
197
143
  tenant_id: this.config.etlTenantId,
198
144
  etl_enabled: this.config.etlEnabled,
199
- remote_fetch_enabled: this.config.etlRemoteFetchEnabled,
145
+ etl_source: this.config.etlSource,
146
+ source_config: {
147
+ gcs_bucket: this.config.etlGcsBucket,
148
+ gcs_prefix: this.config.etlGcsPrefix,
149
+ s3_bucket: this.config.etlS3Bucket,
150
+ s3_prefix: this.config.etlS3Prefix,
151
+ },
200
152
  poll_interval_minutes: this.config.etlPollIntervalMinutes,
201
- sqlite_path: path.resolve(this.config.sqlitePath ?? ".apphud-mcp/apphud.db"),
202
153
  incoming_dir: path.resolve(this.config.etlIncomingDir),
203
- manifest_total: manifest.total,
204
- manifest_done: manifest.done ?? 0,
205
- raw_rows_total: rawRows.total,
206
- last_processed_at: manifest.last_processed,
207
- alerts_todo: alerts,
154
+ manifest_total: manifestEntries.length,
155
+ manifest_done: doneEntries.length,
156
+ raw_rows_total: rawRowsTotal,
157
+ last_processed_at: lastProcessed,
158
+ alerts_todo: this.alerts.slice(-20).reverse(),
208
159
  retrieved_at: now,
209
160
  };
210
161
  }
211
162
  async localAppsList(input = {}) {
212
163
  await this.ensureLocalBootstrap(false);
213
- const apps = this.db
214
- .prepare(`SELECT app_id, app_name, platform, key_source AS source, updated_at FROM etl_app_api_keys WHERE tenant_id = ? ORDER BY app_name`)
215
- .all(this.config.etlTenantId);
164
+ const apps = this.appKeys
165
+ .filter((app) => app.tenant_id === this.config.etlTenantId)
166
+ .map((app) => ({
167
+ app_id: app.app_id,
168
+ app_name: app.app_name,
169
+ platform: app.platform ?? null,
170
+ source: app.key_source,
171
+ updated_at: app.updated_at,
172
+ }))
173
+ .sort((a, b) => a.app_name.localeCompare(b.app_name));
216
174
  return {
217
- source: "apphud_local_etl",
175
+ source: "apphud_storage_etl",
218
176
  count: apps.length,
219
177
  apps,
220
178
  retrieved_at: new Date().toISOString(),
@@ -227,7 +185,7 @@ export class EtlService {
227
185
  const from = input.from ?? new Date(Date.now() - 7 * 24 * 60 * 60 * 1000).toISOString();
228
186
  const to = input.to ?? new Date().toISOString();
229
187
  const limit = Math.min(Math.max(input.limit ?? 100, 1), 500);
230
- const rows = this.loadRows(appId, from, to, input.filters);
188
+ const rows = await this.loadRows(appId, from, to, input.filters);
231
189
  const events = rows.slice(0, limit).map((row) => ({
232
190
  event_id: this.readString(row.record, ["event_id", "id", "uuid"]),
233
191
  event_type: this.readString(row.record, ["event_type", "event", "type", "name"]) ?? "unknown",
@@ -253,8 +211,8 @@ export class EtlService {
253
211
  events_count: events.length,
254
212
  next_cursor: undefined,
255
213
  has_more: rows.length > limit,
256
- source: "apphud_local_etl",
257
- source_used: "etl_raw_rows",
214
+ source: "apphud_storage_etl",
215
+ source_used: "incoming_csv_gz",
258
216
  retrieved_at: new Date().toISOString(),
259
217
  raw_payload: input.include_raw ? rows.map((row) => row.record) : undefined,
260
218
  };
@@ -267,7 +225,7 @@ export class EtlService {
267
225
  app_id: appId,
268
226
  apphud_app_id: appId,
269
227
  analytics_available: true,
270
- source: "apphud_local_etl",
228
+ source: "apphud_storage_etl",
271
229
  supported_tools: [
272
230
  "apphud_analytics_events_list_local",
273
231
  "apphud_analytics_metrics_list_local",
@@ -284,7 +242,7 @@ export class EtlService {
284
242
  supported_shapes: ["value", "timeseries", "breakdown", "raw"],
285
243
  metrics_count: metrics.length,
286
244
  metrics_sample: metrics.slice(0, 20),
287
- source_used: "etl_raw_rows",
245
+ source_used: "incoming_csv_gz",
288
246
  retrieved_at: new Date().toISOString(),
289
247
  raw_payload: input.include_raw ? metrics : undefined,
290
248
  };
@@ -298,7 +256,7 @@ export class EtlService {
298
256
  apphud_app_id: appId,
299
257
  metrics,
300
258
  count: metrics.length,
301
- source_used: "etl_raw_rows",
259
+ source_used: "incoming_csv_gz",
302
260
  retrieved_at: new Date().toISOString(),
303
261
  raw_payload: input.include_raw ? metrics : undefined,
304
262
  };
@@ -307,19 +265,17 @@ export class EtlService {
307
265
  await this.ensureLocalBootstrap(true);
308
266
  const appId = this.resolveLocalAppId(input.app_id, input.apphud_app_id);
309
267
  const metricKey = input.metric_key ?? "events_count";
310
- const from = input.from;
311
- const to = input.to;
312
- const rows = this.loadRows(appId, from, to, input.filters);
268
+ const rows = await this.loadRows(appId, input.from, input.to, input.filters);
313
269
  const value = this.computeMetric(metricKey, rows);
314
270
  return {
315
271
  app_id: appId,
316
272
  apphud_app_id: appId,
317
273
  metric_key: metricKey,
318
- from,
319
- to,
274
+ from: input.from,
275
+ to: input.to,
320
276
  value,
321
- source: "apphud_local_etl",
322
- source_used: "etl_raw_rows",
277
+ source: "apphud_storage_etl",
278
+ source_used: "incoming_csv_gz",
323
279
  warnings: [],
324
280
  retrieved_at: new Date().toISOString(),
325
281
  raw_payload: input.include_raw ? rows.map((row) => row.record) : undefined,
@@ -330,7 +286,7 @@ export class EtlService {
330
286
  const appId = this.resolveLocalAppId(input.app_id, input.apphud_app_id);
331
287
  const metricKey = input.metric_key ?? "events_count";
332
288
  const granularity = input.granularity ?? "day";
333
- const rows = this.loadRows(appId, input.from, input.to, input.filters);
289
+ const rows = await this.loadRows(appId, input.from, input.to, input.filters);
334
290
  const buckets = new Map();
335
291
  for (const row of rows) {
336
292
  const bucket = this.timeBucket(row.occurredAt, granularity);
@@ -356,8 +312,8 @@ export class EtlService {
356
312
  points_count: points.length,
357
313
  total: Number(total.toFixed(4)),
358
314
  average: points.length > 0 ? Number((total / points.length).toFixed(4)) : null,
359
- source: "apphud_local_etl",
360
- source_used: "etl_raw_rows",
315
+ source: "apphud_storage_etl",
316
+ source_used: "incoming_csv_gz",
361
317
  warnings: [],
362
318
  retrieved_at: new Date().toISOString(),
363
319
  raw_payload: input.include_raw ? rows.map((row) => row.record) : undefined,
@@ -369,7 +325,7 @@ export class EtlService {
369
325
  const metricKey = input.metric_key ?? "events_count";
370
326
  const dimension = input.dimension ?? "country";
371
327
  const limit = Math.min(Math.max(input.limit ?? 20, 1), 200);
372
- const rows = this.loadRows(appId, input.from, input.to, input.filters);
328
+ const rows = await this.loadRows(appId, input.from, input.to, input.filters);
373
329
  const groups = new Map();
374
330
  for (const row of rows) {
375
331
  const key = this.dimensionValue(row.record, dimension);
@@ -399,8 +355,8 @@ export class EtlService {
399
355
  granularity: input.granularity ?? "day",
400
356
  rows: rowsWithShare,
401
357
  rows_count: rowsWithShare.length,
402
- source: "apphud_local_etl",
403
- source_used: "etl_raw_rows",
358
+ source: "apphud_storage_etl",
359
+ source_used: "incoming_csv_gz",
404
360
  warnings: [],
405
361
  retrieved_at: new Date().toISOString(),
406
362
  raw_payload: input.include_raw ? rows.map((row) => row.record) : undefined,
@@ -408,9 +364,8 @@ export class EtlService {
408
364
  }
409
365
  async localRevenueSummary(input) {
410
366
  await this.ensureLocalBootstrap(true);
411
- const base = { ...input };
412
- const gross = await this.localMetricValue({ ...base, metric_key: "revenue_gross", include_raw: false });
413
- const refunds = await this.localMetricValue({ ...base, metric_key: "refunds", include_raw: false });
367
+ const gross = await this.localMetricValue({ ...input, metric_key: "revenue_gross", include_raw: false });
368
+ const refunds = await this.localMetricValue({ ...input, metric_key: "refunds", include_raw: false });
414
369
  const grossValue = typeof gross.value === "number" ? gross.value : 0;
415
370
  const refundValue = Math.abs(typeof refunds.value === "number" ? refunds.value : 0);
416
371
  return {
@@ -421,8 +376,8 @@ export class EtlService {
421
376
  revenue_gross: gross.value,
422
377
  refunds: refunds.value,
423
378
  net_revenue_estimate: Number((grossValue - refundValue).toFixed(4)),
424
- source: "apphud_local_etl",
425
- source_used: ["etl_raw_rows"],
379
+ source: "apphud_storage_etl",
380
+ source_used: ["incoming_csv_gz"],
426
381
  warnings: [],
427
382
  compare_prev_period: input.compare_prev_period ? { note: "TODO local compare_prev_period" } : undefined,
428
383
  retrieved_at: new Date().toISOString(),
@@ -449,8 +404,8 @@ export class EtlService {
449
404
  renewals: renewals.value,
450
405
  cancellations: cancellations.value,
451
406
  churn_rate_estimate: activePaidValue > 0 ? Number((cancellationsValue / activePaidValue).toFixed(6)) : null,
452
- source: "apphud_local_etl",
453
- source_used: ["etl_raw_rows"],
407
+ source: "apphud_storage_etl",
408
+ source_used: ["incoming_csv_gz"],
454
409
  warnings: [],
455
410
  retrieved_at: new Date().toISOString(),
456
411
  raw_payload: input.include_raw ? { activePaid, activeTrials, newSubs, renewals, cancellations } : undefined,
@@ -471,8 +426,8 @@ export class EtlService {
471
426
  trials_converted: converted.value,
472
427
  conversion_rate: startedValue > 0 ? Number((convertedValue / startedValue).toFixed(6)) : null,
473
428
  median_time_to_convert: null,
474
- source: "apphud_local_etl",
475
- source_used: ["etl_raw_rows"],
429
+ source: "apphud_storage_etl",
430
+ source_used: ["incoming_csv_gz"],
476
431
  warnings: [],
477
432
  retrieved_at: new Date().toISOString(),
478
433
  raw_payload: input.include_raw ? { started, converted } : undefined,
@@ -483,11 +438,6 @@ export class EtlService {
483
438
  const timeseries = await this.localMetricTimeseries({ ...input, metric_key: "active_subs", granularity: input.granularity ?? "week" });
484
439
  const points = timeseries.points ?? [];
485
440
  const base = points[0]?.value ?? 0;
486
- const periods = points.map((point, index) => ({
487
- period_index: index,
488
- retention_rate: base > 0 ? Number((point.value / base).toFixed(6)) : 0,
489
- users_count: Math.round(point.value),
490
- }));
491
441
  return {
492
442
  app_id: timeseries.app_id,
493
443
  apphud_app_id: timeseries.apphud_app_id,
@@ -495,10 +445,20 @@ export class EtlService {
495
445
  to: input.to,
496
446
  granularity: input.granularity ?? "week",
497
447
  cohort_by: "subscription_started",
498
- rows: [{ cohort_start_date: (input.from ?? new Date().toISOString()).slice(0, 10), users_count: base, periods }],
499
- source: "apphud_local_etl",
500
- source_used: "etl_raw_rows",
501
- warnings: ["Retention is approximated from local event stream."],
448
+ rows: [
449
+ {
450
+ cohort_start_date: (input.from ?? new Date().toISOString()).slice(0, 10),
451
+ users_count: base,
452
+ periods: points.map((point, index) => ({
453
+ period_index: index,
454
+ retention_rate: base > 0 ? Number((point.value / base).toFixed(6)) : 0,
455
+ users_count: Math.round(point.value),
456
+ })),
457
+ },
458
+ ],
459
+ source: "apphud_storage_etl",
460
+ source_used: "incoming_csv_gz",
461
+ warnings: ["Retention is approximated from storage event stream."],
502
462
  retrieved_at: new Date().toISOString(),
503
463
  raw_payload: input.include_raw ? points : undefined,
504
464
  };
@@ -507,21 +467,25 @@ export class EtlService {
507
467
  await this.ensureLocalBootstrap(true);
508
468
  const timeseries = await this.localMetricTimeseries({ ...input, metric_key: "cumulative_ltv", granularity: input.granularity ?? "week" });
509
469
  const points = timeseries.points ?? [];
510
- const periods = points.map((point, index) => ({
511
- period_index: index,
512
- ltv_value: Number(point.value.toFixed(6)),
513
- date: point.date,
514
- }));
515
470
  return {
516
471
  app_id: timeseries.app_id,
517
472
  apphud_app_id: timeseries.apphud_app_id,
518
473
  from: input.from,
519
474
  to: input.to,
520
475
  granularity: input.granularity ?? "week",
521
- rows: [{ cohort_start_date: (input.from ?? new Date().toISOString()).slice(0, 10), periods }],
522
- source: "apphud_local_etl",
523
- source_used: "etl_raw_rows",
524
- warnings: ["LTV is approximated from local event stream."],
476
+ rows: [
477
+ {
478
+ cohort_start_date: (input.from ?? new Date().toISOString()).slice(0, 10),
479
+ periods: points.map((point, index) => ({
480
+ period_index: index,
481
+ ltv_value: Number(point.value.toFixed(6)),
482
+ date: point.date,
483
+ })),
484
+ },
485
+ ],
486
+ source: "apphud_storage_etl",
487
+ source_used: "incoming_csv_gz",
488
+ warnings: ["LTV is approximated from storage event stream."],
525
489
  retrieved_at: new Date().toISOString(),
526
490
  raw_payload: input.include_raw ? points : undefined,
527
491
  };
@@ -531,7 +495,13 @@ export class EtlService {
531
495
  const query = (input.query ?? {});
532
496
  const shape = String(query.shape ?? "raw");
533
497
  if (shape === "value") {
534
- return this.localMetricValue({ ...input, metric_key: String(query.metric_key ?? "") });
498
+ return this.localMetricValue({
499
+ ...input,
500
+ metric_key: String(query.metric_key ?? ""),
501
+ from: typeof query.from === "string" ? query.from : input.from,
502
+ to: typeof query.to === "string" ? query.to : input.to,
503
+ filters: query.filters ?? input.filters,
504
+ });
535
505
  }
536
506
  if (shape === "timeseries") {
537
507
  return this.localMetricTimeseries({
@@ -540,6 +510,7 @@ export class EtlService {
540
510
  from: typeof query.from === "string" ? query.from : input.from,
541
511
  to: typeof query.to === "string" ? query.to : input.to,
542
512
  granularity: query.granularity === "week" ? "week" : "day",
513
+ filters: query.filters ?? input.filters,
543
514
  });
544
515
  }
545
516
  if (shape === "breakdown") {
@@ -550,15 +521,16 @@ export class EtlService {
550
521
  from: typeof query.from === "string" ? query.from : input.from,
551
522
  to: typeof query.to === "string" ? query.to : input.to,
552
523
  limit: typeof query.limit === "number" ? query.limit : input.limit,
524
+ filters: query.filters ?? input.filters,
553
525
  });
554
526
  }
555
527
  const appId = this.resolveLocalAppId(input.app_id, input.apphud_app_id, true);
556
- const rows = this.loadRows(appId, input.from, input.to, input.filters);
528
+ const rows = await this.loadRows(appId, input.from, input.to, input.filters);
557
529
  return {
558
530
  app_id: appId,
559
531
  apphud_app_id: appId,
560
532
  shape,
561
- source: "apphud_local_etl",
533
+ source: "apphud_storage_etl",
562
534
  status: 200,
563
535
  retrieved_at: new Date().toISOString(),
564
536
  response: input.include_raw ? rows.map((row) => row.record) : { rows_count: rows.length },
@@ -581,8 +553,8 @@ export class EtlService {
581
553
  this.localMetricValue({ app_id: appId, metric_key: "new_subscriptions", from: rangeFrom, to: rangeTo, include_raw: false }),
582
554
  this.localMetricValue({ app_id: appId, metric_key: "renewals", from: rangeFrom, to: rangeTo, include_raw: false }),
583
555
  ]);
584
- const selectedRows = this.loadRows(appId, rangeFrom, rangeTo, input.filters);
585
- const allRows = this.loadRows(appId, undefined, undefined, input.filters);
556
+ const selectedRows = await this.loadRows(appId, rangeFrom, rangeTo, input.filters);
557
+ const allRows = await this.loadRows(appId, undefined, undefined, input.filters);
586
558
  const selectedRevenueGross = typeof revenueGross.value === "number" ? revenueGross.value : 0;
587
559
  const selectedRefunds = typeof refunds.value === "number" ? refunds.value : 0;
588
560
  const selectedProceeds = Number((selectedRevenueGross - Math.abs(selectedRefunds)).toFixed(4));
@@ -591,7 +563,6 @@ export class EtlService {
591
563
  .filter((row) => ["subscription_started", "renewal", "trial_converted"].includes(this.eventType(row.record)))
592
564
  .map((row) => this.userId(row.record))
593
565
  .filter((value) => Boolean(value))).size;
594
- const refundCount = selectedRows.filter((row) => this.eventType(row.record) === "refund").length;
595
566
  const failedCharges = selectedRows.filter((row) => {
596
567
  const type = this.eventType(row.record);
597
568
  return type === "billing_issue" || type === "failed_charge";
@@ -599,14 +570,14 @@ export class EtlService {
599
570
  const newUsers = new Set(selectedRows
600
571
  .map((row) => this.userId(row.record))
601
572
  .filter((value) => Boolean(value))).size;
573
+ const trialCancellations = selectedRows.filter((row) => this.eventType(row.record) === "cancellation").length;
602
574
  const mrr = Number(selectedProceeds.toFixed(2));
603
575
  const arr = Number((mrr * 12).toFixed(2));
604
576
  const arpu = newUsers > 0 ? Number((selectedProceeds / newUsers).toFixed(2)) : null;
605
577
  const arppu = paidUsersInRange > 0 ? Number((selectedProceeds / paidUsersInRange).toFixed(2)) : null;
606
578
  const refundRate = selectedSales > 0 ? Number((Math.abs(selectedRefunds) / selectedSales).toFixed(4)) : 0;
607
- const trialCancellations = selectedRows.filter((row) => this.eventType(row.record) === "cancellation").length;
608
579
  return {
609
- source: "apphud_local_etl",
580
+ source: "apphud_storage_etl",
610
581
  app_id: appId,
611
582
  apphud_app_id: appId,
612
583
  where: {
@@ -637,8 +608,8 @@ export class EtlService {
637
608
  refunds: Math.abs(selectedRefunds),
638
609
  refund_rate: refundRate,
639
610
  failed_charges: failedCharges,
640
- arpu: arpu,
641
- arppu: arppu,
611
+ arpu,
612
+ arppu,
642
613
  prevented_refund_requests: "N/A",
643
614
  },
644
615
  events: {
@@ -659,55 +630,50 @@ export class EtlService {
659
630
  rows_in_selected_range: selectedRows.length,
660
631
  rows_total_local: allRows.length,
661
632
  },
662
- warnings: [
663
- "Local dashboard metrics are computed from ingested ETL rows and may differ from Apphud dashboard exact formulas.",
664
- ],
633
+ warnings: ["Dashboard metrics are computed from storage exports and may differ from Apphud dashboard exact formulas."],
665
634
  retrieved_at: nowIso,
666
635
  raw_payload: input.include_raw ? selectedRows.slice(0, 1000).map((row) => row.record) : undefined,
667
636
  };
668
637
  }
669
- initializeSchema() {
670
- this.db.exec(`
671
- CREATE TABLE IF NOT EXISTS etl_app_api_keys (
672
- tenant_id TEXT NOT NULL,
673
- app_id TEXT NOT NULL,
674
- app_name TEXT NOT NULL,
675
- platform TEXT,
676
- api_key TEXT NOT NULL,
677
- key_source TEXT NOT NULL,
678
- updated_at TEXT NOT NULL,
679
- PRIMARY KEY (tenant_id, app_id)
680
- );
681
-
682
- CREATE TABLE IF NOT EXISTS etl_manifest (
683
- tenant_id TEXT NOT NULL,
684
- app_id TEXT,
685
- source_file TEXT NOT NULL,
686
- checksum TEXT NOT NULL,
687
- status TEXT NOT NULL,
688
- row_count INTEGER NOT NULL DEFAULT 0,
689
- processed_at TEXT,
690
- error_message TEXT,
691
- PRIMARY KEY (tenant_id, source_file)
692
- );
693
-
694
- CREATE TABLE IF NOT EXISTS etl_raw_rows (
695
- id INTEGER PRIMARY KEY AUTOINCREMENT,
696
- tenant_id TEXT NOT NULL,
697
- app_id TEXT,
698
- source_file TEXT NOT NULL,
699
- row_index INTEGER NOT NULL,
700
- raw_json TEXT NOT NULL,
701
- ingested_at TEXT NOT NULL
702
- );
703
-
704
- CREATE TABLE IF NOT EXISTS etl_alerts_todo (
705
- id INTEGER PRIMARY KEY AUTOINCREMENT,
706
- code TEXT NOT NULL,
707
- message TEXT NOT NULL,
708
- created_at TEXT NOT NULL
709
- );
710
- `);
638
+ async ensureStateLoaded() {
639
+ if (this.stateLoaded) {
640
+ return;
641
+ }
642
+ this.appKeys = await this.readJsonFile(this.appKeysPath(), []);
643
+ const manifestRows = await this.readJsonFile(this.manifestPath(), []);
644
+ this.manifest = new Map(manifestRows.map((row) => [row.source_file, row]));
645
+ this.alerts = await this.readJsonFile(this.alertsPath(), []);
646
+ this.stateLoaded = true;
647
+ }
648
+ async persistState() {
649
+ await this.writeJsonFile(this.appKeysPath(), this.appKeys);
650
+ await this.writeJsonFile(this.manifestPath(), Array.from(this.manifest.values()));
651
+ await this.writeJsonFile(this.alertsPath(), this.alerts.slice(-1000));
652
+ }
653
+ stateDir() {
654
+ return path.join(path.resolve(this.config.etlStorageDir), "state");
655
+ }
656
+ appKeysPath() {
657
+ return path.join(this.stateDir(), "app_api_keys.json");
658
+ }
659
+ manifestPath() {
660
+ return path.join(this.stateDir(), "manifest.json");
661
+ }
662
+ alertsPath() {
663
+ return path.join(this.stateDir(), "alerts_todo.json");
664
+ }
665
+ async readJsonFile(filePath, fallback) {
666
+ try {
667
+ const raw = await readFile(filePath, "utf8");
668
+ return JSON.parse(raw);
669
+ }
670
+ catch {
671
+ return fallback;
672
+ }
673
+ }
674
+ async writeJsonFile(filePath, payload) {
675
+ await mkdir(path.dirname(filePath), { recursive: true });
676
+ await writeFile(filePath, `${JSON.stringify(payload, null, 2)}\n`, "utf8");
711
677
  }
712
678
  metricCatalog() {
713
679
  return [
@@ -731,68 +697,57 @@ export class EtlService {
731
697
  if (!this.config.etlEnabled) {
732
698
  return;
733
699
  }
734
- const keysCount = this.db
735
- .prepare(`SELECT COUNT(*) AS total FROM etl_app_api_keys WHERE tenant_id = ?`)
736
- .get(this.config.etlTenantId);
737
- const doneCount = this.db
738
- .prepare(`SELECT COUNT(*) AS total FROM etl_manifest WHERE tenant_id = ? AND status = 'done'`)
739
- .get(this.config.etlTenantId);
740
- const needsBootstrap = keysCount.total === 0 || (requireRows && doneCount.total === 0);
741
- if (!needsBootstrap) {
742
- return;
700
+ await this.ensureStateLoaded();
701
+ const keysCount = this.appKeys.filter((key) => key.tenant_id === this.config.etlTenantId).length;
702
+ const doneCount = Array.from(this.manifest.values()).filter((entry) => entry.tenant_id === this.config.etlTenantId && entry.status === "done").length;
703
+ if (keysCount === 0 || (requireRows && doneCount === 0)) {
704
+ await this.runOnce();
743
705
  }
744
- await this.runOnce();
745
706
  }
746
707
  resolveLocalAppId(appId, apphudAppId, allowEmpty = false) {
747
708
  const explicit = appId?.trim() || apphudAppId?.trim();
748
709
  if (explicit) {
749
710
  return explicit;
750
711
  }
751
- const fromKeys = this.db
752
- .prepare(`SELECT app_id FROM etl_app_api_keys WHERE tenant_id = ? ORDER BY updated_at DESC LIMIT 1`)
753
- .get(this.config.etlTenantId);
754
- if (fromKeys?.app_id) {
755
- return fromKeys.app_id;
712
+ const latestKey = this.appKeys
713
+ .filter((key) => key.tenant_id === this.config.etlTenantId)
714
+ .sort((a, b) => b.updated_at.localeCompare(a.updated_at))[0];
715
+ if (latestKey?.app_id) {
716
+ return latestKey.app_id;
756
717
  }
757
- const fromRows = this.db
758
- .prepare(`SELECT app_id FROM etl_raw_rows WHERE tenant_id = ? AND app_id IS NOT NULL ORDER BY id DESC LIMIT 1`)
759
- .get(this.config.etlTenantId);
760
- if (fromRows?.app_id) {
761
- return fromRows.app_id;
718
+ const latestManifest = Array.from(this.manifest.values())
719
+ .filter((entry) => entry.tenant_id === this.config.etlTenantId && entry.app_id)
720
+ .sort((a, b) => (b.processed_at ?? "").localeCompare(a.processed_at ?? ""))[0];
721
+ if (latestManifest?.app_id) {
722
+ return latestManifest.app_id;
762
723
  }
763
724
  if (allowEmpty) {
764
725
  return "unknown_app";
765
726
  }
766
- throw new ApphudMcpError("APP_NOT_FOUND", "No local app data found", {
727
+ throw new ApphudMcpError("APP_NOT_FOUND", "No storage data found", {
767
728
  statusCode: 404,
768
- actionHint: "Enable etl.enabled=true and etl.remote_fetch_enabled=true for automatic historical import, or place .csv.gz files into etl.incoming_dir.",
729
+ actionHint: "Set etl.enabled=true and etl.source=gcs|s3 with bucket settings, or place .csv.gz files into etl.incoming_dir.",
769
730
  });
770
731
  }
771
- loadRows(appId, from, to, filters) {
772
- const where = ["tenant_id = ?"];
773
- const params = [this.config.etlTenantId];
774
- if (appId) {
775
- where.push("app_id = ?");
776
- params.push(appId);
777
- }
778
- if (from) {
779
- where.push("ingested_at >= ?");
780
- params.push(from);
781
- }
782
- if (to) {
783
- where.push("ingested_at <= ?");
784
- params.push(to);
785
- }
786
- const whereSql = where.join(" AND ");
787
- const count = this.db
788
- .prepare(`SELECT COUNT(*) AS total FROM etl_raw_rows WHERE ${whereSql}`)
789
- .get(...params);
790
- if (count.total > LOCAL_ROWS_GUARD_LIMIT) {
791
- throw new ApphudMcpError("INVALID_PAYLOAD", "Local dataset for selected range is too large", {
732
+ async loadRows(appId, from, to, filters) {
733
+ const fromMs = from ? new Date(from).getTime() : Number.NEGATIVE_INFINITY;
734
+ const toMs = to ? new Date(to).getTime() : Number.POSITIVE_INFINITY;
735
+ const candidates = Array.from(this.manifest.values()).filter((entry) => {
736
+ if (entry.tenant_id !== this.config.etlTenantId || entry.status !== "done") {
737
+ return false;
738
+ }
739
+ if (appId && entry.app_id !== appId) {
740
+ return false;
741
+ }
742
+ return true;
743
+ });
744
+ const totalRowsHint = candidates.reduce((sum, entry) => sum + entry.row_count, 0);
745
+ if (totalRowsHint > LOCAL_ROWS_GUARD_LIMIT) {
746
+ throw new ApphudMcpError("INVALID_PAYLOAD", "Storage dataset for selected range is too large", {
792
747
  statusCode: 400,
793
- actionHint: "Narrow date range (from/to) or add stricter filters. Large local queries should be requested in smaller windows.",
748
+ actionHint: "Narrow date range (from/to) or add stricter filters.",
794
749
  details: {
795
- rows: count.total,
750
+ rows_hint: totalRowsHint,
796
751
  limit: LOCAL_ROWS_GUARD_LIMIT,
797
752
  from,
798
753
  to,
@@ -800,31 +755,31 @@ export class EtlService {
800
755
  },
801
756
  });
802
757
  }
803
- const records = this.db
804
- .prepare(`SELECT raw_json, ingested_at, app_id FROM etl_raw_rows WHERE ${whereSql} ORDER BY id DESC`)
805
- .all(...params);
806
- return records
807
- .map((row) => {
808
- const record = this.safeParseJson(row.raw_json);
809
- const occurredAt = this.detectOccurredAt(record) ?? row.ingested_at;
810
- return { occurredAt, record };
811
- })
812
- .filter((row) => this.passesFilters(row.record, filters));
813
- }
814
- safeParseJson(raw) {
815
- try {
816
- const parsed = JSON.parse(raw);
817
- if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
818
- return parsed;
758
+ const rows = [];
759
+ for (const entry of candidates) {
760
+ try {
761
+ const content = await readFile(entry.source_file);
762
+ const unzipped = gunzipSync(content).toString("utf8");
763
+ const parsed = parseCsv(unzipped);
764
+ for (const row of parsed.rows) {
765
+ const record = rowToRecord(parsed.headers, row);
766
+ const normalized = record;
767
+ const occurredAt = this.detectOccurredAt(normalized) ?? entry.processed_at ?? new Date().toISOString();
768
+ const ts = new Date(occurredAt).getTime();
769
+ if (Number.isFinite(ts) && (ts < fromMs || ts > toMs)) {
770
+ continue;
771
+ }
772
+ if (!this.passesFilters(normalized, filters)) {
773
+ continue;
774
+ }
775
+ rows.push({ occurredAt, record: normalized });
776
+ }
777
+ }
778
+ catch {
779
+ continue;
819
780
  }
820
- return {};
821
- }
822
- catch {
823
- return {};
824
781
  }
825
- }
826
- detectOccurredAt(record) {
827
- return this.readString(record, ["occurred_at", "timestamp", "time", "date", "event_date", "created_at", "purchased_at"]);
782
+ return rows.sort((a, b) => b.occurredAt.localeCompare(a.occurredAt));
828
783
  }
829
784
  readString(record, keys) {
830
785
  for (const key of keys) {
@@ -853,6 +808,9 @@ export class EtlService {
853
808
  }
854
809
  return null;
855
810
  }
811
+ detectOccurredAt(record) {
812
+ return this.readString(record, ["occurred_at", "timestamp", "time", "date", "event_date", "created_at", "purchased_at"]);
813
+ }
856
814
  eventType(record) {
857
815
  return (this.readString(record, ["event_type", "event", "type", "name"]) ?? "unknown").toLowerCase();
858
816
  }
@@ -870,8 +828,7 @@ export class EtlService {
870
828
  const distinctUsers = (types) => {
871
829
  const set = new Set();
872
830
  for (const row of rows) {
873
- const type = this.eventType(row.record);
874
- if (!types.includes(type)) {
831
+ if (!types.includes(this.eventType(row.record))) {
875
832
  continue;
876
833
  }
877
834
  const uid = this.userId(row.record);
@@ -881,27 +838,20 @@ export class EtlService {
881
838
  }
882
839
  return set.size;
883
840
  };
884
- if (lower === "events_count") {
841
+ if (lower === "events_count")
885
842
  return rows.length;
886
- }
887
- if (lower === "trials_started") {
843
+ if (lower === "trials_started")
888
844
  return countByType(["trial_started"]);
889
- }
890
- if (lower === "trials_converted") {
845
+ if (lower === "trials_converted")
891
846
  return countByType(["trial_converted"]);
892
- }
893
- if (lower === "new_subscriptions") {
847
+ if (lower === "new_subscriptions")
894
848
  return countByType(["subscription_started"]);
895
- }
896
- if (lower === "renewals") {
849
+ if (lower === "renewals")
897
850
  return countByType(["renewal"]);
898
- }
899
- if (lower === "cancellations") {
851
+ if (lower === "cancellations")
900
852
  return countByType(["cancellation", "expiration"]);
901
- }
902
- if (lower === "active_trials") {
853
+ if (lower === "active_trials")
903
854
  return distinctUsers(["trial_started"]);
904
- }
905
855
  if (lower === "active_subs" || lower === "subscribers_retention") {
906
856
  return distinctUsers(["subscription_started", "renewal", "trial_converted"]);
907
857
  }
@@ -924,12 +874,10 @@ export class EtlService {
924
874
  }
925
875
  dimensionValue(record, dimension) {
926
876
  const normalized = dimension.toLowerCase();
927
- if (normalized === "country") {
877
+ if (normalized === "country")
928
878
  return this.readString(record, ["country", "country_code"]) ?? "unknown";
929
- }
930
- if (normalized === "platform") {
879
+ if (normalized === "platform")
931
880
  return this.readString(record, ["platform", "store", "os"]) ?? "unknown";
932
- }
933
881
  if (normalized === "product" || normalized === "product_id") {
934
882
  return this.readString(record, ["product_id", "sku", "subscription_product_id"]) ?? "unknown";
935
883
  }
@@ -948,9 +896,8 @@ export class EtlService {
948
896
  if (String(actual ?? "").toLowerCase() !== rawValue.toLowerCase()) {
949
897
  return false;
950
898
  }
951
- continue;
952
899
  }
953
- if (typeof rawValue === "number" || typeof rawValue === "boolean") {
900
+ else if (typeof rawValue === "number" || typeof rawValue === "boolean") {
954
901
  if (actual !== rawValue) {
955
902
  return false;
956
903
  }
@@ -973,162 +920,124 @@ export class EtlService {
973
920
  async syncAppApiKeys() {
974
921
  const listed = await this.apphudClient.listDashboardAppApiKeys();
975
922
  const now = new Date().toISOString();
976
- const upsert = this.db.prepare(`
977
- INSERT INTO etl_app_api_keys (tenant_id, app_id, app_name, platform, api_key, key_source, updated_at)
978
- VALUES (?, ?, ?, ?, ?, ?, ?)
979
- ON CONFLICT(tenant_id, app_id) DO UPDATE SET
980
- app_name = excluded.app_name,
981
- platform = excluded.platform,
982
- api_key = excluded.api_key,
983
- key_source = excluded.key_source,
984
- updated_at = excluded.updated_at
985
- `);
986
- const tx = this.db.transaction((apps) => {
987
- for (const app of apps) {
988
- upsert.run(this.config.etlTenantId, app.app_id, app.app_name, app.platform ?? null, app.api_key, app.source, now);
989
- }
990
- });
991
- tx(listed.apps);
992
- return listed.apps;
993
- }
994
- async fetchRemoteExports(apiKeys) {
995
- for (const app of apiKeys) {
996
- const listPath = interpolatePath(this.config.apphudEtlExportsListPath, {
923
+ const map = new Map(this.appKeys.map((item) => [`${item.tenant_id}:${item.app_id}`, item]));
924
+ for (const app of listed.apps) {
925
+ map.set(`${this.config.etlTenantId}:${app.app_id}`, {
926
+ tenant_id: this.config.etlTenantId,
997
927
  app_id: app.app_id,
928
+ app_name: app.app_name,
929
+ platform: app.platform,
930
+ api_key: app.api_key,
931
+ key_source: app.source,
932
+ updated_at: now,
998
933
  });
999
- const listUrl = normalizeToPath(this.config.apphudEtlExportsApiBaseUrl, listPath);
1000
- try {
1001
- const response = await fetch(listUrl, {
1002
- method: "GET",
1003
- headers: {
1004
- Authorization: `Bearer ${app.api_key}`,
1005
- Accept: "application/json",
1006
- },
1007
- signal: AbortSignal.timeout(20_000),
1008
- });
1009
- if (!response.ok) {
1010
- this.insertAlertTodo("ETL_EXPORT_LIST_FAILED", `app=${app.app_id} status=${response.status}`);
1011
- continue;
1012
- }
1013
- const payload = (await response.json());
1014
- const exportsList = extractExportCandidates(payload, app.app_id);
1015
- for (const item of exportsList) {
1016
- await this.downloadExport(item, app.api_key);
1017
- }
1018
- }
1019
- catch (error) {
1020
- this.insertAlertTodo("ETL_EXPORT_LIST_FAILED", `app=${app.app_id} ${error instanceof Error ? error.message : String(error)}`);
1021
- }
1022
934
  }
935
+ this.appKeys = Array.from(map.values());
1023
936
  }
1024
- async downloadExport(item, apiKey) {
1025
- const now = new Date();
1026
- const destinationDir = path.resolve(this.config.etlIncomingDir, item.appId, String(now.getUTCFullYear()), String(now.getUTCMonth() + 1).padStart(2, "0"), String(now.getUTCDate()).padStart(2, "0"));
1027
- await mkdir(destinationDir, { recursive: true });
1028
- const targetPath = path.join(destinationDir, item.fileName);
1029
- try {
1030
- const existing = await stat(targetPath).catch(() => null);
1031
- if (existing && existing.isFile()) {
1032
- return;
1033
- }
1034
- const response = await fetch(normalizeToPath(this.config.apphudEtlExportsApiBaseUrl, item.downloadUrl), {
1035
- method: "GET",
1036
- headers: {
1037
- Authorization: `Bearer ${apiKey}`,
1038
- Accept: "application/octet-stream,application/gzip,application/json",
1039
- },
1040
- signal: AbortSignal.timeout(60_000),
1041
- });
1042
- if (!response.ok) {
1043
- this.insertAlertTodo("ETL_EXPORT_DOWNLOAD_FAILED", `file=${item.fileName} status=${response.status}`);
1044
- return;
1045
- }
1046
- const contentType = response.headers.get("content-type") ?? "";
1047
- if (contentType.includes("application/json")) {
1048
- const payload = (await response.json());
1049
- const nestedUrl = (typeof payload.download_url === "string" && payload.download_url) ||
1050
- (typeof payload.url === "string" && payload.url);
1051
- if (!nestedUrl) {
1052
- this.insertAlertTodo("ETL_EXPORT_DOWNLOAD_FAILED", `file=${item.fileName} missing nested download url`);
1053
- return;
1054
- }
1055
- await this.downloadExport({
1056
- ...item,
1057
- downloadUrl: nestedUrl,
1058
- }, apiKey);
1059
- return;
1060
- }
1061
- const bytes = Buffer.from(await response.arrayBuffer());
1062
- await writeFile(targetPath, bytes);
937
+ async syncFromStorage() {
938
+ if (this.config.etlSource === "none") {
939
+ return;
1063
940
  }
1064
- catch (error) {
1065
- this.insertAlertTodo("ETL_EXPORT_DOWNLOAD_FAILED", `file=${item.fileName} ${error instanceof Error ? error.message : String(error)}`);
941
+ if (this.config.etlSource === "gcs") {
942
+ await this.syncFromGcs();
943
+ return;
944
+ }
945
+ if (this.config.etlSource === "s3") {
946
+ await this.syncFromS3();
947
+ return;
1066
948
  }
1067
949
  }
1068
- async ingestIncomingFiles() {
1069
- const incomingRoot = path.resolve(this.config.etlIncomingDir);
1070
- await mkdir(incomingRoot, { recursive: true });
1071
- const files = await listCsvGzFiles(incomingRoot);
1072
- for (const filePath of files) {
1073
- await this.ingestCsvGzFile(filePath);
950
+ async syncFromGcs() {
951
+ const bucket = this.config.etlGcsBucket;
952
+ if (!bucket) {
953
+ this.insertAlertTodo("ETL_GCS_CONFIG_MISSING", "Missing ETL_GCS_BUCKET for etl.source=gcs");
954
+ return;
955
+ }
956
+ const sourceUri = `gs://${bucket}${this.normalizeStoragePrefix(this.config.etlGcsPrefix)}`;
957
+ const destination = path.resolve(this.config.etlIncomingDir);
958
+ try {
959
+ await execFileAsync("gsutil", ["-m", "rsync", "-r", sourceUri, destination], { timeout: 10 * 60 * 1000 });
960
+ }
961
+ catch (error) {
962
+ this.insertAlertTodo("ETL_GCS_SYNC_FAILED", this.stringifyExecError(error, "gsutil"));
1074
963
  }
1075
964
  }
1076
- async ingestCsvGzFile(filePath) {
1077
- const absolutePath = path.resolve(filePath);
1078
- const tenantId = this.config.etlTenantId;
1079
- const appId = this.extractAppIdFromPath(absolutePath);
1080
- const content = await readFile(absolutePath);
1081
- const checksum = createHash("sha256").update(content).digest("hex");
1082
- const now = new Date().toISOString();
1083
- const existing = this.db
1084
- .prepare(`SELECT checksum, status FROM etl_manifest WHERE tenant_id = ? AND source_file = ?`)
1085
- .get(tenantId, absolutePath);
1086
- if (existing?.checksum === checksum && existing.status === "done") {
965
+ async syncFromS3() {
966
+ const bucket = this.config.etlS3Bucket;
967
+ if (!bucket) {
968
+ this.insertAlertTodo("ETL_S3_CONFIG_MISSING", "Missing ETL_S3_BUCKET for etl.source=s3");
1087
969
  return;
1088
970
  }
1089
- const markFailed = this.db.prepare(`
1090
- INSERT INTO etl_manifest (tenant_id, app_id, source_file, checksum, status, row_count, processed_at, error_message)
1091
- VALUES (?, ?, ?, ?, 'failed', 0, ?, ?)
1092
- ON CONFLICT(tenant_id, source_file) DO UPDATE SET
1093
- checksum = excluded.checksum,
1094
- app_id = excluded.app_id,
1095
- status = 'failed',
1096
- row_count = 0,
1097
- processed_at = excluded.processed_at,
1098
- error_message = excluded.error_message
1099
- `);
971
+ const sourceUri = `s3://${bucket}${this.normalizeStoragePrefix(this.config.etlS3Prefix)}`;
972
+ const destination = path.resolve(this.config.etlIncomingDir);
1100
973
  try {
1101
- const unzipped = gunzipSync(content).toString("utf8");
1102
- const parsed = parseCsv(unzipped);
1103
- const deleteExistingRows = this.db.prepare(`DELETE FROM etl_raw_rows WHERE tenant_id = ? AND source_file = ?`);
1104
- const insertRaw = this.db.prepare(`
1105
- INSERT INTO etl_raw_rows (tenant_id, app_id, source_file, row_index, raw_json, ingested_at)
1106
- VALUES (?, ?, ?, ?, ?, ?)
1107
- `);
1108
- const upsertManifest = this.db.prepare(`
1109
- INSERT INTO etl_manifest (tenant_id, app_id, source_file, checksum, status, row_count, processed_at, error_message)
1110
- VALUES (?, ?, ?, ?, 'done', ?, ?, NULL)
1111
- ON CONFLICT(tenant_id, source_file) DO UPDATE SET
1112
- checksum = excluded.checksum,
1113
- app_id = excluded.app_id,
1114
- status = 'done',
1115
- row_count = excluded.row_count,
1116
- processed_at = excluded.processed_at,
1117
- error_message = NULL
1118
- `);
1119
- const tx = this.db.transaction(() => {
1120
- deleteExistingRows.run(tenantId, absolutePath);
1121
- parsed.rows.forEach((row, index) => {
1122
- const mapped = rowToRecord(parsed.headers, row);
1123
- insertRaw.run(tenantId, appId, absolutePath, index + 1, JSON.stringify(mapped), now);
1124
- });
1125
- upsertManifest.run(tenantId, appId, absolutePath, checksum, parsed.rows.length, now);
974
+ await execFileAsync("aws", ["s3", "sync", sourceUri, destination, "--no-progress"], {
975
+ timeout: 10 * 60 * 1000,
1126
976
  });
1127
- tx();
1128
977
  }
1129
978
  catch (error) {
1130
- markFailed.run(tenantId, appId, absolutePath, checksum, now, error instanceof Error ? error.message : String(error));
1131
- this.insertAlertTodo("ETL_INGEST_FAILED", `file=${path.basename(absolutePath)} ${error instanceof Error ? error.message : String(error)}`);
979
+ this.insertAlertTodo("ETL_S3_SYNC_FAILED", this.stringifyExecError(error, "aws"));
980
+ }
981
+ }
982
+ normalizeStoragePrefix(prefix) {
983
+ if (!prefix || prefix.trim().length === 0) {
984
+ return "";
985
+ }
986
+ const trimmed = prefix.trim().replace(/^\/+/, "").replace(/\/+$/, "");
987
+ return trimmed.length > 0 ? `/${trimmed}` : "";
988
+ }
989
+ stringifyExecError(error, command) {
990
+ const err = error;
991
+ if (err?.code === "ENOENT") {
992
+ return `${command} command not found. Install ${command} CLI and configure credentials.`;
993
+ }
994
+ const stderr = typeof err?.stderr === "string" ? err.stderr.trim() : "";
995
+ const message = typeof err?.message === "string" ? err.message : "unknown error";
996
+ return `${command} sync failed: ${stderr || message}`.slice(0, 800);
997
+ }
998
+ async refreshManifestFromIncoming() {
999
+ const incomingRoot = path.resolve(this.config.etlIncomingDir);
1000
+ await mkdir(incomingRoot, { recursive: true });
1001
+ const files = await listCsvGzFiles(incomingRoot);
1002
+ for (const filePath of files) {
1003
+ const absolutePath = path.resolve(filePath);
1004
+ const tenantId = this.config.etlTenantId;
1005
+ const appId = this.extractAppIdFromPath(absolutePath);
1006
+ const now = new Date().toISOString();
1007
+ try {
1008
+ const content = await readFile(absolutePath);
1009
+ const checksum = createHash("sha256").update(content).digest("hex");
1010
+ const existing = this.manifest.get(absolutePath);
1011
+ if (existing?.checksum === checksum && existing.status === "done") {
1012
+ continue;
1013
+ }
1014
+ const unzipped = gunzipSync(content).toString("utf8");
1015
+ const parsed = parseCsv(unzipped);
1016
+ this.manifest.set(absolutePath, {
1017
+ tenant_id: tenantId,
1018
+ app_id: appId,
1019
+ source_file: absolutePath,
1020
+ checksum,
1021
+ status: "done",
1022
+ row_count: parsed.rows.length,
1023
+ processed_at: now,
1024
+ error_message: null,
1025
+ });
1026
+ }
1027
+ catch (error) {
1028
+ const message = error instanceof Error ? error.message : String(error);
1029
+ this.manifest.set(absolutePath, {
1030
+ tenant_id: tenantId,
1031
+ app_id: appId,
1032
+ source_file: absolutePath,
1033
+ checksum: "",
1034
+ status: "failed",
1035
+ row_count: 0,
1036
+ processed_at: now,
1037
+ error_message: message,
1038
+ });
1039
+ this.insertAlertTodo("ETL_INGEST_FAILED", `file=${path.basename(absolutePath)} ${message}`);
1040
+ }
1132
1041
  }
1133
1042
  }
1134
1043
  extractAppIdFromPath(filePath) {
@@ -1137,10 +1046,10 @@ export class EtlService {
1137
1046
  return parts.length > 0 ? parts[0] ?? null : null;
1138
1047
  }
1139
1048
  applyAlertTodos() {
1140
- const latestDone = this.db
1141
- .prepare(`SELECT MAX(processed_at) AS processed_at FROM etl_manifest WHERE tenant_id = ? AND status = 'done'`)
1142
- .get(this.config.etlTenantId);
1143
- if (!latestDone.processed_at) {
1049
+ const latestDone = Array.from(this.manifest.values())
1050
+ .filter((entry) => entry.tenant_id === this.config.etlTenantId && entry.status === "done" && entry.processed_at)
1051
+ .sort((a, b) => (b.processed_at ?? "").localeCompare(a.processed_at ?? ""))[0];
1052
+ if (!latestDone?.processed_at) {
1144
1053
  return;
1145
1054
  }
1146
1055
  const ageMs = Date.now() - new Date(latestDone.processed_at).getTime();
@@ -1150,8 +1059,14 @@ export class EtlService {
1150
1059
  }
1151
1060
  }
1152
1061
  insertAlertTodo(code, message) {
1153
- this.db
1154
- .prepare(`INSERT INTO etl_alerts_todo (code, message, created_at) VALUES (?, ?, ?)`)
1155
- .run(code, message, new Date().toISOString());
1062
+ const now = new Date().toISOString();
1063
+ const lastSame = this.alerts[this.alerts.length - 1];
1064
+ if (lastSame && lastSame.code === code && lastSame.message === message) {
1065
+ return;
1066
+ }
1067
+ this.alerts.push({ code, message, created_at: now });
1068
+ if (this.alerts.length > 1000) {
1069
+ this.alerts = this.alerts.slice(this.alerts.length - 1000);
1070
+ }
1156
1071
  }
1157
1072
  }