@gscdump/engine 0.20.2 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/engine.mjs +1 -1
- package/dist/_chunks/iceberg-schema.mjs +67 -0
- package/dist/_chunks/registry.d.mts +1 -1
- package/dist/_chunks/resolver.mjs +15 -21
- package/dist/_chunks/schema.d.mts +452 -133
- package/dist/_chunks/schema.mjs +50 -24
- package/dist/_chunks/sink.d.mts +329 -0
- package/dist/_chunks/storage.d.mts +4 -4
- package/dist/adapters/duckdb-node.mjs +2 -2
- package/dist/adapters/hyparquet.mjs +5 -5
- package/dist/index.d.mts +39 -7
- package/dist/index.mjs +272 -14
- package/dist/ingest.d.mts +23 -3
- package/dist/ingest.mjs +43 -18
- package/dist/rollups.d.mts +16 -6
- package/dist/rollups.mjs +42 -35
- package/dist/schema.d.mts +2 -2
- package/dist/schema.mjs +2 -2
- package/dist/sink-node.d.mts +31 -0
- package/dist/sink-node.mjs +76 -0
- package/dist/vendor/hysnappy-purejs.d.mts +29 -0
- package/dist/vendor/hysnappy-purejs.mjs +13 -0
- package/package.json +14 -3
package/dist/rollups.mjs
CHANGED
|
@@ -41,16 +41,18 @@ async function readLatestRollup(bucket, ctx, id, searchType) {
|
|
|
41
41
|
}
|
|
42
42
|
async function rebuildRollups(opts) {
|
|
43
43
|
const now = opts.now ?? (() => Date.now());
|
|
44
|
+
const dataEndMs = opts.dataEndDate !== void 0 ? isoDateToUtcMs(opts.dataEndDate) : null;
|
|
44
45
|
const results = [];
|
|
45
46
|
for (const def of opts.defs) {
|
|
46
47
|
const builtAt = now();
|
|
48
|
+
const windowAnchorMs = dataEndMs ?? builtAt;
|
|
47
49
|
const defSearchType = def.sliceOrthogonal === true ? void 0 : opts.searchType;
|
|
48
50
|
try {
|
|
49
51
|
const payload = await def.build({
|
|
50
52
|
engine: opts.engine,
|
|
51
53
|
ctx: opts.ctx,
|
|
52
54
|
dataSource: opts.dataSource,
|
|
53
|
-
|
|
55
|
+
windowAnchorMs,
|
|
54
56
|
...defSearchType !== void 0 ? { searchType: defSearchType } : {}
|
|
55
57
|
});
|
|
56
58
|
if (def.format === "parquet") {
|
|
@@ -115,6 +117,11 @@ async function rebuildRollups(opts) {
|
|
|
115
117
|
}
|
|
116
118
|
return results;
|
|
117
119
|
}
|
|
120
|
+
function isoDateToUtcMs(iso) {
|
|
121
|
+
const m = /^(\d{4})-(\d{2})-(\d{2})$/.exec(iso);
|
|
122
|
+
if (!m) throw new Error(`dataEndDate must be ISO YYYY-MM-DD, got: ${iso}`);
|
|
123
|
+
return Date.UTC(Number(m[1]), Number(m[2]) - 1, Number(m[3]));
|
|
124
|
+
}
|
|
118
125
|
function utcDateMinusDays(at, days) {
|
|
119
126
|
const d = new Date(at - days * MS_PER_DAY);
|
|
120
127
|
return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
@@ -262,10 +269,10 @@ const dailyTotalsRollup = {
|
|
|
262
269
|
ORDER BY date
|
|
263
270
|
`
|
|
264
271
|
});
|
|
265
|
-
const
|
|
272
|
+
const queryRows = await runWindowed({
|
|
266
273
|
engine,
|
|
267
274
|
ctx,
|
|
268
|
-
table: "
|
|
275
|
+
table: "queries",
|
|
269
276
|
...searchType !== void 0 ? { searchType } : {},
|
|
270
277
|
sqlFor: (w) => `
|
|
271
278
|
SELECT
|
|
@@ -290,14 +297,14 @@ const dailyTotalsRollup = {
|
|
|
290
297
|
cur.sum_position += Number(r.sum_position);
|
|
291
298
|
pagesByDate.set(date, cur);
|
|
292
299
|
}
|
|
293
|
-
const
|
|
294
|
-
for (const r of
|
|
300
|
+
const queryImpressionsByDate = /* @__PURE__ */ new Map();
|
|
301
|
+
for (const r of queryRows) {
|
|
295
302
|
const date = String(r.date);
|
|
296
|
-
|
|
303
|
+
queryImpressionsByDate.set(date, (queryImpressionsByDate.get(date) ?? BigInt(0)) + BigInt(r.impressions));
|
|
297
304
|
}
|
|
298
305
|
return Array.from(pagesByDate.values()).sort((a, b) => a.date < b.date ? -1 : 1).map((r) => {
|
|
299
306
|
const totalImpressions = BigInt(r.impressions);
|
|
300
|
-
const queryImpressions =
|
|
307
|
+
const queryImpressions = queryImpressionsByDate.get(String(r.date)) ?? BigInt(0);
|
|
301
308
|
const anonymized = totalImpressions === BigInt(0) ? 0 : 1 - Number(queryImpressions) / Number(totalImpressions);
|
|
302
309
|
return {
|
|
303
310
|
date: r.date,
|
|
@@ -350,13 +357,13 @@ const weeklyTotalsRollup = {
|
|
|
350
357
|
const topPages28dRollup = {
|
|
351
358
|
id: "top_pages_28d",
|
|
352
359
|
windowDays: 28,
|
|
353
|
-
async build({ engine, ctx,
|
|
354
|
-
const cutoff = utcDateMinusDays(
|
|
360
|
+
async build({ engine, ctx, windowAnchorMs, searchType }) {
|
|
361
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
355
362
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
356
363
|
ctx,
|
|
357
364
|
table: "pages",
|
|
358
365
|
...searchType !== void 0 ? { searchType } : {}
|
|
359
|
-
}), cutoff, utcDateMinusDays(
|
|
366
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
360
367
|
if (partitions.length === 0) return [];
|
|
361
368
|
return (await engine.runSQL({
|
|
362
369
|
ctx,
|
|
@@ -389,13 +396,13 @@ const topPages28dRollup = {
|
|
|
389
396
|
const topCountries28dRollup = {
|
|
390
397
|
id: "top_countries_28d",
|
|
391
398
|
windowDays: 28,
|
|
392
|
-
async build({ engine, ctx,
|
|
393
|
-
const cutoff = utcDateMinusDays(
|
|
399
|
+
async build({ engine, ctx, windowAnchorMs, searchType }) {
|
|
400
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
394
401
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
395
402
|
ctx,
|
|
396
403
|
table: "countries",
|
|
397
404
|
...searchType !== void 0 ? { searchType } : {}
|
|
398
|
-
}), cutoff, utcDateMinusDays(
|
|
405
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
399
406
|
if (partitions.length === 0) return [];
|
|
400
407
|
return (await engine.runSQL({
|
|
401
408
|
ctx,
|
|
@@ -428,19 +435,19 @@ const topCountries28dRollup = {
|
|
|
428
435
|
const topKeywords28dRollup = {
|
|
429
436
|
id: "top_keywords_28d",
|
|
430
437
|
windowDays: 28,
|
|
431
|
-
async build({ engine, ctx,
|
|
432
|
-
const cutoff = utcDateMinusDays(
|
|
438
|
+
async build({ engine, ctx, windowAnchorMs, searchType }) {
|
|
439
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
433
440
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
434
441
|
ctx,
|
|
435
|
-
table: "
|
|
442
|
+
table: "queries",
|
|
436
443
|
...searchType !== void 0 ? { searchType } : {}
|
|
437
|
-
}), cutoff, utcDateMinusDays(
|
|
444
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
438
445
|
if (partitions.length === 0) return [];
|
|
439
446
|
return (await engine.runSQL({
|
|
440
447
|
ctx,
|
|
441
|
-
table: "
|
|
448
|
+
table: "queries",
|
|
442
449
|
fileSets: { FILES: {
|
|
443
|
-
table: "
|
|
450
|
+
table: "queries",
|
|
444
451
|
partitions
|
|
445
452
|
} },
|
|
446
453
|
...searchType !== void 0 ? { searchType } : {},
|
|
@@ -491,19 +498,19 @@ const topKeywords28dParquetRollup = {
|
|
|
491
498
|
}
|
|
492
499
|
],
|
|
493
500
|
parquetSortKey: ["clicks"],
|
|
494
|
-
async build({ engine, ctx,
|
|
495
|
-
const cutoff = utcDateMinusDays(
|
|
501
|
+
async build({ engine, ctx, windowAnchorMs, searchType }) {
|
|
502
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
496
503
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
497
504
|
ctx,
|
|
498
|
-
table: "
|
|
505
|
+
table: "queries",
|
|
499
506
|
...searchType !== void 0 ? { searchType } : {}
|
|
500
|
-
}), cutoff, utcDateMinusDays(
|
|
507
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
501
508
|
if (partitions.length === 0) return [];
|
|
502
509
|
return (await engine.runSQL({
|
|
503
510
|
ctx,
|
|
504
|
-
table: "
|
|
511
|
+
table: "queries",
|
|
505
512
|
fileSets: { FILES: {
|
|
506
|
-
table: "
|
|
513
|
+
table: "queries",
|
|
507
514
|
partitions
|
|
508
515
|
} },
|
|
509
516
|
...searchType !== void 0 ? { searchType } : {},
|
|
@@ -575,7 +582,7 @@ const indexingHealthRollup = {
|
|
|
575
582
|
id: "indexing_health",
|
|
576
583
|
windowDays: 90,
|
|
577
584
|
sliceOrthogonal: true,
|
|
578
|
-
async build({ engine, ctx, dataSource,
|
|
585
|
+
async build({ engine, ctx, dataSource, windowAnchorMs }) {
|
|
579
586
|
const key = inspectionParquetKey(ctx);
|
|
580
587
|
if (!await dataSource.head?.(key)) return { days: [] };
|
|
581
588
|
const sql = `
|
|
@@ -590,7 +597,7 @@ const indexingHealthRollup = {
|
|
|
590
597
|
SUM(CASE WHEN CAST(richResultsVerdict AS VARCHAR) = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
|
|
591
598
|
SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND CAST(userCanonical AS VARCHAR) <> CAST(googleCanonical AS VARCHAR) THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
|
|
592
599
|
FROM read_parquet({{INSPECTIONS}}, union_by_name = true)
|
|
593
|
-
WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(
|
|
600
|
+
WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(windowAnchorMs, 90)}'
|
|
594
601
|
GROUP BY 1
|
|
595
602
|
ORDER BY 1
|
|
596
603
|
`;
|
|
@@ -619,19 +626,19 @@ const indexPercentRollup = {
|
|
|
619
626
|
id: "index_percent",
|
|
620
627
|
windowDays: 90,
|
|
621
628
|
sliceOrthogonal: true,
|
|
622
|
-
async build({ engine, ctx, dataSource,
|
|
629
|
+
async build({ engine, ctx, dataSource, windowAnchorMs, searchType }) {
|
|
623
630
|
const urlsKeys = await dataSource.list(sitemapUrlsIndexPrefix(ctx));
|
|
624
631
|
if (urlsKeys.length === 0) return {
|
|
625
632
|
totalSitemapUrls: 0,
|
|
626
633
|
days: []
|
|
627
634
|
};
|
|
628
|
-
const cutoff = utcDateMinusDays(
|
|
635
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 90);
|
|
629
636
|
const factSearchType = searchType ?? "web";
|
|
630
637
|
const pagesPartitions = partitionsInRange(await engine.listPartitions({
|
|
631
638
|
ctx,
|
|
632
639
|
table: "pages",
|
|
633
640
|
searchType: factSearchType
|
|
634
|
-
}), cutoff, utcDateMinusDays(
|
|
641
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
635
642
|
const numerator = await engine.runSQL({
|
|
636
643
|
ctx,
|
|
637
644
|
table: "pages",
|
|
@@ -690,10 +697,10 @@ const sitemapHealthRollup = {
|
|
|
690
697
|
id: "sitemap_health",
|
|
691
698
|
windowDays: 90,
|
|
692
699
|
sliceOrthogonal: true,
|
|
693
|
-
async build({ dataSource, ctx,
|
|
700
|
+
async build({ dataSource, ctx, windowAnchorMs }) {
|
|
694
701
|
const index = await createSitemapStore({ dataSource }).loadIndex(ctx);
|
|
695
702
|
const records = Object.values(index.records);
|
|
696
|
-
const cutoff = utcDateMinusDays(
|
|
703
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 90);
|
|
697
704
|
const byDay = /* @__PURE__ */ new Map();
|
|
698
705
|
const feeds = [];
|
|
699
706
|
for (const r of records) {
|
|
@@ -734,10 +741,10 @@ const sitemapChanges28dRollup = {
|
|
|
734
741
|
id: "sitemap_changes_28d",
|
|
735
742
|
windowDays: 28,
|
|
736
743
|
sliceOrthogonal: true,
|
|
737
|
-
async build({ dataSource, ctx,
|
|
744
|
+
async build({ dataSource, ctx, windowAnchorMs }) {
|
|
738
745
|
const store = createSitemapStore({ dataSource });
|
|
739
|
-
const from = utcDateMinusDays(
|
|
740
|
-
const to = utcDateMinusDays(
|
|
746
|
+
const from = utcDateMinusDays(windowAnchorMs, 28);
|
|
747
|
+
const to = utcDateMinusDays(windowAnchorMs, 0);
|
|
741
748
|
const counts = /* @__PURE__ */ new Map();
|
|
742
749
|
const addedTop = [];
|
|
743
750
|
const removedTop = [];
|
package/dist/schema.d.mts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { _ as hourly_pages, a as allTables, b as
|
|
2
|
-
export { type ColumnDef, type ColumnType, type DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema, allTables, countries, currentSchemaVersion,
|
|
1
|
+
import { _ as hourly_pages, a as allTables, b as queries, c as dimensionToColumn, d as schemaFor, f as DrizzleSchema, g as drizzleSchema, h as dates, i as TableSchema, l as inferTable, m as countries, n as ColumnType, o as currentSchemaVersion, p as TABLE_METADATA, r as SCHEMAS, s as dedupeByNaturalKey, t as ColumnDef, u as naturalKeyColumns, v as page_queries, x as search_appearance, y as pages } from "./_chunks/schema.mjs";
|
|
2
|
+
export { type ColumnDef, type ColumnType, type DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance };
|
package/dist/schema.mjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { _ as search_appearance, a as dimensionToColumn, c as schemaFor, d as
|
|
2
|
-
export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion,
|
|
1
|
+
import { _ as search_appearance, a as dimensionToColumn, c as schemaFor, d as dates, f as drizzleSchema, g as queries, h as pages, i as dedupeByNaturalKey, l as TABLE_METADATA, m as page_queries, n as allTables, o as inferTable, p as hourly_pages, r as currentSchemaVersion, s as naturalKeyColumns, t as SCHEMAS, u as countries } from "./_chunks/schema.mjs";
|
|
2
|
+
export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance };
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { n as LocalIcebergSinkOptions, r as Sink } from "./_chunks/sink.mjs";
|
|
2
|
+
/** S3-compatible credentials for the warehouse (POC: MinIO). */
|
|
3
|
+
interface LocalIcebergS3Config {
|
|
4
|
+
/** S3 endpoint host (POC MinIO: `localhost:9100`). */
|
|
5
|
+
endpoint: string;
|
|
6
|
+
accessKeyId: string;
|
|
7
|
+
secretAccessKey: string;
|
|
8
|
+
region?: string;
|
|
9
|
+
}
|
|
10
|
+
/** Full `LocalIcebergSink` options — extends the frozen contract options. */
|
|
11
|
+
interface LocalIcebergSinkFullOptions extends LocalIcebergSinkOptions {
|
|
12
|
+
/** S3 credentials for the warehouse. Defaults to the POC MinIO creds. */
|
|
13
|
+
s3?: LocalIcebergS3Config;
|
|
14
|
+
/** Python interpreter. Defaults to `$GSCDUMP_ICEBERG_PYTHON` then `python3`. */
|
|
15
|
+
python?: string;
|
|
16
|
+
/** Override the writer-script path. Defaults to `scripts/iceberg-writer.py`. */
|
|
17
|
+
writerScript?: string;
|
|
18
|
+
}
|
|
19
|
+
interface LocalIcebergSink extends Sink {
|
|
20
|
+
/** The catalog namespace the 5 tables live under. */
|
|
21
|
+
readonly namespace: string;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Create a `LocalIcebergSink` pointed at a local Iceberg REST catalog.
|
|
25
|
+
*
|
|
26
|
+
* Requires the POC docker stack (`poc/iceberg/docker-compose.iceberg.yml`)
|
|
27
|
+
* running and a Python env with `pyiceberg` + `pyarrow` available. Tests that
|
|
28
|
+
* use this sink must skip when the stack is unreachable.
|
|
29
|
+
*/
|
|
30
|
+
declare function createLocalIcebergSink(options: LocalIcebergSinkFullOptions): LocalIcebergSink;
|
|
31
|
+
export { type LocalIcebergS3Config, type LocalIcebergSink, type LocalIcebergSinkFullOptions, createLocalIcebergSink };
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { i as ICEBERG_SCHEMAS } from "./_chunks/iceberg-schema.mjs";
|
|
2
|
+
import { execFile } from "node:child_process";
|
|
3
|
+
import { dirname, join } from "node:path";
|
|
4
|
+
import process from "node:process";
|
|
5
|
+
import { fileURLToPath } from "node:url";
|
|
6
|
+
const POC_S3 = {
|
|
7
|
+
endpoint: "localhost:9100",
|
|
8
|
+
accessKeyId: "poc",
|
|
9
|
+
secretAccessKey: "pocpocpoc",
|
|
10
|
+
region: "us-east-1"
|
|
11
|
+
};
|
|
12
|
+
function resolveWriterScript(override) {
|
|
13
|
+
if (override) return override;
|
|
14
|
+
return join(dirname(fileURLToPath(import.meta.url)), "..", "..", "scripts", "iceberg-writer.py");
|
|
15
|
+
}
|
|
16
|
+
function runWriter(python, script, job) {
|
|
17
|
+
return new Promise((resolve, reject) => {
|
|
18
|
+
execFile(python, [script], { maxBuffer: 64 * 1024 * 1024 }, (err, stdout, stderr) => {
|
|
19
|
+
let parsed;
|
|
20
|
+
if (stdout.trim()) try {
|
|
21
|
+
parsed = JSON.parse(stdout);
|
|
22
|
+
} catch {}
|
|
23
|
+
if (parsed?.error) {
|
|
24
|
+
reject(/* @__PURE__ */ new Error(`LocalIcebergSink writer failed: ${parsed.error}`));
|
|
25
|
+
return;
|
|
26
|
+
}
|
|
27
|
+
if (err) {
|
|
28
|
+
reject(/* @__PURE__ */ new Error(`LocalIcebergSink writer process failed (${err.message})${stderr ? `: ${stderr}` : ""}`));
|
|
29
|
+
return;
|
|
30
|
+
}
|
|
31
|
+
if (!parsed) {
|
|
32
|
+
reject(/* @__PURE__ */ new Error(`LocalIcebergSink writer produced no parseable output: ${stdout || stderr}`));
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
resolve(parsed);
|
|
36
|
+
}).stdin?.end(JSON.stringify(job));
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
function createLocalIcebergSink(options) {
|
|
40
|
+
const s3 = options.s3 ?? POC_S3;
|
|
41
|
+
const python = options.python ?? process.env.GSCDUMP_ICEBERG_PYTHON ?? "python3";
|
|
42
|
+
const script = resolveWriterScript(options.writerScript);
|
|
43
|
+
function buildJob(op, slice, rows) {
|
|
44
|
+
return {
|
|
45
|
+
op,
|
|
46
|
+
catalogUri: options.catalogUri,
|
|
47
|
+
namespace: options.namespace,
|
|
48
|
+
warehouse: options.warehouse,
|
|
49
|
+
s3,
|
|
50
|
+
table: slice.table,
|
|
51
|
+
spec: ICEBERG_SCHEMAS[slice.table],
|
|
52
|
+
siteId: slice.ctx.siteId ?? "",
|
|
53
|
+
searchType: slice.searchType,
|
|
54
|
+
date: slice.date,
|
|
55
|
+
rows
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
const touched = /* @__PURE__ */ new Set();
|
|
59
|
+
return {
|
|
60
|
+
namespace: options.namespace,
|
|
61
|
+
capabilities: { appendOnly: true },
|
|
62
|
+
async emit(slice, rows) {
|
|
63
|
+
if (rows.length === 0) return { rowCount: 0 };
|
|
64
|
+
const res = await runWriter(python, script, buildJob("emit", slice, rows));
|
|
65
|
+
touched.add(slice.table);
|
|
66
|
+
return { rowCount: res.rowCount ?? 0 };
|
|
67
|
+
},
|
|
68
|
+
async close() {
|
|
69
|
+
return {
|
|
70
|
+
flushed: [...touched],
|
|
71
|
+
failed: []
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
export { createLocalIcebergSink };
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure-JS drop-in replacement for `hysnappy`.
|
|
3
|
+
*
|
|
4
|
+
* `hysnappy`'s `snappyUncompressor()` eagerly compiles a WASM module
|
|
5
|
+
* (`new WebAssembly.Module(byteArray)`) — and `hyparquet-compressors`
|
|
6
|
+
* instantiates it at module top level (`SNAPPY: snappyUncompressor()`).
|
|
7
|
+
* Cloudflare's `workerd` forbids compiling WebAssembly from a runtime buffer
|
|
8
|
+
* (WASM must be a bundled module import), so any Worker bundle that imports
|
|
9
|
+
* `icebird` (→ `hyparquet-compressors`) fails to start with
|
|
10
|
+
* `CompileError: Wasm code generation disallowed by embedder`.
|
|
11
|
+
*
|
|
12
|
+
* `icebird`'s append path never actually decompresses a snappy data file — it
|
|
13
|
+
* only reads gzipped `metadata.json` and Avro manifests — so the snappy
|
|
14
|
+
* decompressor is instantiated but never invoked. This shim swaps the WASM
|
|
15
|
+
* codec for `hyparquet`'s pure-JS snappy decompressor (a vendored snappyjs),
|
|
16
|
+
* keeping the exact `hysnappy` API surface so `hyparquet-compressors` is
|
|
17
|
+
* unaware of the swap. Wired in via a build-time `hysnappy` alias.
|
|
18
|
+
*
|
|
19
|
+
* See `docs/plans/2026-05-22-icebird-ingest-writer-spike.md` (section e).
|
|
20
|
+
*/
|
|
21
|
+
/**
|
|
22
|
+
* Pure-JS stand-in for `hysnappy`'s `snappyUncompressor()`. Returns the
|
|
23
|
+
* decompressor immediately — no WASM compilation, so it is safe to call at
|
|
24
|
+
* module top level inside `workerd`.
|
|
25
|
+
*/
|
|
26
|
+
declare function snappyUncompressor(): (input: Uint8Array, outputLength: number) => Uint8Array;
|
|
27
|
+
/** Pure-JS stand-in for `hysnappy`'s `snappyUncompress(input, outputLength)`. */
|
|
28
|
+
declare function snappyUncompress(input: Uint8Array, outputLength: number): Uint8Array;
|
|
29
|
+
export { snappyUncompress, snappyUncompressor };
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { snappyUncompress as snappyUncompress$1 } from "hyparquet/src/snappy.js";
|
|
2
|
+
function decode(input, outputLength) {
|
|
3
|
+
const output = new Uint8Array(outputLength);
|
|
4
|
+
snappyUncompress$1(input, output);
|
|
5
|
+
return output;
|
|
6
|
+
}
|
|
7
|
+
function snappyUncompressor() {
|
|
8
|
+
return decode;
|
|
9
|
+
}
|
|
10
|
+
function snappyUncompress(input, outputLength) {
|
|
11
|
+
return decode(input, outputLength);
|
|
12
|
+
}
|
|
13
|
+
export { snappyUncompress, snappyUncompressor };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.21.0",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -51,6 +51,11 @@
|
|
|
51
51
|
"import": "./dist/ingest.mjs",
|
|
52
52
|
"default": "./dist/ingest.mjs"
|
|
53
53
|
},
|
|
54
|
+
"./sink-node": {
|
|
55
|
+
"types": "./dist/sink-node.d.mts",
|
|
56
|
+
"import": "./dist/sink-node.mjs",
|
|
57
|
+
"default": "./dist/sink-node.mjs"
|
|
58
|
+
},
|
|
54
59
|
"./sql": {
|
|
55
60
|
"types": "./dist/sql-bind.d.mts",
|
|
56
61
|
"import": "./dist/sql-bind.mjs",
|
|
@@ -140,6 +145,11 @@
|
|
|
140
145
|
"types": "./dist/arrow-utils.d.mts",
|
|
141
146
|
"import": "./dist/arrow-utils.mjs",
|
|
142
147
|
"default": "./dist/arrow-utils.mjs"
|
|
148
|
+
},
|
|
149
|
+
"./vendor/hysnappy": {
|
|
150
|
+
"types": "./dist/vendor/hysnappy-purejs.d.mts",
|
|
151
|
+
"import": "./dist/vendor/hysnappy-purejs.mjs",
|
|
152
|
+
"default": "./dist/vendor/hysnappy-purejs.mjs"
|
|
143
153
|
}
|
|
144
154
|
},
|
|
145
155
|
"main": "./dist/index.mjs",
|
|
@@ -168,9 +178,10 @@
|
|
|
168
178
|
},
|
|
169
179
|
"dependencies": {
|
|
170
180
|
"drizzle-orm": "^0.45.2",
|
|
181
|
+
"icebird": "^0.8.5",
|
|
171
182
|
"proper-lockfile": "^4.1.2",
|
|
172
|
-
"@gscdump/contracts": "0.
|
|
173
|
-
"gscdump": "0.
|
|
183
|
+
"@gscdump/contracts": "0.21.0",
|
|
184
|
+
"gscdump": "0.21.0"
|
|
174
185
|
},
|
|
175
186
|
"devDependencies": {
|
|
176
187
|
"@duckdb/duckdb-wasm": "^1.32.0",
|