@gscdump/engine 0.24.1 → 0.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/compaction.mjs +247 -0
- package/dist/_chunks/engine.mjs +22 -4
- package/dist/_chunks/parquet-plan.mjs +3 -248
- package/dist/_chunks/resolver.mjs +3 -3
- package/dist/_chunks/{iceberg-schema.mjs → schema2.mjs} +9 -2
- package/dist/_chunks/sink.d.mts +11 -1
- package/dist/_chunks/source.mjs +1 -1
- package/dist/_chunks/storage.d.mts +24 -33
- package/dist/adapters/filesystem.mjs +1 -1
- package/dist/adapters/node.mjs +1 -1
- package/dist/adapters/r2-manifest.mjs +1 -1
- package/dist/compaction-public.d.mts +15 -0
- package/dist/compaction-public.mjs +5 -0
- package/dist/iceberg/index.d.mts +12 -0
- package/dist/iceberg/index.mjs +269 -0
- package/dist/index.d.mts +30 -29
- package/dist/index.mjs +5 -272
- package/dist/planner.mjs +2 -1
- package/dist/rollups.mjs +1 -1
- package/dist/sink-node.d.mts +1 -1
- package/dist/sink-node.mjs +1 -1
- package/package.json +13 -8
- package/dist/_chunks/{storage.mjs → layout.mjs} +11 -11
|
@@ -11,11 +11,6 @@ interface CompactionThresholds {
|
|
|
11
11
|
d7?: number;
|
|
12
12
|
d30?: number;
|
|
13
13
|
}
|
|
14
|
-
declare const RAW_DAILY_COMPACT_THRESHOLD = 7;
|
|
15
|
-
declare function countRawDailies(entries: ReadonlyArray<{
|
|
16
|
-
tier?: string | null;
|
|
17
|
-
partition: string;
|
|
18
|
-
}>): number;
|
|
19
14
|
declare function enumeratePartitions(startDate: string, endDate: string): string[];
|
|
20
15
|
/**
|
|
21
16
|
* Split manifest entries into the set worth reading (`kept`) and the set whose
|
|
@@ -59,12 +54,6 @@ declare function dedupeOverlappingTiers(entries: ManifestEntry[], queryRange?: {
|
|
|
59
54
|
start: string;
|
|
60
55
|
end: string;
|
|
61
56
|
}): ManifestEntry[];
|
|
62
|
-
/**
|
|
63
|
-
* Default `searchType` for entries written before the field landed and for
|
|
64
|
-
* sync paths that don't request a specific type. GSC's own default; the
|
|
65
|
-
* vast majority of stored data is web-search.
|
|
66
|
-
*/
|
|
67
|
-
declare const DEFAULT_SEARCH_TYPE: SearchType;
|
|
68
57
|
interface WriteCtx extends TenantCtx {
|
|
69
58
|
table: TableName;
|
|
70
59
|
date?: string;
|
|
@@ -152,18 +141,6 @@ interface ManifestEntry {
|
|
|
152
141
|
*/
|
|
153
142
|
searchType?: SearchType;
|
|
154
143
|
}
|
|
155
|
-
/**
|
|
156
|
-
* Resolve the search type for an entry, defaulting legacy entries to `web`.
|
|
157
|
-
* Use this anywhere code needs to bucket entries by searchType.
|
|
158
|
-
*/
|
|
159
|
-
declare function inferSearchType(entry: Pick<ManifestEntry, 'searchType'>): SearchType;
|
|
160
|
-
/**
|
|
161
|
-
* Infer the tier for an entry that pre-dates the `tier` field. Daily files
|
|
162
|
-
* are `raw`; monthly files are `d30`. Anything else (already migrated, or
|
|
163
|
-
* a partition shape we haven't seen) returns undefined and the caller must
|
|
164
|
-
* decide how to handle it.
|
|
165
|
-
*/
|
|
166
|
-
declare function inferLegacyTier(entry: Pick<ManifestEntry, 'partition' | 'tier'>): CompactionTier | undefined;
|
|
167
144
|
interface ListLiveFilter {
|
|
168
145
|
userId: string;
|
|
169
146
|
siteId?: string;
|
|
@@ -493,6 +470,29 @@ interface StorageEngine {
|
|
|
493
470
|
*/
|
|
494
471
|
runSQL: (opts: RunSQLOptions) => Promise<QueryResult>;
|
|
495
472
|
compactTiered: (ctx: WriteCtx, thresholds?: CompactionThresholds) => Promise<void>;
|
|
473
|
+
/**
|
|
474
|
+
* Write-time half of the manifest tier invariant: retire every live entry
|
|
475
|
+
* whose every covered day is already served by a finer-or-newer live entry.
|
|
476
|
+
*
|
|
477
|
+
* `compactTiered` retires the inputs it merges, but cannot retire a coarse
|
|
478
|
+
* partition that outlived the finer files it should have superseded (a D1→R2
|
|
479
|
+
* backfill writing coarse directly, a re-sync landing fresh dailies after a
|
|
480
|
+
* month already rolled up). Those stale overlaps make the query resolver
|
|
481
|
+
* union the same dates twice. Subsumption is evaluated per searchType, over
|
|
482
|
+
* the full live set (so a `web` monthly never cancels a `discover` weekly),
|
|
483
|
+
* then the subsumed set is retired via the manifest's `registerVersions([], …)`
|
|
484
|
+
* primitive — atomic, no inserts. Safe by construction: it only drops files
|
|
485
|
+
* whose days are already covered, so no data is lost.
|
|
486
|
+
*
|
|
487
|
+
* Reads and retires through the engine's own manifest store, so it is
|
|
488
|
+
* read-your-writes-consistent with the `compactTiered` that precedes it.
|
|
489
|
+
* Returns audit counters. Hosts running a cached manifest store must bust
|
|
490
|
+
* their cache afterwards — the engine has no knowledge of host-side caching.
|
|
491
|
+
*/
|
|
492
|
+
reconcileSubsumed: (ctx: WriteCtx) => Promise<{
|
|
493
|
+
retired: number;
|
|
494
|
+
partitions: string[];
|
|
495
|
+
}>;
|
|
496
496
|
gcOrphans: (ctx: GcCtx, graceMs: number) => Promise<{
|
|
497
497
|
deleted: number;
|
|
498
498
|
}>;
|
|
@@ -541,13 +541,4 @@ interface EngineOptions {
|
|
|
541
541
|
executor: QueryExecutor;
|
|
542
542
|
now?: () => number;
|
|
543
543
|
}
|
|
544
|
-
|
|
545
|
-
/**
|
|
546
|
-
* Hourly partition keyed by the PT calendar day (`YYYY-MM-DD`). One parquet
|
|
547
|
-
* per day holds 24 hourly buckets — read-merge-write keeps `(url, hour)`
|
|
548
|
-
* idempotency across retries. Names sort lexically alongside daily ones but
|
|
549
|
-
* never collide because of the `hourly/` prefix.
|
|
550
|
-
*/
|
|
551
|
-
declare function hourPartition(date: string): string;
|
|
552
|
-
declare function objectKey(ctx: TenantCtx, table: TableName, partition: string, version: number, searchType?: SearchType): string;
|
|
553
|
-
export { CodecCtx, CompactionThresholds, CompactionTier, DEFAULT_SEARCH_TYPE, DataSource, EngineOptions, FileSetRef, GcCtx, type Grain$1 as Grain, ListLiveFilter, LockScope, ManifestEntry, ManifestPurgeResult, ManifestStore, ParquetCodec, PurgeFilter, PurgeResult, PurgeUrlsResult, QueryCtx, QueryExecuteOptions, QueryExecuteResult, QueryExecutor, QueryResult, RAW_DAILY_COMPACT_THRESHOLD, type Row$1 as Row, RunSQLOptions, type SearchType$1 as SearchType, StorageEngine, SyncState, SyncStateDetail, SyncStateFilter, SyncStateKind, SyncStateScope, type TableName$1 as TableName, type TenantCtx$1 as TenantCtx, Watermark, WatermarkFilter, WatermarkScope, WriteCtx, WriteResult, countRawDailies, dayPartition, dedupeOverlappingTiers, enumeratePartitions, hourPartition, inferLegacyTier, inferSearchType, objectKey, splitOverlappingTiers };
|
|
544
|
+
export { CodecCtx, CompactionThresholds, CompactionTier, DataSource, EngineOptions, FileSetRef, GcCtx, type Grain$1 as Grain, ListLiveFilter, LockScope, ManifestEntry, ManifestPurgeResult, ManifestStore, ParquetCodec, PurgeFilter, PurgeResult, PurgeUrlsResult, QueryCtx, QueryExecuteOptions, QueryExecuteResult, QueryExecutor, QueryResult, type Row$1 as Row, RunSQLOptions, type SearchType$1 as SearchType, StorageEngine, SyncState, SyncStateDetail, SyncStateFilter, SyncStateKind, SyncStateScope, type TableName$1 as TableName, type TenantCtx$1 as TenantCtx, Watermark, WatermarkFilter, WatermarkScope, WriteCtx, WriteResult, dedupeOverlappingTiers, enumeratePartitions, splitOverlappingTiers };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { inferLegacyTier, inferSearchType } from "../_chunks/
|
|
1
|
+
import { inferLegacyTier, inferSearchType } from "../_chunks/layout.mjs";
|
|
2
2
|
import { dirname, join, resolve } from "node:path";
|
|
3
3
|
import { Buffer } from "node:buffer";
|
|
4
4
|
import { randomBytes } from "node:crypto";
|
package/dist/adapters/node.mjs
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { createDuckDBCodec, createDuckDBExecutor, createStorageEngine } from "../_chunks/engine.mjs";
|
|
2
2
|
import { createNodeDuckDBHandle, resetNodeDuckDB } from "./duckdb-node.mjs";
|
|
3
3
|
import { createFilesystemDataSource, createFilesystemManifestStore } from "./filesystem.mjs";
|
|
4
|
+
import { encodeSiteId } from "gscdump";
|
|
4
5
|
import path from "node:path";
|
|
5
|
-
import { encodeSiteId } from "gscdump/tenant";
|
|
6
6
|
function createNodeHarness(opts) {
|
|
7
7
|
const dataDir = opts.dataDir;
|
|
8
8
|
const userId = opts.userId ?? "local";
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { inferLegacyTier, inferSearchType } from "../_chunks/
|
|
1
|
+
import { inferLegacyTier, inferSearchType } from "../_chunks/layout.mjs";
|
|
2
2
|
const SHARD_RE = /^u_[^/]+\/manifest\/(?<siteId>[^/]+)\/(?<table>[^/]+)\/HEAD$/;
|
|
3
3
|
const CAS_BACKOFF_BASE_MS = 5;
|
|
4
4
|
const CAS_BACKOFF_CAP_MS = 250;
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { dedupeOverlappingTiers, splitOverlappingTiers } from "./_chunks/storage.mjs";
|
|
2
|
+
/**
|
|
3
|
+
* Host-policy predicate: true once a table's live raw-daily count crosses the
|
|
4
|
+
* engine's daily→weekly compaction gate. Wraps the internal threshold so hosts
|
|
5
|
+
* decide "is compaction due?" without importing the constant or the counter.
|
|
6
|
+
*
|
|
7
|
+
* Pure — pass the entries the host already fetched (typically via its own cached
|
|
8
|
+
* manifest store, so the hot-path check stays on the host's cache rather than
|
|
9
|
+
* forcing an uncached read through the engine).
|
|
10
|
+
*/
|
|
11
|
+
declare function isRawDailyCompactionDue(entries: ReadonlyArray<{
|
|
12
|
+
tier?: string | null;
|
|
13
|
+
partition: string;
|
|
14
|
+
}>): boolean;
|
|
15
|
+
export { dedupeOverlappingTiers, isRawDailyCompactionDue, splitOverlappingTiers };
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import { countRawDailies, dedupeOverlappingTiers, splitOverlappingTiers } from "./_chunks/compaction.mjs";
|
|
2
|
+
function isRawDailyCompactionDue(entries) {
|
|
3
|
+
return countRawDailies(entries) > 7;
|
|
4
|
+
}
|
|
5
|
+
export { dedupeOverlappingTiers, isRawDailyCompactionDue, splitOverlappingTiers };
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { CommitRetryOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
|
|
2
|
+
type IcebergAppendSink = Sink;
|
|
3
|
+
/**
|
|
4
|
+
* Create an `IcebergAppendSink` over the R2 Data Catalog.
|
|
5
|
+
*
|
|
6
|
+
* `emit` buffers; `close()` commits one `icebergAppend()` per table touched.
|
|
7
|
+
* The catalog connection (REST context + signed S3 resolver) is established
|
|
8
|
+
* lazily on the first flush and reused — a sink that is opened and closed
|
|
9
|
+
* with no rows never touches the network.
|
|
10
|
+
*/
|
|
11
|
+
declare function createIcebergAppendSink(options: IcebergAppendSinkOptions): IcebergAppendSink;
|
|
12
|
+
export { type CommitRetryOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, icebergTableSpec, isIcebergTable } from "../_chunks/schema2.mjs";
|
|
2
|
+
import { icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver } from "icebird";
|
|
3
|
+
const ICEBERG_TYPE_MAP = {
|
|
4
|
+
STRING: "string",
|
|
5
|
+
INT: "int",
|
|
6
|
+
LONG: "long",
|
|
7
|
+
DOUBLE: "double",
|
|
8
|
+
DATE: "date"
|
|
9
|
+
};
|
|
10
|
+
function icebergSchemaFor(table) {
|
|
11
|
+
return {
|
|
12
|
+
"type": "struct",
|
|
13
|
+
"schema-id": 0,
|
|
14
|
+
"fields": ICEBERG_SCHEMAS[table].columns.map((col) => ({
|
|
15
|
+
id: col.fieldId,
|
|
16
|
+
name: col.name,
|
|
17
|
+
required: col.required,
|
|
18
|
+
type: ICEBERG_TYPE_MAP[col.type]
|
|
19
|
+
}))
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
function icebergPartitionSpecFor(table) {
|
|
23
|
+
const fields = ICEBERG_SCHEMAS[table].columns;
|
|
24
|
+
const fieldId = (name) => {
|
|
25
|
+
const col = fields.find((c) => c.name === name);
|
|
26
|
+
if (!col) throw new Error(`iceberg-catalog: table '${table}' has no '${name}' column`);
|
|
27
|
+
return col.fieldId;
|
|
28
|
+
};
|
|
29
|
+
return {
|
|
30
|
+
"spec-id": 0,
|
|
31
|
+
"fields": ICEBERG_PARTITION_SPEC.map((p, i) => ({
|
|
32
|
+
"source-id": fieldId(p.sourceColumn),
|
|
33
|
+
"field-id": 1e3 + i,
|
|
34
|
+
"name": p.name,
|
|
35
|
+
"transform": p.transform
|
|
36
|
+
}))
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
async function connectIcebergCatalog(config) {
|
|
40
|
+
return {
|
|
41
|
+
catalog: await restCatalogConnect({
|
|
42
|
+
url: config.catalogUri,
|
|
43
|
+
warehouse: config.warehouse,
|
|
44
|
+
requestInit: { headers: { Authorization: `Bearer ${config.catalogToken}` } }
|
|
45
|
+
}),
|
|
46
|
+
resolver: s3SignedResolver({
|
|
47
|
+
accessKeyId: config.s3.accessKeyId,
|
|
48
|
+
secretAccessKey: config.s3.secretAccessKey,
|
|
49
|
+
region: config.s3.region ?? "auto",
|
|
50
|
+
endpoint: config.s3.endpoint,
|
|
51
|
+
pathStyle: true
|
|
52
|
+
}),
|
|
53
|
+
namespace: config.namespace
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
function isCommitRateLimited(err) {
|
|
57
|
+
if (err && typeof err === "object" && err.status === 429) return true;
|
|
58
|
+
const msg = (err instanceof Error ? err.message : String(err)).toLowerCase();
|
|
59
|
+
return msg.includes("429") || msg.includes("too many commits") || msg.includes("rate limit");
|
|
60
|
+
}
|
|
61
|
+
function defaultCommitSleep(ms) {
|
|
62
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
63
|
+
}
|
|
64
|
+
async function icebergAppendRetrying(args, options = {}) {
|
|
65
|
+
const maxAttempts = options.maxAttempts ?? 6;
|
|
66
|
+
const baseDelayMs = options.baseDelayMs ?? 1e3;
|
|
67
|
+
const maxDelayMs = options.maxDelayMs ?? 2e4;
|
|
68
|
+
const sleep = options.sleep ?? defaultCommitSleep;
|
|
69
|
+
const random = options.random ?? Math.random;
|
|
70
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
71
|
+
const err = await icebergAppend(args).then(() => void 0, (e) => e);
|
|
72
|
+
if (err === void 0) return;
|
|
73
|
+
if (!isCommitRateLimited(err) || attempt === maxAttempts - 1) throw err;
|
|
74
|
+
const ceiling = Math.min(maxDelayMs, baseDelayMs * 2 ** attempt);
|
|
75
|
+
await sleep(Math.floor(random() * ceiling));
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
async function ensureIcebergNamespace(conn) {
|
|
79
|
+
await restCatalogCreateNamespace(conn.catalog, { namespace: conn.namespace }).catch(() => {});
|
|
80
|
+
}
|
|
81
|
+
async function createIcebergTables(conn, tables = ICEBERG_TABLES) {
|
|
82
|
+
const results = [];
|
|
83
|
+
for (const table of tables) await icebergCreateTable({
|
|
84
|
+
catalog: conn.catalog,
|
|
85
|
+
namespace: conn.namespace,
|
|
86
|
+
table,
|
|
87
|
+
schema: icebergSchemaFor(table),
|
|
88
|
+
partitionSpec: icebergPartitionSpecFor(table)
|
|
89
|
+
}).then(() => results.push({
|
|
90
|
+
table,
|
|
91
|
+
ok: true
|
|
92
|
+
}), (e) => results.push({
|
|
93
|
+
table,
|
|
94
|
+
ok: false,
|
|
95
|
+
error: String(e)
|
|
96
|
+
}));
|
|
97
|
+
return results;
|
|
98
|
+
}
|
|
99
|
+
async function listIcebergTables(conn) {
|
|
100
|
+
return restCatalogListTables(conn.catalog, { namespace: conn.namespace }).then((list) => list.map((t) => t.name).sort(), () => []);
|
|
101
|
+
}
|
|
102
|
+
function monthsInRange(range) {
|
|
103
|
+
const [sy, sm] = range.start.split("-").map(Number);
|
|
104
|
+
const [ey, em] = range.end.split("-").map(Number);
|
|
105
|
+
const out = [];
|
|
106
|
+
let y = sy;
|
|
107
|
+
let m = sm;
|
|
108
|
+
while (y < ey || y === ey && m <= em) {
|
|
109
|
+
out.push(`${y}-${String(m).padStart(2, "0")}`);
|
|
110
|
+
m++;
|
|
111
|
+
if (m > 12) {
|
|
112
|
+
m = 1;
|
|
113
|
+
y++;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return out;
|
|
117
|
+
}
|
|
118
|
+
function monthsSinceEpoch(ym) {
|
|
119
|
+
const [y, m] = ym.split("-").map(Number);
|
|
120
|
+
return (y - 1970) * 12 + (m - 1);
|
|
121
|
+
}
|
|
122
|
+
function stripBucket(filePath) {
|
|
123
|
+
if (!filePath.startsWith("s3://")) return filePath;
|
|
124
|
+
const rest = filePath.slice(5);
|
|
125
|
+
const slash = rest.indexOf("/");
|
|
126
|
+
return slash >= 0 ? rest.slice(slash + 1) : rest;
|
|
127
|
+
}
|
|
128
|
+
async function listIcebergDataFiles(conn, opts) {
|
|
129
|
+
const { metadata } = await restCatalogLoadTable(conn.catalog, {
|
|
130
|
+
namespace: conn.namespace,
|
|
131
|
+
table: opts.table
|
|
132
|
+
});
|
|
133
|
+
if (metadata["current-snapshot-id"] == null) return [];
|
|
134
|
+
const wantedMonths = new Set(monthsInRange(opts.range).map(monthsSinceEpoch));
|
|
135
|
+
const manifests = await icebergManifests({
|
|
136
|
+
metadata,
|
|
137
|
+
resolver: conn.resolver
|
|
138
|
+
});
|
|
139
|
+
const out = [];
|
|
140
|
+
for (const m of manifests) for (const entry of m.entries) {
|
|
141
|
+
if (entry.status === 2) continue;
|
|
142
|
+
const df = entry.data_file;
|
|
143
|
+
if (df.content !== 0) continue;
|
|
144
|
+
const part = df.partition;
|
|
145
|
+
if (part.site_id !== opts.siteId) continue;
|
|
146
|
+
if (part.search_type !== opts.searchType) continue;
|
|
147
|
+
const month = part.date_month;
|
|
148
|
+
if (typeof month !== "number" || !wantedMonths.has(month)) continue;
|
|
149
|
+
out.push({
|
|
150
|
+
filePath: df.file_path,
|
|
151
|
+
objectKey: stripBucket(df.file_path),
|
|
152
|
+
bytes: Number(df.file_size_in_bytes),
|
|
153
|
+
rowCount: Number(df.record_count)
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
return out;
|
|
157
|
+
}
|
|
158
|
+
async function dropIcebergTables(conn, tables) {
|
|
159
|
+
const targets = tables ?? await restCatalogListTables(conn.catalog, { namespace: conn.namespace }).then((list) => list.map((t) => t.name), () => []);
|
|
160
|
+
const results = [];
|
|
161
|
+
for (const table of targets) await icebergDropTable({
|
|
162
|
+
catalog: conn.catalog,
|
|
163
|
+
namespace: conn.namespace,
|
|
164
|
+
table,
|
|
165
|
+
purgeRequested: true
|
|
166
|
+
}).then(() => results.push({
|
|
167
|
+
table,
|
|
168
|
+
ok: true
|
|
169
|
+
}), (e) => results.push({
|
|
170
|
+
table,
|
|
171
|
+
ok: false,
|
|
172
|
+
error: String(e)
|
|
173
|
+
}));
|
|
174
|
+
return results;
|
|
175
|
+
}
|
|
176
|
+
const DAY_MILLIS = 864e5;
|
|
177
|
+
function toIcebergDate(value) {
|
|
178
|
+
if (typeof value === "string") {
|
|
179
|
+
const ms = Date.parse(`${value}T00:00:00Z`);
|
|
180
|
+
if (Number.isNaN(ms)) throw new TypeError(`toIcebergDate: invalid date string '${value}'`);
|
|
181
|
+
return Math.floor(ms / DAY_MILLIS);
|
|
182
|
+
}
|
|
183
|
+
if (value instanceof Date) {
|
|
184
|
+
const ms = value.getTime();
|
|
185
|
+
if (Number.isNaN(ms)) throw new TypeError("toIcebergDate: invalid Date (NaN)");
|
|
186
|
+
return Math.floor(ms / DAY_MILLIS);
|
|
187
|
+
}
|
|
188
|
+
return value;
|
|
189
|
+
}
|
|
190
|
+
function coerceJsonSafe(value) {
|
|
191
|
+
if (typeof value === "bigint") return Number(value);
|
|
192
|
+
return value;
|
|
193
|
+
}
|
|
194
|
+
function toRecords(slice, rows) {
|
|
195
|
+
const siteId = slice.ctx.siteId ?? "";
|
|
196
|
+
return rows.map((row) => {
|
|
197
|
+
const out = {};
|
|
198
|
+
for (const k in row) out[k] = coerceJsonSafe(row[k]);
|
|
199
|
+
out.date = toIcebergDate(out.date);
|
|
200
|
+
out.site_id = siteId;
|
|
201
|
+
out.search_type = slice.searchType;
|
|
202
|
+
return out;
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
function createIcebergAppendSink(options) {
|
|
206
|
+
let connection;
|
|
207
|
+
const buffers = /* @__PURE__ */ new Map();
|
|
208
|
+
function connect() {
|
|
209
|
+
connection ??= connectIcebergCatalog(options.catalog);
|
|
210
|
+
return connection;
|
|
211
|
+
}
|
|
212
|
+
return {
|
|
213
|
+
capabilities: { appendOnly: true },
|
|
214
|
+
async emit(slice, rows) {
|
|
215
|
+
if (rows.length === 0) return { rowCount: 0 };
|
|
216
|
+
const records = toRecords(slice, rows);
|
|
217
|
+
const buffer = buffers.get(slice.table);
|
|
218
|
+
if (buffer) for (let i = 0; i < records.length; i++) buffer.push(records[i]);
|
|
219
|
+
else buffers.set(slice.table, records);
|
|
220
|
+
return { rowCount: records.length };
|
|
221
|
+
},
|
|
222
|
+
async close() {
|
|
223
|
+
const flushed = [];
|
|
224
|
+
const failed = [];
|
|
225
|
+
if (buffers.size === 0) return {
|
|
226
|
+
flushed,
|
|
227
|
+
failed
|
|
228
|
+
};
|
|
229
|
+
const conn = await connect().then((c) => c, (err) => {
|
|
230
|
+
connection = void 0;
|
|
231
|
+
return { error: String(err) };
|
|
232
|
+
});
|
|
233
|
+
if ("error" in conn) {
|
|
234
|
+
for (const [table, records] of buffers) if (records.length > 0) failed.push({
|
|
235
|
+
table,
|
|
236
|
+
error: conn.error
|
|
237
|
+
});
|
|
238
|
+
buffers.clear();
|
|
239
|
+
return {
|
|
240
|
+
flushed,
|
|
241
|
+
failed
|
|
242
|
+
};
|
|
243
|
+
}
|
|
244
|
+
for (const [table, records] of buffers) {
|
|
245
|
+
if (records.length === 0) continue;
|
|
246
|
+
await icebergAppendRetrying({
|
|
247
|
+
catalog: conn.catalog,
|
|
248
|
+
namespace: conn.namespace,
|
|
249
|
+
table,
|
|
250
|
+
resolver: conn.resolver,
|
|
251
|
+
records
|
|
252
|
+
}, options.commitRetry).then(() => {
|
|
253
|
+
flushed.push(table);
|
|
254
|
+
}, (err) => {
|
|
255
|
+
failed.push({
|
|
256
|
+
table,
|
|
257
|
+
error: String(err)
|
|
258
|
+
});
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
buffers.clear();
|
|
262
|
+
return {
|
|
263
|
+
flushed,
|
|
264
|
+
failed
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
|
package/dist/index.d.mts
CHANGED
|
@@ -1,36 +1,20 @@
|
|
|
1
|
-
import { CodecCtx, CompactionThresholds, CompactionTier,
|
|
1
|
+
import { CodecCtx, CompactionThresholds, CompactionTier, DataSource, EngineOptions, FileSetRef, GcCtx, Grain, ListLiveFilter, LockScope, ManifestEntry, ManifestPurgeResult, ManifestStore, ParquetCodec, PurgeFilter, PurgeResult, PurgeUrlsResult, QueryCtx, QueryExecuteOptions, QueryExecuteResult, QueryExecutor, QueryResult, Row, RunSQLOptions, SearchType, StorageEngine, SyncState, SyncStateDetail, SyncStateFilter, SyncStateKind, SyncStateScope, TableName, TenantCtx, Watermark, WatermarkFilter, WatermarkScope, WriteCtx, WriteResult, enumeratePartitions } from "./_chunks/storage.mjs";
|
|
2
2
|
import { DuckDBFactory, DuckDBHandle, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor } from "./_chunks/duckdb.mjs";
|
|
3
3
|
import { ColumnDef, ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, TableSchema, allTables, countries, currentSchemaVersion, dates, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, page_queries, pages, queries } from "./_chunks/schema.mjs";
|
|
4
4
|
import { InspectionVerdict, SchedulePolicy, ScheduleState, fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
|
|
5
|
-
import {
|
|
5
|
+
import { IcebergTableName, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult } from "./_chunks/sink.mjs";
|
|
6
6
|
import { GscApiRow, IngestOptions, RowAccumulator, RowAccumulatorOptions, assembleDatesRow, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
|
|
7
7
|
import { FILES_PLACEHOLDER, ResolvedQuery, resolveParquetSQL, substituteNamedFiles } from "./_chunks/planner.mjs";
|
|
8
8
|
import { createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter } from "./_chunks/pg-adapter.mjs";
|
|
9
9
|
import { rebuildDailyFromHourly } from "./rollups.mjs";
|
|
10
10
|
import { ENGINE_QUERY_CAPABILITIES, createSqlQuerySource } from "./_chunks/index.mjs";
|
|
11
11
|
import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
|
|
12
|
-
import { Grain as Grain$1, Row as Row$1, TableName as TableName$1 } from "@gscdump/contracts";
|
|
12
|
+
import { Grain as Grain$1, Row as Row$1, TableName as TableName$1, TenantCtx as TenantCtx$1 } from "@gscdump/contracts";
|
|
13
|
+
import { SearchType as SearchType$1 } from "gscdump/query";
|
|
13
14
|
declare function coerceRow(row: Row$1): Row$1;
|
|
14
15
|
declare function coerceRows(rows: readonly Row$1[]): Row$1[];
|
|
15
16
|
declare const MAX_DAY_BYTES: number;
|
|
16
17
|
declare function createStorageEngine(opts: EngineOptions): StorageEngine;
|
|
17
|
-
interface GcDeps {
|
|
18
|
-
dataSource: DataSource;
|
|
19
|
-
manifestStore: ManifestStore;
|
|
20
|
-
}
|
|
21
|
-
interface GcOptions {
|
|
22
|
-
userId?: string;
|
|
23
|
-
siteId?: string;
|
|
24
|
-
/**
|
|
25
|
-
* Retention for hourly partitions (`hourly/{date}`) in milliseconds.
|
|
26
|
-
* Defaults to 90 days; entries with `createdAt < now - hourlyRetentionMs`
|
|
27
|
-
* are retired and their bytes deleted alongside ordinary orphan sweeping.
|
|
28
|
-
*/
|
|
29
|
-
hourlyRetentionMs?: number;
|
|
30
|
-
}
|
|
31
|
-
declare function gcOrphansImpl(deps: GcDeps, now: number, graceMs: number, opts?: GcOptions): Promise<{
|
|
32
|
-
deleted: number;
|
|
33
|
-
}>;
|
|
34
18
|
interface IngestAccumulatorEngine {
|
|
35
19
|
writeDay: (scope: TenantCtx & {
|
|
36
20
|
table: TableName$1;
|
|
@@ -125,16 +109,33 @@ interface CreateIngestAccumulatorOptions extends RowAccumulatorOptions {
|
|
|
125
109
|
}
|
|
126
110
|
declare function createNoopIngestAccumulator(): IngestAccumulator;
|
|
127
111
|
declare function createIngestAccumulator(opts: CreateIngestAccumulatorOptions): IngestAccumulator;
|
|
128
|
-
|
|
112
|
+
declare function dayPartition(date: string): string;
|
|
113
|
+
/**
|
|
114
|
+
* Hourly partition keyed by the PT calendar day (`YYYY-MM-DD`). One parquet
|
|
115
|
+
* per day holds 24 hourly buckets — read-merge-write keeps `(url, hour)`
|
|
116
|
+
* idempotency across retries. Names sort lexically alongside daily ones but
|
|
117
|
+
* never collide because of the `hourly/` prefix.
|
|
118
|
+
*/
|
|
119
|
+
declare function hourPartition(date: string): string;
|
|
120
|
+
/**
|
|
121
|
+
* Default `searchType` for entries written before the field landed and for
|
|
122
|
+
* sync paths that don't request a specific type. GSC's own default; the
|
|
123
|
+
* vast majority of stored data is web-search.
|
|
124
|
+
*/
|
|
125
|
+
declare const DEFAULT_SEARCH_TYPE: SearchType$1;
|
|
126
|
+
declare function objectKey(ctx: TenantCtx$1, table: TableName$1, partition: string, version: number, searchType?: SearchType$1): string;
|
|
127
|
+
/**
|
|
128
|
+
* Resolve the search type for an entry, defaulting legacy entries to `web`.
|
|
129
|
+
* Use this anywhere code needs to bucket entries by searchType.
|
|
130
|
+
*/
|
|
131
|
+
declare function inferSearchType(entry: Pick<ManifestEntry, 'searchType'>): SearchType$1;
|
|
129
132
|
/**
|
|
130
|
-
*
|
|
131
|
-
*
|
|
132
|
-
*
|
|
133
|
-
*
|
|
134
|
-
* lazily on the first flush and reused — a sink that is opened and closed
|
|
135
|
-
* with no rows never touches the network.
|
|
133
|
+
* Infer the tier for an entry that pre-dates the `tier` field. Daily files
|
|
134
|
+
* are `raw`; monthly files are `d30`. Anything else (already migrated, or
|
|
135
|
+
* a partition shape we haven't seen) returns undefined and the caller must
|
|
136
|
+
* decide how to handle it.
|
|
136
137
|
*/
|
|
137
|
-
declare function
|
|
138
|
+
declare function inferLegacyTier(entry: Pick<ManifestEntry, 'partition' | 'tier'>): CompactionTier | undefined;
|
|
138
139
|
/** A row as stored by the fake — data columns plus the injected identity columns. */
|
|
139
140
|
type StoredRow = Row & {
|
|
140
141
|
site_id: string;
|
|
@@ -181,4 +182,4 @@ declare const MIN_SYNC_IMPRESSIONS = 1;
|
|
|
181
182
|
declare const MIN_COUNTRY_IMPRESSIONS = 10;
|
|
182
183
|
declare const MAX_SITEMAP_URLS_PER_SITE = 50000;
|
|
183
184
|
declare const MAX_TRACKED_URLS_PER_SITE = 200000;
|
|
184
|
-
export { type CodecCtx, type ColumnDef, type ColumnType, type
|
|
185
|
+
export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type CreateIngestAccumulatorOptions, DEFAULT_SEARCH_TYPE, type DataSource, type DateWeight, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, ENGINE_QUERY_CAPABILITIES, type EngineOptions, FILES_PLACEHOLDER, type FileSetRef, type FinalizeOptions, type FinalizeResult, type GcCtx, type Grain, type GscApiRow, type InMemorySink, type IngestAccumulator, type IngestAccumulatorCtx, type IngestAccumulatorEngine, type IngestAccumulatorHooks, type IngestOptions, type InspectionVerdict, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, ROW_LIMIT_R2, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SchedulePolicy, type ScheduleState, type SearchType, type Sink, type SinkCapabilities, type SinkCloseResult, type SinkOptions, type SinkSlice, type SinkWriteResult, type StorageEngine, type StoredRow, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, type SyncTableName, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, type TableName, type TableSchema, type TableTier, type TenantCtx, type TieredTableName, WEIGHT_PRIORITY, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergResolverAdapter, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createParquetResolverAdapter, createRowAccumulator, createSqlQuerySource, createStorageEngine, currentSchemaVersion, dates, dayPartition, dimensionToColumn, drizzleSchema, enumeratePartitions, fixedPolicy, formatLiteral, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, objectKey, page_queries, pages, parseEnabledSearchTypes, pgResolverAdapter, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };
|