@gscdump/engine 0.27.2 → 0.28.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/THIRD-PARTY-LICENSES.md +443 -0
- package/dist/_chunks/engine.mjs +26 -12
- package/dist/_chunks/libs/chokidar.d.mts +1 -0
- package/dist/_chunks/libs/db0.d.mts +1 -0
- package/dist/_chunks/libs/denque.d.mts +1 -0
- package/dist/_chunks/libs/fzstd.mjs +545 -0
- package/dist/_chunks/libs/hyparquet-compressors.mjs +2796 -0
- package/dist/_chunks/libs/icebird.d.mts +441 -0
- package/dist/_chunks/libs/icebird.mjs +3708 -0
- package/dist/_chunks/libs/ioredis.d.mts +1 -0
- package/dist/_chunks/libs/lru-cache.d.mts +1 -0
- package/dist/_chunks/libs/unstorage.d.mts +120 -0
- package/dist/_chunks/sink.d.mts +62 -11
- package/dist/_chunks/storage.d.mts +33 -1
- package/dist/iceberg/index.d.mts +3 -2
- package/dist/iceberg/index.mjs +169 -17
- package/dist/index.d.mts +21 -2
- package/dist/index.mjs +27 -1
- package/dist/vendor/hysnappy-purejs.mjs +1 -12
- package/package.json +7 -6
- package/LICENSE +0 -21
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { };
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { };
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
type StorageValue = null | string | number | boolean | object;
|
|
2
|
+
type WatchEvent = "update" | "remove";
|
|
3
|
+
type WatchCallback = (event: WatchEvent, key: string) => any;
|
|
4
|
+
type MaybePromise<T> = T | Promise<T>;
|
|
5
|
+
type MaybeDefined<T> = T extends any ? T : any;
|
|
6
|
+
type Unwatch = () => MaybePromise<void>;
|
|
7
|
+
interface StorageMeta {
|
|
8
|
+
atime?: Date;
|
|
9
|
+
mtime?: Date;
|
|
10
|
+
ttl?: number;
|
|
11
|
+
[key: string]: StorageValue | Date | undefined;
|
|
12
|
+
}
|
|
13
|
+
type TransactionOptions = Record<string, any>;
|
|
14
|
+
type GetKeysOptions = TransactionOptions & {
|
|
15
|
+
maxDepth?: number;
|
|
16
|
+
};
|
|
17
|
+
interface DriverFlags {
|
|
18
|
+
maxDepth?: boolean;
|
|
19
|
+
ttl?: boolean;
|
|
20
|
+
}
|
|
21
|
+
interface Driver<OptionsT = any, InstanceT = any> {
|
|
22
|
+
name?: string;
|
|
23
|
+
flags?: DriverFlags;
|
|
24
|
+
options?: OptionsT;
|
|
25
|
+
getInstance?: () => InstanceT;
|
|
26
|
+
hasItem: (key: string, opts: TransactionOptions) => MaybePromise<boolean>;
|
|
27
|
+
getItem: (key: string, opts?: TransactionOptions) => MaybePromise<StorageValue>;
|
|
28
|
+
/** @experimental */
|
|
29
|
+
getItems?: (items: {
|
|
30
|
+
key: string;
|
|
31
|
+
options?: TransactionOptions;
|
|
32
|
+
}[], commonOptions?: TransactionOptions) => MaybePromise<{
|
|
33
|
+
key: string;
|
|
34
|
+
value: StorageValue;
|
|
35
|
+
}[]>;
|
|
36
|
+
/** @experimental */
|
|
37
|
+
getItemRaw?: (key: string, opts: TransactionOptions) => MaybePromise<unknown>;
|
|
38
|
+
setItem?: (key: string, value: string, opts: TransactionOptions) => MaybePromise<void>;
|
|
39
|
+
/** @experimental */
|
|
40
|
+
setItems?: (items: {
|
|
41
|
+
key: string;
|
|
42
|
+
value: string;
|
|
43
|
+
options?: TransactionOptions;
|
|
44
|
+
}[], commonOptions?: TransactionOptions) => MaybePromise<void>;
|
|
45
|
+
/** @experimental */
|
|
46
|
+
setItemRaw?: (key: string, value: any, opts: TransactionOptions) => MaybePromise<void>;
|
|
47
|
+
removeItem?: (key: string, opts: TransactionOptions) => MaybePromise<void>;
|
|
48
|
+
getMeta?: (key: string, opts: TransactionOptions) => MaybePromise<StorageMeta | null>;
|
|
49
|
+
getKeys: (base: string, opts: GetKeysOptions) => MaybePromise<string[]>;
|
|
50
|
+
clear?: (base: string, opts: TransactionOptions) => MaybePromise<void>;
|
|
51
|
+
dispose?: () => MaybePromise<void>;
|
|
52
|
+
watch?: (callback: WatchCallback) => MaybePromise<Unwatch>;
|
|
53
|
+
}
|
|
54
|
+
type StorageDefinition = {
|
|
55
|
+
items: unknown;
|
|
56
|
+
[key: string]: unknown;
|
|
57
|
+
};
|
|
58
|
+
type StorageItemMap<T> = T extends StorageDefinition ? T["items"] : T;
|
|
59
|
+
type StorageItemType<T, K> = K extends keyof StorageItemMap<T> ? StorageItemMap<T>[K] : T extends StorageDefinition ? StorageValue : T;
|
|
60
|
+
interface Storage$1<T extends StorageValue = StorageValue> {
|
|
61
|
+
hasItem<U extends Extract<T, StorageDefinition>, K extends keyof StorageItemMap<U>>(key: K, opts?: TransactionOptions): Promise<boolean>;
|
|
62
|
+
hasItem(key: string, opts?: TransactionOptions): Promise<boolean>;
|
|
63
|
+
getItem<U extends Extract<T, StorageDefinition>, K extends string & keyof StorageItemMap<U>>(key: K, ops?: TransactionOptions): Promise<StorageItemType<T, K> | null>;
|
|
64
|
+
getItem<R = StorageItemType<T, string>>(key: string, opts?: TransactionOptions): Promise<R | null>;
|
|
65
|
+
/** @experimental */
|
|
66
|
+
getItems: <U extends T>(items: (string | {
|
|
67
|
+
key: string;
|
|
68
|
+
options?: TransactionOptions;
|
|
69
|
+
})[], commonOptions?: TransactionOptions) => Promise<{
|
|
70
|
+
key: string;
|
|
71
|
+
value: U;
|
|
72
|
+
}[]>;
|
|
73
|
+
/** @experimental See https://github.com/unjs/unstorage/issues/142 */
|
|
74
|
+
getItemRaw: <T = any>(key: string, opts?: TransactionOptions) => Promise<MaybeDefined<T> | null>;
|
|
75
|
+
setItem<U extends Extract<T, StorageDefinition>, K extends keyof StorageItemMap<U>>(key: K, value: StorageItemType<T, K>, opts?: TransactionOptions): Promise<void>;
|
|
76
|
+
setItem<U extends T>(key: string, value: U, opts?: TransactionOptions): Promise<void>;
|
|
77
|
+
/** @experimental */
|
|
78
|
+
setItems: <U extends T>(items: {
|
|
79
|
+
key: string;
|
|
80
|
+
value: U;
|
|
81
|
+
options?: TransactionOptions;
|
|
82
|
+
}[], commonOptions?: TransactionOptions) => Promise<void>;
|
|
83
|
+
/** @experimental See https://github.com/unjs/unstorage/issues/142 */
|
|
84
|
+
setItemRaw: <T = any>(key: string, value: MaybeDefined<T>, opts?: TransactionOptions) => Promise<void>;
|
|
85
|
+
removeItem<U extends Extract<T, StorageDefinition>, K extends keyof StorageItemMap<U>>(key: K, opts?: (TransactionOptions & {
|
|
86
|
+
removeMeta?: boolean;
|
|
87
|
+
}) | boolean): Promise<void>;
|
|
88
|
+
removeItem(key: string, opts?: (TransactionOptions & {
|
|
89
|
+
removeMeta?: boolean;
|
|
90
|
+
}) | boolean): Promise<void>;
|
|
91
|
+
getMeta: (key: string, opts?: (TransactionOptions & {
|
|
92
|
+
nativeOnly?: boolean;
|
|
93
|
+
}) | boolean) => MaybePromise<StorageMeta>;
|
|
94
|
+
setMeta: (key: string, value: StorageMeta, opts?: TransactionOptions) => Promise<void>;
|
|
95
|
+
removeMeta: (key: string, opts?: TransactionOptions) => Promise<void>;
|
|
96
|
+
getKeys: (base?: string, opts?: GetKeysOptions) => Promise<string[]>;
|
|
97
|
+
clear: (base?: string, opts?: TransactionOptions) => Promise<void>;
|
|
98
|
+
dispose: () => Promise<void>;
|
|
99
|
+
watch: (callback: WatchCallback) => Promise<Unwatch>;
|
|
100
|
+
unwatch: () => Promise<void>;
|
|
101
|
+
mount: (base: string, driver: Driver) => Storage$1;
|
|
102
|
+
unmount: (base: string, dispose?: boolean) => Promise<void>;
|
|
103
|
+
getMount: (key?: string) => {
|
|
104
|
+
base: string;
|
|
105
|
+
driver: Driver;
|
|
106
|
+
};
|
|
107
|
+
getMounts: (base?: string, options?: {
|
|
108
|
+
parents?: boolean;
|
|
109
|
+
}) => {
|
|
110
|
+
base: string;
|
|
111
|
+
driver: Driver;
|
|
112
|
+
}[];
|
|
113
|
+
keys: Storage$1["getKeys"];
|
|
114
|
+
get: Storage$1<T>["getItem"];
|
|
115
|
+
set: Storage$1<T>["setItem"];
|
|
116
|
+
has: Storage$1<T>["hasItem"];
|
|
117
|
+
del: Storage$1<T>["removeItem"];
|
|
118
|
+
remove: Storage$1<T>["removeItem"];
|
|
119
|
+
}
|
|
120
|
+
export { Storage$1 as Storage };
|
package/dist/_chunks/sink.d.mts
CHANGED
|
@@ -1,8 +1,20 @@
|
|
|
1
|
-
import { Row as Row$1, SearchType, TenantCtx as TenantCtx$1 } from "./storage.mjs";
|
|
1
|
+
import { QueryProfiler, Row as Row$1, SearchType, TenantCtx as TenantCtx$1 } from "./storage.mjs";
|
|
2
2
|
import { EngineError } from "./errors.mjs";
|
|
3
|
+
import { Storage } from "./libs/unstorage.mjs";
|
|
4
|
+
import { cachingResolver, icebergAppend, restCatalogConnect } from "./libs/icebird.mjs";
|
|
3
5
|
import { Result } from "gscdump/result";
|
|
4
|
-
import { icebergAppend, restCatalogConnect, s3SignedResolver } from "icebird";
|
|
5
6
|
import { TableName } from "@gscdump/contracts";
|
|
7
|
+
/** Injected catalog cache: an unstorage `Storage` plus an optional defer hook. */
|
|
8
|
+
interface CatalogCache {
|
|
9
|
+
/** unstorage storage instance — the driver is the caller's choice. */
|
|
10
|
+
storage: Storage;
|
|
11
|
+
/**
|
|
12
|
+
* Optional hook to run a cache WRITE off the response critical path, e.g.
|
|
13
|
+
* Cloudflare's `ctx.waitUntil`. When omitted the writer awaits the put
|
|
14
|
+
* inline so it is never cut off when the response returns.
|
|
15
|
+
*/
|
|
16
|
+
defer?: (write: Promise<unknown>) => void;
|
|
17
|
+
}
|
|
6
18
|
/**
|
|
7
19
|
* S3-compatible credentials for the Iceberg warehouse object store (R2 in prod,
|
|
8
20
|
* MinIO in the POC). The single definition shared by every catalog/writer/sink
|
|
@@ -150,8 +162,8 @@ interface IcebergCatalogConfig {
|
|
|
150
162
|
interface IcebergConnection {
|
|
151
163
|
/** icebird REST catalog context, passed as `{ catalog }` to icebird write fns. */
|
|
152
164
|
catalog: Awaited<ReturnType<typeof restCatalogConnect>>;
|
|
153
|
-
/** icebird S3 resolver, passed as `{ resolver }` to icebird
|
|
154
|
-
resolver: ReturnType<typeof
|
|
165
|
+
/** icebird S3 resolver (caching-wrapped), passed as `{ resolver }` to icebird fns. */
|
|
166
|
+
resolver: ReturnType<typeof cachingResolver>;
|
|
155
167
|
/** The namespace the fact tables live under. */
|
|
156
168
|
namespace: string;
|
|
157
169
|
}
|
|
@@ -168,12 +180,30 @@ declare function icebergSchemaFor(table: IcebergTableName): IcebergSchema;
|
|
|
168
180
|
* {@link icebergSchemaFor}.
|
|
169
181
|
*/
|
|
170
182
|
declare function icebergPartitionSpecFor(table: IcebergTableName): IcebergPartitionSpec;
|
|
183
|
+
/** Options for {@link connectIcebergCatalog}. */
|
|
184
|
+
interface ConnectIcebergOptions {
|
|
185
|
+
/**
|
|
186
|
+
* Optional cross-isolate cache (any unstorage driver). When supplied, the
|
|
187
|
+
* `/v1/config` REST probe is served from cache on a warm catalog, removing
|
|
188
|
+
* one serial network hop from cold-isolate connects. The bearer token is
|
|
189
|
+
* NEVER cached — only the warehouse-static routing config (`url`, `prefix`,
|
|
190
|
+
* `defaults`, `overrides`) is; `requestInit` is rebuilt from `config`.
|
|
191
|
+
*/
|
|
192
|
+
cache?: CatalogCache;
|
|
193
|
+
/** Injectable clock for the cache TTL. Defaults to `Date.now`. */
|
|
194
|
+
clock?: () => number;
|
|
195
|
+
}
|
|
171
196
|
/**
|
|
172
197
|
* Connect to the R2 Data Catalog: a REST catalog context + a signed S3
|
|
173
198
|
* resolver. Runs in Node and in `workerd` — SigV4 is Web Crypto, I/O is
|
|
174
199
|
* `fetch`, no node builtins.
|
|
200
|
+
*
|
|
201
|
+
* With a `cache`, the `/v1/config` probe is skipped on a warm catalog and the
|
|
202
|
+
* context is rebuilt from the cached routing config plus the freshly-derived
|
|
203
|
+
* bearer `requestInit`. icebird reads only `url`/`prefix`/`requestInit` from
|
|
204
|
+
* the context downstream, so this is a faithful, secret-free reconstruction.
|
|
175
205
|
*/
|
|
176
|
-
declare function connectIcebergCatalog(config: IcebergCatalogConfig): Promise<IcebergConnection>;
|
|
206
|
+
declare function connectIcebergCatalog(config: IcebergCatalogConfig, opts?: ConnectIcebergOptions): Promise<IcebergConnection>;
|
|
177
207
|
/** Tunable retry policy for {@link icebergAppendRetrying}. */
|
|
178
208
|
interface CommitRetryOptions {
|
|
179
209
|
/** Total attempts, including the first. Default 6. */
|
|
@@ -266,14 +296,35 @@ interface ListIcebergDataFilesOptions {
|
|
|
266
296
|
start: string;
|
|
267
297
|
end: string;
|
|
268
298
|
};
|
|
299
|
+
/**
|
|
300
|
+
* Optional cross-isolate cache (any unstorage driver). When supplied, the
|
|
301
|
+
* snapshot pointer is cached short (so a warm catalog skips `loadTable`) and
|
|
302
|
+
* the resolved file list is cached long, content-addressed by snapshot id
|
|
303
|
+
* (so it skips the manifest walk). Omit it to read straight from the catalog.
|
|
304
|
+
*/
|
|
305
|
+
cache?: CatalogCache;
|
|
306
|
+
/** Injectable clock for the cache TTLs. Defaults to `Date.now`. */
|
|
307
|
+
clock?: () => number;
|
|
308
|
+
/**
|
|
309
|
+
* Optional read-path profiler. Emits `iceberg.snapshot` (snapshot-pointer
|
|
310
|
+
* load), `iceberg.cache` (resolved-files lookup + hit/miss), and
|
|
311
|
+
* `iceberg.walk` (manifest fetch + entry scan, with manifest/file counts) —
|
|
312
|
+
* the catalog cold-start breakdown a hosted reader wants in `Server-Timing`.
|
|
313
|
+
*/
|
|
314
|
+
profiler?: QueryProfiler;
|
|
269
315
|
}
|
|
270
316
|
/**
|
|
271
|
-
* List the parquet data files in the current snapshot of `table`, filtered
|
|
272
|
-
*
|
|
317
|
+
* List the parquet data files in the current snapshot of `table`, filtered to a
|
|
318
|
+
* single partition slice `(siteId, searchType, month(date) ∈ range)`.
|
|
273
319
|
*
|
|
274
|
-
*
|
|
275
|
-
*
|
|
276
|
-
*
|
|
320
|
+
* The shared `gsc.<table>` tables are multi-tenant, so a naive walk is O(all
|
|
321
|
+
* tenants). This prunes the manifest LIST by partition summaries before
|
|
322
|
+
* fetching any manifest's entries (see {@link buildPartitionFilter}), making
|
|
323
|
+
* the fetch count independent of tenant count, and — when an unstorage `cache`
|
|
324
|
+
* is supplied — skips the `loadTable` round-trip on a warm snapshot pointer and
|
|
325
|
+
* the manifest walk entirely on a resolved-files hit. The final entry-level
|
|
326
|
+
* partition filter is the authoritative correctness check; pruning only avoids
|
|
327
|
+
* reading manifests that cannot match.
|
|
277
328
|
*
|
|
278
329
|
* Skips deleted entries (status=2) and non-data file types (delete files).
|
|
279
330
|
* Returns object keys + bytes + rowCount so the caller can build presigned
|
|
@@ -396,4 +447,4 @@ interface LocalIcebergSinkOptions extends SinkOptions {
|
|
|
396
447
|
/** S3-compatible warehouse location (POC: MinIO). */
|
|
397
448
|
warehouse: string;
|
|
398
449
|
}
|
|
399
|
-
export { CommitRetryOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
|
|
450
|
+
export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
|
|
@@ -72,9 +72,30 @@ interface WriteCtx extends TenantCtx {
|
|
|
72
72
|
*/
|
|
73
73
|
grain?: Grain;
|
|
74
74
|
}
|
|
75
|
+
/**
|
|
76
|
+
* A closed profiling span: a named slice of read-path work with its
|
|
77
|
+
* wall-clock cost and optional dimensional meta (file counts, row counts).
|
|
78
|
+
* Emitted by an injected {@link QueryProfiler}; see `./profile.ts`.
|
|
79
|
+
*/
|
|
80
|
+
interface QuerySpan {
|
|
81
|
+
readonly name: string;
|
|
82
|
+
readonly ms: number;
|
|
83
|
+
readonly meta?: Readonly<Record<string, string | number | boolean>>;
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Injected read-path profiler. `start(name, meta)` opens a span and returns an
|
|
87
|
+
* `end` thunk to call when that work finishes (merging completion-only meta).
|
|
88
|
+
* Absent by default — every emit site optional-chains it, so an unprofiled
|
|
89
|
+
* query pays nothing. Build one with `createQueryProfiler` / `collectSpans`.
|
|
90
|
+
*/
|
|
91
|
+
interface QueryProfiler {
|
|
92
|
+
readonly start: (name: string, meta?: Record<string, string | number | boolean>) => (extra?: Record<string, string | number | boolean>) => void;
|
|
93
|
+
}
|
|
75
94
|
interface QueryCtx extends TenantCtx {
|
|
76
95
|
table?: TableName;
|
|
77
96
|
signal?: AbortSignal;
|
|
97
|
+
/** Optional read-path profiler; forwarded into `runSQL` and the executor. */
|
|
98
|
+
profiler?: QueryProfiler;
|
|
78
99
|
/**
|
|
79
100
|
* Restrict the query to a single GSC search-type partition (`web`,
|
|
80
101
|
* `discover`, etc.). Undefined preserves the cross-type union for
|
|
@@ -390,6 +411,11 @@ interface QueryExecuteOptions {
|
|
|
390
411
|
bytes?: number;
|
|
391
412
|
reason: string;
|
|
392
413
|
}) => void;
|
|
414
|
+
/**
|
|
415
|
+
* Optional profiler. An instrumented executor emits `files.register` and
|
|
416
|
+
* `query.run` spans through it; an absent profiler is a no-op skip.
|
|
417
|
+
*/
|
|
418
|
+
profiler?: QueryProfiler;
|
|
393
419
|
}
|
|
394
420
|
interface QueryExecuteResult {
|
|
395
421
|
rows: Row[];
|
|
@@ -449,6 +475,12 @@ interface RunSQLOptions {
|
|
|
449
475
|
* Undefined keeps the legacy cross-type union.
|
|
450
476
|
*/
|
|
451
477
|
searchType?: SearchType;
|
|
478
|
+
/**
|
|
479
|
+
* Optional read-path profiler. `runSQL` emits `manifest.list` +
|
|
480
|
+
* `executor.execute` spans and forwards it into the executor for the
|
|
481
|
+
* finer `files.register` / `query.run` breakdown.
|
|
482
|
+
*/
|
|
483
|
+
profiler?: QueryProfiler;
|
|
452
484
|
}
|
|
453
485
|
interface StorageEngine {
|
|
454
486
|
writeDay: (ctx: WriteCtx, rows: Row[]) => Promise<void>;
|
|
@@ -541,4 +573,4 @@ interface EngineOptions {
|
|
|
541
573
|
executor: QueryExecutor;
|
|
542
574
|
now?: () => number;
|
|
543
575
|
}
|
|
544
|
-
export { CodecCtx, CompactionThresholds, CompactionTier, DataSource, EngineOptions, FileSetRef, GcCtx, type Grain$1 as Grain, ListLiveFilter, LockScope, ManifestEntry, ManifestPurgeResult, ManifestStore, ParquetCodec, PurgeFilter, PurgeResult, PurgeUrlsResult, QueryCtx, QueryExecuteOptions, QueryExecuteResult, QueryExecutor, QueryResult, type Row$1 as Row, RunSQLOptions, type SearchType$1 as SearchType, StorageEngine, SyncState, SyncStateDetail, SyncStateFilter, SyncStateKind, SyncStateScope, type TableName$1 as TableName, type TenantCtx$1 as TenantCtx, Watermark, WatermarkFilter, WatermarkScope, WriteCtx, WriteResult, dedupeOverlappingTiers, enumeratePartitions, splitOverlappingTiers };
|
|
576
|
+
export { CodecCtx, CompactionThresholds, CompactionTier, DataSource, EngineOptions, FileSetRef, GcCtx, type Grain$1 as Grain, ListLiveFilter, LockScope, ManifestEntry, ManifestPurgeResult, ManifestStore, ParquetCodec, PurgeFilter, PurgeResult, PurgeUrlsResult, QueryCtx, QueryExecuteOptions, QueryExecuteResult, QueryExecutor, QueryProfiler, QueryResult, QuerySpan, type Row$1 as Row, RunSQLOptions, type SearchType$1 as SearchType, StorageEngine, SyncState, SyncStateDetail, SyncStateFilter, SyncStateKind, SyncStateScope, type TableName$1 as TableName, type TenantCtx$1 as TenantCtx, Watermark, WatermarkFilter, WatermarkScope, WriteCtx, WriteResult, dedupeOverlappingTiers, enumeratePartitions, splitOverlappingTiers };
|
package/dist/iceberg/index.d.mts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { CommitRetryOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
|
|
1
|
+
import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
|
|
2
|
+
import { icebergCreateTable, icebergManifests, restCatalogLoadTable } from "../_chunks/libs/icebird.mjs";
|
|
2
3
|
type IcebergAppendSink = Sink;
|
|
3
4
|
/**
|
|
4
5
|
* Create an `IcebergAppendSink` over the R2 Data Catalog.
|
|
@@ -9,4 +10,4 @@ type IcebergAppendSink = Sink;
|
|
|
9
10
|
* with no rows never touches the network.
|
|
10
11
|
*/
|
|
11
12
|
declare function createIcebergAppendSink(options: IcebergAppendSinkOptions): IcebergAppendSink;
|
|
12
|
-
export { type CommitRetryOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
|
|
13
|
+
export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
|
package/dist/iceberg/index.mjs
CHANGED
|
@@ -1,7 +1,72 @@
|
|
|
1
1
|
import { engineErrors } from "../errors.mjs";
|
|
2
2
|
import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, icebergTableSpec, isIcebergTable } from "../_chunks/schema2.mjs";
|
|
3
|
+
import { cachingResolver, icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver } from "../_chunks/libs/icebird.mjs";
|
|
3
4
|
import { err, ok } from "gscdump/result";
|
|
4
|
-
|
|
5
|
+
async function cacheGet(cache, key, now) {
|
|
6
|
+
const boxed = await cache.storage.getItem(key).catch(() => null);
|
|
7
|
+
if (!boxed || typeof boxed.exp !== "number" || boxed.exp <= now) return void 0;
|
|
8
|
+
return boxed.v;
|
|
9
|
+
}
|
|
10
|
+
function cachePut(cache, key, value, ttlMs, now) {
|
|
11
|
+
const boxed = {
|
|
12
|
+
v: value,
|
|
13
|
+
exp: now + ttlMs
|
|
14
|
+
};
|
|
15
|
+
const write = cache.storage.setItem(key, boxed, { ttl: Math.ceil(ttlMs / 1e3) }).catch(() => {});
|
|
16
|
+
if (cache.defer) {
|
|
17
|
+
cache.defer(write);
|
|
18
|
+
return Promise.resolve();
|
|
19
|
+
}
|
|
20
|
+
return write;
|
|
21
|
+
}
|
|
22
|
+
const SITE_ID_FIELD_INDEX = ICEBERG_PARTITION_SPEC.findIndex((f) => f.sourceColumn === "site_id" && f.transform === "identity");
|
|
23
|
+
const SEARCH_TYPE_FIELD_INDEX = ICEBERG_PARTITION_SPEC.findIndex((f) => f.sourceColumn === "search_type" && f.transform === "identity");
|
|
24
|
+
const DATE_MONTH_FIELD_INDEX = ICEBERG_PARTITION_SPEC.findIndex((f) => f.transform === "month");
|
|
25
|
+
function toUint8(bytes) {
|
|
26
|
+
if (bytes == null) return null;
|
|
27
|
+
return bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes);
|
|
28
|
+
}
|
|
29
|
+
function decodeString(bytes) {
|
|
30
|
+
const u = toUint8(bytes);
|
|
31
|
+
return u == null ? null : new TextDecoder().decode(u);
|
|
32
|
+
}
|
|
33
|
+
function decodeInt(bytes) {
|
|
34
|
+
const u = toUint8(bytes);
|
|
35
|
+
if (u == null) return null;
|
|
36
|
+
return new DataView(u.buffer, u.byteOffset, u.byteLength).getInt32(0, true);
|
|
37
|
+
}
|
|
38
|
+
function buildPartitionFilter(siteId, searchType, wantedMonths) {
|
|
39
|
+
return (partitions) => {
|
|
40
|
+
const parts = partitions;
|
|
41
|
+
if (!parts || parts.length === 0) return true;
|
|
42
|
+
const siteSummary = parts[SITE_ID_FIELD_INDEX];
|
|
43
|
+
if (siteSummary && (siteSummary.lower_bound != null || siteSummary.upper_bound != null)) {
|
|
44
|
+
const lo = decodeString(siteSummary.lower_bound);
|
|
45
|
+
const hi = decodeString(siteSummary.upper_bound);
|
|
46
|
+
if (lo != null && hi != null && (siteId < lo || siteId > hi)) return false;
|
|
47
|
+
}
|
|
48
|
+
const searchTypeSummary = parts[SEARCH_TYPE_FIELD_INDEX];
|
|
49
|
+
if (searchTypeSummary && (searchTypeSummary.lower_bound != null || searchTypeSummary.upper_bound != null)) {
|
|
50
|
+
const lo = decodeString(searchTypeSummary.lower_bound);
|
|
51
|
+
const hi = decodeString(searchTypeSummary.upper_bound);
|
|
52
|
+
if (lo != null && hi != null && (searchType < lo || searchType > hi)) return false;
|
|
53
|
+
}
|
|
54
|
+
const monthSummary = parts[DATE_MONTH_FIELD_INDEX];
|
|
55
|
+
if (monthSummary && (monthSummary.lower_bound != null || monthSummary.upper_bound != null)) {
|
|
56
|
+
const lo = decodeInt(monthSummary.lower_bound);
|
|
57
|
+
const hi = decodeInt(monthSummary.upper_bound);
|
|
58
|
+
if (lo != null && hi != null) {
|
|
59
|
+
let anyInRange = false;
|
|
60
|
+
for (const wm of wantedMonths) if (wm >= lo && wm <= hi) {
|
|
61
|
+
anyInRange = true;
|
|
62
|
+
break;
|
|
63
|
+
}
|
|
64
|
+
if (!anyInRange) return false;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return true;
|
|
68
|
+
};
|
|
69
|
+
}
|
|
5
70
|
const ICEBERG_TYPE_MAP = {
|
|
6
71
|
STRING: "string",
|
|
7
72
|
INT: "int",
|
|
@@ -38,20 +103,51 @@ function icebergPartitionSpecFor(table) {
|
|
|
38
103
|
}))
|
|
39
104
|
};
|
|
40
105
|
}
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
106
|
+
const CATALOG_CONFIG_TTL_MS = 3600 * 1e3;
|
|
107
|
+
function catalogConfigKey(config) {
|
|
108
|
+
return `gsc-catalog-cfg\0${config.catalogUri}\0${config.warehouse}`;
|
|
109
|
+
}
|
|
110
|
+
async function connectIcebergCatalog(config, opts = {}) {
|
|
111
|
+
const now = (opts.clock ?? Date.now)();
|
|
112
|
+
const requestInit = { headers: { Authorization: `Bearer ${config.catalogToken}` } };
|
|
113
|
+
let catalog;
|
|
114
|
+
if (opts.cache) {
|
|
115
|
+
const cached = await cacheGet(opts.cache, catalogConfigKey(config), now);
|
|
116
|
+
if (cached) catalog = Object.freeze({
|
|
117
|
+
type: "rest",
|
|
118
|
+
url: cached.url,
|
|
119
|
+
prefix: cached.prefix,
|
|
120
|
+
defaults: cached.defaults,
|
|
121
|
+
overrides: cached.overrides,
|
|
122
|
+
requestInit
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
if (!catalog) {
|
|
126
|
+
catalog = await restCatalogConnect({
|
|
44
127
|
url: config.catalogUri,
|
|
45
128
|
warehouse: config.warehouse,
|
|
46
|
-
requestInit
|
|
47
|
-
})
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
129
|
+
requestInit
|
|
130
|
+
});
|
|
131
|
+
if (opts.cache) {
|
|
132
|
+
const toCache = {
|
|
133
|
+
url: catalog.url,
|
|
134
|
+
prefix: catalog.prefix,
|
|
135
|
+
defaults: catalog.defaults,
|
|
136
|
+
overrides: catalog.overrides
|
|
137
|
+
};
|
|
138
|
+
await cachePut(opts.cache, catalogConfigKey(config), toCache, CATALOG_CONFIG_TTL_MS, now);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
const resolver = cachingResolver(s3SignedResolver({
|
|
142
|
+
accessKeyId: config.s3.accessKeyId,
|
|
143
|
+
secretAccessKey: config.s3.secretAccessKey,
|
|
144
|
+
region: config.s3.region ?? "auto",
|
|
145
|
+
endpoint: config.s3.endpoint,
|
|
146
|
+
pathStyle: true
|
|
147
|
+
}));
|
|
148
|
+
return {
|
|
149
|
+
catalog,
|
|
150
|
+
resolver,
|
|
55
151
|
namespace: config.namespace
|
|
56
152
|
};
|
|
57
153
|
}
|
|
@@ -100,6 +196,14 @@ async function createIcebergTables(conn, tables = ICEBERG_TABLES) {
|
|
|
100
196
|
async function listIcebergTables(conn) {
|
|
101
197
|
return (await restCatalogListTables(conn.catalog, { namespace: conn.namespace })).map((t) => t.name).sort();
|
|
102
198
|
}
|
|
199
|
+
const SNAPSHOT_REF_TTL_MS = 3e4;
|
|
200
|
+
const RESOLVED_FILES_TTL_MS = 1440 * 60 * 1e3;
|
|
201
|
+
function snapshotRefKey(namespace, table) {
|
|
202
|
+
return `gsc-snapref\0${namespace}\0${table}`;
|
|
203
|
+
}
|
|
204
|
+
function resolvedFilesKey(namespace, table, snapshotId, siteId, searchType, wantedMonths) {
|
|
205
|
+
return `gsc-files\0${namespace}\0${table}\0${snapshotId}\0${siteId}\0${searchType}\0${[...wantedMonths].sort((a, b) => a - b).join(",")}`;
|
|
206
|
+
}
|
|
103
207
|
function monthsInRange(range) {
|
|
104
208
|
const [sy, sm] = range.start.split("-").map(Number);
|
|
105
209
|
const [ey, em] = range.end.split("-").map(Number);
|
|
@@ -126,16 +230,56 @@ function stripBucket(filePath) {
|
|
|
126
230
|
const slash = rest.indexOf("/");
|
|
127
231
|
return slash >= 0 ? rest.slice(slash + 1) : rest;
|
|
128
232
|
}
|
|
129
|
-
async function
|
|
233
|
+
async function loadSnapshotId(conn, opts, now) {
|
|
234
|
+
if (opts.cache) {
|
|
235
|
+
const cached = await cacheGet(opts.cache, snapshotRefKey(conn.namespace, opts.table), now);
|
|
236
|
+
if (cached !== void 0) return {
|
|
237
|
+
snapshotId: cached,
|
|
238
|
+
metadata: null
|
|
239
|
+
};
|
|
240
|
+
}
|
|
130
241
|
const { metadata } = await restCatalogLoadTable(conn.catalog, {
|
|
131
242
|
namespace: conn.namespace,
|
|
132
243
|
table: opts.table
|
|
133
244
|
});
|
|
134
|
-
|
|
245
|
+
const raw = metadata["current-snapshot-id"];
|
|
246
|
+
const snapshotId = raw == null ? null : String(raw);
|
|
247
|
+
if (opts.cache) await cachePut(opts.cache, snapshotRefKey(conn.namespace, opts.table), snapshotId, SNAPSHOT_REF_TTL_MS, now);
|
|
248
|
+
return {
|
|
249
|
+
snapshotId,
|
|
250
|
+
metadata
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
async function listIcebergDataFiles(conn, opts) {
|
|
254
|
+
const profiler = opts.profiler;
|
|
255
|
+
const now = (opts.clock ?? Date.now)();
|
|
135
256
|
const wantedMonths = new Set(monthsInRange(opts.range).map(monthsSinceEpoch));
|
|
257
|
+
const endSnapshot = profiler?.start("iceberg.snapshot");
|
|
258
|
+
let { snapshotId, metadata } = await loadSnapshotId(conn, opts, now);
|
|
259
|
+
endSnapshot?.({ cached: metadata == null && snapshotId != null });
|
|
260
|
+
if (snapshotId == null) return [];
|
|
261
|
+
const filesKey = resolvedFilesKey(conn.namespace, opts.table, snapshotId, opts.siteId, opts.searchType, wantedMonths);
|
|
262
|
+
if (opts.cache) {
|
|
263
|
+
const endCache = profiler?.start("iceberg.cache");
|
|
264
|
+
const cached = await cacheGet(opts.cache, filesKey, now);
|
|
265
|
+
endCache?.({ hit: cached !== void 0 });
|
|
266
|
+
if (cached !== void 0) return cached;
|
|
267
|
+
}
|
|
268
|
+
if (!metadata) {
|
|
269
|
+
const reloaded = await loadSnapshotId(conn, {
|
|
270
|
+
...opts,
|
|
271
|
+
cache: void 0
|
|
272
|
+
}, now);
|
|
273
|
+
snapshotId = reloaded.snapshotId;
|
|
274
|
+
metadata = reloaded.metadata;
|
|
275
|
+
if (snapshotId == null || !metadata) return [];
|
|
276
|
+
}
|
|
277
|
+
const endWalk = profiler?.start("iceberg.walk");
|
|
278
|
+
const partitionFilter = buildPartitionFilter(opts.siteId, opts.searchType, wantedMonths);
|
|
136
279
|
const manifests = await icebergManifests({
|
|
137
280
|
metadata,
|
|
138
|
-
resolver: conn.resolver
|
|
281
|
+
resolver: conn.resolver,
|
|
282
|
+
partitionFilter
|
|
139
283
|
});
|
|
140
284
|
const out = [];
|
|
141
285
|
for (const m of manifests) for (const entry of m.entries) {
|
|
@@ -154,6 +298,14 @@ async function listIcebergDataFiles(conn, opts) {
|
|
|
154
298
|
rowCount: Number(df.record_count)
|
|
155
299
|
});
|
|
156
300
|
}
|
|
301
|
+
endWalk?.({
|
|
302
|
+
manifests: manifests.length,
|
|
303
|
+
files: out.length
|
|
304
|
+
});
|
|
305
|
+
if (opts.cache) {
|
|
306
|
+
const freshKey = resolvedFilesKey(conn.namespace, opts.table, snapshotId, opts.siteId, opts.searchType, wantedMonths);
|
|
307
|
+
await cachePut(opts.cache, freshKey, out, RESOLVED_FILES_TTL_MS, now);
|
|
308
|
+
}
|
|
157
309
|
return out;
|
|
158
310
|
}
|
|
159
311
|
async function dropIcebergTables(conn, tables) {
|
|
@@ -266,4 +418,4 @@ function createIcebergAppendSink(options) {
|
|
|
266
418
|
}
|
|
267
419
|
};
|
|
268
420
|
}
|
|
269
|
-
export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
|
|
421
|
+
export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
|
package/dist/index.d.mts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { CodecCtx, CompactionThresholds, CompactionTier, DataSource, EngineOptions, FileSetRef, GcCtx, Grain, ListLiveFilter, LockScope, ManifestEntry, ManifestPurgeResult, ManifestStore, ParquetCodec, PurgeFilter, PurgeResult, PurgeUrlsResult, QueryCtx, QueryExecuteOptions, QueryExecuteResult, QueryExecutor, QueryResult, Row, RunSQLOptions, SearchType, StorageEngine, SyncState, SyncStateDetail, SyncStateFilter, SyncStateKind, SyncStateScope, TableName, TenantCtx, Watermark, WatermarkFilter, WatermarkScope, WriteCtx, WriteResult, enumeratePartitions } from "./_chunks/storage.mjs";
|
|
1
|
+
import { CodecCtx, CompactionThresholds, CompactionTier, DataSource, EngineOptions, FileSetRef, GcCtx, Grain, ListLiveFilter, LockScope, ManifestEntry, ManifestPurgeResult, ManifestStore, ParquetCodec, PurgeFilter, PurgeResult, PurgeUrlsResult, QueryCtx, QueryExecuteOptions, QueryExecuteResult, QueryExecutor, QueryProfiler, QueryResult, QuerySpan, Row, RunSQLOptions, SearchType, StorageEngine, SyncState, SyncStateDetail, SyncStateFilter, SyncStateKind, SyncStateScope, TableName, TenantCtx, Watermark, WatermarkFilter, WatermarkScope, WriteCtx, WriteResult, enumeratePartitions } from "./_chunks/storage.mjs";
|
|
2
2
|
import { DuckDBFactory, DuckDBHandle, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor } from "./_chunks/duckdb.mjs";
|
|
3
3
|
import { ColumnDef, ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, TableSchema, allTables, countries, currentSchemaVersion, dates, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, page_queries, pages, queries } from "./_chunks/schema.mjs";
|
|
4
4
|
import { EngineError, EngineErrorKind, engineErrorToException, engineErrors, formatEngineError, isEngineError } from "./_chunks/errors.mjs";
|
|
@@ -138,6 +138,25 @@ declare function inferSearchType(entry: Pick<ManifestEntry, 'searchType'>): Sear
|
|
|
138
138
|
* decide how to handle it.
|
|
139
139
|
*/
|
|
140
140
|
declare function inferLegacyTier(entry: Pick<ManifestEntry, 'partition' | 'tier'>): CompactionTier | undefined;
|
|
141
|
+
/**
|
|
142
|
+
* Build a {@link QueryProfiler} that records each closed span to `sink`.
|
|
143
|
+
*
|
|
144
|
+
* `start(name, meta)` stamps the open time and returns an `end` thunk; calling
|
|
145
|
+
* `end(extra)` records `{ name, ms, meta }` with `extra` merged over the
|
|
146
|
+
* open-time `meta` (so completion-only facts — row counts, buffered-file
|
|
147
|
+
* counts — land on the same span). `now` is injectable for deterministic
|
|
148
|
+
* tests; it defaults to `Date.now`.
|
|
149
|
+
*/
|
|
150
|
+
declare function createQueryProfiler(sink: (span: QuerySpan) => void, now?: () => number): QueryProfiler;
|
|
151
|
+
/**
|
|
152
|
+
* A profiler that accumulates closed spans into an array — for tests, the CLI
|
|
153
|
+
* `query` command, or any ad-hoc "where did the time go" probe. The returned
|
|
154
|
+
* `spans` array is filled as spans close, in completion order.
|
|
155
|
+
*/
|
|
156
|
+
declare function collectSpans(now?: () => number): {
|
|
157
|
+
profiler: QueryProfiler;
|
|
158
|
+
spans: QuerySpan[];
|
|
159
|
+
};
|
|
141
160
|
/** A row as stored by the fake — data columns plus the injected identity columns. */
|
|
142
161
|
type StoredRow = Row & {
|
|
143
162
|
site_id: string;
|
|
@@ -190,4 +209,4 @@ declare const MIN_SYNC_IMPRESSIONS = 1;
|
|
|
190
209
|
declare const MIN_COUNTRY_IMPRESSIONS = 10;
|
|
191
210
|
declare const MAX_SITEMAP_URLS_PER_SITE = 50000;
|
|
192
211
|
declare const MAX_TRACKED_URLS_PER_SITE = 200000;
|
|
193
|
-
export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type CreateIngestAccumulatorOptions, DEFAULT_SEARCH_TYPE, type DataSource, type DateWeight, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, ENGINE_QUERY_CAPABILITIES, EngineError, EngineErrorKind, type EngineOptions, FILES_PLACEHOLDER, type FileSetRef, type FinalizeOptions, type FinalizeResult, type GcCtx, type Grain, type GscApiRow, type InMemorySink, type IngestAccumulator, type IngestAccumulatorCtx, type IngestAccumulatorEngine, type IngestAccumulatorHooks, type IngestOptions, type InspectionVerdict, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, ROW_LIMIT_R2, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SchedulePolicy, type ScheduleState, type SearchType, type Sink, type SinkCapabilities, type SinkCloseResult, type SinkOptions, type SinkSlice, type SinkWriteResult, type StorageEngine, type StoredRow, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, type SyncTableName, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, type TableName, type TableSchema, type TableTier, type TenantCtx, type TieredTableName, WEIGHT_PRIORITY, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergResolverAdapter, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createParquetResolverAdapter, createRowAccumulator, createSqlQuerySource, createStorageEngine, currentSchemaVersion, dates, dayPartition, dimensionToColumn, drizzleSchema, engineErrorToException, engineErrors, enumeratePartitions, fixedPolicy, formatEngineError, formatLiteral, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, isEngineError, objectKey, page_queries, pages, parseEnabledSearchTypes, pgResolverAdapter, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes, validateEnabledSearchTypesResult };
|
|
212
|
+
export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type CreateIngestAccumulatorOptions, DEFAULT_SEARCH_TYPE, type DataSource, type DateWeight, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, ENGINE_QUERY_CAPABILITIES, EngineError, EngineErrorKind, type EngineOptions, FILES_PLACEHOLDER, type FileSetRef, type FinalizeOptions, type FinalizeResult, type GcCtx, type Grain, type GscApiRow, type InMemorySink, type IngestAccumulator, type IngestAccumulatorCtx, type IngestAccumulatorEngine, type IngestAccumulatorHooks, type IngestOptions, type InspectionVerdict, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryProfiler, type QueryResult, type QuerySpan, ROW_LIMIT_R2, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SchedulePolicy, type ScheduleState, type SearchType, type Sink, type SinkCapabilities, type SinkCloseResult, type SinkOptions, type SinkSlice, type SinkWriteResult, type StorageEngine, type StoredRow, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, type SyncTableName, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, type TableName, type TableSchema, type TableTier, type TenantCtx, type TieredTableName, WEIGHT_PRIORITY, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, collectSpans, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergResolverAdapter, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createParquetResolverAdapter, createQueryProfiler, createRowAccumulator, createSqlQuerySource, createStorageEngine, currentSchemaVersion, dates, dayPartition, dimensionToColumn, drizzleSchema, engineErrorToException, engineErrors, enumeratePartitions, fixedPolicy, formatEngineError, formatLiteral, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, isEngineError, objectKey, page_queries, pages, parseEnabledSearchTypes, pgResolverAdapter, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes, validateEnabledSearchTypesResult };
|
package/dist/index.mjs
CHANGED
|
@@ -127,6 +127,32 @@ function createIngestAccumulator(opts) {
|
|
|
127
127
|
}
|
|
128
128
|
};
|
|
129
129
|
}
|
|
130
|
+
function createQueryProfiler(sink, now = () => Date.now()) {
|
|
131
|
+
return { start(name, meta) {
|
|
132
|
+
const t0 = now();
|
|
133
|
+
return (extra) => {
|
|
134
|
+
const merged = meta || extra ? {
|
|
135
|
+
...meta,
|
|
136
|
+
...extra
|
|
137
|
+
} : void 0;
|
|
138
|
+
sink(merged ? {
|
|
139
|
+
name,
|
|
140
|
+
ms: now() - t0,
|
|
141
|
+
meta: merged
|
|
142
|
+
} : {
|
|
143
|
+
name,
|
|
144
|
+
ms: now() - t0
|
|
145
|
+
});
|
|
146
|
+
};
|
|
147
|
+
} };
|
|
148
|
+
}
|
|
149
|
+
function collectSpans(now) {
|
|
150
|
+
const spans = [];
|
|
151
|
+
return {
|
|
152
|
+
profiler: createQueryProfiler((s) => spans.push(s), now),
|
|
153
|
+
spans
|
|
154
|
+
};
|
|
155
|
+
}
|
|
130
156
|
const KEY_SEP = "\0";
|
|
131
157
|
function partitionKey(slice) {
|
|
132
158
|
return [
|
|
@@ -282,4 +308,4 @@ const MIN_SYNC_IMPRESSIONS = 1;
|
|
|
282
308
|
const MIN_COUNTRY_IMPRESSIONS = 10;
|
|
283
309
|
const MAX_SITEMAP_URLS_PER_SITE = 5e4;
|
|
284
310
|
const MAX_TRACKED_URLS_PER_SITE = 2e5;
|
|
285
|
-
export { DEFAULT_SEARCH_TYPE, ENGINE_QUERY_CAPABILITIES, FILES_PLACEHOLDER, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, ROW_LIMIT_R2, SCHEMAS, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, WEIGHT_PRIORITY, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergResolverAdapter, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createParquetResolverAdapter, createRowAccumulator, createSqlQuerySource, createStorageEngine, currentSchemaVersion, dates, dayPartition, dimensionToColumn, drizzleSchema, engineErrorToException, engineErrors, enumeratePartitions, fixedPolicy, formatEngineError, formatLiteral, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, isEngineError, objectKey, page_queries, pages, parseEnabledSearchTypes, pgResolverAdapter, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes, validateEnabledSearchTypesResult };
|
|
311
|
+
export { DEFAULT_SEARCH_TYPE, ENGINE_QUERY_CAPABILITIES, FILES_PLACEHOLDER, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, ROW_LIMIT_R2, SCHEMAS, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, WEIGHT_PRIORITY, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, collectSpans, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergResolverAdapter, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createParquetResolverAdapter, createQueryProfiler, createRowAccumulator, createSqlQuerySource, createStorageEngine, currentSchemaVersion, dates, dayPartition, dimensionToColumn, drizzleSchema, engineErrorToException, engineErrors, enumeratePartitions, fixedPolicy, formatEngineError, formatLiteral, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, isEngineError, objectKey, page_queries, pages, parseEnabledSearchTypes, pgResolverAdapter, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes, validateEnabledSearchTypesResult };
|