@malloydata/db-snowflake 0.0.375 → 0.0.376
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +29 -2
- package/dist/index.js.map +1 -1
- package/dist/snowflake_connection.d.ts +48 -13
- package/dist/snowflake_connection.js +146 -228
- package/dist/snowflake_connection.js.map +1 -1
- package/dist/snowflake_connection.spec.js +84 -14
- package/dist/snowflake_connection.spec.js.map +1 -1
- package/dist/snowflake_sample_strategy.spec.d.ts +1 -0
- package/dist/snowflake_sample_strategy.spec.js +25 -0
- package/dist/snowflake_sample_strategy.spec.js.map +1 -0
- package/dist/snowflake_variant_schema.d.ts +43 -0
- package/dist/snowflake_variant_schema.js +203 -0
- package/dist/snowflake_variant_schema.js.map +1 -0
- package/dist/snowflake_variant_schema.spec.d.ts +1 -0
- package/dist/snowflake_variant_schema.spec.js +150 -0
- package/dist/snowflake_variant_schema.spec.js.map +1 -0
- package/package.json +2 -2
- package/src/index.ts +34 -1
- package/src/snowflake_connection.spec.ts +88 -14
- package/src/snowflake_connection.ts +220 -262
- package/src/snowflake_sample_strategy.spec.ts +43 -0
- package/src/snowflake_variant_schema.spec.ts +188 -0
- package/src/snowflake_variant_schema.ts +301 -0
|
@@ -34,27 +34,68 @@ import type {
|
|
|
34
34
|
StructDef,
|
|
35
35
|
QueryRecord,
|
|
36
36
|
TestableConnection,
|
|
37
|
-
Dialect,
|
|
38
|
-
RecordDef,
|
|
39
|
-
AtomicFieldDef,
|
|
40
|
-
ArrayDef,
|
|
41
37
|
SQLSourceRequest,
|
|
42
38
|
} from '@malloydata/malloy';
|
|
43
|
-
import {
|
|
44
|
-
SnowflakeDialect,
|
|
45
|
-
TinyParser,
|
|
46
|
-
mkArrayDef,
|
|
47
|
-
sqlKey,
|
|
48
|
-
makeDigest,
|
|
49
|
-
} from '@malloydata/malloy';
|
|
39
|
+
import {SnowflakeDialect, sqlKey, makeDigest} from '@malloydata/malloy';
|
|
50
40
|
import {BaseConnection} from '@malloydata/malloy/connection';
|
|
51
41
|
|
|
52
42
|
import {SnowflakeExecutor} from './snowflake_executor';
|
|
43
|
+
import {
|
|
44
|
+
accumulateVariantPath,
|
|
45
|
+
buildTopLevelField,
|
|
46
|
+
createVariantSchemaState,
|
|
47
|
+
PathParser,
|
|
48
|
+
seedTopLevelShape,
|
|
49
|
+
} from './snowflake_variant_schema';
|
|
50
|
+
import type {NestedColumn} from './snowflake_variant_schema';
|
|
53
51
|
import type {ConnectionOptions} from 'snowflake-sdk';
|
|
54
52
|
import type {Options as PoolOptions} from 'generic-pool';
|
|
55
53
|
|
|
56
54
|
type namespace = {database: string; schema: string};
|
|
57
55
|
|
|
56
|
+
/**
|
|
57
|
+
* Output of the INFORMATION_SCHEMA.TABLES probe. Undefined when the
|
|
58
|
+
* probe didn't run (non-parseable name) or couldn't find numeric size
|
|
59
|
+
* info (views, missing permissions).
|
|
60
|
+
*/
|
|
61
|
+
export interface TableSizeProbe {
|
|
62
|
+
bytes: number;
|
|
63
|
+
rowCount: number;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Three-way tier that drives variant schema sampling. Extracted as a
|
|
68
|
+
* pure function so cost-policy decisions are unit-testable.
|
|
69
|
+
*
|
|
70
|
+
* full-scan-then-sample: probe confirmed a small base table. One
|
|
71
|
+
* full scan catches rare fields. On failure, fall through to the
|
|
72
|
+
* sample chain rather than accept opaque variant.
|
|
73
|
+
*
|
|
74
|
+
* tablesample-only: probe confirmed a base table above the small
|
|
75
|
+
* threshold. TABLESAMPLE BLOCK is safe (reads a few micro
|
|
76
|
+
* partitions). Plain LIMIT without a WHERE is unsafe on large
|
|
77
|
+
* partitioned tables, so we skip the LIMIT fallback — we'd rather
|
|
78
|
+
* degrade to variant than issue a runaway query.
|
|
79
|
+
*
|
|
80
|
+
* tablesample-then-limit: probe gave no size info (views, temp
|
|
81
|
+
* views, exotic names). We can't distinguish a small view from a
|
|
82
|
+
* view over a petabyte table, so we do best-effort sampling. This
|
|
83
|
+
* is the acknowledged "can't help you" case from the design doc.
|
|
84
|
+
*/
|
|
85
|
+
export type SampleStrategy =
|
|
86
|
+
| 'full-scan-then-sample'
|
|
87
|
+
| 'tablesample-only'
|
|
88
|
+
| 'tablesample-then-limit';
|
|
89
|
+
|
|
90
|
+
export function pickSampleStrategy(
|
|
91
|
+
probe: TableSizeProbe | undefined,
|
|
92
|
+
fullScanMaxBytes: number
|
|
93
|
+
): SampleStrategy {
|
|
94
|
+
if (probe === undefined) return 'tablesample-then-limit';
|
|
95
|
+
if (probe.bytes <= fullScanMaxBytes) return 'full-scan-then-sample';
|
|
96
|
+
return 'tablesample-only';
|
|
97
|
+
}
|
|
98
|
+
|
|
58
99
|
export interface SnowflakeConnectionOptions {
|
|
59
100
|
// snowflake sdk connection options
|
|
60
101
|
connOptions?: ConnectionOptions;
|
|
@@ -74,155 +115,18 @@ export interface SnowflakeConnectionOptions {
|
|
|
74
115
|
// Timeout for the variant schema sampling query (default 2 minutes)
|
|
75
116
|
schemaSampleTimeoutMs?: number;
|
|
76
117
|
|
|
77
|
-
//
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
type PathChain =
|
|
82
|
-
| {arrayRef: true; next?: PathChain}
|
|
83
|
-
| {name: string; next?: PathChain};
|
|
84
|
-
|
|
85
|
-
class SnowField {
|
|
86
|
-
constructor(
|
|
87
|
-
readonly name: string,
|
|
88
|
-
readonly type: string,
|
|
89
|
-
readonly dialect: Dialect
|
|
90
|
-
) {}
|
|
91
|
-
fieldDef(): AtomicFieldDef {
|
|
92
|
-
return {
|
|
93
|
-
...this.dialect.sqlTypeToMalloyType(this.type),
|
|
94
|
-
name: this.name,
|
|
95
|
-
};
|
|
96
|
-
}
|
|
97
|
-
walk(_path: PathChain, _fieldType: string): void {
|
|
98
|
-
throw new Error(
|
|
99
|
-
'SNOWWFLAKE SCHEMA PARSE ERROR: Should not walk through fields'
|
|
100
|
-
);
|
|
101
|
-
}
|
|
102
|
-
static make(name: string, fieldType: string, d: Dialect) {
|
|
103
|
-
if (fieldType === 'array') {
|
|
104
|
-
return new SnowArray(name, d);
|
|
105
|
-
} else if (fieldType === 'object') {
|
|
106
|
-
return new SnowObject(name, d);
|
|
107
|
-
}
|
|
108
|
-
return new SnowField(name, fieldType, d);
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
class SnowObject extends SnowField {
|
|
113
|
-
fieldMap = new Map<string, SnowField>();
|
|
114
|
-
constructor(name: string, d: Dialect) {
|
|
115
|
-
super(name, 'object', d);
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
get fields(): AtomicFieldDef[] {
|
|
119
|
-
const fields: AtomicFieldDef[] = [];
|
|
120
|
-
for (const [_, fieldObj] of this.fieldMap) {
|
|
121
|
-
fields.push(fieldObj.fieldDef());
|
|
122
|
-
}
|
|
123
|
-
return fields;
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
fieldDef(): RecordDef {
|
|
127
|
-
const rec: RecordDef = {
|
|
128
|
-
type: 'record',
|
|
129
|
-
name: this.name,
|
|
130
|
-
fields: this.fields,
|
|
131
|
-
join: 'one',
|
|
132
|
-
};
|
|
133
|
-
return rec;
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
walk(path: PathChain, fieldType: string) {
|
|
137
|
-
if ('name' in path) {
|
|
138
|
-
const field = this.fieldMap.get(path.name);
|
|
139
|
-
if (path.next) {
|
|
140
|
-
if (field instanceof SnowObject || field instanceof SnowArray) {
|
|
141
|
-
field.walk(path.next, fieldType);
|
|
142
|
-
return;
|
|
143
|
-
}
|
|
144
|
-
// Field is missing or is a scalar leaf — the variant data has
|
|
145
|
-
// inconsistent structure across rows. Degrade to opaque variant.
|
|
146
|
-
this.fieldMap.set(
|
|
147
|
-
path.name,
|
|
148
|
-
new SnowField(path.name, 'variant', this.dialect)
|
|
149
|
-
);
|
|
150
|
-
return;
|
|
151
|
-
} else {
|
|
152
|
-
if (!field) {
|
|
153
|
-
this.fieldMap.set(
|
|
154
|
-
path.name,
|
|
155
|
-
SnowField.make(path.name, fieldType, this.dialect)
|
|
156
|
-
);
|
|
157
|
-
return;
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
return;
|
|
161
|
-
}
|
|
162
|
-
// Array reference in an object context — inconsistent structure.
|
|
163
|
-
// Ignore this path; the object keeps whatever fields it already has.
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
class SnowArray extends SnowField {
|
|
168
|
-
arrayOf = 'unknown';
|
|
169
|
-
objectChild?: SnowObject;
|
|
170
|
-
arrayChild?: SnowArray;
|
|
171
|
-
constructor(name: string, d: Dialect) {
|
|
172
|
-
super(name, 'array', d);
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
isArrayOf(type: string) {
|
|
176
|
-
if (this.arrayOf !== 'unknown') {
|
|
177
|
-
this.arrayOf = 'variant';
|
|
178
|
-
return;
|
|
179
|
-
}
|
|
180
|
-
this.arrayOf = type;
|
|
181
|
-
if (type === 'object') {
|
|
182
|
-
this.objectChild = new SnowObject('', this.dialect);
|
|
183
|
-
} else if (type === 'array') {
|
|
184
|
-
this.arrayChild = new SnowArray('', this.dialect);
|
|
185
|
-
}
|
|
186
|
-
}
|
|
118
|
+
// Row limit used inside the variant schema sample (default 1000). When the
|
|
119
|
+
// probe reports the table is small enough to full-scan, this limit is
|
|
120
|
+
// ignored.
|
|
121
|
+
schemaSampleRowLimit?: number;
|
|
187
122
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
this.name
|
|
193
|
-
);
|
|
194
|
-
return t;
|
|
195
|
-
}
|
|
196
|
-
if (this.arrayChild) {
|
|
197
|
-
return mkArrayDef(this.arrayChild.fieldDef(), this.name);
|
|
198
|
-
}
|
|
199
|
-
return mkArrayDef(
|
|
200
|
-
this.dialect.sqlTypeToMalloyType(this.arrayOf),
|
|
201
|
-
this.name
|
|
202
|
-
);
|
|
203
|
-
}
|
|
123
|
+
// Byte threshold below which variant schema inference skips sampling and
|
|
124
|
+
// full-scans the table instead (default 100 MB). A full scan catches rare
|
|
125
|
+
// fields that a sample would miss.
|
|
126
|
+
schemaSampleFullScanMaxBytes?: number;
|
|
204
127
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
if (path.next) {
|
|
208
|
-
const next = this.arrayChild || this.objectChild;
|
|
209
|
-
if (next) {
|
|
210
|
-
next.walk(path.next, fieldType);
|
|
211
|
-
return;
|
|
212
|
-
}
|
|
213
|
-
// Array elements were scalars but now we see deeper structure —
|
|
214
|
-
// inconsistent variant data. Degrade to variant array.
|
|
215
|
-
this.arrayOf = 'variant';
|
|
216
|
-
return;
|
|
217
|
-
} else {
|
|
218
|
-
this.isArrayOf(fieldType);
|
|
219
|
-
return;
|
|
220
|
-
}
|
|
221
|
-
}
|
|
222
|
-
// Name reference in an array context — inconsistent structure.
|
|
223
|
-
// Degrade to variant array.
|
|
224
|
-
this.arrayOf = 'variant';
|
|
225
|
-
}
|
|
128
|
+
// SQL statements to run when a connection is acquired from the pool
|
|
129
|
+
setupSQL?: string;
|
|
226
130
|
}
|
|
227
131
|
|
|
228
132
|
/**
|
|
@@ -248,6 +152,8 @@ export class SnowflakeConnection
|
|
|
248
152
|
private queryOptions: RunSQLOptions;
|
|
249
153
|
private timeoutMs: number;
|
|
250
154
|
private schemaSampleTimeoutMs: number;
|
|
155
|
+
private schemaSampleRowLimit: number;
|
|
156
|
+
private schemaSampleFullScanMaxBytes: number;
|
|
251
157
|
private setupSQL: string | undefined;
|
|
252
158
|
|
|
253
159
|
constructor(
|
|
@@ -271,6 +177,9 @@ export class SnowflakeConnection
|
|
|
271
177
|
this.queryOptions = options?.queryOptions ?? {};
|
|
272
178
|
this.timeoutMs = options?.timeoutMs ?? TIMEOUT_MS;
|
|
273
179
|
this.schemaSampleTimeoutMs = options?.schemaSampleTimeoutMs ?? 15_000;
|
|
180
|
+
this.schemaSampleRowLimit = options?.schemaSampleRowLimit ?? 1000;
|
|
181
|
+
this.schemaSampleFullScanMaxBytes =
|
|
182
|
+
options?.schemaSampleFullScanMaxBytes ?? 100_000_000;
|
|
274
183
|
}
|
|
275
184
|
|
|
276
185
|
get dialectName(): string {
|
|
@@ -366,7 +275,7 @@ export class SnowflakeConnection
|
|
|
366
275
|
): Promise<void> {
|
|
367
276
|
const infoQuery = `DESCRIBE TABLE ${tablePath}`;
|
|
368
277
|
const rows = await this.executor.batch(infoQuery);
|
|
369
|
-
const
|
|
278
|
+
const nestedColumns: NestedColumn[] = [];
|
|
370
279
|
const notVariant = new Map<string, boolean>();
|
|
371
280
|
for (const row of rows) {
|
|
372
281
|
// data types look like `VARCHAR(1234)` or `NUMBER(10,2)`
|
|
@@ -374,8 +283,12 @@ export class SnowflakeConnection
|
|
|
374
283
|
const baseType = fullType.split('(')[0];
|
|
375
284
|
const name = row['name'] as string;
|
|
376
285
|
|
|
377
|
-
if (
|
|
378
|
-
|
|
286
|
+
if (
|
|
287
|
+
baseType === 'variant' ||
|
|
288
|
+
baseType === 'array' ||
|
|
289
|
+
baseType === 'object'
|
|
290
|
+
) {
|
|
291
|
+
nestedColumns.push({kind: baseType, name});
|
|
379
292
|
} else {
|
|
380
293
|
notVariant.set(name, true);
|
|
381
294
|
// For NUMBER types, pass full string so dialect can inspect scale
|
|
@@ -390,74 +303,170 @@ export class SnowflakeConnection
|
|
|
390
303
|
}
|
|
391
304
|
}
|
|
392
305
|
// VARIANT, ARRAY, and OBJECT columns don't have schema in metadata —
|
|
393
|
-
// we have to sample actual data and inspect it to discover the
|
|
394
|
-
//
|
|
395
|
-
//
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
306
|
+
// we have to sample actual data and inspect it to discover the
|
|
307
|
+
// structure. Cost control happens in two places:
|
|
308
|
+
// 1. project only the nested columns (via object_construct), so
|
|
309
|
+
// bytes-on-wire are bounded by actual variant content.
|
|
310
|
+
// 2. tier the sampling strategy by probeTableSize (see
|
|
311
|
+
// pickSampleStrategy) — small base tables get a full scan;
|
|
312
|
+
// large base tables get TABLESAMPLE only (no unsafe LIMIT
|
|
313
|
+
// fallback); unknown-size sources (views, temp views) get
|
|
314
|
+
// the best-effort TABLESAMPLE→LIMIT chain.
|
|
315
|
+
if (nestedColumns.length > 0) {
|
|
316
|
+
const variantArgs = nestedColumns
|
|
317
|
+
.map(v => `'${v.name}', "${v.name}"`)
|
|
318
|
+
.join(', ');
|
|
319
|
+
// Flatten sampled rows and emit each distinct (path, type) pair.
|
|
320
|
+
// Conflicting pairs at the same path flow through to mergeShape,
|
|
321
|
+
// which collapses them to variant — that is how we honestly
|
|
322
|
+
// surface mixed-type fields to the user.
|
|
403
323
|
const makeSampleQuery = (sampleClause: string) => `
|
|
404
|
-
select
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
)
|
|
417
|
-
where type != 'null_value'
|
|
418
|
-
group BY 1
|
|
419
|
-
having count(*) <=1
|
|
420
|
-
order by path;
|
|
324
|
+
select
|
|
325
|
+
regexp_replace(path, '\\\\[[0-9]+\\\\]', '[*]') as path,
|
|
326
|
+
case
|
|
327
|
+
when typeof(value) = 'INTEGER' then 'decimal'
|
|
328
|
+
when typeof(value) = 'DOUBLE' then 'decimal'
|
|
329
|
+
else lower(typeof(value)) end as type
|
|
330
|
+
from
|
|
331
|
+
(${sampleClause})
|
|
332
|
+
,table(flatten(input => o, recursive => true)) as meta
|
|
333
|
+
where typeof(value) != 'NULL_VALUE'
|
|
334
|
+
group by 1, 2
|
|
335
|
+
order by 1;
|
|
421
336
|
`;
|
|
422
|
-
const
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
// large partitioned tables. TABLESAMPLE only works on base tables,
|
|
428
|
-
// not views, so if it fails we fall back to a plain LIMIT 100.
|
|
429
|
-
const tablesampleClause =
|
|
430
|
-
`select object_construct(${variantArgs}) o` +
|
|
431
|
-
` from ${tablePath} TABLESAMPLE BLOCK (1) limit 100`;
|
|
432
|
-
const fieldPathRows = await this.runSchemaSample(
|
|
433
|
-
makeSampleQuery(tablesampleClause),
|
|
434
|
-
makeSampleQuery(limitClause)
|
|
337
|
+
const projectVariants = `select object_construct(${variantArgs}) o`;
|
|
338
|
+
const probe = await this.probeTableSize(tablePath);
|
|
339
|
+
const strategy = pickSampleStrategy(
|
|
340
|
+
probe,
|
|
341
|
+
this.schemaSampleFullScanMaxBytes
|
|
435
342
|
);
|
|
343
|
+
const n = this.schemaSampleRowLimit;
|
|
344
|
+
let fieldPathRows: QueryRecord[] | undefined;
|
|
345
|
+
|
|
346
|
+
if (strategy === 'full-scan-then-sample') {
|
|
347
|
+
// Small base table: one full scan catches rare fields that
|
|
348
|
+
// sampling would miss. tryBatch so a failure doesn't poison
|
|
349
|
+
// the pool connection (temp views live on it). On failure we
|
|
350
|
+
// fall through to the sample path so a slow or timed-out full
|
|
351
|
+
// scan still gets partial structure.
|
|
352
|
+
fieldPathRows =
|
|
353
|
+
(await this.executor.tryBatch(
|
|
354
|
+
makeSampleQuery(`${projectVariants} from ${tablePath}`),
|
|
355
|
+
{},
|
|
356
|
+
this.schemaSampleTimeoutMs
|
|
357
|
+
)) ?? undefined;
|
|
358
|
+
}
|
|
436
359
|
|
|
437
360
|
if (fieldPathRows === undefined) {
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
361
|
+
const tablesampleQuery = makeSampleQuery(
|
|
362
|
+
`${projectVariants} from ${tablePath} TABLESAMPLE BLOCK (1) limit ${n}`
|
|
363
|
+
);
|
|
364
|
+
if (strategy === 'tablesample-only') {
|
|
365
|
+
// Known-large base table: TABLESAMPLE is safe (reads a few
|
|
366
|
+
// micro-partitions), plain LIMIT without a WHERE can be
|
|
367
|
+
// catastrophic on large partitioned tables. If TABLESAMPLE
|
|
368
|
+
// fails here we accept variant rather than risk an unbounded
|
|
369
|
+
// scan.
|
|
370
|
+
fieldPathRows =
|
|
371
|
+
(await this.executor.tryBatch(
|
|
372
|
+
tablesampleQuery,
|
|
373
|
+
{},
|
|
374
|
+
this.schemaSampleTimeoutMs
|
|
375
|
+
)) ?? undefined;
|
|
376
|
+
} else {
|
|
377
|
+
// Unknown size (view, temp view, non-parseable name) or
|
|
378
|
+
// full-scan fallback: best-effort TABLESAMPLE→LIMIT chain.
|
|
379
|
+
// The LIMIT fallback is the acknowledged "can't help" case
|
|
380
|
+
// for views over large partitioned tables.
|
|
381
|
+
fieldPathRows = await this.runSchemaSample(
|
|
382
|
+
tablesampleQuery,
|
|
383
|
+
makeSampleQuery(`${projectVariants} from ${tablePath} limit ${n}`)
|
|
384
|
+
);
|
|
441
385
|
}
|
|
442
|
-
}
|
|
443
|
-
|
|
444
|
-
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
const state = createVariantSchemaState();
|
|
389
|
+
// Snowflake nested-schema inference follows these rules:
|
|
390
|
+
// - top-level ARRAY/OBJECT from DESCRIBE are authoritative
|
|
391
|
+
// - descendant paths imply ancestor shape
|
|
392
|
+
// - conflicting shapes degrade only that prefix to variant
|
|
393
|
+
// - every top-level nested column still produces a field
|
|
394
|
+
for (const nestedColumn of nestedColumns) {
|
|
395
|
+
seedTopLevelShape(state, nestedColumn);
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
if (fieldPathRows !== undefined) {
|
|
445
399
|
for (const f of fieldPathRows) {
|
|
446
400
|
const pathString = f['PATH']?.valueOf().toString();
|
|
447
401
|
const fieldType = f['TYPE']?.valueOf().toString();
|
|
448
402
|
if (pathString === undefined || fieldType === undefined) continue;
|
|
449
403
|
const pathParser = new PathParser(pathString);
|
|
450
|
-
const
|
|
451
|
-
|
|
404
|
+
const segments = pathParser.segments();
|
|
405
|
+
const topLevel = segments[0];
|
|
406
|
+
if (topLevel?.kind !== 'name' || notVariant.get(topLevel.name)) {
|
|
452
407
|
continue;
|
|
453
408
|
}
|
|
454
|
-
|
|
409
|
+
accumulateVariantPath(state, segments, fieldType);
|
|
455
410
|
}
|
|
456
|
-
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
// Always emit one field per top-level nested column from DESCRIBE, even
|
|
414
|
+
// if sampling produced no usable descendant paths.
|
|
415
|
+
for (const nestedColumn of nestedColumns) {
|
|
416
|
+
structDef.fields.push(
|
|
417
|
+
buildTopLevelField(nestedColumn, state, this.dialect)
|
|
418
|
+
);
|
|
457
419
|
}
|
|
458
420
|
}
|
|
459
421
|
}
|
|
460
422
|
|
|
423
|
+
/**
|
|
424
|
+
* Cheap metadata probe: ask INFORMATION_SCHEMA.TABLES for the row count
|
|
425
|
+
* and byte size of tablePath. Returns undefined when the name doesn't
|
|
426
|
+
* parse as a two- or three-part identifier (temp views, exotic quoted
|
|
427
|
+
* names), when the probe query fails, or when the row has no numeric
|
|
428
|
+
* BYTES (views and external tables typically report NULL).
|
|
429
|
+
*
|
|
430
|
+
* Two-part `schema.table` names use the current database's
|
|
431
|
+
* INFORMATION_SCHEMA; three-part `db.schema.table` names address
|
|
432
|
+
* INFORMATION_SCHEMA in the named database. Identifier parts are
|
|
433
|
+
* validated against a strict regex before interpolation; values that
|
|
434
|
+
* don't match cause the probe to skip.
|
|
435
|
+
*/
|
|
436
|
+
private async probeTableSize(
|
|
437
|
+
tablePath: string
|
|
438
|
+
): Promise<TableSizeProbe | undefined> {
|
|
439
|
+
const parts = tablePath.split('.');
|
|
440
|
+
if (parts.length !== 2 && parts.length !== 3) return undefined;
|
|
441
|
+
const identifier = /^[A-Za-z_][A-Za-z0-9_$]*$/;
|
|
442
|
+
if (!parts.every(p => identifier.test(p))) return undefined;
|
|
443
|
+
const [db, schema, table] =
|
|
444
|
+
parts.length === 3 ? parts : [undefined, parts[0], parts[1]];
|
|
445
|
+
const dbQualifier = db !== undefined ? `${db}.` : '';
|
|
446
|
+
const rows = await this.executor.tryBatch(
|
|
447
|
+
`select row_count as rc, bytes as by
|
|
448
|
+
from ${dbQualifier}information_schema.tables
|
|
449
|
+
where upper(table_schema) = upper('${schema}')
|
|
450
|
+
and upper(table_name) = upper('${table}')
|
|
451
|
+
limit 1`,
|
|
452
|
+
{},
|
|
453
|
+
this.schemaSampleTimeoutMs
|
|
454
|
+
);
|
|
455
|
+
if (!rows || rows.length === 0) return undefined;
|
|
456
|
+
const row = rows[0];
|
|
457
|
+
const bytesRaw = row['BY'] ?? row['by'];
|
|
458
|
+
const rowsRaw = row['RC'] ?? row['rc'];
|
|
459
|
+
// Views and external tables surface null BYTES / ROW_COUNT; treat
|
|
460
|
+
// that as "unknown size" so we don't classify them as small and
|
|
461
|
+
// launch a full scan against something potentially huge.
|
|
462
|
+
if (bytesRaw === null || bytesRaw === undefined) return undefined;
|
|
463
|
+
if (rowsRaw === null || rowsRaw === undefined) return undefined;
|
|
464
|
+
const bytes = Number(bytesRaw);
|
|
465
|
+
const rowCount = Number(rowsRaw);
|
|
466
|
+
if (!Number.isFinite(bytes) || !Number.isFinite(rowCount)) return undefined;
|
|
467
|
+
return {bytes, rowCount};
|
|
468
|
+
}
|
|
469
|
+
|
|
461
470
|
/**
|
|
462
471
|
* Try to run a schema sampling query, with fallback.
|
|
463
472
|
* First tries the primary query (e.g. using TABLESAMPLE for speed).
|
|
@@ -535,54 +544,3 @@ export class SnowflakeConnection
|
|
|
535
544
|
return tableName;
|
|
536
545
|
}
|
|
537
546
|
}
|
|
538
|
-
|
|
539
|
-
export class PathParser extends TinyParser {
|
|
540
|
-
constructor(pathName: string) {
|
|
541
|
-
super(pathName, {
|
|
542
|
-
quoted: /^'(\\'|[^'])*'/,
|
|
543
|
-
array_of: /^\[\*]/,
|
|
544
|
-
char: /^[[.\]]/,
|
|
545
|
-
number: /^\d+/,
|
|
546
|
-
word: /^\w+/,
|
|
547
|
-
});
|
|
548
|
-
}
|
|
549
|
-
|
|
550
|
-
getName() {
|
|
551
|
-
const nameStart = this.next();
|
|
552
|
-
if (nameStart.type === 'word') {
|
|
553
|
-
return nameStart.text;
|
|
554
|
-
}
|
|
555
|
-
if (nameStart.type === '[') {
|
|
556
|
-
const quotedName = this.next('quoted');
|
|
557
|
-
this.next(']');
|
|
558
|
-
return quotedName.text;
|
|
559
|
-
}
|
|
560
|
-
throw this.parseError('Expected column name');
|
|
561
|
-
}
|
|
562
|
-
|
|
563
|
-
pathChain(): PathChain {
|
|
564
|
-
const chain: PathChain = {name: this.getName()};
|
|
565
|
-
let node: PathChain = chain;
|
|
566
|
-
for (;;) {
|
|
567
|
-
const sep = this.next();
|
|
568
|
-
if (sep.type === 'eof') {
|
|
569
|
-
return chain;
|
|
570
|
-
}
|
|
571
|
-
if (sep.type === '.') {
|
|
572
|
-
node.next = {name: this.next('word').text};
|
|
573
|
-
node = node.next;
|
|
574
|
-
} else if (sep.type === 'array_of') {
|
|
575
|
-
node.next = {arrayRef: true};
|
|
576
|
-
node = node.next;
|
|
577
|
-
} else if (sep.type === '[') {
|
|
578
|
-
// Actually a dot access through a quoted name
|
|
579
|
-
const quoted = this.next('quoted');
|
|
580
|
-
node.next = {name: quoted.text};
|
|
581
|
-
node = node.next;
|
|
582
|
-
this.next(']');
|
|
583
|
-
} else {
|
|
584
|
-
throw this.parseError(`Unexpected ${sep.type}`);
|
|
585
|
-
}
|
|
586
|
-
}
|
|
587
|
-
}
|
|
588
|
-
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright Contributors to the Malloy project
|
|
3
|
+
* SPDX-License-Identifier: MIT
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import {pickSampleStrategy} from './snowflake_connection';
|
|
7
|
+
|
|
8
|
+
describe('pickSampleStrategy', () => {
|
|
9
|
+
const threshold = 100_000_000;
|
|
10
|
+
|
|
11
|
+
test('no probe → best-effort tablesample-then-limit', () => {
|
|
12
|
+
expect(pickSampleStrategy(undefined, threshold)).toBe(
|
|
13
|
+
'tablesample-then-limit'
|
|
14
|
+
);
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
test('probe at or below threshold → full-scan-then-sample', () => {
|
|
18
|
+
expect(pickSampleStrategy({bytes: 0, rowCount: 0}, threshold)).toBe(
|
|
19
|
+
'full-scan-then-sample'
|
|
20
|
+
);
|
|
21
|
+
expect(pickSampleStrategy({bytes: threshold, rowCount: 1}, threshold)).toBe(
|
|
22
|
+
'full-scan-then-sample'
|
|
23
|
+
);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
test('probe above threshold → tablesample-only (no unsafe LIMIT fallback)', () => {
|
|
27
|
+
expect(
|
|
28
|
+
pickSampleStrategy({bytes: threshold + 1, rowCount: 1}, threshold)
|
|
29
|
+
).toBe('tablesample-only');
|
|
30
|
+
expect(
|
|
31
|
+
pickSampleStrategy(
|
|
32
|
+
{bytes: 10_000_000_000, rowCount: 1_000_000_000},
|
|
33
|
+
threshold
|
|
34
|
+
)
|
|
35
|
+
).toBe('tablesample-only');
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
test('threshold=0 forces every probed table into tablesample-only', () => {
|
|
39
|
+
expect(pickSampleStrategy({bytes: 1, rowCount: 1}, 0)).toBe(
|
|
40
|
+
'tablesample-only'
|
|
41
|
+
);
|
|
42
|
+
});
|
|
43
|
+
});
|