@malloydata/db-snowflake 0.0.375 → 0.0.377
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +29 -2
- package/dist/index.js.map +1 -1
- package/dist/snowflake_connection.d.ts +48 -13
- package/dist/snowflake_connection.js +144 -228
- package/dist/snowflake_connection.js.map +1 -1
- package/dist/snowflake_connection.spec.js +179 -14
- package/dist/snowflake_connection.spec.js.map +1 -1
- package/dist/snowflake_sample_strategy.spec.js +97 -0
- package/dist/snowflake_sample_strategy.spec.js.map +1 -0
- package/dist/snowflake_table_name.d.ts +19 -0
- package/dist/snowflake_table_name.js +80 -0
- package/dist/snowflake_table_name.js.map +1 -0
- package/dist/snowflake_variant_schema.d.ts +43 -0
- package/dist/snowflake_variant_schema.js +203 -0
- package/dist/snowflake_variant_schema.js.map +1 -0
- package/dist/snowflake_variant_schema.spec.js +150 -0
- package/dist/snowflake_variant_schema.spec.js.map +1 -0
- package/package.json +2 -2
- package/src/index.ts +34 -1
- package/src/snowflake_connection.spec.ts +219 -15
- package/src/snowflake_connection.ts +218 -262
- package/src/snowflake_sample_strategy.spec.ts +130 -0
- package/src/snowflake_table_name.ts +94 -0
- package/src/snowflake_variant_schema.spec.ts +188 -0
- package/src/snowflake_variant_schema.ts +301 -0
- package/dist/snowflake_executor.spec.js +0 -89
- package/dist/snowflake_executor.spec.js.map +0 -1
- package/dist/snowflake_setup.spec.js +0 -76
- package/dist/snowflake_setup.spec.js.map +0 -1
- package/src/snowflake_executor.spec.ts +0 -103
- package/src/snowflake_setup.spec.ts +0 -56
- /package/dist/{snowflake_executor.spec.d.ts → snowflake_sample_strategy.spec.d.ts} +0 -0
- /package/dist/{snowflake_setup.spec.d.ts → snowflake_variant_schema.spec.d.ts} +0 -0
|
@@ -34,27 +34,69 @@ import type {
|
|
|
34
34
|
StructDef,
|
|
35
35
|
QueryRecord,
|
|
36
36
|
TestableConnection,
|
|
37
|
-
Dialect,
|
|
38
|
-
RecordDef,
|
|
39
|
-
AtomicFieldDef,
|
|
40
|
-
ArrayDef,
|
|
41
37
|
SQLSourceRequest,
|
|
42
38
|
} from '@malloydata/malloy';
|
|
43
|
-
import {
|
|
44
|
-
SnowflakeDialect,
|
|
45
|
-
TinyParser,
|
|
46
|
-
mkArrayDef,
|
|
47
|
-
sqlKey,
|
|
48
|
-
makeDigest,
|
|
49
|
-
} from '@malloydata/malloy';
|
|
39
|
+
import {SnowflakeDialect, sqlKey, makeDigest} from '@malloydata/malloy';
|
|
50
40
|
import {BaseConnection} from '@malloydata/malloy/connection';
|
|
51
41
|
|
|
52
42
|
import {SnowflakeExecutor} from './snowflake_executor';
|
|
43
|
+
import {
|
|
44
|
+
accumulateVariantPath,
|
|
45
|
+
buildTopLevelField,
|
|
46
|
+
createVariantSchemaState,
|
|
47
|
+
PathParser,
|
|
48
|
+
seedTopLevelShape,
|
|
49
|
+
} from './snowflake_variant_schema';
|
|
50
|
+
import type {NestedColumn} from './snowflake_variant_schema';
|
|
51
|
+
import {parseSnowflakeTableName} from './snowflake_table_name';
|
|
53
52
|
import type {ConnectionOptions} from 'snowflake-sdk';
|
|
54
53
|
import type {Options as PoolOptions} from 'generic-pool';
|
|
55
54
|
|
|
56
55
|
type namespace = {database: string; schema: string};
|
|
57
56
|
|
|
57
|
+
/**
|
|
58
|
+
* Output of the INFORMATION_SCHEMA.TABLES probe. Undefined when the
|
|
59
|
+
* probe didn't run (non-parseable name) or couldn't find numeric size
|
|
60
|
+
* info (views, missing permissions).
|
|
61
|
+
*/
|
|
62
|
+
export interface TableSizeProbe {
|
|
63
|
+
bytes: number;
|
|
64
|
+
rowCount: number;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Three-way tier that drives variant schema sampling. Extracted as a
|
|
69
|
+
* pure function so cost-policy decisions are unit-testable.
|
|
70
|
+
*
|
|
71
|
+
* full-scan-then-sample: probe confirmed a small base table. One
|
|
72
|
+
* full scan catches rare fields. On failure, fall through to the
|
|
73
|
+
* sample chain rather than accept opaque variant.
|
|
74
|
+
*
|
|
75
|
+
* tablesample-only: probe confirmed a base table above the small
|
|
76
|
+
* threshold. TABLESAMPLE BLOCK is safe (reads a few micro
|
|
77
|
+
* partitions). Plain LIMIT without a WHERE is unsafe on large
|
|
78
|
+
* partitioned tables, so we skip the LIMIT fallback — we'd rather
|
|
79
|
+
* degrade to variant than issue a runaway query.
|
|
80
|
+
*
|
|
81
|
+
* tablesample-then-limit: probe gave no size info (views, temp
|
|
82
|
+
* views, exotic names). We can't distinguish a small view from a
|
|
83
|
+
* view over a petabyte table, so we do best-effort sampling. This
|
|
84
|
+
* is the acknowledged "can't help you" case from the design doc.
|
|
85
|
+
*/
|
|
86
|
+
export type SampleStrategy =
|
|
87
|
+
| 'full-scan-then-sample'
|
|
88
|
+
| 'tablesample-only'
|
|
89
|
+
| 'tablesample-then-limit';
|
|
90
|
+
|
|
91
|
+
export function pickSampleStrategy(
|
|
92
|
+
probe: TableSizeProbe | undefined,
|
|
93
|
+
fullScanMaxBytes: number
|
|
94
|
+
): SampleStrategy {
|
|
95
|
+
if (probe === undefined) return 'tablesample-then-limit';
|
|
96
|
+
if (probe.bytes <= fullScanMaxBytes) return 'full-scan-then-sample';
|
|
97
|
+
return 'tablesample-only';
|
|
98
|
+
}
|
|
99
|
+
|
|
58
100
|
export interface SnowflakeConnectionOptions {
|
|
59
101
|
// snowflake sdk connection options
|
|
60
102
|
connOptions?: ConnectionOptions;
|
|
@@ -74,155 +116,18 @@ export interface SnowflakeConnectionOptions {
|
|
|
74
116
|
// Timeout for the variant schema sampling query (default 2 minutes)
|
|
75
117
|
schemaSampleTimeoutMs?: number;
|
|
76
118
|
|
|
77
|
-
//
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
type PathChain =
|
|
82
|
-
| {arrayRef: true; next?: PathChain}
|
|
83
|
-
| {name: string; next?: PathChain};
|
|
84
|
-
|
|
85
|
-
class SnowField {
|
|
86
|
-
constructor(
|
|
87
|
-
readonly name: string,
|
|
88
|
-
readonly type: string,
|
|
89
|
-
readonly dialect: Dialect
|
|
90
|
-
) {}
|
|
91
|
-
fieldDef(): AtomicFieldDef {
|
|
92
|
-
return {
|
|
93
|
-
...this.dialect.sqlTypeToMalloyType(this.type),
|
|
94
|
-
name: this.name,
|
|
95
|
-
};
|
|
96
|
-
}
|
|
97
|
-
walk(_path: PathChain, _fieldType: string): void {
|
|
98
|
-
throw new Error(
|
|
99
|
-
'SNOWWFLAKE SCHEMA PARSE ERROR: Should not walk through fields'
|
|
100
|
-
);
|
|
101
|
-
}
|
|
102
|
-
static make(name: string, fieldType: string, d: Dialect) {
|
|
103
|
-
if (fieldType === 'array') {
|
|
104
|
-
return new SnowArray(name, d);
|
|
105
|
-
} else if (fieldType === 'object') {
|
|
106
|
-
return new SnowObject(name, d);
|
|
107
|
-
}
|
|
108
|
-
return new SnowField(name, fieldType, d);
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
class SnowObject extends SnowField {
|
|
113
|
-
fieldMap = new Map<string, SnowField>();
|
|
114
|
-
constructor(name: string, d: Dialect) {
|
|
115
|
-
super(name, 'object', d);
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
get fields(): AtomicFieldDef[] {
|
|
119
|
-
const fields: AtomicFieldDef[] = [];
|
|
120
|
-
for (const [_, fieldObj] of this.fieldMap) {
|
|
121
|
-
fields.push(fieldObj.fieldDef());
|
|
122
|
-
}
|
|
123
|
-
return fields;
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
fieldDef(): RecordDef {
|
|
127
|
-
const rec: RecordDef = {
|
|
128
|
-
type: 'record',
|
|
129
|
-
name: this.name,
|
|
130
|
-
fields: this.fields,
|
|
131
|
-
join: 'one',
|
|
132
|
-
};
|
|
133
|
-
return rec;
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
walk(path: PathChain, fieldType: string) {
|
|
137
|
-
if ('name' in path) {
|
|
138
|
-
const field = this.fieldMap.get(path.name);
|
|
139
|
-
if (path.next) {
|
|
140
|
-
if (field instanceof SnowObject || field instanceof SnowArray) {
|
|
141
|
-
field.walk(path.next, fieldType);
|
|
142
|
-
return;
|
|
143
|
-
}
|
|
144
|
-
// Field is missing or is a scalar leaf — the variant data has
|
|
145
|
-
// inconsistent structure across rows. Degrade to opaque variant.
|
|
146
|
-
this.fieldMap.set(
|
|
147
|
-
path.name,
|
|
148
|
-
new SnowField(path.name, 'variant', this.dialect)
|
|
149
|
-
);
|
|
150
|
-
return;
|
|
151
|
-
} else {
|
|
152
|
-
if (!field) {
|
|
153
|
-
this.fieldMap.set(
|
|
154
|
-
path.name,
|
|
155
|
-
SnowField.make(path.name, fieldType, this.dialect)
|
|
156
|
-
);
|
|
157
|
-
return;
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
return;
|
|
161
|
-
}
|
|
162
|
-
// Array reference in an object context — inconsistent structure.
|
|
163
|
-
// Ignore this path; the object keeps whatever fields it already has.
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
class SnowArray extends SnowField {
|
|
168
|
-
arrayOf = 'unknown';
|
|
169
|
-
objectChild?: SnowObject;
|
|
170
|
-
arrayChild?: SnowArray;
|
|
171
|
-
constructor(name: string, d: Dialect) {
|
|
172
|
-
super(name, 'array', d);
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
isArrayOf(type: string) {
|
|
176
|
-
if (this.arrayOf !== 'unknown') {
|
|
177
|
-
this.arrayOf = 'variant';
|
|
178
|
-
return;
|
|
179
|
-
}
|
|
180
|
-
this.arrayOf = type;
|
|
181
|
-
if (type === 'object') {
|
|
182
|
-
this.objectChild = new SnowObject('', this.dialect);
|
|
183
|
-
} else if (type === 'array') {
|
|
184
|
-
this.arrayChild = new SnowArray('', this.dialect);
|
|
185
|
-
}
|
|
186
|
-
}
|
|
119
|
+
// Row limit used inside the variant schema sample (default 1000). When the
|
|
120
|
+
// probe reports the table is small enough to full-scan, this limit is
|
|
121
|
+
// ignored.
|
|
122
|
+
schemaSampleRowLimit?: number;
|
|
187
123
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
this.name
|
|
193
|
-
);
|
|
194
|
-
return t;
|
|
195
|
-
}
|
|
196
|
-
if (this.arrayChild) {
|
|
197
|
-
return mkArrayDef(this.arrayChild.fieldDef(), this.name);
|
|
198
|
-
}
|
|
199
|
-
return mkArrayDef(
|
|
200
|
-
this.dialect.sqlTypeToMalloyType(this.arrayOf),
|
|
201
|
-
this.name
|
|
202
|
-
);
|
|
203
|
-
}
|
|
124
|
+
// Byte threshold below which variant schema inference skips sampling and
|
|
125
|
+
// full-scans the table instead (default 100 MB). A full scan catches rare
|
|
126
|
+
// fields that a sample would miss.
|
|
127
|
+
schemaSampleFullScanMaxBytes?: number;
|
|
204
128
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
if (path.next) {
|
|
208
|
-
const next = this.arrayChild || this.objectChild;
|
|
209
|
-
if (next) {
|
|
210
|
-
next.walk(path.next, fieldType);
|
|
211
|
-
return;
|
|
212
|
-
}
|
|
213
|
-
// Array elements were scalars but now we see deeper structure —
|
|
214
|
-
// inconsistent variant data. Degrade to variant array.
|
|
215
|
-
this.arrayOf = 'variant';
|
|
216
|
-
return;
|
|
217
|
-
} else {
|
|
218
|
-
this.isArrayOf(fieldType);
|
|
219
|
-
return;
|
|
220
|
-
}
|
|
221
|
-
}
|
|
222
|
-
// Name reference in an array context — inconsistent structure.
|
|
223
|
-
// Degrade to variant array.
|
|
224
|
-
this.arrayOf = 'variant';
|
|
225
|
-
}
|
|
129
|
+
// SQL statements to run when a connection is acquired from the pool
|
|
130
|
+
setupSQL?: string;
|
|
226
131
|
}
|
|
227
132
|
|
|
228
133
|
/**
|
|
@@ -248,6 +153,8 @@ export class SnowflakeConnection
|
|
|
248
153
|
private queryOptions: RunSQLOptions;
|
|
249
154
|
private timeoutMs: number;
|
|
250
155
|
private schemaSampleTimeoutMs: number;
|
|
156
|
+
private schemaSampleRowLimit: number;
|
|
157
|
+
private schemaSampleFullScanMaxBytes: number;
|
|
251
158
|
private setupSQL: string | undefined;
|
|
252
159
|
|
|
253
160
|
constructor(
|
|
@@ -271,6 +178,9 @@ export class SnowflakeConnection
|
|
|
271
178
|
this.queryOptions = options?.queryOptions ?? {};
|
|
272
179
|
this.timeoutMs = options?.timeoutMs ?? TIMEOUT_MS;
|
|
273
180
|
this.schemaSampleTimeoutMs = options?.schemaSampleTimeoutMs ?? 15_000;
|
|
181
|
+
this.schemaSampleRowLimit = options?.schemaSampleRowLimit ?? 1000;
|
|
182
|
+
this.schemaSampleFullScanMaxBytes =
|
|
183
|
+
options?.schemaSampleFullScanMaxBytes ?? 100_000_000;
|
|
274
184
|
}
|
|
275
185
|
|
|
276
186
|
get dialectName(): string {
|
|
@@ -366,7 +276,7 @@ export class SnowflakeConnection
|
|
|
366
276
|
): Promise<void> {
|
|
367
277
|
const infoQuery = `DESCRIBE TABLE ${tablePath}`;
|
|
368
278
|
const rows = await this.executor.batch(infoQuery);
|
|
369
|
-
const
|
|
279
|
+
const nestedColumns: NestedColumn[] = [];
|
|
370
280
|
const notVariant = new Map<string, boolean>();
|
|
371
281
|
for (const row of rows) {
|
|
372
282
|
// data types look like `VARCHAR(1234)` or `NUMBER(10,2)`
|
|
@@ -374,8 +284,12 @@ export class SnowflakeConnection
|
|
|
374
284
|
const baseType = fullType.split('(')[0];
|
|
375
285
|
const name = row['name'] as string;
|
|
376
286
|
|
|
377
|
-
if (
|
|
378
|
-
|
|
287
|
+
if (
|
|
288
|
+
baseType === 'variant' ||
|
|
289
|
+
baseType === 'array' ||
|
|
290
|
+
baseType === 'object'
|
|
291
|
+
) {
|
|
292
|
+
nestedColumns.push({kind: baseType, name});
|
|
379
293
|
} else {
|
|
380
294
|
notVariant.set(name, true);
|
|
381
295
|
// For NUMBER types, pass full string so dialect can inspect scale
|
|
@@ -390,74 +304,167 @@ export class SnowflakeConnection
|
|
|
390
304
|
}
|
|
391
305
|
}
|
|
392
306
|
// VARIANT, ARRAY, and OBJECT columns don't have schema in metadata —
|
|
393
|
-
// we have to sample actual data and inspect it to discover the
|
|
394
|
-
//
|
|
395
|
-
//
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
307
|
+
// we have to sample actual data and inspect it to discover the
|
|
308
|
+
// structure. Cost control happens in two places:
|
|
309
|
+
// 1. project only the nested columns (via object_construct), so
|
|
310
|
+
// bytes-on-wire are bounded by actual variant content.
|
|
311
|
+
// 2. tier the sampling strategy by probeTableSize (see
|
|
312
|
+
// pickSampleStrategy) — small base tables get a full scan;
|
|
313
|
+
// large base tables get TABLESAMPLE only (no unsafe LIMIT
|
|
314
|
+
// fallback); unknown-size sources (views, temp views) get
|
|
315
|
+
// the best-effort TABLESAMPLE→LIMIT chain.
|
|
316
|
+
if (nestedColumns.length > 0) {
|
|
317
|
+
const variantArgs = nestedColumns
|
|
318
|
+
.map(v => `'${v.name}', "${v.name}"`)
|
|
319
|
+
.join(', ');
|
|
320
|
+
// Flatten sampled rows and emit each distinct (path, type) pair.
|
|
321
|
+
// Conflicting pairs at the same path flow through to mergeShape,
|
|
322
|
+
// which collapses them to variant — that is how we honestly
|
|
323
|
+
// surface mixed-type fields to the user.
|
|
403
324
|
const makeSampleQuery = (sampleClause: string) => `
|
|
404
|
-
select
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
)
|
|
417
|
-
where type != 'null_value'
|
|
418
|
-
group BY 1
|
|
419
|
-
having count(*) <=1
|
|
420
|
-
order by path;
|
|
325
|
+
select
|
|
326
|
+
regexp_replace(path, '\\\\[[0-9]+\\\\]', '[*]') as path,
|
|
327
|
+
case
|
|
328
|
+
when typeof(value) = 'INTEGER' then 'decimal'
|
|
329
|
+
when typeof(value) = 'DOUBLE' then 'decimal'
|
|
330
|
+
else lower(typeof(value)) end as type
|
|
331
|
+
from
|
|
332
|
+
(${sampleClause})
|
|
333
|
+
,table(flatten(input => o, recursive => true)) as meta
|
|
334
|
+
where typeof(value) != 'NULL_VALUE'
|
|
335
|
+
group by 1, 2
|
|
336
|
+
order by 1;
|
|
421
337
|
`;
|
|
422
|
-
const
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
// large partitioned tables. TABLESAMPLE only works on base tables,
|
|
428
|
-
// not views, so if it fails we fall back to a plain LIMIT 100.
|
|
429
|
-
const tablesampleClause =
|
|
430
|
-
`select object_construct(${variantArgs}) o` +
|
|
431
|
-
` from ${tablePath} TABLESAMPLE BLOCK (1) limit 100`;
|
|
432
|
-
const fieldPathRows = await this.runSchemaSample(
|
|
433
|
-
makeSampleQuery(tablesampleClause),
|
|
434
|
-
makeSampleQuery(limitClause)
|
|
338
|
+
const projectVariants = `select object_construct(${variantArgs}) o`;
|
|
339
|
+
const probe = await this.probeTableSize(tablePath);
|
|
340
|
+
const strategy = pickSampleStrategy(
|
|
341
|
+
probe,
|
|
342
|
+
this.schemaSampleFullScanMaxBytes
|
|
435
343
|
);
|
|
344
|
+
const n = this.schemaSampleRowLimit;
|
|
345
|
+
let fieldPathRows: QueryRecord[] | undefined;
|
|
346
|
+
|
|
347
|
+
if (strategy === 'full-scan-then-sample') {
|
|
348
|
+
// Small base table: one full scan catches rare fields that
|
|
349
|
+
// sampling would miss. tryBatch so a failure doesn't poison
|
|
350
|
+
// the pool connection (temp views live on it). On failure we
|
|
351
|
+
// fall through to the sample path so a slow or timed-out full
|
|
352
|
+
// scan still gets partial structure.
|
|
353
|
+
fieldPathRows =
|
|
354
|
+
(await this.executor.tryBatch(
|
|
355
|
+
makeSampleQuery(`${projectVariants} from ${tablePath}`),
|
|
356
|
+
{},
|
|
357
|
+
this.schemaSampleTimeoutMs
|
|
358
|
+
)) ?? undefined;
|
|
359
|
+
}
|
|
436
360
|
|
|
437
361
|
if (fieldPathRows === undefined) {
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
362
|
+
const tablesampleQuery = makeSampleQuery(
|
|
363
|
+
`${projectVariants} from ${tablePath} TABLESAMPLE BLOCK (1) limit ${n}`
|
|
364
|
+
);
|
|
365
|
+
if (strategy === 'tablesample-only') {
|
|
366
|
+
// Known-large base table: TABLESAMPLE is safe (reads a few
|
|
367
|
+
// micro-partitions), plain LIMIT without a WHERE can be
|
|
368
|
+
// catastrophic on large partitioned tables. If TABLESAMPLE
|
|
369
|
+
// fails here we accept variant rather than risk an unbounded
|
|
370
|
+
// scan.
|
|
371
|
+
fieldPathRows =
|
|
372
|
+
(await this.executor.tryBatch(
|
|
373
|
+
tablesampleQuery,
|
|
374
|
+
{},
|
|
375
|
+
this.schemaSampleTimeoutMs
|
|
376
|
+
)) ?? undefined;
|
|
377
|
+
} else {
|
|
378
|
+
// Unknown size (view, temp view, non-parseable name) or
|
|
379
|
+
// full-scan fallback: best-effort TABLESAMPLE→LIMIT chain.
|
|
380
|
+
// The LIMIT fallback is the acknowledged "can't help" case
|
|
381
|
+
// for views over large partitioned tables.
|
|
382
|
+
fieldPathRows = await this.runSchemaSample(
|
|
383
|
+
tablesampleQuery,
|
|
384
|
+
makeSampleQuery(`${projectVariants} from ${tablePath} limit ${n}`)
|
|
385
|
+
);
|
|
441
386
|
}
|
|
442
|
-
}
|
|
443
|
-
|
|
444
|
-
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
const state = createVariantSchemaState();
|
|
390
|
+
// Snowflake nested-schema inference follows these rules:
|
|
391
|
+
// - top-level ARRAY/OBJECT from DESCRIBE are authoritative
|
|
392
|
+
// - descendant paths imply ancestor shape
|
|
393
|
+
// - conflicting shapes degrade only that prefix to variant
|
|
394
|
+
// - every top-level nested column still produces a field
|
|
395
|
+
for (const nestedColumn of nestedColumns) {
|
|
396
|
+
seedTopLevelShape(state, nestedColumn);
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
if (fieldPathRows !== undefined) {
|
|
445
400
|
for (const f of fieldPathRows) {
|
|
446
401
|
const pathString = f['PATH']?.valueOf().toString();
|
|
447
402
|
const fieldType = f['TYPE']?.valueOf().toString();
|
|
448
403
|
if (pathString === undefined || fieldType === undefined) continue;
|
|
449
404
|
const pathParser = new PathParser(pathString);
|
|
450
|
-
const
|
|
451
|
-
|
|
405
|
+
const segments = pathParser.segments();
|
|
406
|
+
const topLevel = segments[0];
|
|
407
|
+
if (topLevel?.kind !== 'name' || notVariant.get(topLevel.name)) {
|
|
452
408
|
continue;
|
|
453
409
|
}
|
|
454
|
-
|
|
410
|
+
accumulateVariantPath(state, segments, fieldType);
|
|
455
411
|
}
|
|
456
|
-
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// Always emit one field per top-level nested column from DESCRIBE, even
|
|
415
|
+
// if sampling produced no usable descendant paths.
|
|
416
|
+
for (const nestedColumn of nestedColumns) {
|
|
417
|
+
structDef.fields.push(
|
|
418
|
+
buildTopLevelField(nestedColumn, state, this.dialect)
|
|
419
|
+
);
|
|
457
420
|
}
|
|
458
421
|
}
|
|
459
422
|
}
|
|
460
423
|
|
|
424
|
+
/**
|
|
425
|
+
* Cheap metadata probe: ask INFORMATION_SCHEMA.TABLES for the row count
|
|
426
|
+
* and byte size of tablePath. Returns undefined when the name doesn't
|
|
427
|
+
* parse as a two- or three-part identifier, when the probe query fails,
|
|
428
|
+
* or when the row has no numeric BYTES (views and external tables
|
|
429
|
+
* typically report NULL).
|
|
430
|
+
*
|
|
431
|
+
* Two-part `schema.table` names use the current database's
|
|
432
|
+
* INFORMATION_SCHEMA; three-part `db.schema.table` names address
|
|
433
|
+
* INFORMATION_SCHEMA in the named database. Identifiers are parsed
|
|
434
|
+
* with Snowflake's quoting rules so bare parts case-fold to upper and
|
|
435
|
+
* quoted parts are compared verbatim against the catalog.
|
|
436
|
+
*/
|
|
437
|
+
private async probeTableSize(
|
|
438
|
+
tablePath: string
|
|
439
|
+
): Promise<TableSizeProbe | undefined> {
|
|
440
|
+
const parsed = parseSnowflakeTableName(tablePath);
|
|
441
|
+
if (parsed === undefined || parsed.schema === undefined) return undefined;
|
|
442
|
+
const quoteLit = (s: string) => s.replace(/'/g, "''");
|
|
443
|
+
const dbQualifier = parsed.database ? `${parsed.database.sql}.` : '';
|
|
444
|
+
const rows = await this.executor.tryBatch(
|
|
445
|
+
`select row_count as rc, bytes as by
|
|
446
|
+
from ${dbQualifier}information_schema.tables
|
|
447
|
+
where table_schema = '${quoteLit(parsed.schema.literal)}'
|
|
448
|
+
and table_name = '${quoteLit(parsed.table.literal)}'
|
|
449
|
+
limit 1`,
|
|
450
|
+
{},
|
|
451
|
+
this.schemaSampleTimeoutMs
|
|
452
|
+
);
|
|
453
|
+
if (!rows || rows.length === 0) return undefined;
|
|
454
|
+
const row = rows[0];
|
|
455
|
+
const bytesRaw = row['BY'] ?? row['by'];
|
|
456
|
+
const rowsRaw = row['RC'] ?? row['rc'];
|
|
457
|
+
// Views and external tables surface null BYTES / ROW_COUNT; treat
|
|
458
|
+
// that as "unknown size" so we don't classify them as small and
|
|
459
|
+
// launch a full scan against something potentially huge.
|
|
460
|
+
if (bytesRaw === null || bytesRaw === undefined) return undefined;
|
|
461
|
+
if (rowsRaw === null || rowsRaw === undefined) return undefined;
|
|
462
|
+
const bytes = Number(bytesRaw);
|
|
463
|
+
const rowCount = Number(rowsRaw);
|
|
464
|
+
if (!Number.isFinite(bytes) || !Number.isFinite(rowCount)) return undefined;
|
|
465
|
+
return {bytes, rowCount};
|
|
466
|
+
}
|
|
467
|
+
|
|
461
468
|
/**
|
|
462
469
|
* Try to run a schema sampling query, with fallback.
|
|
463
470
|
* First tries the primary query (e.g. using TABLESAMPLE for speed).
|
|
@@ -535,54 +542,3 @@ export class SnowflakeConnection
|
|
|
535
542
|
return tableName;
|
|
536
543
|
}
|
|
537
544
|
}
|
|
538
|
-
|
|
539
|
-
export class PathParser extends TinyParser {
|
|
540
|
-
constructor(pathName: string) {
|
|
541
|
-
super(pathName, {
|
|
542
|
-
quoted: /^'(\\'|[^'])*'/,
|
|
543
|
-
array_of: /^\[\*]/,
|
|
544
|
-
char: /^[[.\]]/,
|
|
545
|
-
number: /^\d+/,
|
|
546
|
-
word: /^\w+/,
|
|
547
|
-
});
|
|
548
|
-
}
|
|
549
|
-
|
|
550
|
-
getName() {
|
|
551
|
-
const nameStart = this.next();
|
|
552
|
-
if (nameStart.type === 'word') {
|
|
553
|
-
return nameStart.text;
|
|
554
|
-
}
|
|
555
|
-
if (nameStart.type === '[') {
|
|
556
|
-
const quotedName = this.next('quoted');
|
|
557
|
-
this.next(']');
|
|
558
|
-
return quotedName.text;
|
|
559
|
-
}
|
|
560
|
-
throw this.parseError('Expected column name');
|
|
561
|
-
}
|
|
562
|
-
|
|
563
|
-
pathChain(): PathChain {
|
|
564
|
-
const chain: PathChain = {name: this.getName()};
|
|
565
|
-
let node: PathChain = chain;
|
|
566
|
-
for (;;) {
|
|
567
|
-
const sep = this.next();
|
|
568
|
-
if (sep.type === 'eof') {
|
|
569
|
-
return chain;
|
|
570
|
-
}
|
|
571
|
-
if (sep.type === '.') {
|
|
572
|
-
node.next = {name: this.next('word').text};
|
|
573
|
-
node = node.next;
|
|
574
|
-
} else if (sep.type === 'array_of') {
|
|
575
|
-
node.next = {arrayRef: true};
|
|
576
|
-
node = node.next;
|
|
577
|
-
} else if (sep.type === '[') {
|
|
578
|
-
// Actually a dot access through a quoted name
|
|
579
|
-
const quoted = this.next('quoted');
|
|
580
|
-
node.next = {name: quoted.text};
|
|
581
|
-
node = node.next;
|
|
582
|
-
this.next(']');
|
|
583
|
-
} else {
|
|
584
|
-
throw this.parseError(`Unexpected ${sep.type}`);
|
|
585
|
-
}
|
|
586
|
-
}
|
|
587
|
-
}
|
|
588
|
-
}
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright Contributors to the Malloy project
|
|
3
|
+
* SPDX-License-Identifier: MIT
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import {pickSampleStrategy} from './snowflake_connection';
|
|
7
|
+
import {parseSnowflakeTableName} from './snowflake_table_name';
|
|
8
|
+
|
|
9
|
+
describe('pickSampleStrategy', () => {
|
|
10
|
+
const threshold = 100_000_000;
|
|
11
|
+
|
|
12
|
+
test('no probe → best-effort tablesample-then-limit', () => {
|
|
13
|
+
expect(pickSampleStrategy(undefined, threshold)).toBe(
|
|
14
|
+
'tablesample-then-limit'
|
|
15
|
+
);
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
test('probe at or below threshold → full-scan-then-sample', () => {
|
|
19
|
+
expect(pickSampleStrategy({bytes: 0, rowCount: 0}, threshold)).toBe(
|
|
20
|
+
'full-scan-then-sample'
|
|
21
|
+
);
|
|
22
|
+
expect(pickSampleStrategy({bytes: threshold, rowCount: 1}, threshold)).toBe(
|
|
23
|
+
'full-scan-then-sample'
|
|
24
|
+
);
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
test('probe above threshold → tablesample-only (no unsafe LIMIT fallback)', () => {
|
|
28
|
+
expect(
|
|
29
|
+
pickSampleStrategy({bytes: threshold + 1, rowCount: 1}, threshold)
|
|
30
|
+
).toBe('tablesample-only');
|
|
31
|
+
expect(
|
|
32
|
+
pickSampleStrategy(
|
|
33
|
+
{bytes: 10_000_000_000, rowCount: 1_000_000_000},
|
|
34
|
+
threshold
|
|
35
|
+
)
|
|
36
|
+
).toBe('tablesample-only');
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
test('threshold=0 forces every probed table into tablesample-only', () => {
|
|
40
|
+
expect(pickSampleStrategy({bytes: 1, rowCount: 1}, 0)).toBe(
|
|
41
|
+
'tablesample-only'
|
|
42
|
+
);
|
|
43
|
+
});
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
describe('parseSnowflakeTableName', () => {
|
|
47
|
+
test('single bare identifier', () => {
|
|
48
|
+
expect(parseSnowflakeTableName('aircraft')).toEqual({
|
|
49
|
+
table: {literal: 'AIRCRAFT', sql: 'AIRCRAFT', quoted: false},
|
|
50
|
+
});
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
test('two-part bare name uppercases both parts', () => {
|
|
54
|
+
expect(parseSnowflakeTableName('malloytest.aircraft')).toEqual({
|
|
55
|
+
schema: {literal: 'MALLOYTEST', sql: 'MALLOYTEST', quoted: false},
|
|
56
|
+
table: {literal: 'AIRCRAFT', sql: 'AIRCRAFT', quoted: false},
|
|
57
|
+
});
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
test('three-part bare name', () => {
|
|
61
|
+
expect(parseSnowflakeTableName('db.sch.t')).toEqual({
|
|
62
|
+
database: {literal: 'DB', sql: 'DB', quoted: false},
|
|
63
|
+
schema: {literal: 'SCH', sql: 'SCH', quoted: false},
|
|
64
|
+
table: {literal: 'T', sql: 'T', quoted: false},
|
|
65
|
+
});
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
test('quoted identifier preserves case', () => {
|
|
69
|
+
expect(parseSnowflakeTableName('"MyDb"."schema"."t"')).toEqual({
|
|
70
|
+
database: {literal: 'MyDb', sql: '"MyDb"', quoted: true},
|
|
71
|
+
schema: {literal: 'schema', sql: '"schema"', quoted: true},
|
|
72
|
+
table: {literal: 't', sql: '"t"', quoted: true},
|
|
73
|
+
});
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
test('quoted identifier allows embedded dots', () => {
|
|
77
|
+
expect(parseSnowflakeTableName('"a.b"."c.d"')).toEqual({
|
|
78
|
+
schema: {literal: 'a.b', sql: '"a.b"', quoted: true},
|
|
79
|
+
table: {literal: 'c.d', sql: '"c.d"', quoted: true},
|
|
80
|
+
});
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
test('doubled double-quote is a literal quote', () => {
|
|
84
|
+
expect(parseSnowflakeTableName('"a""b"')).toEqual({
|
|
85
|
+
table: {literal: 'a"b', sql: '"a""b"', quoted: true},
|
|
86
|
+
});
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
test('mixes quoted and bare parts', () => {
|
|
90
|
+
expect(parseSnowflakeTableName('MYDB."mixed"')).toEqual({
|
|
91
|
+
schema: {literal: 'MYDB', sql: 'MYDB', quoted: false},
|
|
92
|
+
table: {literal: 'mixed', sql: '"mixed"', quoted: true},
|
|
93
|
+
});
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
test('tolerates surrounding whitespace and whitespace around dots', () => {
|
|
97
|
+
expect(parseSnowflakeTableName(' sch . t ')).toEqual({
|
|
98
|
+
schema: {literal: 'SCH', sql: 'SCH', quoted: false},
|
|
99
|
+
table: {literal: 'T', sql: 'T', quoted: false},
|
|
100
|
+
});
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
test('returns undefined for empty input', () => {
|
|
104
|
+
expect(parseSnowflakeTableName('')).toBeUndefined();
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
test('returns undefined for four-part name', () => {
|
|
108
|
+
expect(parseSnowflakeTableName('a.b.c.d')).toBeUndefined();
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
test('returns undefined for trailing dot', () => {
|
|
112
|
+
expect(parseSnowflakeTableName('sch.')).toBeUndefined();
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
test('returns undefined for leading dot', () => {
|
|
116
|
+
expect(parseSnowflakeTableName('.t')).toBeUndefined();
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
test('returns undefined for unterminated quoted identifier', () => {
|
|
120
|
+
expect(parseSnowflakeTableName('"oops')).toBeUndefined();
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
test('returns undefined for identifier starting with a digit', () => {
|
|
124
|
+
expect(parseSnowflakeTableName('1foo')).toBeUndefined();
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
test('returns undefined for identifier containing a dash', () => {
|
|
128
|
+
expect(parseSnowflakeTableName('foo-bar')).toBeUndefined();
|
|
129
|
+
});
|
|
130
|
+
});
|