@malloy-publisher/server 0.0.151 → 0.0.153

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,304 @@
1
+ import {
2
+ ListBucketsCommand,
3
+ ListObjectsV2Command,
4
+ S3Client,
5
+ } from "@aws-sdk/client-s3";
6
+ import { Connection } from "@malloydata/malloy";
7
+ import { components } from "../api";
8
+ import { logger } from "../logger";
9
+
10
+ type ApiTable = components["schemas"]["Table"];
11
+
12
+ export type CloudStorageType = "gcs" | "s3";
13
+
14
+ export interface CloudStorageCredentials {
15
+ type: CloudStorageType;
16
+ accessKeyId: string;
17
+ secretAccessKey: string;
18
+ region?: string;
19
+ endpoint?: string;
20
+ sessionToken?: string;
21
+ }
22
+
23
+ export interface CloudStorageBucket {
24
+ name: string;
25
+ creationDate?: Date;
26
+ }
27
+
28
+ export interface CloudStorageObject {
29
+ key: string;
30
+ size?: number;
31
+ lastModified?: Date;
32
+ isFolder: boolean;
33
+ }
34
+
35
+ export function gcsConnectionToCredentials(gcsConnection: {
36
+ keyId?: string;
37
+ secret?: string;
38
+ }): CloudStorageCredentials {
39
+ return {
40
+ type: "gcs",
41
+ accessKeyId: gcsConnection.keyId || "",
42
+ secretAccessKey: gcsConnection.secret || "",
43
+ };
44
+ }
45
+
46
+ export function s3ConnectionToCredentials(s3Connection: {
47
+ accessKeyId?: string;
48
+ secretAccessKey?: string;
49
+ region?: string;
50
+ endpoint?: string;
51
+ sessionToken?: string;
52
+ }): CloudStorageCredentials {
53
+ return {
54
+ type: "s3",
55
+ accessKeyId: s3Connection.accessKeyId || "",
56
+ secretAccessKey: s3Connection.secretAccessKey || "",
57
+ region: s3Connection.region,
58
+ endpoint: s3Connection.endpoint,
59
+ sessionToken: s3Connection.sessionToken,
60
+ };
61
+ }
62
+
63
+ function createCloudStorageClient(
64
+ credentials: CloudStorageCredentials,
65
+ ): S3Client {
66
+ const isGCS = credentials.type === "gcs";
67
+
68
+ const client = new S3Client({
69
+ endpoint: isGCS ? "https://storage.googleapis.com" : credentials.endpoint,
70
+ region: isGCS ? "auto" : credentials.region || "us-east-1",
71
+ credentials: {
72
+ accessKeyId: credentials.accessKeyId,
73
+ secretAccessKey: credentials.secretAccessKey,
74
+ sessionToken: credentials.sessionToken,
75
+ },
76
+ forcePathStyle: isGCS || !!credentials.endpoint,
77
+ });
78
+
79
+ if (isGCS) {
80
+ client.middlewareStack.add(
81
+ (next) => async (args) => {
82
+ const request = args.request as { query?: Record<string, string> };
83
+ if (request.query) {
84
+ delete request.query["x-id"];
85
+ }
86
+ return next(args);
87
+ },
88
+ { step: "build", name: "removeXIdParam" },
89
+ );
90
+ }
91
+
92
+ return client;
93
+ }
94
+
95
+ export async function listCloudBuckets(
96
+ credentials: CloudStorageCredentials,
97
+ ): Promise<CloudStorageBucket[]> {
98
+ const client = createCloudStorageClient(credentials);
99
+ const storageType = credentials.type.toUpperCase();
100
+
101
+ try {
102
+ const response = await client.send(new ListBucketsCommand({}));
103
+ return (response.Buckets || []).map((bucket) => ({
104
+ name: bucket.Name || "",
105
+ creationDate: bucket.CreationDate,
106
+ }));
107
+ } catch (error) {
108
+ logger.error(`Failed to list ${storageType} buckets`, { error });
109
+ throw new Error(
110
+ `Failed to list ${storageType} buckets: ${error instanceof Error ? error.message : String(error)}`,
111
+ );
112
+ }
113
+ }
114
+
115
+ async function listCloudObjectsInFolder(
116
+ credentials: CloudStorageCredentials,
117
+ bucket: string,
118
+ prefix: string = "",
119
+ ): Promise<CloudStorageObject[]> {
120
+ const client = createCloudStorageClient(credentials);
121
+ const storageType = credentials.type.toUpperCase();
122
+ const uri = buildCloudUri(credentials.type, bucket, prefix);
123
+
124
+ try {
125
+ const response = await client.send(
126
+ new ListObjectsV2Command({
127
+ Bucket: bucket,
128
+ Prefix: prefix,
129
+ Delimiter: "/",
130
+ }),
131
+ );
132
+
133
+ const objects: CloudStorageObject[] = [];
134
+
135
+ for (const folderPrefix of response.CommonPrefixes || []) {
136
+ if (folderPrefix.Prefix) {
137
+ objects.push({
138
+ key: folderPrefix.Prefix,
139
+ isFolder: true,
140
+ });
141
+ }
142
+ }
143
+
144
+ for (const content of response.Contents || []) {
145
+ if (content.Key && content.Key !== prefix) {
146
+ objects.push({
147
+ key: content.Key,
148
+ size: content.Size,
149
+ lastModified: content.LastModified,
150
+ isFolder: false,
151
+ });
152
+ }
153
+ }
154
+
155
+ return objects;
156
+ } catch (error) {
157
+ logger.error(`Failed to list ${storageType} objects`, {
158
+ error,
159
+ bucket,
160
+ prefix,
161
+ });
162
+ throw new Error(
163
+ `Failed to list objects in ${uri}: ${error instanceof Error ? error.message : String(error)}`,
164
+ );
165
+ }
166
+ }
167
+
168
+ export async function listAllCloudFiles(
169
+ credentials: CloudStorageCredentials,
170
+ bucket: string,
171
+ prefix: string = "",
172
+ ): Promise<CloudStorageObject[]> {
173
+ const allFiles: CloudStorageObject[] = [];
174
+
175
+ async function traverse(currentPrefix: string): Promise<void> {
176
+ const objects = await listCloudObjectsInFolder(
177
+ credentials,
178
+ bucket,
179
+ currentPrefix,
180
+ );
181
+
182
+ for (const obj of objects) {
183
+ if (obj.isFolder) {
184
+ await traverse(obj.key);
185
+ } else {
186
+ allFiles.push(obj);
187
+ }
188
+ }
189
+ }
190
+
191
+ await traverse(prefix);
192
+ return allFiles;
193
+ }
194
+
195
+ export function isDataFile(key: string): boolean {
196
+ const lowerKey = key.toLowerCase();
197
+ return (
198
+ lowerKey.endsWith(".csv") ||
199
+ lowerKey.endsWith(".parquet") ||
200
+ lowerKey.endsWith(".json") ||
201
+ lowerKey.endsWith(".jsonl") ||
202
+ lowerKey.endsWith(".ndjson")
203
+ );
204
+ }
205
+
206
+ export function getFileType(key: string): string {
207
+ const lowerKey = key.toLowerCase();
208
+ if (lowerKey.endsWith(".csv")) return "csv";
209
+ if (lowerKey.endsWith(".parquet")) return "parquet";
210
+ if (lowerKey.endsWith(".json")) return "json";
211
+ if (lowerKey.endsWith(".jsonl") || lowerKey.endsWith(".ndjson"))
212
+ return "jsonl";
213
+ return "unknown";
214
+ }
215
+
216
+ export function buildCloudUri(
217
+ type: CloudStorageType,
218
+ bucket: string,
219
+ key: string,
220
+ ): string {
221
+ const scheme = type === "gcs" ? "gs" : "s3";
222
+ return `${scheme}://${bucket}/${key}`;
223
+ }
224
+
225
+ function standardizeRunSQLResult(result: unknown): unknown[] {
226
+ return Array.isArray(result)
227
+ ? result
228
+ : (result as { rows?: unknown[] }).rows || [];
229
+ }
230
+
231
+ export async function getCloudTablesWithColumns(
232
+ malloyConnection: Connection,
233
+ credentials: CloudStorageCredentials,
234
+ bucketName: string,
235
+ fileKeys: string[],
236
+ ): Promise<ApiTable[]> {
237
+ const tables: ApiTable[] = [];
238
+
239
+ for (const fileKey of fileKeys) {
240
+ const uri = buildCloudUri(credentials.type, bucketName, fileKey);
241
+ const fileType = getFileType(fileKey);
242
+
243
+ try {
244
+ let describeQuery: string;
245
+
246
+ switch (fileType) {
247
+ case "csv":
248
+ describeQuery = `DESCRIBE SELECT * FROM read_csv('${uri}', auto_detect=true) LIMIT 1`;
249
+ break;
250
+ case "parquet":
251
+ describeQuery = `DESCRIBE SELECT * FROM read_parquet('${uri}') LIMIT 1`;
252
+ break;
253
+ case "json":
254
+ describeQuery = `DESCRIBE SELECT * FROM read_json('${uri}', auto_detect=true) LIMIT 1`;
255
+ break;
256
+ case "jsonl":
257
+ describeQuery = `DESCRIBE SELECT * FROM read_json('${uri}', format='newline_delimited', auto_detect=true) LIMIT 1`;
258
+ break;
259
+ default:
260
+ logger.warn(`Unsupported file type for ${fileKey}`);
261
+ tables.push({
262
+ resource: uri,
263
+ columns: [],
264
+ });
265
+ continue;
266
+ }
267
+
268
+ const result = await malloyConnection.runSQL(describeQuery);
269
+ const rows = standardizeRunSQLResult(result);
270
+ const columns = rows.map((row: unknown) => {
271
+ const typedRow = row as Record<string, unknown>;
272
+ return {
273
+ name: (typedRow.column_name || typedRow.name) as string,
274
+ type: (typedRow.column_type || typedRow.type) as string,
275
+ };
276
+ });
277
+
278
+ tables.push({
279
+ resource: uri,
280
+ columns,
281
+ });
282
+
283
+ logger.info(
284
+ `Got schema for ${credentials.type.toUpperCase()} file: ${uri}`,
285
+ {
286
+ columnCount: columns.length,
287
+ },
288
+ );
289
+ } catch (error) {
290
+ logger.warn(
291
+ `Failed to get schema for ${credentials.type.toUpperCase()} file: ${uri}`,
292
+ {
293
+ error,
294
+ },
295
+ );
296
+ tables.push({
297
+ resource: uri,
298
+ columns: [],
299
+ });
300
+ }
301
+ }
302
+
303
+ return tables;
304
+ }