@oceanum/datamesh 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,458 @@
1
+ import { CachedHTTPStore } from "./zarr";
2
+ import * as zarr from "@zarrita/core";
3
+ import { Chunk, DataType, Listable, Location, TypedArray } from "@zarrita/core";
4
+ import { Mutable, AsyncReadable } from "@zarrita/storage";
5
+ import { get, set, Slice } from "@zarrita/indexing";
6
+
7
+ import { Schema } from "./datasource";
8
+
9
+ type ATypedArray =
10
+ | Int8Array
11
+ | Int16Array
12
+ | Int32Array
13
+ | Uint8Array
14
+ | Uint16Array
15
+ | Uint32Array
16
+ | Float32Array
17
+ | Float64Array;
18
+ type Scalar = string | number | boolean;
19
+ type NDArray =
20
+ | Scalar[]
21
+ | Scalar[][]
22
+ | Scalar[][][]
23
+ | Scalar[][][][]
24
+ | ATypedArray[]
25
+ | ATypedArray[][]
26
+ | ATypedArray[][][]
27
+ | ATypedArray[][][][];
28
+ type Data = NDArray | ATypedArray | Scalar;
29
+
30
+ /**
31
+ * Represents a data variable.
32
+ */
33
+ export type DataVariable = {
34
+ /**
35
+ * Attributes of the variable.
36
+ */
37
+ attrs: Record<string, string | unknown>;
38
+ /**
39
+ * Dimensions of the variable
40
+ * */
41
+ dims: string[];
42
+ /**
43
+ * Datatype of the variable.
44
+ */
45
+ dtype?: string;
46
+ /**
47
+ * Data associated with the variable.
48
+ */
49
+ data?: Data;
50
+ };
51
+
52
+ const isArray = (data?: Data) => {
53
+ return data && (Array.isArray(data) || ArrayBuffer.isView(data));
54
+ };
55
+
56
+ const getShape = (a: Data) => {
57
+ const dim = [] as number[];
58
+ if (!isArray(a)) return dim;
59
+ for (;;) {
60
+ // @ts-ignore: Scalar already returned
61
+ dim.push(a.length);
62
+ // @ts-ignore: Scalar already returned
63
+ if (isArray(a[0])) {
64
+ // @ts-ignore: Scalar already returned
65
+ a = a[0];
66
+ } else {
67
+ break;
68
+ }
69
+ }
70
+ return dim;
71
+ };
72
+
73
+ const getDtype = (data: Data): DataType => {
74
+ for (;;) {
75
+ if (Array.isArray(data)) {
76
+ data = data[0];
77
+ } else {
78
+ break;
79
+ }
80
+ }
81
+ if (typeof data === "number") {
82
+ return "float32";
83
+ } else if (typeof data === "string") {
84
+ return "int8";
85
+ } else {
86
+ switch (data.constructor.name) {
87
+ case "Int8Array":
88
+ return "int8";
89
+ case "Int16Array":
90
+ return "int16";
91
+ case "Int32Array":
92
+ return "int32";
93
+ case "Uint8Array":
94
+ return "uint8";
95
+ case "Uint16Array":
96
+ return "uint16";
97
+ case "Uint32Array":
98
+ return "uint32";
99
+ case "Float32Array":
100
+ return "float32";
101
+ case "Float64Array":
102
+ return "float64";
103
+ }
104
+ }
105
+
106
+ throw new Error("Unsupported data type: " + data.constructor.name);
107
+ };
108
+
109
+ const ravel = (data: Data) => {
110
+ if (!Array.isArray(data)) return data;
111
+ const flat = (data as NDArray).flat(Infinity);
112
+ if (isArray(flat[0])) {
113
+ // @ts-expect-error: Is array
114
+ const len = flat[0].length;
115
+ // @ts-expect-error: Is array
116
+ const arr = new flat[0].constructor(flat.length * len);
117
+ for (let i = 0; i < flat.length; i++) {
118
+ arr.set(flat[i], i * len);
119
+ }
120
+ return arr;
121
+ } else {
122
+ return flat;
123
+ }
124
+ };
125
+
126
+ function get_strides(shape: readonly number[]) {
127
+ const ndim = shape.length;
128
+ const stride: number[] = Array(ndim);
129
+ for (let i = ndim - 1, step = 1; i >= 0; i--) {
130
+ stride[i] = step;
131
+ step *= shape[i];
132
+ }
133
+ return stride;
134
+ }
135
+
136
+ function unravel<T extends DataType>(
137
+ data: TypedArray<T>,
138
+ shape: number[],
139
+ stride: number[],
140
+ offset = 0
141
+ ): Data {
142
+ // @ts-expect-error: Is array
143
+ if (shape.length === 0) return data[0];
144
+ if (shape.length === 1) {
145
+ // @ts-expect-error: Is array
146
+ const arr = new data.constructor(shape[0]);
147
+ // @ts-expect-error: Is array
148
+ arr.set(data.slice(offset, offset + shape[0]));
149
+ return arr;
150
+ }
151
+
152
+ const arr = new Array(shape[0]);
153
+ for (let i = 0; i < shape[0]; i++) {
154
+ arr[i] = unravel(
155
+ data,
156
+ shape.slice(1),
157
+ stride.slice(1),
158
+ offset + i * stride[0]
159
+ );
160
+ }
161
+ return arr;
162
+ }
163
+
164
+ const flatten = (
165
+ data: Record<string, DataVariable>,
166
+ dims: Record<string, number>,
167
+ rows: Record<string, unknown>[]
168
+ ): Record<string, unknown>[] => {
169
+ const dim = Object.keys(dims);
170
+ const arrays = {} as Record<string, boolean>;
171
+ for (const k in data) {
172
+ if (isArray(data[k].data)) {
173
+ arrays[k] = true;
174
+ }
175
+ }
176
+ if (dim.length == 1) {
177
+ for (let i = 0; i < dims[dim[0]]; i++) {
178
+ const row = {} as Record<string, unknown>;
179
+ for (const k in data) {
180
+ if (arrays[k]) {
181
+ // @ts-expect-error: Is array
182
+ if (data[k].data.length > 1) row[k] = data[k].data[i];
183
+ } else {
184
+ row[k] = data[k].data;
185
+ }
186
+ }
187
+ rows.push(row);
188
+ }
189
+ } else {
190
+ for (let i = 0; i < dims[dim[0]]; i++) {
191
+ const subdata = {} as Record<string, DataVariable>;
192
+ for (const k in data) {
193
+ if (data[k].dims.includes(dim[0])) {
194
+ subdata[k] = {
195
+ attrs: {},
196
+ // @ts-expect-error: Is array because include dims
197
+ data: data[k].data[i],
198
+ dims: data[k].dims.slice(1),
199
+ };
200
+ } else {
201
+ subdata[k] = data[k];
202
+ }
203
+ }
204
+ const subdims = { ...dims };
205
+ delete subdims[dim[0]];
206
+ flatten(subdata, subdims, rows);
207
+ }
208
+ }
209
+ return rows;
210
+ };
211
+
212
+ export type DatameshStore = Location<Listable<AsyncReadable>>;
213
+ export type TempStore = Location<Mutable>;
214
+
215
+ /**
216
+ * Represents a data variable within a dataset.
217
+ */
218
+ export class DataVar<
219
+ DType extends DataType,
220
+ S extends TempStore | DatameshStore
221
+ > {
222
+ /**
223
+ * Creates an instance of DataVar.
224
+ * @param id - The identifier for the data variable.
225
+ * @param dims - The dimensions associated with the data variable.
226
+ * @param attrs - The attributes of the data variable, represented as a record of key-value pairs.
227
+ * @param arr - The zarr array associated with the data variable.
228
+ */
229
+ id: string;
230
+ dims: string[];
231
+ attrs: Record<string, unknown>;
232
+ arr: S extends TempStore
233
+ ? zarr.Array<DType, Mutable>
234
+ : zarr.Array<DType, AsyncReadable>;
235
+ constructor(
236
+ id: string,
237
+ dims: string[],
238
+ attrs: Record<string, unknown>,
239
+ arr: S extends TempStore
240
+ ? zarr.Array<DType, Mutable>
241
+ : zarr.Array<DType, AsyncReadable>
242
+ ) {
243
+ this.id = id;
244
+ this.dims = dims;
245
+ this.attrs = attrs;
246
+ this.arr = arr; // zarr array
247
+ }
248
+
249
+ /**
250
+ * Retrieves the data from the zarr array. If the data is already cached, it returns the cached data.
251
+ * @param slice - Optional slice parameters to retrieve specific data from the zarr array.
252
+ * @returns A promise that resolves to the data of the zarr array.
253
+ */
254
+
255
+ async get(
256
+ slice?: (null | Slice | number)[] | null | undefined
257
+ ): Promise<Data> {
258
+ const _data: Chunk<DType> | Scalar = await get(
259
+ this.arr as zarr.Array<DType, AsyncReadable>,
260
+ slice
261
+ );
262
+ if (_data.shape) {
263
+ return unravel(_data.data, _data.shape, _data.stride);
264
+ } else {
265
+ return _data.data as Data;
266
+ }
267
+ }
268
+ }
269
+
270
+ /**
271
+ * Represents a dataset with dimensions, data variables, and attributes.
272
+ * Implements the DatasetApi interface.
273
+ */
274
+ export class Dataset<S extends DatameshStore | TempStore> {
275
+ /**
276
+ * Creates an instance of Dataset.
277
+ * @param dims - The dimensions of the dataset.
278
+ * @param data_vars - The data variables of the dataset.
279
+ * @param attrs - The attributes of the dataset.
280
+ * @param root - The root group of the dataset.
281
+ */
282
+ dims: Record<string, number>;
283
+ data_vars: S extends TempStore
284
+ ? Record<string, DataVar<DataType, TempStore>>
285
+ : Record<string, DataVar<DataType, DatameshStore>>;
286
+ attrs: Record<string, unknown>;
287
+ root: S;
288
+
289
+ constructor(
290
+ dims: Record<string, number>,
291
+ data_vars: S extends TempStore
292
+ ? Record<string, DataVar<DataType, TempStore>>
293
+ : Record<string, DataVar<DataType, DatameshStore>>,
294
+ attrs: Record<string, unknown>,
295
+ root: S
296
+ ) {
297
+ this.data_vars = data_vars;
298
+ this.dims = dims;
299
+ this.attrs = attrs;
300
+ this.root = root;
301
+ }
302
+
303
+ /**
304
+ * Creates a Dataset instance from a Zarr store.
305
+ * @param gateway - The URL of the datamesh gateway.
306
+ * @param authHeaders - The authentication headers.
307
+ * @param parameters - Optional parameters for the request.
308
+ * @param chunks - Optional chunking strategy.
309
+ * @param downsample - Optional downsampling strategy.
310
+ * @returns A promise that resolves to a Dataset instance.
311
+ */
312
+ static async zarr(
313
+ url: string,
314
+ authHeaders: Record<string, string>,
315
+ parameters?: Record<string, string | number>,
316
+ chunks?: string,
317
+ downsample?: Record<string, number>
318
+ ): Promise<Dataset<DatameshStore>> {
319
+ const store = await zarr.withConsolidated(
320
+ new CachedHTTPStore(
321
+ url,
322
+ authHeaders,
323
+ parameters,
324
+ chunks,
325
+ downsample,
326
+ typeof window === "undefined"
327
+ )
328
+ );
329
+ const root = await zarr.root(store);
330
+ const group = await zarr.open(root, { kind: "group" });
331
+ const data_vars = {} as Record<string, DataVar<DataType, DatameshStore>>;
332
+ const dims = {} as Record<string, number>;
333
+ for (const item of store.contents()) {
334
+ if (item.kind == "array") {
335
+ const arr = await zarr.open(root.resolve(item.path), { kind: "array" });
336
+ const array_dims = arr.attrs._ARRAY_DIMENSIONS as string[] | null;
337
+ const vid = item.path.split("/").pop() as string;
338
+ data_vars[vid] = new DataVar<DataType, DatameshStore>(
339
+ vid,
340
+ array_dims || [],
341
+ arr.attrs as Record<string, unknown>,
342
+ arr
343
+ );
344
+ if (array_dims)
345
+ array_dims.map((dim: string, i: number) => {
346
+ const n = (arr.shape as number[])[i];
347
+ if (dims[dim] && dims[dim] != n) {
348
+ throw new Error(
349
+ `Inconsistent dimension size for ${dim}: ${dims[dim]} != ${n}`
350
+ );
351
+ } else {
352
+ dims[dim] = n;
353
+ }
354
+ });
355
+ }
356
+ }
357
+ return new Dataset<DatameshStore>(dims, data_vars, group.attrs, root);
358
+ }
359
+
360
+ /**
361
+ * Initializes an in memory Dataset instance from a data object.
362
+ * @param datasource - An object containing id, dimensions, data variables, and attributes.
363
+ */
364
+ static async init(datasource: Schema): Promise<Dataset<TempStore>> {
365
+ const root = zarr.root(new Map());
366
+ const ds = new Dataset(datasource.dims, {}, datasource.attrs || {}, root);
367
+ for (const k in datasource.data_vars) {
368
+ const { dims, attrs, data }: DataVariable = datasource.data_vars[k];
369
+ await ds.assign(k, dims, data as Data, attrs);
370
+ }
371
+ return ds;
372
+ }
373
+
374
+ /**
375
+ * Converts the data variables into a dataframe format.
376
+ *
377
+ * @returns {Promise<Record<string, unknown>[]>} A promise that resolves to an array of records,
378
+ * where each record represents a row in the dataframe.
379
+ *
380
+ * @remarks
381
+ * This method iterates over the data variables, retrieves their dimensions and data,
382
+ * and then flattens the data into a dataframe structure.
383
+ *
384
+ * @example
385
+ * ```typescript
386
+ * const dataframe = await instance.to_dataframe();
387
+ * console.log(dataframe);
388
+ * ```
389
+ */
390
+ async to_dataframe(): Promise<Record<string, unknown>[]> {
391
+ const data = {} as Record<string, DataVariable>;
392
+ for (const k in this.data_vars) {
393
+ data[k] = {
394
+ attrs: this.data_vars[k].attrs,
395
+ dims: this.data_vars[k].dims,
396
+ };
397
+ data[k].data = (await this.data_vars[k].get()) as Data;
398
+ }
399
+ return flatten(data, { ...this.dims }, []);
400
+ }
401
+
402
+ /**
403
+ * Asynchronously assigns data to a variable in the dataset.
404
+ *
405
+ * @param varid - The identifier for the variable.
406
+ * @param dims - An array of dimension names corresponding to the data.
407
+ * @param data - The data to be assigned, which can be a multi-dimensional array.
408
+ * @param attrs - Optional. A record of attributes to be associated with the variable.
409
+ * @param dtype - The data type of the variable.
410
+ * @param chunks - Optional. An array specifying the chunk sizes for the data.
411
+
412
+ * @returns A promise that resolves when the data has been successfully assigned.
413
+ * @throws Will throw an error if the shape of the data does not match the provided dimensions.
414
+ * @throws Will throw an error if an existing dimension size does not match the new data.
415
+ */
416
+ async assign(
417
+ varid: string,
418
+ dims: string[],
419
+ data: Data,
420
+ attrs?: Record<string, unknown>,
421
+ chunks?: number[]
422
+ ): Promise<void> {
423
+ const shape = getShape(data);
424
+ if (shape.length != dims.length) {
425
+ throw new Error("Data shape does not match dimensions");
426
+ }
427
+ dims.map((dim, i) => {
428
+ if (this.dims[dim]) {
429
+ if (this.dims[dim] != shape[i]) {
430
+ throw new Error(
431
+ `Existing size of dimension ${dim} does not match new data`
432
+ );
433
+ }
434
+ } else {
435
+ this.dims[dim] = shape[i];
436
+ }
437
+ });
438
+
439
+ const arr = await zarr.create(
440
+ this.root.resolve(varid) as Location<Mutable>,
441
+ {
442
+ shape,
443
+ data_type: getDtype(data),
444
+ chunk_shape: chunks || shape,
445
+ }
446
+ );
447
+ await set(
448
+ arr,
449
+ shape.map(() => null),
450
+ {
451
+ data: ravel(data),
452
+ shape: shape,
453
+ stride: get_strides(shape),
454
+ }
455
+ );
456
+ this.data_vars[varid] = new DataVar(varid, dims, attrs || {}, arr);
457
+ }
458
+ }
@@ -0,0 +1,139 @@
1
+ import { Geometry } from "geojson";
2
+ import dayjs from "dayjs";
3
+ import duration from "dayjs/plugin/duration";
4
+
5
+ import { DataVariable } from "./datamodel";
6
+
7
+ /**
8
+ * Represents a data variable.
9
+ */
10
+ enum Coordinate {
11
+ "Station" = "s", // locations assumed stationary, datasource multigeometry coordinate indexed by station coordinate
12
+ "Ensemble" = "e",
13
+ "Raster band" = "b",
14
+ "Category" = "c",
15
+ "Quantile" = "q",
16
+ "Season" = "n",
17
+ "Month" = "m",
18
+ "Time" = "t",
19
+ "Vertical coordinate" = "z",
20
+ "Horizontal northerly" = "y",
21
+ "Horizontal easterly" = "x",
22
+ "Geometry" = "g", // Abstract coordinate - a 2 or 3D geometry that defines a feature location
23
+ "Frequency" = "f", // spectra
24
+ "Direction" = "d", // spectra or stats
25
+ "Coordinate_i" = "i",
26
+ "Coordinate_j" = "j",
27
+ "Coordinate_k" = "k",
28
+ }
29
+
30
+ type Coordinates = {
31
+ [key in Coordinate]?: string;
32
+ };
33
+
34
+ /**
35
+ * Represents the schema of a data source.
36
+ */
37
+ export type Schema = {
38
+ /**
39
+ * Attributes of the schema.
40
+ */
41
+ attrs?: Record<string, string | number>;
42
+
43
+ /**
44
+ * Dimensions of the schema.
45
+ */
46
+ dims: Record<string, number>;
47
+
48
+ /**
49
+ * Coordinates of the schema.
50
+ */
51
+ coords?: Record<string, DataVariable>;
52
+
53
+ /**
54
+ * Data variables of the schema.
55
+ */
56
+ data_vars: Record<string, DataVariable>;
57
+ };
58
+
59
+ /**
60
+ * Represents a data source.
61
+ */
62
+ export type Datasource = {
63
+ /**
64
+ * Unique identifier for the data source.
65
+ */
66
+ id: string;
67
+
68
+ /**
69
+ * Name of the data source.
70
+ */
71
+ name: string;
72
+
73
+ /**
74
+ * Description of the data source.
75
+ */
76
+ description?: string;
77
+
78
+ /**
79
+ * Parameters associated with the data source.
80
+ */
81
+ parameters?: Record<string, unknown>;
82
+
83
+ /**
84
+ * Geometric representation of the data source.
85
+ */
86
+ geom?: Geometry;
87
+
88
+ /**
89
+ * Start time for the data source.
90
+ */
91
+ tstart?: dayjs.Dayjs;
92
+
93
+ /**
94
+ * End time for the data source.
95
+ */
96
+ tend?: dayjs.Dayjs;
97
+
98
+ /**
99
+ * Forecast time period for the data source.
100
+ */
101
+ pforecast?: duration.Duration;
102
+
103
+ /**
104
+ * Archive time period for the data source.
105
+ */
106
+ parchive?: duration.Duration;
107
+
108
+ /**
109
+ * Tags associated with the data source.
110
+ */
111
+ tags?: string[];
112
+
113
+ /**
114
+ * Additional information about the data source.
115
+ */
116
+ info?: Record<string, unknown>;
117
+
118
+ /**
119
+ * Schema information for the data source.
120
+ */
121
+ schema: Schema;
122
+
123
+ /**
124
+ * Coordinate mappings for the data source.
125
+ */
126
+ coordinates: Coordinates;
127
+
128
+ /**
129
+ * Additional details about the data source.
130
+ */
131
+ details?: string;
132
+
133
+ /**
134
+ * Last modified date of the data source.
135
+ */
136
+ last_modified?: Date;
137
+
138
+ driver: string;
139
+ };