cradle-sdk 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,48 @@
1
+ from enum import StrEnum
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from .common import ArchivableResourceMixin, BaseListResponse, ErrorResponseMixin, ResourceResponse
6
+
7
+
8
+ class WorkspaceState(StrEnum):
9
+ PROVISIONING = "PROVISIONING"
10
+ DELETING = "DELETING"
11
+ READY = "READY"
12
+ DELETED = "DELETED"
13
+ FAILED = "FAILED"
14
+
15
+
16
+ class WorkspaceResponse(ResourceResponse, ErrorResponseMixin):
17
+ name: str
18
+ display_name: str
19
+ state: WorkspaceState
20
+ picture_file_id: int | None = Field(default=None)
21
+
22
+
23
+ class ProjectResponse(ResourceResponse, ArchivableResourceMixin):
24
+ name: str
25
+
26
+
27
+ class ListProjectResponse(BaseListResponse):
28
+ items: list[ProjectResponse]
29
+
30
+
31
+ class ProjectCreate(BaseModel):
32
+ name: str
33
+
34
+
35
+ class RoundResponse(ResourceResponse, ArchivableResourceMixin):
36
+ project_id: int
37
+ name: str
38
+ description: str | None
39
+
40
+
41
+ class ListRoundResponse(BaseListResponse):
42
+ items: list[RoundResponse]
43
+
44
+
45
+ class RoundCreate(BaseModel):
46
+ project_id: int = Field(description="The project to which the round belongs")
47
+ name: str = Field(description="The name of the round")
48
+ description: str | None = Field(default=None)
cradle/sdk/uploader.py ADDED
@@ -0,0 +1,445 @@
1
+ import csv as csv_std
2
+ import tempfile
3
+ import time
4
+ from abc import ABC, abstractmethod
5
+ from collections.abc import Callable
6
+ from datetime import UTC, datetime, timedelta
7
+ from pathlib import Path
8
+ from typing import Self
9
+
10
+ import openpyxl
11
+ import pyarrow as pa
12
+ import pyarrow.compute as pc
13
+ import pyarrow.csv as csv
14
+ import pyarrow.parquet as pq
15
+ from typing_extensions import TypeVar
16
+
17
+ from cradle.sdk.client import DataLoadClient
18
+ from cradle.sdk.types.common import ContextProject, ContextRound
19
+ from cradle.sdk.types.data import (
20
+ ArrayColumn,
21
+ ArrayType,
22
+ Column,
23
+ ColumnType,
24
+ DataLoadCreate,
25
+ DataLoadResponse,
26
+ DataLoadState,
27
+ FileFormat,
28
+ PrimitiveColumn,
29
+ PrimitiveType,
30
+ StructColumn,
31
+ StructType,
32
+ TableLoadConfig,
33
+ TypeNames,
34
+ )
35
+
36
+
37
+ class Transform(ABC):
38
+ """Base class for all transformations applied to UploadFile objects."""
39
+
40
+ @abstractmethod
41
+ def apply(self, data: pa.Table) -> pa.Table:
42
+ """Apply the transformation to an UploadFile and return a new UploadFile."""
43
+ ...
44
+
45
+ def __call__(self, data: pa.Table) -> pa.Table:
46
+ return self.apply(data)
47
+
48
+
49
+ class File:
50
+ def __init__(self, data: pa.Table, source: Path | None, relative_path: Path | None = None):
51
+ self.data = data
52
+ self.source = source
53
+ self.relative_path = relative_path
54
+ self.columns: list[Column] = []
55
+
56
+ try:
57
+ for field in data.schema:
58
+ self.columns.append(_convert_column(field.name, field.type))
59
+ except _NullTypeError as e:
60
+ raise ValueError(f"""
61
+ {e}
62
+
63
+ For CSV input files, provide an explicit column type (e.g. `convert_options=pyarrow.csv.ConvertOptions(column_types={{"c1": pa.string()}})`)
64
+ or omit the affected column (via `ConvertOptions(include_columns=[...])`).
65
+ """) from e
66
+
67
+
68
+ class FileSet:
69
+ def __init__(self, files: list[File]):
70
+ if len(files) == 0:
71
+ raise ValueError("file list must not be empty")
72
+
73
+ for f in files[1:]:
74
+ if f.columns != files[0].columns:
75
+ raise ValueError(
76
+ f"all files must have the same schema, found {f.columns} but expected {files[0].columns}"
77
+ )
78
+
79
+ self.files = files
80
+
81
+ @property
82
+ def columns(self) -> list[Column]:
83
+ return self.files[0].columns
84
+
85
+ def merge(self, other: "FileSet") -> "FileSet":
86
+ if self.columns != other.columns:
87
+ raise ValueError(f"FileSets have incompatible columns {self.columns} and {other.columns}")
88
+ return FileSet(self.files + other.files)
89
+
90
+ def transform(self, *transforms: Transform) -> "FileSet":
91
+ """Apply all transformations to all files.
92
+
93
+ Use this primarily to make the data in the files fit the schema of the target table.
94
+ Avoid transformations that prepare data for a specific task. Those use cases should
95
+ instead be deferred to views over base tables in the platform.
96
+ """
97
+
98
+ # Iterate over transforms in the outer loop so we know for each transform
99
+ # whether it succeeds for all files before proceeding. This should make
100
+ # debugging overall easier.
101
+ files = self.files[:]
102
+ for t in transforms:
103
+ files = [File(t(f.data), f.source, f.relative_path) for f in files]
104
+ return FileSet(files)
105
+
106
+ @classmethod
107
+ def _from_files(
108
+ cls,
109
+ open_fn: Callable[[Path], pa.Table],
110
+ directory: str | Path,
111
+ pattern: str | list[str],
112
+ ) -> Self:
113
+ if isinstance(directory, str):
114
+ directory = Path(directory)
115
+ if isinstance(pattern, str):
116
+ pattern = [pattern]
117
+
118
+ files = []
119
+ for p in pattern:
120
+ for f in directory.glob(p):
121
+ relative_path = f.relative_to(directory)
122
+ files.append(File(data=open_fn(f), source=f, relative_path=relative_path))
123
+ return cls(files)
124
+
125
+ @classmethod
126
+ def from_parquet(
127
+ cls,
128
+ directory: Path | str,
129
+ pattern: str | list[str],
130
+ **parquet_options,
131
+ ) -> Self:
132
+ """Create a `FileSet` from Parquet source files.
133
+
134
+ Args:
135
+ directory: The base directory relative to which the file path pattern is matched.
136
+ pattern: The glob pattern or list of patterns to match the Parquet files. The resulting paths
137
+ relative to `directory` will be submitted as metadata with the uploaded files and be visible
138
+ in the platform.
139
+ **parquet_options: Additional options to pass to `pyarrow.parquet.read_table` to control how the
140
+ Parquet files are read.
141
+ """
142
+
143
+ def _open(file: Path) -> pa.Table:
144
+ return pq.read_table(file, **parquet_options)
145
+
146
+ return cls._from_files(_open, directory, pattern)
147
+
148
+ @classmethod
149
+ def from_csv(
150
+ cls,
151
+ directory: Path | str,
152
+ pattern: str | list[str],
153
+ **csv_options,
154
+ ) -> Self:
155
+ """Create a `FileSet` from CSV source files.
156
+
157
+ Args:
158
+ directory: The base directory relative to which the file path pattern is matched.
159
+ pattern: The glob pattern or list of patterns to match the CSV files. The resulting paths
160
+ relative to `directory` will be submitted as metadata with the uploaded files and be visible
161
+ in the platform.
162
+ **csv_options: Additional options to pass to `pyarrow.csv.read_csv` to control how the
163
+ CSV files are read.
164
+ """
165
+
166
+ def _open(file: Path) -> pa.Table:
167
+ return csv.read_csv(file, **csv_options)
168
+
169
+ return cls._from_files(_open, directory, pattern)
170
+
171
+ @classmethod
172
+ def from_excel(
173
+ cls,
174
+ directory: Path | str,
175
+ pattern: str | list[str],
176
+ sheet: str | int,
177
+ **csv_options,
178
+ ) -> Self:
179
+ """Create a `FileSet` from Excel files (xlsx or xls).
180
+
181
+ Excel files are first converted to CSV as-is and then read as CSV files like in `from_csv`.
182
+
183
+ Args:
184
+ directory: The base directory relative to which the file path pattern is matched.
185
+ pattern: The glob pattern or list of patterns to match the Excel files. The resulting paths
186
+ relative to `directory` will be submitted as metadata with the uploaded files and be visible
187
+ in the platform.
188
+ sheet: The sheet name or index to read from the Excel file. If multiple sheets should be read,
189
+ call `from_excel` individually for each sheet.
190
+ **csv_options: Additional options to pass to `pyarrow.csv.read_csv` to control how the
191
+ converted CSV files are read.
192
+ """
193
+
194
+ def _open(file: Path) -> pa.Table:
195
+ with tempfile.NamedTemporaryFile(suffix=".csv") as temp_csv:
196
+ tmp_csv = Path(temp_csv.name)
197
+
198
+ workbook = openpyxl.load_workbook(file, read_only=True, data_only=True)
199
+ if isinstance(sheet, str):
200
+ if sheet not in workbook.sheetnames:
201
+ raise ValueError(f"Sheet '{sheet}' not found in {file}")
202
+ data = workbook[sheet]
203
+ else:
204
+ if sheet >= len(workbook.sheetnames):
205
+ raise ValueError(f"Sheet index {sheet} is out of range for {file}")
206
+ data = workbook[workbook.sheetnames[sheet]]
207
+
208
+ with tmp_csv.open("w", encoding="utf-8", newline="") as f:
209
+ writer = csv_std.writer(f)
210
+ for row in data.rows:
211
+ writer.writerow([cell.value for cell in row])
212
+
213
+ return csv.read_csv(tmp_csv, **csv_options)
214
+
215
+ return cls._from_files(_open, directory, pattern)
216
+
217
+
218
+ class Uploader:
219
+ def __init__(self, client: DataLoadClient, context: ContextProject | ContextRound):
220
+ """Initialize the Uploader with a Client instance.
221
+
222
+ Args:
223
+ client: An authenticated Client instance
224
+ context: The context of the upload
225
+ """
226
+ self._load_id: int | None = None
227
+ self._client = client
228
+ self._context = context
229
+ self._tables: dict[str, FileSet] = {}
230
+
231
+ def add_files(self, table_name: str, files: FileSet) -> None:
232
+ """Add files to the upload for the specified table.
233
+
234
+ All files for a given table must have the same schema, which must be compatible
235
+ with the schema of the target table. It is compatible if all required columns exist
236
+ and have the right type.
237
+
238
+ Args:
239
+ table_name: The name of the table the files will be ingested into.
240
+ files: The files to upload.
241
+ """
242
+ existing = self._tables.get(table_name)
243
+ if existing is not None:
244
+ files = existing.merge(files)
245
+ self._tables[table_name] = files
246
+
247
+ def load(self) -> DataLoadResponse:
248
+ """Upload all files for this upload.
249
+
250
+ For each added `UploadFile` this will upload the data in Parquet format to be ingested
251
+ into the target table. Additionally, the source, if set, will be uploaded for archival purposes.
252
+
253
+ The upload will be left in pending state until finalize is called.
254
+
255
+ Returns:
256
+ UploadResponse: The upload state retrieved from the server after all uploads completed.
257
+ """
258
+ if self._load_id is not None:
259
+ raise ValueError("Upload already started")
260
+
261
+ load = self._client.create(
262
+ DataLoadCreate(
263
+ context=self._context,
264
+ tables={
265
+ name: TableLoadConfig(columns=files.columns, format=FileFormat.PARQUET)
266
+ for name, files in self._tables.items()
267
+ },
268
+ )
269
+ )
270
+ self._load_id = load.id
271
+
272
+ with tempfile.TemporaryDirectory(prefix="cradle-upload-") as temp_dir:
273
+ for ref, t in self._tables.items():
274
+ base_dir = Path(temp_dir) / ref
275
+ base_dir.mkdir(exist_ok=False, parents=True)
276
+
277
+ for i, file in enumerate(t.files):
278
+ description = None
279
+ source_file_id = None
280
+ if file.source is not None:
281
+ print(f"Uploading source file {file.source}...")
282
+ resp = self._client.upload_file(
283
+ load_id=self._load_id,
284
+ file=file.source,
285
+ filepath=file.relative_path,
286
+ table_reference=None,
287
+ )
288
+ description = f"Source file: {file.source} ({resp.id})"
289
+ source_file_id = resp.id
290
+
291
+ path = base_dir / f"{i}.parquet"
292
+ print(f"Uploading file {path}...")
293
+ pq.write_table(file.data, path)
294
+ self._client.upload_file(
295
+ load_id=self._load_id,
296
+ file=path,
297
+ filepath=(
298
+ file.relative_path.with_suffix(".parquet") if file.relative_path is not None else None
299
+ ),
300
+ table_reference=ref,
301
+ description=description,
302
+ source_file_id=source_file_id,
303
+ )
304
+
305
+ return self._client.get(load_id=self._load_id)
306
+
307
+ def finalize(self, wait: bool = True, timeout: float = 60) -> DataLoadResponse:
308
+ """Finalize the upload. Calls `upload()` if it has not already been called.
309
+
310
+ Args:
311
+ wait: Whether to wait for the upload to complete.
312
+ timeout: Timeout in seconds for waiting for the upload to complete.
313
+
314
+ Returns:
315
+ UploadResponse: The most recent upload state retrieved from the server.
316
+ """
317
+ if self._load_id is None:
318
+ load_id = self.load().id
319
+ else:
320
+ load_id = self._load_id
321
+
322
+ u = self._client.finalize(load_id=load_id)
323
+ if not wait:
324
+ return u
325
+
326
+ def _fn():
327
+ u = self._client.get(load_id=load_id)
328
+ if u.state == DataLoadState.FAILED:
329
+ raise RuntimeError(f"Upload failed: {', '.join(u.errors)}")
330
+ return u.state == DataLoadState.COMPLETED, u
331
+
332
+ return _wait_for_condition(_fn, datetime.now(tz=UTC) + timedelta(seconds=timeout))
333
+
334
+
335
+ class RenameColumns(Transform):
336
+ """Transform that renames columns in the dataframe."""
337
+
338
+ def __init__(self, column_map: dict[str, str]):
339
+ """Args:
340
+ column_map: A dictionary mapping old column names to new column names
341
+ """
342
+ self.column_map = column_map
343
+
344
+ def apply(self, data: pa.Table) -> pa.Table:
345
+ names = [self.column_map.get(name, name) for name in data.column_names]
346
+ return data.rename_columns(names)
347
+
348
+
349
+ class FilterRows(Transform):
350
+ """Filter rows in the dataframe based on a provided condition.
351
+
352
+ Example usage: FilterRows(lambda t: pa.compute.greater(t.column("a"), 42))
353
+ """
354
+
355
+ def __init__(self, condition: Callable[[pa.Table], pa.Array]):
356
+ """Args:
357
+ condition: A function that computes a boolean array over all rows in the provided tables.
358
+ Rows for which the result index is False, will be dropped from the resulting table.
359
+ """
360
+ self._condition = condition
361
+
362
+ def apply(self, data: pa.Table) -> pa.Table:
363
+ return data.filter(self._condition(data))
364
+
365
+
366
+ class DropNullRows(FilterRows):
367
+ """Transform that removes rows with null values in the specified columns."""
368
+
369
+ def __init__(self, columns: str | list[str] | None = None):
370
+ """Args:
371
+ columns: Column or list of column names to check for nulls. If None, checks all columns.
372
+ """
373
+ if isinstance(columns, str):
374
+ columns = [columns]
375
+ self._columns = columns
376
+
377
+ def apply(self, data: pa.Table) -> pa.Table:
378
+ columns = self._columns
379
+ if columns is None:
380
+ columns = data.column_names
381
+ mask = pa.array([True] * len(data))
382
+
383
+ for col in columns:
384
+ if col not in data.column_names:
385
+ raise ValueError(f"Column '{col}' does not exist in table")
386
+ mask = pc.and_(mask, pc.is_valid(data[col])) # type: ignore[reportAttributeAccessIssue] seem to actually be missing in venv, but invoking them works ???
387
+
388
+ return data.filter(mask)
389
+
390
+
391
+ T = TypeVar("T")
392
+
393
+
394
+ def _wait_for_condition(cond_fn: Callable[[], tuple[bool, T]], deadline: datetime, interval: float = 1) -> T:
395
+ """Wait for the condition function to return True or the deadline is exceeded. It will be executed at least once."""
396
+
397
+ while True:
398
+ done, result = cond_fn()
399
+ if done:
400
+ return result
401
+ if datetime.now(tz=UTC) > deadline:
402
+ raise TimeoutError("Deadline exceeded")
403
+ time.sleep(interval)
404
+
405
+
406
+ class _NullTypeError(ValueError): ...
407
+
408
+
409
+ def _convert_type(type_: pa.DataType) -> ColumnType:
410
+ if pa.types.is_integer(type_):
411
+ return PrimitiveType(type=TypeNames.INT64)
412
+ if pa.types.is_floating(type_):
413
+ return PrimitiveType(type=TypeNames.FLOAT64)
414
+ if pa.types.is_boolean(type_):
415
+ return PrimitiveType(type=TypeNames.BOOL)
416
+ if pa.types.is_string(type_):
417
+ return PrimitiveType(type=TypeNames.STRING)
418
+ if pa.types.is_struct(type_):
419
+ return StructType(columns=[_convert_column(f.name, f.type) for f in type_.fields])
420
+ if pa.types.is_list(type_):
421
+ return ArrayType(item_type=_convert_type(type_.value_type))
422
+ if pa.types.is_null(type_):
423
+ raise _NullTypeError(f"Unsupported PyArrow type {type_}")
424
+ else:
425
+ raise ValueError(f"Unsupported PyArrow type {type_}")
426
+
427
+
428
+ def _convert_column(name: str, type_: pa.DataType) -> Column:
429
+ if pa.types.is_integer(type_):
430
+ return PrimitiveColumn(name=name, type=TypeNames.INT64, nullable=True)
431
+ if pa.types.is_floating(type_):
432
+ return PrimitiveColumn(name=name, type=TypeNames.FLOAT64, nullable=True)
433
+ if pa.types.is_boolean(type_):
434
+ return PrimitiveColumn(name=name, type=TypeNames.BOOL, nullable=True)
435
+ if pa.types.is_string(type_):
436
+ return PrimitiveColumn(name=name, type=TypeNames.STRING, nullable=True)
437
+ if pa.types.is_struct(type_):
438
+ columns = [_convert_column(f.name, f.type) for f in type_.fields]
439
+ return StructColumn(name=name, columns=columns, nullable=True)
440
+ if pa.types.is_list(type_):
441
+ return ArrayColumn(name=name, item_type=_convert_type(type_.value_type))
442
+ if pa.types.is_null(type_):
443
+ raise _NullTypeError(f"Unsupported PyArrow type {type_} for column {name}")
444
+ else:
445
+ raise ValueError(f"Unsupported PyArrow type {type_} for column {name}")
cradle/sdk/utils.py ADDED
@@ -0,0 +1,40 @@
1
+ import json
2
+ import time
3
+ from pathlib import Path
4
+
5
+ from cradle.sdk.types.data import FileUploadResponse
6
+ from cradle.sdk.types.task import TaskResponse
7
+
8
+ from .client import DataLoadClient, TaskClient
9
+
10
+
11
+ def upload_blob_files(
12
+ client: DataLoadClient, load_id: int, files: list[Path], table_reference: str | None = None
13
+ ) -> list[FileUploadResponse]:
14
+ responses = []
15
+ for f in files:
16
+ response = client.upload_file(load_id=load_id, file=f, table_reference=table_reference)
17
+ print(f"Uploaded {f}, id={response.id}")
18
+ responses.append(response)
19
+ return responses
20
+
21
+
22
+ def wait_for_task(client: TaskClient, task_id: int, timeout: float = 60) -> TaskResponse:
23
+ start = time.monotonic()
24
+ while time.monotonic() - start < timeout:
25
+ task = client.get(task_id)
26
+ print(f"\r{task.state} @ {task.updated_at}", end="", flush=True)
27
+ if task.state == "COMPLETED":
28
+ if task.result is None:
29
+ raise ValueError("Task result is None")
30
+ print()
31
+ print(json.dumps(task.result.model_dump(mode="json"), indent=2))
32
+ return task
33
+ if task.state == "FAILED":
34
+ print()
35
+ print(task.errors)
36
+ return task
37
+ if task.state == "CANCELLED":
38
+ return task
39
+ time.sleep(1)
40
+ return client.get(task_id)
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.3
2
+ Name: cradle-sdk
3
+ Version: 0.1.1
4
+ Summary:
5
+ Author: Cradle
6
+ Author-email: Cradle <eng@cradle.bio>
7
+ Requires-Dist: httpx>=0.25.0
8
+ Requires-Dist: pyjwt>=2.0.0
9
+ Requires-Dist: numpy>=1.23.0,<2.0.0
10
+ Requires-Dist: openpyxl>=3.1.0
11
+ Requires-Dist: pandas[excel]>=2.2.3
12
+ Requires-Dist: pyarrow>=19.0.1
13
+ Requires-Dist: pydantic[email]>=2.11.0,<3.0.0
14
+ Requires-Dist: keyring>=25.6.0
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+
@@ -0,0 +1,16 @@
1
+ cradle/sdk/__init__.py,sha256=wmR7yK3SKHqgoLv2oEIKVnns-a-imA6N0obI7G-EcoU,263
2
+ cradle/sdk/auth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ cradle/sdk/auth/device.py,sha256=oiLHfVwN5cGu9z7vID7uzqsqq44-OIo_LirZSduj_B8,19774
4
+ cradle/sdk/client.py,sha256=VN1YBZxlOq177w-tkIZiv4MGT1E7xCxyrkJwj8H1mEw,37577
5
+ cradle/sdk/exceptions.py,sha256=kWQ3OAlkaTIJbu61kazUxxiZzPpnwbZwOA7CLCePGkg,413
6
+ cradle/sdk/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ cradle/sdk/types/assembly.py,sha256=l38It_fDHNo0mbQHCw_DaYc03mDDzToepVnK0z3anVI,333
8
+ cradle/sdk/types/common.py,sha256=iSJcJfrGKWqxPORD2Joa8x7_jcYRkR8a0CuLdiNZ5vU,2277
9
+ cradle/sdk/types/data.py,sha256=4aGIsvXneS7ULWJSIGZG2Kle09R3WbqHNNVALgVpk2M,13979
10
+ cradle/sdk/types/task.py,sha256=ge6JbLoVhMCN6nhPwX7a0zVNW9vF9ih4N3F9L9sQM2k,45270
11
+ cradle/sdk/types/workspace.py,sha256=X9_MwykGIMwJYH0iQLBfaLyB1wjNWrtdXtNf6fYGFNg,1142
12
+ cradle/sdk/uploader.py,sha256=F7xNDc073lPXkC0km_9mknWUGXrwi6Y5FtGp3vv3awg,16785
13
+ cradle/sdk/utils.py,sha256=CyWsC29MCIBeLrzr_MT7IOGhCjbOyvjEmEH6hetRLfM,1341
14
+ cradle_sdk-0.1.1.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
15
+ cradle_sdk-0.1.1.dist-info/METADATA,sha256=TPKcF6LUhGfqg8q1YOJr4ai-hoFk5fP9j8WeECN52Ko,450
16
+ cradle_sdk-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.9.17
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any