dyff-schema 0.18.0__py3-none-any.whl → 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dyff-schema might be problematic. Click here for more details.
- dyff/schema/dataset/embedding.py +4 -0
- dyff/schema/v0/r1/base.py +4 -1
- dyff/schema/v0/r1/dataset/arrow.py +23 -13
- dyff/schema/v0/r1/dataset/binary.py +5 -1
- dyff/schema/v0/r1/dataset/embedding.py +26 -0
- dyff/schema/v0/r1/platform.py +17 -1
- dyff/schema/v0/r1/requests.py +21 -0
- {dyff_schema-0.18.0.dist-info → dyff_schema-0.19.0.dist-info}/METADATA +1 -1
- {dyff_schema-0.18.0.dist-info → dyff_schema-0.19.0.dist-info}/RECORD +13 -11
- {dyff_schema-0.18.0.dist-info → dyff_schema-0.19.0.dist-info}/WHEEL +1 -1
- {dyff_schema-0.18.0.dist-info → dyff_schema-0.19.0.dist-info}/LICENSE +0 -0
- {dyff_schema-0.18.0.dist-info → dyff_schema-0.19.0.dist-info}/NOTICE +0 -0
- {dyff_schema-0.18.0.dist-info → dyff_schema-0.19.0.dist-info}/top_level.txt +0 -0
dyff/schema/v0/r1/base.py
CHANGED
|
@@ -547,8 +547,11 @@ def uint64(
|
|
|
547
547
|
return type("UInt64Value", (UInt64,), namespace)
|
|
548
548
|
|
|
549
549
|
|
|
550
|
+
_ListElementT = TypeVar("_ListElementT")
|
|
551
|
+
|
|
552
|
+
|
|
550
553
|
def list_(
|
|
551
|
-
item_type: Type[
|
|
554
|
+
item_type: Type[_ListElementT], *, list_size: Optional[int] = None
|
|
552
555
|
) -> Type[list]:
|
|
553
556
|
if list_size is None:
|
|
554
557
|
return pydantic.conlist(item_type)
|
|
@@ -8,7 +8,7 @@ import functools
|
|
|
8
8
|
import inspect
|
|
9
9
|
import typing
|
|
10
10
|
import uuid
|
|
11
|
-
from typing import Any, Iterable, Optional
|
|
11
|
+
from typing import Any, Iterable, Literal, Optional
|
|
12
12
|
|
|
13
13
|
import pyarrow
|
|
14
14
|
import pyarrow.dataset
|
|
@@ -51,7 +51,10 @@ def make_response_schema(schema: pyarrow.Schema) -> pyarrow.Schema:
|
|
|
51
51
|
"""Given an Arrow schema, create a new one that has the extra ``ResponseItem``
|
|
52
52
|
fields added."""
|
|
53
53
|
response_item_schema = make_response_item_schema(schema)
|
|
54
|
-
fields =
|
|
54
|
+
fields = [
|
|
55
|
+
pyarrow.field(n, t)
|
|
56
|
+
for n, t in zip(response_item_schema.names, response_item_schema.types)
|
|
57
|
+
]
|
|
55
58
|
item_type = pyarrow.struct(fields)
|
|
56
59
|
responses_type = pyarrow.list_(item_type)
|
|
57
60
|
return pyarrow.schema(
|
|
@@ -65,7 +68,8 @@ def make_response_schema(schema: pyarrow.Schema) -> pyarrow.Schema:
|
|
|
65
68
|
|
|
66
69
|
def encode_schema(schema: pyarrow.Schema) -> str:
|
|
67
70
|
"""Encode an Arrow schema as a string."""
|
|
68
|
-
|
|
71
|
+
# pyarrow.Buffer doesn't satisfy ReadableBuffer but it still works
|
|
72
|
+
return binary.encode(schema.serialize()) # type: ignore[arg-type]
|
|
69
73
|
|
|
70
74
|
|
|
71
75
|
def decode_schema(schema: str) -> pyarrow.Schema:
|
|
@@ -84,7 +88,7 @@ def subset_schema(schema: pyarrow.Schema, field_names: list[str]) -> pyarrow.Sch
|
|
|
84
88
|
return pyarrow.schema(fields)
|
|
85
89
|
|
|
86
90
|
|
|
87
|
-
def arrow_type(annotation: type):
|
|
91
|
+
def arrow_type(annotation: type) -> pyarrow.DataType:
|
|
88
92
|
"""Determine a suitable arrow type for a pydantic model field.
|
|
89
93
|
|
|
90
94
|
Supports primitive types as well as pydantic sub-models, lists, and optional types.
|
|
@@ -130,8 +134,7 @@ def arrow_type(annotation: type):
|
|
|
130
134
|
|
|
131
135
|
if issubclass(annotation, DType):
|
|
132
136
|
# The dtype is in the metaclass
|
|
133
|
-
return type(annotation).dtype # type: ignore
|
|
134
|
-
# return pyarrow.from_numpy_dtype(type(annotation).dtype) # type: ignore
|
|
137
|
+
return pyarrow.from_numpy_dtype(type(annotation).dtype) # type: ignore[attr-defined]
|
|
135
138
|
|
|
136
139
|
if annotation == bool:
|
|
137
140
|
return pyarrow.bool_()
|
|
@@ -246,6 +249,7 @@ def _construct_field_docs(
|
|
|
246
249
|
if pyarrow.types.is_struct(field.type):
|
|
247
250
|
children = [field.type.field(i) for i in range(field.type.num_fields)]
|
|
248
251
|
elif pyarrow.types.is_list(field.type):
|
|
252
|
+
assert isinstance(field.type, pyarrow.ListType)
|
|
249
253
|
children = [field.type.value_field]
|
|
250
254
|
else:
|
|
251
255
|
raise ValueError(f"Unsupported nested type {field.type}")
|
|
@@ -275,8 +279,10 @@ def write_dataset(
|
|
|
275
279
|
*,
|
|
276
280
|
output_path: str,
|
|
277
281
|
feature_schema: pyarrow.Schema,
|
|
278
|
-
partition_schema: pyarrow.Schema = None,
|
|
279
|
-
existing_data_behavior:
|
|
282
|
+
partition_schema: Optional[pyarrow.Schema] = None,
|
|
283
|
+
existing_data_behavior: Literal[
|
|
284
|
+
"error", "overwrite_or_ignore", "delete_matching"
|
|
285
|
+
] = "overwrite_or_ignore",
|
|
280
286
|
**kwargs,
|
|
281
287
|
):
|
|
282
288
|
"""Creates a ``pyarrow.dataset.Dataset`` from a data generator.
|
|
@@ -291,15 +297,19 @@ def write_dataset(
|
|
|
291
297
|
existing_data_behavior: Same as ``pyarrow.dataset.write_dataset``, but
|
|
292
298
|
defaults to ``"overwrite_or_ignore"``, which is typically what we want.
|
|
293
299
|
"""
|
|
294
|
-
partitioning =
|
|
295
|
-
partition_schema, flavor="hive"
|
|
300
|
+
partitioning = (
|
|
301
|
+
pyarrow.dataset.partitioning(partition_schema, flavor="hive")
|
|
302
|
+
if partition_schema is not None
|
|
303
|
+
else None
|
|
296
304
|
)
|
|
297
305
|
pyarrow.dataset.write_dataset(
|
|
298
306
|
data_generator,
|
|
299
307
|
output_path,
|
|
300
308
|
format="parquet",
|
|
301
309
|
schema=feature_schema,
|
|
302
|
-
|
|
310
|
+
# Type annotation doesn't include PartitioningFactory even though
|
|
311
|
+
# you're clearly meant to pass the output of partitioning() here
|
|
312
|
+
partitioning=partitioning, # type: ignore[arg-type]
|
|
303
313
|
existing_data_behavior=existing_data_behavior,
|
|
304
314
|
**kwargs,
|
|
305
315
|
)
|
|
@@ -326,10 +336,10 @@ def batches(
|
|
|
326
336
|
for instance in instances:
|
|
327
337
|
batch.append(instance)
|
|
328
338
|
if len(batch) == batch_size:
|
|
329
|
-
yield pyarrow.RecordBatch.from_pylist(batch, schema=schema)
|
|
339
|
+
yield pyarrow.RecordBatch.from_pylist(batch, schema=schema) # type: ignore[attr-defined]
|
|
330
340
|
batch = []
|
|
331
341
|
if batch: # Final (incomplete) batch
|
|
332
|
-
yield pyarrow.RecordBatch.from_pylist(batch, schema=schema)
|
|
342
|
+
yield pyarrow.RecordBatch.from_pylist(batch, schema=schema) # type: ignore[attr-defined]
|
|
333
343
|
|
|
334
344
|
|
|
335
345
|
__all__ = [
|
|
@@ -3,9 +3,13 @@
|
|
|
3
3
|
|
|
4
4
|
import base64
|
|
5
5
|
import hashlib
|
|
6
|
+
import typing
|
|
6
7
|
|
|
8
|
+
if typing.TYPE_CHECKING:
|
|
9
|
+
from _typeshed import ReadableBuffer
|
|
7
10
|
|
|
8
|
-
|
|
11
|
+
|
|
12
|
+
def encode(data: "ReadableBuffer") -> str:
|
|
9
13
|
return base64.b64encode(data).decode("utf-8")
|
|
10
14
|
|
|
11
15
|
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2024 UL Research Institutes
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from typing import Type
|
|
5
|
+
|
|
6
|
+
import pydantic
|
|
7
|
+
|
|
8
|
+
from ..base import DyffSchemaBaseModel, FixedWidthFloat, list_
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def embedding(
|
|
12
|
+
element_type: Type[FixedWidthFloat], size: int
|
|
13
|
+
) -> Type[DyffSchemaBaseModel]:
|
|
14
|
+
"""Returns a schema type representing a list of fixed-length embedding vectors."""
|
|
15
|
+
|
|
16
|
+
class _Embedding(DyffSchemaBaseModel):
|
|
17
|
+
embedding: list_(element_type, list_size=size) = pydantic.Field( # type: ignore[valid-type]
|
|
18
|
+
description="An embedding vector"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return _Embedding
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"embedding",
|
|
26
|
+
]
|
dyff/schema/v0/r1/platform.py
CHANGED
|
@@ -179,6 +179,14 @@ def summary_maxlen() -> int:
|
|
|
179
179
|
return 280
|
|
180
180
|
|
|
181
181
|
|
|
182
|
+
def entity_id_regex() -> str:
|
|
183
|
+
"""An entity ID is a 32-character HEX string.
|
|
184
|
+
|
|
185
|
+
TODO: This doesn't check whether the hex string is a valid UUID.
|
|
186
|
+
"""
|
|
187
|
+
return r"^[a-f0-9]{32}$"
|
|
188
|
+
|
|
189
|
+
|
|
182
190
|
class Entities(str, enum.Enum):
|
|
183
191
|
"""The kinds of entities in the dyff system."""
|
|
184
192
|
|
|
@@ -245,6 +253,9 @@ class Resources(str, enum.Enum):
|
|
|
245
253
|
raise ValueError(f"No Resources for Entity kind: {kind}")
|
|
246
254
|
|
|
247
255
|
|
|
256
|
+
EntityID: TypeAlias = pydantic.constr(regex=entity_id_regex()) # type: ignore
|
|
257
|
+
|
|
258
|
+
|
|
248
259
|
class DyffModelWithID(DyffSchemaBaseModel):
|
|
249
260
|
id: str = pydantic.Field(description="Unique identifier of the entity")
|
|
250
261
|
account: str = pydantic.Field(description="Account that owns the entity")
|
|
@@ -418,6 +429,10 @@ class DyffEntity(Status, Labeled, SchemaVersion, DyffModelWithID):
|
|
|
418
429
|
default=None, description="Resource creation time (assigned by system)"
|
|
419
430
|
)
|
|
420
431
|
|
|
432
|
+
lastTransitionTime: Optional[datetime] = pydantic.Field(
|
|
433
|
+
default=None, description="Time of last (status, reason) change."
|
|
434
|
+
)
|
|
435
|
+
|
|
421
436
|
@abc.abstractmethod
|
|
422
437
|
def dependencies(self) -> list[str]:
|
|
423
438
|
"""List of IDs of resources that this resource depends on.
|
|
@@ -843,7 +858,7 @@ class DataSchema(DyffSchemaBaseModel):
|
|
|
843
858
|
def make_output_schema(
|
|
844
859
|
schema: Union[pyarrow.Schema, Type[DyffSchemaBaseModel], DyffDataSchema],
|
|
845
860
|
) -> "DataSchema":
|
|
846
|
-
"""Construct a complete ``DataSchema`` for inference
|
|
861
|
+
"""Construct a complete ``DataSchema`` for inference outputs.
|
|
847
862
|
|
|
848
863
|
This function will add required special fields for input data and then
|
|
849
864
|
convert the augmented schema as necessary to populate at least the
|
|
@@ -2234,6 +2249,7 @@ __all__ = [
|
|
|
2234
2249
|
"DyffModelWithID",
|
|
2235
2250
|
"DyffSchemaBaseModel",
|
|
2236
2251
|
"Entities",
|
|
2252
|
+
"EntityID",
|
|
2237
2253
|
"Evaluation",
|
|
2238
2254
|
"EvaluationBase",
|
|
2239
2255
|
"ExtractorStep",
|
dyff/schema/v0/r1/requests.py
CHANGED
|
@@ -14,6 +14,7 @@ in response.
|
|
|
14
14
|
|
|
15
15
|
from __future__ import annotations
|
|
16
16
|
|
|
17
|
+
import re
|
|
17
18
|
from datetime import datetime
|
|
18
19
|
from typing import Optional, Union
|
|
19
20
|
|
|
@@ -22,6 +23,7 @@ import pydantic
|
|
|
22
23
|
from .base import DyffBaseModel
|
|
23
24
|
from .platform import (
|
|
24
25
|
AnalysisBase,
|
|
26
|
+
AnalysisScope,
|
|
25
27
|
DatasetBase,
|
|
26
28
|
DataView,
|
|
27
29
|
DocumentationBase,
|
|
@@ -70,6 +72,25 @@ class AnalysisCreateRequest(DyffEntityCreateRequest, AnalysisBase):
|
|
|
70
72
|
|
|
71
73
|
method: str = pydantic.Field(description="Method ID")
|
|
72
74
|
|
|
75
|
+
@pydantic.validator("scope", check_fields=False)
|
|
76
|
+
def _validate_scope(cls, scope: AnalysisScope) -> AnalysisScope:
|
|
77
|
+
# TODO: This has to be a validator function because we can't apply the
|
|
78
|
+
# regex contraint to AnalysisScope, since there are already entities
|
|
79
|
+
# with invalid IDs in the data store. Fix in Schema v1.
|
|
80
|
+
uuid4 = r"^[0-9a-f]{8}[0-9a-f]{4}[4][0-9a-f]{3}[89ab][0-9a-f]{3}[0-9a-f]{12}$"
|
|
81
|
+
id_pattern = re.compile(uuid4)
|
|
82
|
+
if scope.dataset is not None and not re.match(id_pattern, scope.dataset):
|
|
83
|
+
raise ValueError("scope.dataset must be an entity ID")
|
|
84
|
+
if scope.evaluation is not None and not re.match(id_pattern, scope.evaluation):
|
|
85
|
+
raise ValueError("scope.evaluation must be an entity ID")
|
|
86
|
+
if scope.inferenceService is not None and not re.match(
|
|
87
|
+
id_pattern, scope.inferenceService
|
|
88
|
+
):
|
|
89
|
+
raise ValueError("scope.inferenceService must be an entity ID")
|
|
90
|
+
if scope.model is not None and not re.match(id_pattern, scope.model):
|
|
91
|
+
raise ValueError("scope.model must be an entity ID")
|
|
92
|
+
return scope
|
|
93
|
+
|
|
73
94
|
|
|
74
95
|
class DatasetCreateRequest(DyffEntityCreateRequest, DatasetBase):
|
|
75
96
|
pass
|
|
@@ -15,6 +15,7 @@ dyff/schema/dataset/__init__.py,sha256=P4tOKKiOFaVeh3-Keiwpg9n7VTQUJQVOIVZhm8sdA
|
|
|
15
15
|
dyff/schema/dataset/arrow.py,sha256=1tkgbXcvU0Wy-HM64ddaHfAKJos6FLXxGxM9l8xLcjY,129
|
|
16
16
|
dyff/schema/dataset/binary.py,sha256=jm73xo-mSMzh1GuI1uUZ2JulY7h2aJqV9CklrD_wScE,130
|
|
17
17
|
dyff/schema/dataset/classification.py,sha256=nXfFeuAA-wGoiatZ6KzZPSWC078a_6nrHRsKeRmjbAw,138
|
|
18
|
+
dyff/schema/dataset/embedding.py,sha256=yp817Op-NsS66MQJjtJuV1r2NTBcPmWmz3BaYeAflg4,133
|
|
18
19
|
dyff/schema/dataset/text.py,sha256=_mHnCM1oPIqSBqOeggCw0IUmoxmm4_J64GjF-xdOY-4,128
|
|
19
20
|
dyff/schema/dataset/vision.py,sha256=D2wCN54xw8m6yYtiFYjvB15t7PUEREZYUo2A76cejv4,130
|
|
20
21
|
dyff/schema/io/__init__.py,sha256=L5y8UhRnojerPYHumsxQJRcHCNz8Hj9NM8b47mewMNs,92
|
|
@@ -22,22 +23,23 @@ dyff/schema/io/vllm.py,sha256=2q05M_-lTzq9oywKXHPPpCFCSDVCSsRQqtmERzWTtio,123
|
|
|
22
23
|
dyff/schema/v0/__init__.py,sha256=L5y8UhRnojerPYHumsxQJRcHCNz8Hj9NM8b47mewMNs,92
|
|
23
24
|
dyff/schema/v0/r1/__init__.py,sha256=L5y8UhRnojerPYHumsxQJRcHCNz8Hj9NM8b47mewMNs,92
|
|
24
25
|
dyff/schema/v0/r1/adapters.py,sha256=2t2oxsnGfSEDKKDIEYw4qqLXMH7qlFIwPVuLyUmbsHs,23552
|
|
25
|
-
dyff/schema/v0/r1/base.py,sha256=
|
|
26
|
-
dyff/schema/v0/r1/platform.py,sha256=
|
|
27
|
-
dyff/schema/v0/r1/requests.py,sha256=
|
|
26
|
+
dyff/schema/v0/r1/base.py,sha256=IpvlYDr6JjYo6tn8XW4C1Fpgd_uqzZGZsG_cuEn_gQs,19441
|
|
27
|
+
dyff/schema/v0/r1/platform.py,sha256=c-XBKsACEmz_VptGDGCDsULGYtxDajVZ6H7mzjxXHGY,74405
|
|
28
|
+
dyff/schema/v0/r1/requests.py,sha256=UC2jK_upahjemie2s_Acg9xd4NDlwlwUUAOGnpI2iS0,13058
|
|
28
29
|
dyff/schema/v0/r1/test.py,sha256=X6dUyVd5svcPCI-PBMOAqEfK9jv3bRDvkQTJzwS96c0,10720
|
|
29
30
|
dyff/schema/v0/r1/version.py,sha256=isKAGuGxsdru8vDaYmI4YiZdJOu_wNxXK7u6QzD6FE4,392
|
|
30
31
|
dyff/schema/v0/r1/dataset/__init__.py,sha256=LbVlkO2asyGYBKk2z49xjJYTM-pu9y9e4eQDXgTDLnM,2553
|
|
31
|
-
dyff/schema/v0/r1/dataset/arrow.py,sha256=
|
|
32
|
-
dyff/schema/v0/r1/dataset/binary.py,sha256=
|
|
32
|
+
dyff/schema/v0/r1/dataset/arrow.py,sha256=PHNtBe32e4NXNQIqUMh0SxCPzTdidVkKBaFqYr6-aFo,12857
|
|
33
|
+
dyff/schema/v0/r1/dataset/binary.py,sha256=KXvn79SUt3e_ZZXrju2atT_yMFwgAkCgDYXBtfv0E_I,636
|
|
33
34
|
dyff/schema/v0/r1/dataset/classification.py,sha256=pbbEXhxyZ0pgYwzaTlM8hVHPNEJDCdHKOeGowPXgWYc,311
|
|
35
|
+
dyff/schema/v0/r1/dataset/embedding.py,sha256=qcHHIzpFQj9bQ2vuECO9g1EU4WT1yjvUZbTlw0qwkc8,642
|
|
34
36
|
dyff/schema/v0/r1/dataset/text.py,sha256=nLIn91Zlt0tNdXUklSgjJ-kEDxoPX32ISLkiv2DzLvE,1008
|
|
35
37
|
dyff/schema/v0/r1/dataset/vision.py,sha256=aIe0fbfM_g3DsrDTdg2K803YKLjZBpurM_VJcJFuZLc,369
|
|
36
38
|
dyff/schema/v0/r1/io/__init__.py,sha256=L5y8UhRnojerPYHumsxQJRcHCNz8Hj9NM8b47mewMNs,92
|
|
37
39
|
dyff/schema/v0/r1/io/vllm.py,sha256=CUE9y8KthtUI7sD49S875rDmPvKotSXVIRaBS79aBZs,5320
|
|
38
|
-
dyff_schema-0.
|
|
39
|
-
dyff_schema-0.
|
|
40
|
-
dyff_schema-0.
|
|
41
|
-
dyff_schema-0.
|
|
42
|
-
dyff_schema-0.
|
|
43
|
-
dyff_schema-0.
|
|
40
|
+
dyff_schema-0.19.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
41
|
+
dyff_schema-0.19.0.dist-info/METADATA,sha256=r7aFHs035B-teQqNrDhlCKYNSicfqzmwXU_8ADCAigo,3482
|
|
42
|
+
dyff_schema-0.19.0.dist-info/NOTICE,sha256=YONACu0s_Ui6jNi-wtEsVQbTU1JIkh8wvLH6d1-Ni_w,43
|
|
43
|
+
dyff_schema-0.19.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
44
|
+
dyff_schema-0.19.0.dist-info/top_level.txt,sha256=9e3VVdeX73t_sUJOPQPCcGtYO1JhoErhHIi3WoWGcFI,5
|
|
45
|
+
dyff_schema-0.19.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|