dyff-schema 0.18.0__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dyff-schema might be problematic. Click here for more details.

@@ -0,0 +1,4 @@
1
+ # SPDX-FileCopyrightText: 2024 UL Research Institutes
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from ..v0.r1.dataset.embedding import *
dyff/schema/v0/r1/base.py CHANGED
@@ -547,8 +547,11 @@ def uint64(
547
547
  return type("UInt64Value", (UInt64,), namespace)
548
548
 
549
549
 
550
+ _ListElementT = TypeVar("_ListElementT")
551
+
552
+
550
553
  def list_(
551
- item_type: Type[pydantic.BaseModel], *, list_size: Optional[int] = None
554
+ item_type: Type[_ListElementT], *, list_size: Optional[int] = None
552
555
  ) -> Type[list]:
553
556
  if list_size is None:
554
557
  return pydantic.conlist(item_type)
@@ -8,7 +8,7 @@ import functools
8
8
  import inspect
9
9
  import typing
10
10
  import uuid
11
- from typing import Any, Iterable, Optional
11
+ from typing import Any, Iterable, Literal, Optional
12
12
 
13
13
  import pyarrow
14
14
  import pyarrow.dataset
@@ -51,7 +51,10 @@ def make_response_schema(schema: pyarrow.Schema) -> pyarrow.Schema:
51
51
  """Given an Arrow schema, create a new one that has the extra ``ResponseItem``
52
52
  fields added."""
53
53
  response_item_schema = make_response_item_schema(schema)
54
- fields = list(zip(response_item_schema.names, response_item_schema.types))
54
+ fields = [
55
+ pyarrow.field(n, t)
56
+ for n, t in zip(response_item_schema.names, response_item_schema.types)
57
+ ]
55
58
  item_type = pyarrow.struct(fields)
56
59
  responses_type = pyarrow.list_(item_type)
57
60
  return pyarrow.schema(
@@ -65,7 +68,8 @@ def make_response_schema(schema: pyarrow.Schema) -> pyarrow.Schema:
65
68
 
66
69
  def encode_schema(schema: pyarrow.Schema) -> str:
67
70
  """Encode an Arrow schema as a string."""
68
- return binary.encode(schema.serialize())
71
+ # pyarrow.Buffer doesn't satisfy ReadableBuffer but it still works
72
+ return binary.encode(schema.serialize()) # type: ignore[arg-type]
69
73
 
70
74
 
71
75
  def decode_schema(schema: str) -> pyarrow.Schema:
@@ -84,7 +88,7 @@ def subset_schema(schema: pyarrow.Schema, field_names: list[str]) -> pyarrow.Sch
84
88
  return pyarrow.schema(fields)
85
89
 
86
90
 
87
- def arrow_type(annotation: type):
91
+ def arrow_type(annotation: type) -> pyarrow.DataType:
88
92
  """Determine a suitable arrow type for a pydantic model field.
89
93
 
90
94
  Supports primitive types as well as pydantic sub-models, lists, and optional types.
@@ -130,8 +134,7 @@ def arrow_type(annotation: type):
130
134
 
131
135
  if issubclass(annotation, DType):
132
136
  # The dtype is in the metaclass
133
- return type(annotation).dtype # type: ignore
134
- # return pyarrow.from_numpy_dtype(type(annotation).dtype) # type: ignore
137
+ return pyarrow.from_numpy_dtype(type(annotation).dtype) # type: ignore[attr-defined]
135
138
 
136
139
  if annotation == bool:
137
140
  return pyarrow.bool_()
@@ -246,6 +249,7 @@ def _construct_field_docs(
246
249
  if pyarrow.types.is_struct(field.type):
247
250
  children = [field.type.field(i) for i in range(field.type.num_fields)]
248
251
  elif pyarrow.types.is_list(field.type):
252
+ assert isinstance(field.type, pyarrow.ListType)
249
253
  children = [field.type.value_field]
250
254
  else:
251
255
  raise ValueError(f"Unsupported nested type {field.type}")
@@ -275,8 +279,10 @@ def write_dataset(
275
279
  *,
276
280
  output_path: str,
277
281
  feature_schema: pyarrow.Schema,
278
- partition_schema: pyarrow.Schema = None,
279
- existing_data_behavior: str = "overwrite_or_ignore",
282
+ partition_schema: Optional[pyarrow.Schema] = None,
283
+ existing_data_behavior: Literal[
284
+ "error", "overwrite_or_ignore", "delete_matching"
285
+ ] = "overwrite_or_ignore",
280
286
  **kwargs,
281
287
  ):
282
288
  """Creates a ``pyarrow.dataset.Dataset`` from a data generator.
@@ -291,15 +297,19 @@ def write_dataset(
291
297
  existing_data_behavior: Same as ``pyarrow.dataset.write_dataset``, but
292
298
  defaults to ``"overwrite_or_ignore"``, which is typically what we want.
293
299
  """
294
- partitioning = partition_schema and pyarrow.dataset.partitioning(
295
- partition_schema, flavor="hive"
300
+ partitioning = (
301
+ pyarrow.dataset.partitioning(partition_schema, flavor="hive")
302
+ if partition_schema is not None
303
+ else None
296
304
  )
297
305
  pyarrow.dataset.write_dataset(
298
306
  data_generator,
299
307
  output_path,
300
308
  format="parquet",
301
309
  schema=feature_schema,
302
- partitioning=partitioning,
310
+ # Type annotation doesn't include PartitioningFactory even though
311
+ # you're clearly meant to pass the output of partitioning() here
312
+ partitioning=partitioning, # type: ignore[arg-type]
303
313
  existing_data_behavior=existing_data_behavior,
304
314
  **kwargs,
305
315
  )
@@ -326,10 +336,10 @@ def batches(
326
336
  for instance in instances:
327
337
  batch.append(instance)
328
338
  if len(batch) == batch_size:
329
- yield pyarrow.RecordBatch.from_pylist(batch, schema=schema)
339
+ yield pyarrow.RecordBatch.from_pylist(batch, schema=schema) # type: ignore[attr-defined]
330
340
  batch = []
331
341
  if batch: # Final (incomplete) batch
332
- yield pyarrow.RecordBatch.from_pylist(batch, schema=schema)
342
+ yield pyarrow.RecordBatch.from_pylist(batch, schema=schema) # type: ignore[attr-defined]
333
343
 
334
344
 
335
345
  __all__ = [
@@ -3,9 +3,13 @@
3
3
 
4
4
  import base64
5
5
  import hashlib
6
+ import typing
6
7
 
8
+ if typing.TYPE_CHECKING:
9
+ from _typeshed import ReadableBuffer
7
10
 
8
- def encode(data: bytes) -> str:
11
+
12
+ def encode(data: "ReadableBuffer") -> str:
9
13
  return base64.b64encode(data).decode("utf-8")
10
14
 
11
15
 
@@ -0,0 +1,26 @@
1
+ # SPDX-FileCopyrightText: 2024 UL Research Institutes
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from typing import Type
5
+
6
+ import pydantic
7
+
8
+ from ..base import DyffSchemaBaseModel, FixedWidthFloat, list_
9
+
10
+
11
+ def embedding(
12
+ element_type: Type[FixedWidthFloat], size: int
13
+ ) -> Type[DyffSchemaBaseModel]:
14
+ """Returns a schema type representing a list of fixed-length embedding vectors."""
15
+
16
+ class _Embedding(DyffSchemaBaseModel):
17
+ embedding: list_(element_type, list_size=size) = pydantic.Field( # type: ignore[valid-type]
18
+ description="An embedding vector"
19
+ )
20
+
21
+ return _Embedding
22
+
23
+
24
+ __all__ = [
25
+ "embedding",
26
+ ]
@@ -179,6 +179,14 @@ def summary_maxlen() -> int:
179
179
  return 280
180
180
 
181
181
 
182
+ def entity_id_regex() -> str:
183
+ """An entity ID is a 32-character HEX string.
184
+
185
+ TODO: This doesn't check whether the hex string is a valid UUID.
186
+ """
187
+ return r"^[a-f0-9]{32}$"
188
+
189
+
182
190
  class Entities(str, enum.Enum):
183
191
  """The kinds of entities in the dyff system."""
184
192
 
@@ -245,6 +253,9 @@ class Resources(str, enum.Enum):
245
253
  raise ValueError(f"No Resources for Entity kind: {kind}")
246
254
 
247
255
 
256
+ EntityID: TypeAlias = pydantic.constr(regex=entity_id_regex()) # type: ignore
257
+
258
+
248
259
  class DyffModelWithID(DyffSchemaBaseModel):
249
260
  id: str = pydantic.Field(description="Unique identifier of the entity")
250
261
  account: str = pydantic.Field(description="Account that owns the entity")
@@ -418,6 +429,10 @@ class DyffEntity(Status, Labeled, SchemaVersion, DyffModelWithID):
418
429
  default=None, description="Resource creation time (assigned by system)"
419
430
  )
420
431
 
432
+ lastTransitionTime: Optional[datetime] = pydantic.Field(
433
+ default=None, description="Time of last (status, reason) change."
434
+ )
435
+
421
436
  @abc.abstractmethod
422
437
  def dependencies(self) -> list[str]:
423
438
  """List of IDs of resources that this resource depends on.
@@ -843,7 +858,7 @@ class DataSchema(DyffSchemaBaseModel):
843
858
  def make_output_schema(
844
859
  schema: Union[pyarrow.Schema, Type[DyffSchemaBaseModel], DyffDataSchema],
845
860
  ) -> "DataSchema":
846
- """Construct a complete ``DataSchema`` for inference inputs.
861
+ """Construct a complete ``DataSchema`` for inference outputs.
847
862
 
848
863
  This function will add required special fields for input data and then
849
864
  convert the augmented schema as necessary to populate at least the
@@ -2234,6 +2249,7 @@ __all__ = [
2234
2249
  "DyffModelWithID",
2235
2250
  "DyffSchemaBaseModel",
2236
2251
  "Entities",
2252
+ "EntityID",
2237
2253
  "Evaluation",
2238
2254
  "EvaluationBase",
2239
2255
  "ExtractorStep",
@@ -14,6 +14,7 @@ in response.
14
14
 
15
15
  from __future__ import annotations
16
16
 
17
+ import re
17
18
  from datetime import datetime
18
19
  from typing import Optional, Union
19
20
 
@@ -22,6 +23,7 @@ import pydantic
22
23
  from .base import DyffBaseModel
23
24
  from .platform import (
24
25
  AnalysisBase,
26
+ AnalysisScope,
25
27
  DatasetBase,
26
28
  DataView,
27
29
  DocumentationBase,
@@ -70,6 +72,25 @@ class AnalysisCreateRequest(DyffEntityCreateRequest, AnalysisBase):
70
72
 
71
73
  method: str = pydantic.Field(description="Method ID")
72
74
 
75
+ @pydantic.validator("scope", check_fields=False)
76
+ def _validate_scope(cls, scope: AnalysisScope) -> AnalysisScope:
77
+ # TODO: This has to be a validator function because we can't apply the
78
+ # regex contraint to AnalysisScope, since there are already entities
79
+ # with invalid IDs in the data store. Fix in Schema v1.
80
+ uuid4 = r"^[0-9a-f]{8}[0-9a-f]{4}[4][0-9a-f]{3}[89ab][0-9a-f]{3}[0-9a-f]{12}$"
81
+ id_pattern = re.compile(uuid4)
82
+ if scope.dataset is not None and not re.match(id_pattern, scope.dataset):
83
+ raise ValueError("scope.dataset must be an entity ID")
84
+ if scope.evaluation is not None and not re.match(id_pattern, scope.evaluation):
85
+ raise ValueError("scope.evaluation must be an entity ID")
86
+ if scope.inferenceService is not None and not re.match(
87
+ id_pattern, scope.inferenceService
88
+ ):
89
+ raise ValueError("scope.inferenceService must be an entity ID")
90
+ if scope.model is not None and not re.match(id_pattern, scope.model):
91
+ raise ValueError("scope.model must be an entity ID")
92
+ return scope
93
+
73
94
 
74
95
  class DatasetCreateRequest(DyffEntityCreateRequest, DatasetBase):
75
96
  pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dyff-schema
3
- Version: 0.18.0
3
+ Version: 0.19.0
4
4
  Summary: Data models for the Dyff AI auditing platform.
5
5
  Author-email: Digital Safety Research Institute <contact@dsri.org>
6
6
  License: Apache-2.0
@@ -15,6 +15,7 @@ dyff/schema/dataset/__init__.py,sha256=P4tOKKiOFaVeh3-Keiwpg9n7VTQUJQVOIVZhm8sdA
15
15
  dyff/schema/dataset/arrow.py,sha256=1tkgbXcvU0Wy-HM64ddaHfAKJos6FLXxGxM9l8xLcjY,129
16
16
  dyff/schema/dataset/binary.py,sha256=jm73xo-mSMzh1GuI1uUZ2JulY7h2aJqV9CklrD_wScE,130
17
17
  dyff/schema/dataset/classification.py,sha256=nXfFeuAA-wGoiatZ6KzZPSWC078a_6nrHRsKeRmjbAw,138
18
+ dyff/schema/dataset/embedding.py,sha256=yp817Op-NsS66MQJjtJuV1r2NTBcPmWmz3BaYeAflg4,133
18
19
  dyff/schema/dataset/text.py,sha256=_mHnCM1oPIqSBqOeggCw0IUmoxmm4_J64GjF-xdOY-4,128
19
20
  dyff/schema/dataset/vision.py,sha256=D2wCN54xw8m6yYtiFYjvB15t7PUEREZYUo2A76cejv4,130
20
21
  dyff/schema/io/__init__.py,sha256=L5y8UhRnojerPYHumsxQJRcHCNz8Hj9NM8b47mewMNs,92
@@ -22,22 +23,23 @@ dyff/schema/io/vllm.py,sha256=2q05M_-lTzq9oywKXHPPpCFCSDVCSsRQqtmERzWTtio,123
22
23
  dyff/schema/v0/__init__.py,sha256=L5y8UhRnojerPYHumsxQJRcHCNz8Hj9NM8b47mewMNs,92
23
24
  dyff/schema/v0/r1/__init__.py,sha256=L5y8UhRnojerPYHumsxQJRcHCNz8Hj9NM8b47mewMNs,92
24
25
  dyff/schema/v0/r1/adapters.py,sha256=2t2oxsnGfSEDKKDIEYw4qqLXMH7qlFIwPVuLyUmbsHs,23552
25
- dyff/schema/v0/r1/base.py,sha256=i7eOKXDGS8_J9k2aVObUTpSOnA8CAgRW7Quj1fSbyRg,19403
26
- dyff/schema/v0/r1/platform.py,sha256=LqYVZdeRBBCY3OZ0L3dobhUmQPzN51ik4T9BoUyQHbM,73976
27
- dyff/schema/v0/r1/requests.py,sha256=yvMHi4P02DIoibczP-PLMkgkh5XR_VFQNfCtFTorlYs,11891
26
+ dyff/schema/v0/r1/base.py,sha256=IpvlYDr6JjYo6tn8XW4C1Fpgd_uqzZGZsG_cuEn_gQs,19441
27
+ dyff/schema/v0/r1/platform.py,sha256=c-XBKsACEmz_VptGDGCDsULGYtxDajVZ6H7mzjxXHGY,74405
28
+ dyff/schema/v0/r1/requests.py,sha256=UC2jK_upahjemie2s_Acg9xd4NDlwlwUUAOGnpI2iS0,13058
28
29
  dyff/schema/v0/r1/test.py,sha256=X6dUyVd5svcPCI-PBMOAqEfK9jv3bRDvkQTJzwS96c0,10720
29
30
  dyff/schema/v0/r1/version.py,sha256=isKAGuGxsdru8vDaYmI4YiZdJOu_wNxXK7u6QzD6FE4,392
30
31
  dyff/schema/v0/r1/dataset/__init__.py,sha256=LbVlkO2asyGYBKk2z49xjJYTM-pu9y9e4eQDXgTDLnM,2553
31
- dyff/schema/v0/r1/dataset/arrow.py,sha256=juJ3MbiCL54zn3dSmXVl4GBhfLJPk6Qvasb0epFZ4V0,12312
32
- dyff/schema/v0/r1/dataset/binary.py,sha256=MLqj_O7iJvsDiom23jxR054seJaJntc0FTTkHuHYDJg,544
32
+ dyff/schema/v0/r1/dataset/arrow.py,sha256=PHNtBe32e4NXNQIqUMh0SxCPzTdidVkKBaFqYr6-aFo,12857
33
+ dyff/schema/v0/r1/dataset/binary.py,sha256=KXvn79SUt3e_ZZXrju2atT_yMFwgAkCgDYXBtfv0E_I,636
33
34
  dyff/schema/v0/r1/dataset/classification.py,sha256=pbbEXhxyZ0pgYwzaTlM8hVHPNEJDCdHKOeGowPXgWYc,311
35
+ dyff/schema/v0/r1/dataset/embedding.py,sha256=qcHHIzpFQj9bQ2vuECO9g1EU4WT1yjvUZbTlw0qwkc8,642
34
36
  dyff/schema/v0/r1/dataset/text.py,sha256=nLIn91Zlt0tNdXUklSgjJ-kEDxoPX32ISLkiv2DzLvE,1008
35
37
  dyff/schema/v0/r1/dataset/vision.py,sha256=aIe0fbfM_g3DsrDTdg2K803YKLjZBpurM_VJcJFuZLc,369
36
38
  dyff/schema/v0/r1/io/__init__.py,sha256=L5y8UhRnojerPYHumsxQJRcHCNz8Hj9NM8b47mewMNs,92
37
39
  dyff/schema/v0/r1/io/vllm.py,sha256=CUE9y8KthtUI7sD49S875rDmPvKotSXVIRaBS79aBZs,5320
38
- dyff_schema-0.18.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
39
- dyff_schema-0.18.0.dist-info/METADATA,sha256=HN9-j-LkdQYLzQyKV18GoajMwOOAmchU8EQbjtaGIIc,3482
40
- dyff_schema-0.18.0.dist-info/NOTICE,sha256=YONACu0s_Ui6jNi-wtEsVQbTU1JIkh8wvLH6d1-Ni_w,43
41
- dyff_schema-0.18.0.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
42
- dyff_schema-0.18.0.dist-info/top_level.txt,sha256=9e3VVdeX73t_sUJOPQPCcGtYO1JhoErhHIi3WoWGcFI,5
43
- dyff_schema-0.18.0.dist-info/RECORD,,
40
+ dyff_schema-0.19.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
41
+ dyff_schema-0.19.0.dist-info/METADATA,sha256=r7aFHs035B-teQqNrDhlCKYNSicfqzmwXU_8ADCAigo,3482
42
+ dyff_schema-0.19.0.dist-info/NOTICE,sha256=YONACu0s_Ui6jNi-wtEsVQbTU1JIkh8wvLH6d1-Ni_w,43
43
+ dyff_schema-0.19.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
44
+ dyff_schema-0.19.0.dist-info/top_level.txt,sha256=9e3VVdeX73t_sUJOPQPCcGtYO1JhoErhHIi3WoWGcFI,5
45
+ dyff_schema-0.19.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.2.0)
2
+ Generator: setuptools (75.6.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5