PyPI - dyff-schema - Versions diffs - 0.23.0__py3-none-any.whl → 0.24.1__py3-none-any.whl - Mend

dyff-schema 0.23.0py3-none-any.whl → 0.24.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dyff-schema might be problematic. Click here for more details.

Files changed (8) hide show

dyff/schema/v0/r1/adapters.py CHANGED Viewed

@@ -6,14 +6,21 @@ from __future__ import annotations
 import functools
 import json
-from typing import Any, Iterable, NamedTuple, Protocol, Type
+import operator
+import re
+from typing import Any, Callable, Iterable, Literal, NamedTuple, Protocol, Type
 import jsonpath_ng as jsonpath
 from jsonpath_ng.exceptions import JSONPathError
+from jsonpath_ng.ext.parser import parse as jsonpath_parse_ext
 from dyff.schema.platform import SchemaAdapter
+def _json_deep_copy(data):
+    return json.loads(json.dumps(data))
 def map_structure(fn, data):
     """Given a JSON data structure ``data``, create a new data structure instance with
     the same shape as ``data`` by applying ``fn`` to each "leaf" value in the nested
@@ -70,90 +77,346 @@ class Adapter(Protocol):
         raise NotImplementedError()
+class _Literal:
+    def __init__(self, value):
+        self.value = value
+    def __call__(self, x):
+        return self.value
+class _Func_findall:
+    def __init__(self, *, pattern: str, flags: int = 0):
+        self.pattern = pattern
+        self.flags = flags
+    def __call__(self, x) -> list[str]:
+        return re.findall(self.pattern, x, self.flags)
+class _Func_join:
+    def __init__(self, *, separator: str = ""):
+        self._separator = separator
+    def __call__(self, x: list[str]) -> str:
+        return self._separator.join(x)
+class _Func_list:
+    def __call__(self, x) -> list:
+        return list(x)
+class _Func_reduce:
+    def __call__(self, x):
+        return functools.reduce(operator.add, x)
+class _Func_search:
+    def __init__(
+        self,
+        *,
+        pattern: str,
+        flags: int = 0,
+        group: int = 0,
+        default: str | None = None,
+    ):
+        self.pattern = pattern
+        self.flags = flags
+        self.group = group
+        self.default = default
+    def __call__(self, x) -> str | None:
+        m = re.search(self.pattern, x, self.flags)
+        return self.default if m is None else m.group(self.group)
+class _Func_split:
+    def __init__(self, *, pattern: str, maxsplit: int = 0, flags: int = 0):
+        self.pattern = pattern
+        self.maxsplit = maxsplit
+        self.flags = flags
+    def __call__(self, x) -> list[str]:
+        return re.split(self.pattern, x, self.maxsplit, self.flags)
+class _Func_sub:
+    def __init__(self, *, pattern: str, repl: str, count: int = 0, flags: int = 0):
+        self.pattern = pattern
+        self.repl = repl
+        self.count = count
+        self.flags = flags
+    def __call__(self, x) -> str:
+        return re.sub(self.pattern, self.repl, x, self.count, self.flags)
+class _Value_jsonpath:
+    def __init__(self, expr, *, kind: Literal["scalar", "list"] = "scalar"):
+        self._expr: jsonpath.JSONPath = jsonpath.parse(expr)
+        self._kind = kind
+    def __call__(self, x):
+        results = self._expr.find(x)
+        if self._kind == "list":
+            return [result.value for result in results]
+        elif self._kind == "scalar":
+            if len(results) == 0:
+                raise ValueError(f"no match for '{self._expr}' in '{x}'")
+            elif len(results) > 1:
+                raise ValueError(f"multiple results for '{self._expr}' in '{x}'")
+            return results[0].value
+        else:
+            raise AssertionError(f"kind {self._kind}")
+class _Value_list:
+    def __init__(self, exprs: list[Callable]):
+        self._exprs = exprs
+    def __call__(self, x) -> list:
+        return [e(x) for e in self._exprs]
+def _maybe_value_expr(expr: dict) -> Callable | None:
+    kinds = ["$literal", "$scalar", "$list"]
+    maybe_exprs = {k: expr.get(k) for k in kinds}
+    just_exprs = [k for k in kinds if maybe_exprs[k] is not None]
+    if len(just_exprs) == 0:
+        return None
+    if len(just_exprs) > 1:
+        raise ValueError(f"must specify exactly one of {kinds}: got {just_exprs}")
+    # remove sigil
+    kind: Literal["literal", "scalar", "list"] = just_exprs[0][1:]  # type: ignore
+    value = maybe_exprs[just_exprs[0]]
+    if kind == "literal":
+        return _Literal(value)
+    op: Callable = _Literal(value)
+    if isinstance(value, str):
+        if value.startswith("$"):
+            if value.startswith("$$"):
+                # Literal string -- remove "escape" character
+                op = _Literal(value[1:])
+            else:
+                op = _Value_jsonpath(value, kind=kind)
+    elif kind == "list" and isinstance(value, list):
+        exprs = [_maybe_value_expr(e) for e in value]
+        if any(e is None for e in exprs):
+            raise ValueError("$list elements must be value expressions")
+        op = _Value_list(exprs)  # type: ignore
+    if isinstance(op, _Literal) and kind != "literal":
+        raise ValueError("must use $literal when providing a literal value")
+    return op
+class _LeafExpression:
+    FUNCTIONS = {
+        "findall": _Func_findall,
+        "join": _Func_join,
+        "list": _Func_list,
+        "reduce": _Func_reduce,
+        "search": _Func_search,
+        "split": _Func_split,
+        "sub": _Func_sub,
+    }
+    def __init__(self, pipeline: dict | list[dict]):
+        if isinstance(pipeline, dict):
+            pipeline = [pipeline]
+        self._compiled_pipeline: list[Callable] = []
+        for step in pipeline:
+            if (value_op := _maybe_value_expr(step)) is not None:
+                self._compiled_pipeline.append(value_op)
+            elif (func := step.pop("$func", None)) is not None:
+                self._compiled_pipeline.append(_LeafExpression.FUNCTIONS[func](**step))
+            else:
+                raise ValueError(f"invalid $compute step: {step}")
+    def __call__(self, x):
+        output = None
+        for i, step in enumerate(self._compiled_pipeline):
+            if i == 0:
+                output = step(x)
+            else:
+                output = step(output)
+        return output
 class TransformJSON:
-    """Transform an input JSON structure by creating a new output JSON structure where
-    all of the "leaf" values are populated by either:
+    """Create a new JSON structure where the "leaf" values are populated by the results
+    of transformation functions applied to the input.
+    The "value" for each leaf can be::
+        1. A JSON literal value, or
+        2. The result of a jsonpath query on the input structure, or
+        3. The result of a computation pipeline starting from (1) or (2).
-        1. A provided JSON literal value, or
-        2. The result of a jsonpath query on the input structure.
+    To distinguish the specifications of leaf values from the specification of
+    the output structure, we apply the following rules::
-    For example, if the ``output_structure`` parameter is::
+        1. Composite values (``list`` and ``dict``) specify the structure of
+        the output.
+        2. Scalar values are output as-is, unless they are strings containing
+        JSONPath queries.
+        3. JSONPath queries are strings beginning with '$'. They are replaced
+        by the result of the query.
+        4. A ``dict`` containing the special key ``"$compute"`` introduces a
+        "compute context", which computes a leaf value from the input data.
+        Descendents of this key have "compute context semantics", which are
+        different from the "normal" semantics.
+    For example, if the ``configuration`` is::
         {
             "id": "$.object.id",
             "name": "literal",
             "children": {"left": "$.list[0]", "right": "$.list[1]"}
+            "characters": {
+                "letters": {
+                    "$compute": [
+                        {"$scalar": "$.object.id"},
+                        {
+                            "$func": "sub",
+                            "pattern": "[A-Za-z]",
+                            "repl": "",
+                        },
+                        {"$func": "list"}
+                    ]
+                }
+            }
         }
     and the data is::
         {
-            "object": {"id": 42, "name": "spam"},
+            "object": {"id": "abc123", "name": "spam"},
             "list": [1, 2]
         }
-    Then applying the transformer to the data will result in the new structure::
+    Then applying the transformation to the data will result in the new structure::
         {
-            "id": 42,
+            "id": "abc123",
             "name": "literal",
-            "children: {"left": 1, "right": 2}
+            "children: {"left": 1, "right": 2},
+            "characters": {
+                "letters": ["a", "b", "c"]
+            }
         }
-    A value is interpreted as a jsonpath query if it is a string that starts
-    with the '$' character. If you need a literal string that starts with
-    the '$' character, escape it with a second '$', e.g., "$$PATH" will appear
-    as the literal string "$PATH" in the output.
-    All of the jsonpath queries must return *exactly one value* when executed
-    against each input item. If not, a ``ValueError`` will be raised.
+    The ``.characters.letters`` field was derived by::
+        1. Extracting the value of the ``.object.id`` field in the input
+        2. Applying ``re.sub(r"[A-Za-z]", "", _)`` to the result of (1)
+        3. Applying ``list(_)`` to the result of (2)
+    Notice that descendents of the ``$compute`` key no longer describe the
+    structure of the output, but instead describe steps of the computation.
+    The value of ``"$compute"`` can be either an object or a list of objects.
+    A list is interpreted as a "pipeline" where each step is applied to the
+    output of the previous step.
+    Implicit queries
+    ================
+    Outside of the ``$compute`` context, string values that start with a ``$``
+    character are interpreted as jsonpath queries. Queries in this context must
+    return **exactly one value**, otherwise a ``ValueError`` will be raised.
+    This is because when multiple values are returned, there's no way to
+    distinguish a scalar-valued query that found 1 scalar from a list-valued
+    query that found a list with 1 element. In the ``$compute`` context, you
+    can specify which semantics you want.
+    If you need a literal string that starts with the '$' character, escape it
+    with a second '$', e.g., "$$PATH" will appear as the literal string "$PATH"
+    in the output. This works for both keys and values, e.g.,
+    ``{"$$key": "$$value"}`` outputs ``{"$key": "$value"}``. All keys that
+    begin with ``$`` are reserved, and you must always escape them.
+    The $compute context
+    ====================
+    A ``$compute`` context is introduced by a ``dict`` that contains the key
+    ``{"$compute": ...}``. Semantics in the ``$compute`` context are different
+    from semantics in the "normal" context.
+    $literal vs. $scalar vs. $list
+    ------------------------------
+    Inside a ``$compute`` context, we distinguish explicitly between literal
+    values, jsonpath queries that return scalars, and jsonpath queries that
+    return lists. You specify which semantics you intend by using
+    ``{"$literal": [1, 2]}``, ``{"$scalar": "$.foo"}``, or ``{"$list": $.foo[*]}``.
+    Items with ``$literal`` semantics are **never** interpreted as jsonpath
+    queries, even if they start with ``$``. In the ``$literal`` context, you
+    **should not** escape the leading ``$`` character.
+    A ``$scalar`` query has the same semantiics as a jsonpath query outside
+    of the ``$compute`` context, i.e., it must return exactly 1 item.
+    A ``$list`` query will return a list, which can be empty. Scalar-valued
+    queries in a ``$list`` context will return a list with 1 element.
+    $func
+    -----
+    You use blocks with a ``$func`` key to specify computation steps. The
+    available functions are: ``findall``, ``join``, ``list``, ``reduce``,
+    ``search``, ``split``, ``sub``. These behave the same way as the
+    corresponding functions from the Python standard library::
+        * ``findall``, ``search``, ``split``, and ``sub`` are from the
+        ``re`` module.
+        * ``reduce`` uses the ``+`` operator with no starting value; it will
+        raise an error if called on an empty list.
+    All of these functions take named parameters with the same names and
+    semantics as their parameters in Python.
     """
     def __init__(self, configuration: dict):
-        """
-        Parameters:
-            ``output_structure``: A JSON object where all the "leaf" values
-                are strings containing jsonpath queries.
-        """
         if configuration != json.loads(json.dumps(configuration)):
             raise ValueError("configuration is not valid JSON")
-        self.output_structure = configuration
-        try:
-            self._expressions = map_structure(
-                self._jsonpath_expr_or_literal, self.output_structure
-            )
-        except JSONPathError as ex:
-            raise ValueError(
-                "output_structure leaf values must be JSON literals or jsonpath query strings"
-            ) from ex
-    def _jsonpath_expr_or_literal(self, x):
-        if isinstance(x, str):
+        self.configuration = configuration
+        self._transformation = self._compile(self.configuration)
+    def _compile(self, x) -> Callable | list | dict:
+        if isinstance(x, dict):
+            if (compute := x.get("$compute")) is not None:
+                if len(x) != 1:
+                    raise ValueError("$compute must be the only key in the dict")
+                return _LeafExpression(compute)
+            else:
+                # Escape '$' in dict keys
+                d: dict[str, Any] = {}
+                for k, v in x.items():
+                    if k.startswith("$"):
+                        if k.startswith("$$"):
+                            k = k[1:]
+                        else:
+                            raise ValueError(
+                                f"dict key '{k}': keys beginning with '$' are reserved; use '$$' to escape"
+                            )
+                    d[k] = self._compile(v)
+                return d
+        elif isinstance(x, list):
+            return [self._compile(y) for y in x]
+        elif isinstance(x, str):
             if x.startswith("$"):
                 if x.startswith("$$"):
                     # Literal string -- remove "escape" character
-                    return x[1:]
+                    return _Literal(x[1:])
                 else:
-                    return jsonpath.parse(x)
-        return x
+                    return _Value_jsonpath(x, kind="scalar")
+        return _Literal(x)
     def __call__(self, stream: Iterable[dict]) -> Iterable[dict]:
-        def query(data, expr):
-            if not isinstance(expr, jsonpath.JSONPath):
-                # Literal
-                return expr
-            results = expr.find(data)
-            if len(results) == 0:
-                raise ValueError(f"no match for {expr}")
-            elif len(results) > 1:
-                raise ValueError(f"multiple results for {expr}")
-            return results[0].value
         for item in stream:
-            transformed = map_structure(
-                lambda expr: query(item, expr), self._expressions
-            )
-            yield transformed
+            yield map_structure(lambda compute: compute(item), self._transformation)
 class EmbedIndex:
@@ -573,13 +836,16 @@ def _test():
     print(list(transformer([data])))
     transformer = TransformJSON(
-        {"id": "$.object.id", "children": {"left": "$.list[0]", "right": "$.list[1]"}}
+        {
+            "id": "$.object.id",
+            "children": {"left": "$.list[0]", "right": "$.list[1]"},
+        }
     )
     print(
         list(
             transformer(
                 [
-                    {"object": {"id": 42, "name": "spam"}, "list": [1, 2]},
+                    {"object": {"id": "abc123", "name": "spam"}, "list": [1, 2]},
                 ]
             )
         )

dyff/schema/v0/r1/platform.py CHANGED Viewed

@@ -1370,6 +1370,52 @@ class TaskSchema(DyffSchemaBaseModel):
     objective: str
+class EvaluationClientConfiguration(DyffSchemaBaseModel):
+    badRequestPolicy: Literal["Abort", "Skip"] = pydantic.Field(
+        default="Abort",
+        description="What to do if an inference call raises a 400 Bad Request"
+        " or a similar error that indicates a problem with the input instance."
+        " Abort (default): the evaluation fails immediately."
+        " Skip: output None for the bad instance and continue.",
+    )
+    transientErrorRetryLimit: int = pydantic.Field(
+        default=120,
+        ge=0,
+        description="How many times to retry transient errors before the"
+        " evaluation fails. The count is reset after a successful inference."
+        " Note that transient errors often occur during inference service"
+        " startup. The maximum time that the evaluation will wait for a"
+        " service (re)start is (retry limit) * (retry delay).",
+    )
+    transientErrorRetryDelaySeconds: int = pydantic.Field(
+        default=30,
+        ge=1,
+        description="How long to wait before retrying a transient error."
+        " Note that transient errors often occur during inference service"
+        " startup. The maximum time that the evaluation will wait for a"
+        " service (re)start is (retry limit) * (retry delay).",
+    )
+    duplicateOutputPolicy: Literal["Deduplicate", "Error", "Ignore"] = pydantic.Field(
+        default="Deduplicate",
+        description="What to do if there are missing outputs."
+        " Deduplicate (default): output only one of the duplicates, chosen"
+        " arbitrarily. Error: the evaluation fails. Ignore: duplicates are"
+        " retained in the output."
+        " Setting this to Error is discouraged because duplicates can"
+        " arise in normal operation if the client restarts due to"
+        " a transient failure.",
+    )
+    missingOutputPolicy: Literal["Error", "Ignore"] = pydantic.Field(
+        default="Error",
+        description="What to do if there are missing outputs."
+        " Error (default): the evaluation fails. Ignore: no error.",
+    )
 class EvaluationBase(DyffSchemaBaseModel):
     dataset: str = pydantic.Field(description="The Dataset to evaluate on.")
@@ -1377,11 +1423,17 @@ class EvaluationBase(DyffSchemaBaseModel):
         default=1, description="Number of replications to run."
     )
+    # TODO: This should be in the client config object
     workersPerReplica: Optional[int] = pydantic.Field(
         default=None,
         description="Number of data workers per inference service replica.",
     )
+    client: EvaluationClientConfiguration = pydantic.Field(
+        default_factory=EvaluationClientConfiguration,
+        description="Configuration for the evaluation client.",
+    )
 class Evaluation(DyffEntity, EvaluationBase):
     """A description of how to run an InferenceService on a Dataset to obtain a set of

{dyff_schema-0.23.0.dist-info → dyff_schema-0.24.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: dyff-schema
-Version: 0.23.0
+Version: 0.24.1
 Summary: Data models for the Dyff AI auditing platform.
 Author-email: Digital Safety Research Institute <contact@dsri.org>
 License: Apache-2.0

{dyff_schema-0.23.0.dist-info → dyff_schema-0.24.1.dist-info}/RECORD RENAMED Viewed

@@ -22,9 +22,9 @@ dyff/schema/io/__init__.py,sha256=L5y8UhRnojerPYHumsxQJRcHCNz8Hj9NM8b47mewMNs,92
 dyff/schema/io/vllm.py,sha256=2q05M_-lTzq9oywKXHPPpCFCSDVCSsRQqtmERzWTtio,123
 dyff/schema/v0/__init__.py,sha256=L5y8UhRnojerPYHumsxQJRcHCNz8Hj9NM8b47mewMNs,92
 dyff/schema/v0/r1/__init__.py,sha256=L5y8UhRnojerPYHumsxQJRcHCNz8Hj9NM8b47mewMNs,92
-dyff/schema/v0/r1/adapters.py,sha256=2t2oxsnGfSEDKKDIEYw4qqLXMH7qlFIwPVuLyUmbsHs,23552
+dyff/schema/v0/r1/adapters.py,sha256=dmQS2ecgDX4ZvTMOW-6NzV_Oq_UpaiyFd7QnSNoOnK8,33057
 dyff/schema/v0/r1/base.py,sha256=IpvlYDr6JjYo6tn8XW4C1Fpgd_uqzZGZsG_cuEn_gQs,19441
-dyff/schema/v0/r1/platform.py,sha256=xaJohO5SB_Nh6kIRmbAe1gc-xV_vGoM3R7UgMJcbByM,75980
+dyff/schema/v0/r1/platform.py,sha256=KpTGCS8x1FAkfrMEgbjfDpTGgwNTF28WSuohYnROvj8,78287
 dyff/schema/v0/r1/requests.py,sha256=4TM1IKG9IP4MyprIy9E9XA_JqvkuwKAuY1ao1BbVLI0,15676
 dyff/schema/v0/r1/test.py,sha256=X6dUyVd5svcPCI-PBMOAqEfK9jv3bRDvkQTJzwS96c0,10720
 dyff/schema/v0/r1/version.py,sha256=isKAGuGxsdru8vDaYmI4YiZdJOu_wNxXK7u6QzD6FE4,392
@@ -37,9 +37,9 @@ dyff/schema/v0/r1/dataset/text.py,sha256=nLIn91Zlt0tNdXUklSgjJ-kEDxoPX32ISLkiv2D
 dyff/schema/v0/r1/dataset/vision.py,sha256=aIe0fbfM_g3DsrDTdg2K803YKLjZBpurM_VJcJFuZLc,369
 dyff/schema/v0/r1/io/__init__.py,sha256=L5y8UhRnojerPYHumsxQJRcHCNz8Hj9NM8b47mewMNs,92
 dyff/schema/v0/r1/io/vllm.py,sha256=CUE9y8KthtUI7sD49S875rDmPvKotSXVIRaBS79aBZs,5320
-dyff_schema-0.23.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-dyff_schema-0.23.0.dist-info/METADATA,sha256=D_F0_u21Cwr_Wh1n8KGZtOOKlQCYk52gkL-Z6EqbC5o,3482
-dyff_schema-0.23.0.dist-info/NOTICE,sha256=YONACu0s_Ui6jNi-wtEsVQbTU1JIkh8wvLH6d1-Ni_w,43
-dyff_schema-0.23.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-dyff_schema-0.23.0.dist-info/top_level.txt,sha256=9e3VVdeX73t_sUJOPQPCcGtYO1JhoErhHIi3WoWGcFI,5
-dyff_schema-0.23.0.dist-info/RECORD,,
+dyff_schema-0.24.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+dyff_schema-0.24.1.dist-info/METADATA,sha256=joonUGA6du1G3HJKOTBQE7HuOivJW33vIMw3zNrl3bg,3482
+dyff_schema-0.24.1.dist-info/NOTICE,sha256=YONACu0s_Ui6jNi-wtEsVQbTU1JIkh8wvLH6d1-Ni_w,43
+dyff_schema-0.24.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+dyff_schema-0.24.1.dist-info/top_level.txt,sha256=9e3VVdeX73t_sUJOPQPCcGtYO1JhoErhHIi3WoWGcFI,5
+dyff_schema-0.24.1.dist-info/RECORD,,

{dyff_schema-0.23.0.dist-info → dyff_schema-0.24.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{dyff_schema-0.23.0.dist-info → dyff_schema-0.24.1.dist-info}/NOTICE RENAMED Viewed

File without changes

{dyff_schema-0.23.0.dist-info → dyff_schema-0.24.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{dyff_schema-0.23.0.dist-info → dyff_schema-0.24.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

dyff-schema 0.23.0__py3-none-any.whl → 0.24.1__py3-none-any.whl

Potentially problematic release.

dyff-schema 0.23.0py3-none-any.whl → 0.24.1py3-none-any.whl