PyPI - data-designer - Versions diffs - 0.1.0__py3-none-any.whl - Mend

data-designer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (177) hide show

data_designer/__init__.py +15 -0
data_designer/_version.py +34 -0
data_designer/cli/README.md +236 -0
data_designer/cli/__init__.py +6 -0
data_designer/cli/commands/__init__.py +2 -0
data_designer/cli/commands/list.py +130 -0
data_designer/cli/commands/models.py +10 -0
data_designer/cli/commands/providers.py +11 -0
data_designer/cli/commands/reset.py +100 -0
data_designer/cli/controllers/__init__.py +7 -0
data_designer/cli/controllers/model_controller.py +246 -0
data_designer/cli/controllers/provider_controller.py +317 -0
data_designer/cli/forms/__init__.py +20 -0
data_designer/cli/forms/builder.py +51 -0
data_designer/cli/forms/field.py +180 -0
data_designer/cli/forms/form.py +59 -0
data_designer/cli/forms/model_builder.py +125 -0
data_designer/cli/forms/provider_builder.py +76 -0
data_designer/cli/main.py +44 -0
data_designer/cli/repositories/__init__.py +8 -0
data_designer/cli/repositories/base.py +39 -0
data_designer/cli/repositories/model_repository.py +42 -0
data_designer/cli/repositories/provider_repository.py +43 -0
data_designer/cli/services/__init__.py +7 -0
data_designer/cli/services/model_service.py +116 -0
data_designer/cli/services/provider_service.py +111 -0
data_designer/cli/ui.py +448 -0
data_designer/cli/utils.py +47 -0
data_designer/config/__init__.py +2 -0
data_designer/config/analysis/column_profilers.py +89 -0
data_designer/config/analysis/column_statistics.py +274 -0
data_designer/config/analysis/dataset_profiler.py +60 -0
data_designer/config/analysis/utils/errors.py +8 -0
data_designer/config/analysis/utils/reporting.py +188 -0
data_designer/config/base.py +68 -0
data_designer/config/column_configs.py +354 -0
data_designer/config/column_types.py +168 -0
data_designer/config/config_builder.py +660 -0
data_designer/config/data_designer_config.py +40 -0
data_designer/config/dataset_builders.py +11 -0
data_designer/config/datastore.py +151 -0
data_designer/config/default_model_settings.py +123 -0
data_designer/config/errors.py +19 -0
data_designer/config/interface.py +54 -0
data_designer/config/models.py +231 -0
data_designer/config/preview_results.py +32 -0
data_designer/config/processors.py +41 -0
data_designer/config/sampler_constraints.py +51 -0
data_designer/config/sampler_params.py +604 -0
data_designer/config/seed.py +145 -0
data_designer/config/utils/code_lang.py +83 -0
data_designer/config/utils/constants.py +313 -0
data_designer/config/utils/errors.py +19 -0
data_designer/config/utils/info.py +88 -0
data_designer/config/utils/io_helpers.py +273 -0
data_designer/config/utils/misc.py +81 -0
data_designer/config/utils/numerical_helpers.py +28 -0
data_designer/config/utils/type_helpers.py +100 -0
data_designer/config/utils/validation.py +336 -0
data_designer/config/utils/visualization.py +427 -0
data_designer/config/validator_params.py +96 -0
data_designer/engine/__init__.py +2 -0
data_designer/engine/analysis/column_profilers/base.py +55 -0
data_designer/engine/analysis/column_profilers/judge_score_profiler.py +160 -0
data_designer/engine/analysis/column_profilers/registry.py +20 -0
data_designer/engine/analysis/column_statistics.py +142 -0
data_designer/engine/analysis/dataset_profiler.py +125 -0
data_designer/engine/analysis/errors.py +7 -0
data_designer/engine/analysis/utils/column_statistics_calculations.py +209 -0
data_designer/engine/analysis/utils/judge_score_processing.py +128 -0
data_designer/engine/column_generators/__init__.py +2 -0
data_designer/engine/column_generators/generators/__init__.py +2 -0
data_designer/engine/column_generators/generators/base.py +61 -0
data_designer/engine/column_generators/generators/expression.py +63 -0
data_designer/engine/column_generators/generators/llm_generators.py +172 -0
data_designer/engine/column_generators/generators/samplers.py +75 -0
data_designer/engine/column_generators/generators/seed_dataset.py +149 -0
data_designer/engine/column_generators/generators/validation.py +147 -0
data_designer/engine/column_generators/registry.py +56 -0
data_designer/engine/column_generators/utils/errors.py +13 -0
data_designer/engine/column_generators/utils/judge_score_factory.py +57 -0
data_designer/engine/column_generators/utils/prompt_renderer.py +98 -0
data_designer/engine/configurable_task.py +82 -0
data_designer/engine/dataset_builders/artifact_storage.py +181 -0
data_designer/engine/dataset_builders/column_wise_builder.py +287 -0
data_designer/engine/dataset_builders/errors.py +13 -0
data_designer/engine/dataset_builders/multi_column_configs.py +44 -0
data_designer/engine/dataset_builders/utils/__init__.py +2 -0
data_designer/engine/dataset_builders/utils/concurrency.py +184 -0
data_designer/engine/dataset_builders/utils/config_compiler.py +60 -0
data_designer/engine/dataset_builders/utils/dag.py +56 -0
data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +190 -0
data_designer/engine/dataset_builders/utils/errors.py +13 -0
data_designer/engine/errors.py +49 -0
data_designer/engine/model_provider.py +75 -0
data_designer/engine/models/__init__.py +2 -0
data_designer/engine/models/errors.py +308 -0
data_designer/engine/models/facade.py +225 -0
data_designer/engine/models/litellm_overrides.py +162 -0
data_designer/engine/models/parsers/__init__.py +2 -0
data_designer/engine/models/parsers/errors.py +34 -0
data_designer/engine/models/parsers/parser.py +236 -0
data_designer/engine/models/parsers/postprocessors.py +93 -0
data_designer/engine/models/parsers/tag_parsers.py +60 -0
data_designer/engine/models/parsers/types.py +82 -0
data_designer/engine/models/recipes/base.py +79 -0
data_designer/engine/models/recipes/response_recipes.py +291 -0
data_designer/engine/models/registry.py +118 -0
data_designer/engine/models/usage.py +75 -0
data_designer/engine/models/utils.py +38 -0
data_designer/engine/processing/ginja/__init__.py +2 -0
data_designer/engine/processing/ginja/ast.py +64 -0
data_designer/engine/processing/ginja/environment.py +461 -0
data_designer/engine/processing/ginja/exceptions.py +54 -0
data_designer/engine/processing/ginja/record.py +30 -0
data_designer/engine/processing/gsonschema/__init__.py +2 -0
data_designer/engine/processing/gsonschema/exceptions.py +8 -0
data_designer/engine/processing/gsonschema/schema_transformers.py +81 -0
data_designer/engine/processing/gsonschema/types.py +8 -0
data_designer/engine/processing/gsonschema/validators.py +143 -0
data_designer/engine/processing/processors/base.py +15 -0
data_designer/engine/processing/processors/drop_columns.py +46 -0
data_designer/engine/processing/processors/registry.py +20 -0
data_designer/engine/processing/utils.py +120 -0
data_designer/engine/registry/base.py +97 -0
data_designer/engine/registry/data_designer_registry.py +37 -0
data_designer/engine/registry/errors.py +10 -0
data_designer/engine/resources/managed_dataset_generator.py +35 -0
data_designer/engine/resources/managed_dataset_repository.py +194 -0
data_designer/engine/resources/managed_storage.py +63 -0
data_designer/engine/resources/resource_provider.py +46 -0
data_designer/engine/resources/seed_dataset_data_store.py +66 -0
data_designer/engine/sampling_gen/column.py +89 -0
data_designer/engine/sampling_gen/constraints.py +95 -0
data_designer/engine/sampling_gen/data_sources/base.py +214 -0
data_designer/engine/sampling_gen/data_sources/errors.py +10 -0
data_designer/engine/sampling_gen/data_sources/sources.py +342 -0
data_designer/engine/sampling_gen/entities/__init__.py +2 -0
data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +64 -0
data_designer/engine/sampling_gen/entities/email_address_utils.py +169 -0
data_designer/engine/sampling_gen/entities/errors.py +8 -0
data_designer/engine/sampling_gen/entities/national_id_utils.py +100 -0
data_designer/engine/sampling_gen/entities/person.py +142 -0
data_designer/engine/sampling_gen/entities/phone_number.py +122 -0
data_designer/engine/sampling_gen/errors.py +24 -0
data_designer/engine/sampling_gen/generator.py +121 -0
data_designer/engine/sampling_gen/jinja_utils.py +60 -0
data_designer/engine/sampling_gen/people_gen.py +203 -0
data_designer/engine/sampling_gen/person_constants.py +54 -0
data_designer/engine/sampling_gen/schema.py +143 -0
data_designer/engine/sampling_gen/schema_builder.py +59 -0
data_designer/engine/sampling_gen/utils.py +40 -0
data_designer/engine/secret_resolver.py +80 -0
data_designer/engine/validators/__init__.py +17 -0
data_designer/engine/validators/base.py +36 -0
data_designer/engine/validators/local_callable.py +34 -0
data_designer/engine/validators/python.py +245 -0
data_designer/engine/validators/remote.py +83 -0
data_designer/engine/validators/sql.py +60 -0
data_designer/errors.py +5 -0
data_designer/essentials/__init__.py +137 -0
data_designer/interface/__init__.py +2 -0
data_designer/interface/data_designer.py +351 -0
data_designer/interface/errors.py +16 -0
data_designer/interface/results.py +55 -0
data_designer/logging.py +161 -0
data_designer/plugin_manager.py +83 -0
data_designer/plugins/__init__.py +6 -0
data_designer/plugins/errors.py +10 -0
data_designer/plugins/plugin.py +69 -0
data_designer/plugins/registry.py +86 -0
data_designer-0.1.0.dist-info/METADATA +173 -0
data_designer-0.1.0.dist-info/RECORD +177 -0
data_designer-0.1.0.dist-info/WHEEL +4 -0
data_designer-0.1.0.dist-info/entry_points.txt +2 -0
data_designer-0.1.0.dist-info/licenses/LICENSE +201 -0

data_designer/engine/models/facade.py ADDED Viewed

@@ -0,0 +1,225 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from collections.abc import Callable
+from copy import deepcopy
+import logging
+from typing import Any
+from litellm.types.router import DeploymentTypedDict, LiteLLM_Params
+from litellm.types.utils import ModelResponse
+from data_designer.config.models import ModelConfig, ModelProvider
+from data_designer.engine.model_provider import ModelProviderRegistry
+from data_designer.engine.models.errors import (
+    GenerationValidationFailureError,
+    catch_llm_exceptions,
+    get_exception_primary_cause,
+)
+from data_designer.engine.models.litellm_overrides import CustomRouter, LiteLLMRouterDefaultKwargs
+from data_designer.engine.models.parsers.errors import ParserException
+from data_designer.engine.models.usage import ModelUsageStats, RequestUsageStats, TokenUsageStats
+from data_designer.engine.models.utils import prompt_to_messages, str_to_message
+from data_designer.engine.secret_resolver import SecretResolver
+logger = logging.getLogger(__name__)
+class ModelFacade:
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        secret_resolver: SecretResolver,
+        model_provider_registry: ModelProviderRegistry,
+    ):
+        self._model_config = model_config
+        self._secret_resolver = secret_resolver
+        self._model_provider_registry = model_provider_registry
+        self._litellm_deployment = self._get_litellm_deployment(model_config)
+        self._router = CustomRouter([self._litellm_deployment], **LiteLLMRouterDefaultKwargs().model_dump())
+        self._usage_stats = ModelUsageStats()
+    @property
+    def model_name(self) -> str:
+        return self._model_config.model
+    @property
+    def model_provider(self) -> ModelProvider:
+        return self._model_provider_registry.get_provider(self._model_config.provider)
+    @property
+    def model_provider_name(self) -> str:
+        return self.model_provider.name
+    @property
+    def model_alias(self) -> str:
+        return self._model_config.alias
+    @property
+    def usage_stats(self) -> ModelUsageStats:
+        return self._usage_stats
+    def completion(self, messages: list[dict[str, str]], skip_usage_tracking: bool = False, **kwargs) -> ModelResponse:
+        logger.debug(
+            f"Prompting model {self.model_name!r}...",
+            extra={"model": self.model_name, "messages": messages, "sensitive": True},
+        )
+        response = None
+        if self.model_provider.extra_body:
+            kwargs["extra_body"] = {**kwargs.get("extra_body", {}), **self.model_provider.extra_body}
+        try:
+            response = self._router.completion(self.model_name, messages, **kwargs)
+            logger.debug(
+                f"Received completion from model {self.model_name!r}",
+                extra={
+                    "model": self.model_name,
+                    "response": response,
+                    "text": response.choices[0].message.content,
+                    "usage": self._usage_stats.model_dump(),
+                },
+            )
+            return response
+        except Exception as e:
+            raise e
+        finally:
+            if not skip_usage_tracking:
+                self._track_usage(response)
+    @catch_llm_exceptions
+    def generate(
+        self,
+        prompt: str,
+        *,
+        parser: Callable[[str], Any],
+        system_prompt: str | None = None,
+        multi_modal_context: list[dict[str, Any]] | None = None,
+        max_correction_steps: int = 0,
+        max_conversation_restarts: int = 0,
+        skip_usage_tracking: bool = False,
+        purpose: str | None = None,
+        **kwargs,
+    ) -> tuple[Any, str | None]:
+        """Generate a parsed output with correction steps.
+        This generation call will attempt to generate an output which is
+        valid according to the specified parser, where "valid" implies
+        that the parser can process the LLM response without raising
+        an exception.
+        `ParserExceptions` are routed back
+        to the LLM as new rounds in the conversation, where the LLM is provided its
+        earlier response along with the "user" role responding with the exception string
+        (not traceback). This will continue for the number of rounds specified by
+        `max_correction_steps`.
+        Args:
+            prompt (str): Task prompt.
+            system_prompt (str, optional): Optional system instructions. If not specified,
+                no system message is provided and the model should use its default system
+                prompt.
+            parser (func(str) -> Any): A function applied to the LLM response which processes
+                an LLM response into some output object.
+            max_correction_steps (int): Maximum number of correction rounds permitted
+                within a single conversation. Note, many rounds can lead to increasing
+                context size without necessarily improving performance -- small language
+                models can enter repeated cycles which will not be solved with more steps.
+                Default: `0` (no correction).
+            max_conversation_restarts (int): Maximum number of full conversation restarts permitted
+                if generation fails.  Default: `0` (no restarts).
+            skip_usage_tracking (bool): Whether to skip usage tracking. Default: `False`.
+            purpose (str): The purpose of the model usage to show as context in the error message.
+                It is expected to be used by the @catch_llm_exceptions decorator.
+            **kwargs: Additional arguments to pass to the model.
+        Raises:
+            GenerationValidationFailureError: If the maximum number of retries or
+                correction steps are met and the last response failures on
+                generation validation.
+        """
+        output_obj = None
+        curr_num_correction_steps = 0
+        curr_num_restarts = 0
+        curr_generation_attempt = 0
+        max_generation_attempts = (max_correction_steps + 1) * (max_conversation_restarts + 1)
+        starting_messages = prompt_to_messages(
+            user_prompt=prompt, system_prompt=system_prompt, multi_modal_context=multi_modal_context
+        )
+        messages = deepcopy(starting_messages)
+        while True:
+            curr_generation_attempt += 1
+            logger.debug(
+                f"Starting generation attempt {curr_generation_attempt} of {max_generation_attempts} attempts."
+            )
+            completion_response = self.completion(messages, skip_usage_tracking=skip_usage_tracking, **kwargs)
+            response = completion_response.choices[0].message.content or ""
+            reasoning_trace = getattr(completion_response.choices[0].message, "reasoning_content", None)
+            if reasoning_trace:
+                ## There are generally some extra newlines with how these get parsed.
+                response = response.strip()
+                reasoning_trace = reasoning_trace.strip()
+            curr_num_correction_steps += 1
+            try:
+                output_obj = parser(response)  # type: ignore - if not a string will cause a ParserException below
+                break
+            except ParserException as exc:
+                if max_correction_steps == 0 and max_conversation_restarts == 0:
+                    raise GenerationValidationFailureError(
+                        "Unsuccessful generation attempt. No retries were attempted."
+                    ) from exc
+                if curr_num_correction_steps <= max_correction_steps:
+                    ## Add turns to loop-back errors for correction
+                    messages += [
+                        str_to_message(content=response, role="assistant"),
+                        str_to_message(content=str(get_exception_primary_cause(exc)), role="user"),
+                    ]
+                elif curr_num_restarts < max_conversation_restarts:
+                    curr_num_correction_steps = 0
+                    curr_num_restarts += 1
+                    messages = deepcopy(starting_messages)
+                else:
+                    raise GenerationValidationFailureError(
+                        f"Unsuccessful generation attempt despite {max_generation_attempts} attempts."
+                    ) from exc
+        return output_obj, reasoning_trace
+    def _get_litellm_deployment(self, model_config: ModelConfig) -> DeploymentTypedDict:
+        provider = self._model_provider_registry.get_provider(model_config.provider)
+        api_key = None
+        if provider.api_key:
+            api_key = self._secret_resolver.resolve(provider.api_key)
+        api_key = api_key or "not-used-but-required"
+        litellm_params = LiteLLM_Params(
+            model=f"{provider.provider_type}/{model_config.model}",
+            api_base=provider.endpoint,
+            api_key=api_key,
+        )
+        return {
+            "model_name": model_config.model,
+            "litellm_params": litellm_params.model_dump(),
+        }
+    def _track_usage(self, response: ModelResponse | None) -> None:
+        if response is None:
+            self._usage_stats.extend(request_usage=RequestUsageStats(successful_requests=0, failed_requests=1))
+            return
+        if (
+            response.usage is not None
+            and response.usage.prompt_tokens is not None
+            and response.usage.completion_tokens is not None
+        ):
+            self._usage_stats.extend(
+                token_usage=TokenUsageStats(
+                    prompt_tokens=response.usage.prompt_tokens,
+                    completion_tokens=response.usage.completion_tokens,
+                ),
+                request_usage=RequestUsageStats(successful_requests=1, failed_requests=0),
+            )

data_designer/engine/models/litellm_overrides.py ADDED Viewed

@@ -0,0 +1,162 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import random
+import threading
+from typing import Optional, Union
+import httpx
+import litellm
+from litellm import RetryPolicy
+from litellm.caching.in_memory_cache import InMemoryCache
+from litellm.router import Router
+from pydantic import BaseModel, Field
+from typing_extensions import override
+from data_designer.logging import quiet_noisy_logger
+DEFAULT_MAX_CALLBACKS = 1000
+class LiteLLMRouterDefaultKwargs(BaseModel):
+    ## Number of seconds to wait initially after a connection
+    ## failure.
+    initial_retry_after_s: float = 2.0
+    ## Jitter percentage added during exponential backoff to
+    ## smooth repeated retries over time.
+    jitter_pct: float = 0.2
+    ## Maximum number of seconds to wait for an API request
+    ## before letting it die. Will trigger a retry.
+    timeout: float = 60.0
+    ## Sets the default retry policy, including the number
+    ## of retries to use in particular scenarios.
+    retry_policy: RetryPolicy = Field(
+        default_factory=lambda: RetryPolicy(
+            RateLimitErrorRetries=3,
+            TimeoutErrorRetries=3,
+        )
+    )
+class ThreadSafeCache(InMemoryCache):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._lock = threading.RLock()
+    def get_cache(self, key, **kwargs):
+        with self._lock:
+            return super().get_cache(key, **kwargs)
+    def set_cache(self, key, value, **kwargs):
+        with self._lock:
+            super().set_cache(key, value, **kwargs)
+    def batch_get_cache(self, keys: list, **kwargs):
+        with self._lock:
+            return super().batch_get_cache(keys, **kwargs)
+    def delete_cache(self, key):
+        with self._lock:
+            super().delete_cache(key)
+    def evict_cache(self):
+        with self._lock:
+            super().evict_cache()
+    def increment_cache(self, key, value: int, **kwargs) -> int:
+        with self._lock:
+            return super().increment_cache(key, value, **kwargs)
+    def flush_cache(self):
+        with self._lock:
+            super().flush_cache()
+class CustomRouter(Router):
+    def __init__(
+        self,
+        *args,
+        initial_retry_after_s: float,
+        jitter_pct: float,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self._initial_retry_after_s = initial_retry_after_s
+        self._jitter_pct = jitter_pct
+    def _extract_retry_delay_from_headers(self, e: Exception) -> Optional[Union[int, float]]:
+        """
+        Most of this code logic was extracted directly from the parent
+        `Router`'s `_time_to_sleep_before_retry` function. Our override
+        of that method below should only affect requests where the server
+        didn't explicitly return a desired retry-delay. If the server did
+        return this info, we'll simply use that retry value returned here.
+        """
+        response_headers: Optional[httpx.Headers] = None
+        if hasattr(e, "response") and hasattr(e.response, "headers"):  # type: ignore
+            response_headers = e.response.headers  # type: ignore
+        if hasattr(e, "litellm_response_headers"):
+            response_headers = e.litellm_response_headers  # type: ignore
+        retry_after = litellm.utils._get_retry_after_from_exception_header(response_headers)
+        # If the API asks us to wait a certain amount of time (and it's a reasonable amount), just do what it says.
+        if retry_after is not None and 0 < retry_after <= 60:
+            return retry_after
+        else:
+            return None
+    @override
+    def _time_to_sleep_before_retry(
+        self,
+        e: Exception,
+        remaining_retries: int,
+        num_retries: int,
+        healthy_deployments: Optional[list] = None,
+        all_deployments: Optional[list] = None,
+    ) -> Union[int, float]:
+        """
+        Implements exponential backoff for retries.
+        Technically, litellm's `Router` already implements some
+        form of exponential backoff. However, that backoff
+        is not customizable w.r.t jitter and initial delay
+        timing. For that reason, we override this method to
+        utilize our own custom instance variables, deferring
+        to the existing implementation wherever we can.
+        """
+        # If the response headers indicated how long we should wait,
+        # use that information.
+        if retry_after := self._extract_retry_delay_from_headers(e):
+            return retry_after
+        return self.calculate_exponential_backoff(
+            initial_retry_after_s=self._initial_retry_after_s,
+            current_retry=num_retries - remaining_retries,
+            jitter_pct=self._jitter_pct,
+        )
+    @staticmethod
+    def calculate_exponential_backoff(initial_retry_after_s: float, current_retry: int, jitter_pct: float) -> float:
+        sleep_s = initial_retry_after_s * (pow(2.0, current_retry))
+        jitter = 1.0 + random.uniform(-jitter_pct, jitter_pct)
+        return sleep_s * jitter
+def apply_litellm_patches():
+    litellm.in_memory_llm_clients_cache = ThreadSafeCache()
+    # Workaround for the litellm issue described in https://github.com/BerriAI/litellm/issues/9792
+    litellm.litellm_core_utils.logging_callback_manager.LoggingCallbackManager.MAX_CALLBACKS = DEFAULT_MAX_CALLBACKS
+    quiet_noisy_logger("httpx")
+    quiet_noisy_logger("LiteLLM")
+    quiet_noisy_logger("LiteLLM Router")

data_designer/engine/models/parsers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2	+ # SPDX-License-Identifier: Apache-2.0

data_designer/engine/models/parsers/errors.py ADDED Viewed

@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+class ParserException(Exception):
+    """Identifies errors resulting from generic parser errors.
+    Attributes:
+        source (str | None): The source string that the parser
+            attempted to parse.
+    """
+    source: Optional[str]
+    @staticmethod
+    def _log_format(source: str) -> str:
+        ## NOTE: The point of this was to be able to report offending
+        ##  failure cases to the logs. This might not be what we want
+        ##  to do in all cases. In the meantime, this note is left
+        ##  for later review.
+        #
+        # return f"<source>{source}</source>"
+        return ""
+    def __init__(self, msg: Optional[str] = None, source: Optional[str] = None):
+        msg = "" if msg is None else msg.strip()
+        if source is not None:
+            msg += self._log_format(source)
+        super().__init__(msg)
+        self.source = source

data_designer/engine/models/parsers/parser.py ADDED Viewed

@@ -0,0 +1,236 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from functools import reduce
+from typing import Optional
+from lxml import etree
+from lxml.etree import _Element
+import marko
+from data_designer.engine.models.parsers.postprocessors import merge_text_blocks
+import data_designer.engine.models.parsers.tag_parsers as tp
+from data_designer.engine.models.parsers.types import (
+    LLMStructuredResponse,
+    PostProcessor,
+    TagParser,
+)
+DEFAULT_TAG_PARSERS = {
+    "pre.code": tp.code_block_parser,
+    "p.code": tp.inline_code_parser,
+    "p": tp.text_parser,
+    "pre": tp.text_parser,
+    "": tp.text_parser_keep_markup,
+}
+DEFAULT_POST_PROCESSORS = [merge_text_blocks]
+def _patch_tags_before_code_fences(response: str) -> str:
+    """Patch to add a linebreak between a tag prior to a code block.
+    Marko conversion of MD->HTML has a quirk. If there is a case like
+    the following, it will not convert the code block at all:
+            ...
+            </ending_tag>
+            ```syntax
+            ...
+    We want to find these cases and simply introduce an additional
+    line break.
+    """
+    return response.replace(">\n```", ">\n\n```")
+class LLMResponseParser:
+    """
+    Parses Language Model (LLM) responses containing a mixture of Markdown and custom markup into structured data.
+    The `LLMResponseParser` class facilitates the translation of LLM-generated responses, which may include
+    Markdown and custom markup tags, into a structured format using ElementTree. It allows for customizable
+    parsing behavior through the registration of tag-specific parsers and post-processors.
+    ## Description
+    The core functionality of this class enables LLMs to respond using Markdown along with any custom
+    prompted markup specified by the system or task. The parsing process involves converting the Markdown
+    and markup into an ElementTree, then processing each element using registered tag parsers to produce
+    a list of structured `BaseModel` instances. Post-processors can further refine the structured response.
+    ### Tag Parsers
+    Tag parsers are responsible for handling specific markup tags within the LLM response. They can be
+    registered with the parser using dot-path notation to manage hierarchical tag structures. This allows
+    downstream tasks to customize how specific elements are processed into `BaseModel` instances.
+    ### Post-Processors
+    Post-processors are functions that operate on the list of parsed blocks to perform additional
+    transformations or aggregations. They are applied after the initial parsing of the response.
+    Attributes:
+        tag_parsers (dict[str, TagParser]): A dictionary mapping tag paths to their corresponding `TagParser` instances.
+        postprocessors (list[PostProcessor]): A list of post-processing functions to apply to the structured response.
+    Example:
+        ```python
+        class CodeBlock(BaseModel):
+            code: str
+            syntax: Optional[str] = None
+        class CodeBlockParser:
+            def __call__(self, element: _Element) -> CodeBlock:
+                # Implementation details...
+                return CodeBlock(code=element.text, syntax=element.get("class"))
+        parser = LLMResponseParser(
+            tag_parsers={
+                "pre.code": CodeBlockParser(),
+            }
+        )
+        out = parser.parse('```json\n{"answer": 42}\n```')
+        print(out.parsed)
+        # Output: [CodeBlock(code='{"answer": 42}\n', syntax='json')]
+        ```
+    """
+    tag_parsers: dict[str, TagParser]
+    postprocessors: list[PostProcessor]
+    def __init__(
+        self,
+        tag_parsers: Optional[dict[str, TagParser]] = None,
+        postprocessors: Optional[list[PostProcessor]] = None,
+    ):
+        """
+        Initializes the LLMResponseParser with optional tag parsers and post-processors.
+        Args:
+            tag_parsers (Optional[dict[str, TagParser]]): A dictionary mapping tag paths to `TagParser` instances.
+                If provided, these parsers will be merged with the default tag parsers.
+            postprocessors (Optional[list[PostProcessor]]): A list of post-processing functions to apply
+                to the structured response. If not provided, a default post-processor `merge_text_blocks`
+                is used.
+        Attributes:
+            tag_parsers (dict[str, TagParser]): Initialized with default tag parsers, updated with any provided.
+            postprocessors (list[PostProcessor]): Initialized with default post-processors or the provided list.
+        """
+        self.tag_parsers = {**DEFAULT_TAG_PARSERS}
+        if tag_parsers:
+            self.tag_parsers.update(tag_parsers)
+        self.postprocessors = [
+            merge_text_blocks,
+        ]
+        if postprocessors is not None:
+            self.postprocessors = postprocessors
+    def lookup_parser(self, element: _Element) -> TagParser:
+        """
+        Resolves and retrieves the appropriate `TagParser` for a given XML element based on its tag hierarchy.
+        The method constructs the dot-path lineage of the element's tags, starting from the root and moving
+        towards the specific element. It then attempts to find the most specific matching `TagParser` by
+        progressively reducing the specificity of the tag path until a matching parser is found.
+        Args:
+            element (_Element): The XML element for which to find the corresponding `TagParser`.
+        Returns:
+            TagParser: The `TagParser` instance that matches the element's tag path.
+        Raises:
+            KeyError: If no matching `TagParser` is found for the element's tag path.
+        """
+        # Get the dot path lineage of this tag, sans root.
+        # Note that the lineage comes back in reverse order.
+        parents = [e.tag for e in element.iterancestors()][::-1]
+        lineage = [*parents, element.tag]
+        # Now attempt to matchup with the tag parsers name.
+        # Starts from the full linear (most specific), and
+        # breaks on the first hit. So this should properly
+        # prioritize specific parsers over general ones.
+        while lineage:
+            tag_path = ".".join(lineage)
+            if tag_path not in self.tag_parsers:
+                lineage.pop(0)
+            else:
+                break
+        # Tag path can be an empty string, which hits the
+        # default parsing option specified by the "" entry
+        # of the tag parsers dict.
+        tag_path = ".".join(lineage)
+        return self.tag_parsers[tag_path]
+    def postprocess(self, structured_response: LLMStructuredResponse) -> LLMStructuredResponse:
+        """
+        Applies post-processing functions to the structured response.
+        If no post-processors are registered, the original structured response is returned.
+        Otherwise, each post-processor is applied in sequence to transform the response.
+        Args:
+            structured_response (LLMStructuredResponse): The initial structured response to be post-processed.
+        Returns:
+            LLMStructuredResponse: The post-processed structured response.
+        """
+        if not self.postprocessors:
+            return structured_response
+        return reduce(lambda acc, func: func(acc), self.postprocessors, structured_response)
+    def parse(self, md_response: str) -> LLMStructuredResponse:
+        """
+        Parses a Markdown-formatted LLM response into a structured `LLMStructuredResponse`.
+        The parsing process involves converting the Markdown and custom markup into an XML tree,
+        iterating over each element in a depth-first traversal to apply the appropriate
+        `TagParser`, and then applying any registered post-processors to the resulting structured data.
+        Args:
+            md_response (str): The Markdown-formatted response from the LLM, potentially containing custom markup.
+        Returns:
+            LLMStructuredResponse: The structured representation of the parsed response, containing parsed blocks.
+        Raises:
+            etree.XMLSyntaxError: If the provided Markdown cannot be converted into a valid XML structure.
+        """
+        response = marko.convert(_patch_tags_before_code_fences(md_response))
+        output = LLMStructuredResponse(response=md_response, markup=response)
+        # Generate document tree
+        parser = etree.HTMLParser(recover=True, remove_blank_text=True)
+        root = etree.fromstring(response, parser=parser)
+        tags = root.iter() if root is not None else []
+        # Iterate over tags, depth first
+        for element in tags:
+            if element == root or element.tag == "body":
+                continue
+            parsed_block = self.lookup_parser(element)(element)
+            # Make a quick check for dead text blocks, which
+            # can happen with container tags like <pre>, <ul>, and <ol>.
+            drop_block = isinstance(parsed_block, tp.TextBlock) and not parsed_block.text.strip()
+            if not drop_block:
+                output.parsed.append(parsed_block)
+            # Check tails -- inelegant, but they're always text.
+            # Don't add the tail if it is just blank space.
+            if element.tail and element.tail.strip():
+                output.parsed.append(tp.TextBlock(text=element.tail))
+        return self.postprocess(output)