data-designer 0.3.8rc1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/cli/commands/__init__.py +1 -1
- data_designer/interface/__init__.py +21 -1
- data_designer/{_version.py → interface/_version.py} +2 -2
- data_designer/interface/data_designer.py +8 -11
- {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/METADATA +10 -42
- data_designer-0.4.0.dist-info/RECORD +39 -0
- data_designer/__init__.py +0 -17
- data_designer/config/__init__.py +0 -2
- data_designer/config/analysis/__init__.py +0 -2
- data_designer/config/analysis/column_profilers.py +0 -159
- data_designer/config/analysis/column_statistics.py +0 -421
- data_designer/config/analysis/dataset_profiler.py +0 -84
- data_designer/config/analysis/utils/errors.py +0 -10
- data_designer/config/analysis/utils/reporting.py +0 -192
- data_designer/config/base.py +0 -69
- data_designer/config/column_configs.py +0 -470
- data_designer/config/column_types.py +0 -141
- data_designer/config/config_builder.py +0 -595
- data_designer/config/data_designer_config.py +0 -40
- data_designer/config/dataset_builders.py +0 -13
- data_designer/config/dataset_metadata.py +0 -18
- data_designer/config/default_model_settings.py +0 -121
- data_designer/config/errors.py +0 -24
- data_designer/config/exports.py +0 -145
- data_designer/config/interface.py +0 -55
- data_designer/config/models.py +0 -455
- data_designer/config/preview_results.py +0 -41
- data_designer/config/processors.py +0 -148
- data_designer/config/run_config.py +0 -48
- data_designer/config/sampler_constraints.py +0 -52
- data_designer/config/sampler_params.py +0 -639
- data_designer/config/seed.py +0 -116
- data_designer/config/seed_source.py +0 -84
- data_designer/config/seed_source_types.py +0 -19
- data_designer/config/utils/code_lang.py +0 -82
- data_designer/config/utils/constants.py +0 -363
- data_designer/config/utils/errors.py +0 -21
- data_designer/config/utils/info.py +0 -94
- data_designer/config/utils/io_helpers.py +0 -258
- data_designer/config/utils/misc.py +0 -78
- data_designer/config/utils/numerical_helpers.py +0 -30
- data_designer/config/utils/type_helpers.py +0 -106
- data_designer/config/utils/visualization.py +0 -482
- data_designer/config/validator_params.py +0 -94
- data_designer/engine/__init__.py +0 -2
- data_designer/engine/analysis/column_profilers/base.py +0 -49
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
- data_designer/engine/analysis/column_profilers/registry.py +0 -22
- data_designer/engine/analysis/column_statistics.py +0 -145
- data_designer/engine/analysis/dataset_profiler.py +0 -149
- data_designer/engine/analysis/errors.py +0 -9
- data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
- data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
- data_designer/engine/column_generators/__init__.py +0 -2
- data_designer/engine/column_generators/generators/__init__.py +0 -2
- data_designer/engine/column_generators/generators/base.py +0 -122
- data_designer/engine/column_generators/generators/embedding.py +0 -35
- data_designer/engine/column_generators/generators/expression.py +0 -55
- data_designer/engine/column_generators/generators/llm_completion.py +0 -113
- data_designer/engine/column_generators/generators/samplers.py +0 -69
- data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
- data_designer/engine/column_generators/generators/validation.py +0 -140
- data_designer/engine/column_generators/registry.py +0 -60
- data_designer/engine/column_generators/utils/errors.py +0 -15
- data_designer/engine/column_generators/utils/generator_classification.py +0 -43
- data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
- data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
- data_designer/engine/compiler.py +0 -97
- data_designer/engine/configurable_task.py +0 -71
- data_designer/engine/dataset_builders/artifact_storage.py +0 -283
- data_designer/engine/dataset_builders/column_wise_builder.py +0 -338
- data_designer/engine/dataset_builders/errors.py +0 -15
- data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
- data_designer/engine/dataset_builders/utils/__init__.py +0 -2
- data_designer/engine/dataset_builders/utils/concurrency.py +0 -215
- data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
- data_designer/engine/dataset_builders/utils/dag.py +0 -62
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
- data_designer/engine/dataset_builders/utils/errors.py +0 -15
- data_designer/engine/errors.py +0 -51
- data_designer/engine/model_provider.py +0 -77
- data_designer/engine/models/__init__.py +0 -2
- data_designer/engine/models/errors.py +0 -300
- data_designer/engine/models/facade.py +0 -287
- data_designer/engine/models/factory.py +0 -42
- data_designer/engine/models/litellm_overrides.py +0 -179
- data_designer/engine/models/parsers/__init__.py +0 -2
- data_designer/engine/models/parsers/errors.py +0 -34
- data_designer/engine/models/parsers/parser.py +0 -235
- data_designer/engine/models/parsers/postprocessors.py +0 -93
- data_designer/engine/models/parsers/tag_parsers.py +0 -62
- data_designer/engine/models/parsers/types.py +0 -84
- data_designer/engine/models/recipes/base.py +0 -81
- data_designer/engine/models/recipes/response_recipes.py +0 -293
- data_designer/engine/models/registry.py +0 -146
- data_designer/engine/models/telemetry.py +0 -359
- data_designer/engine/models/usage.py +0 -73
- data_designer/engine/models/utils.py +0 -38
- data_designer/engine/processing/ginja/__init__.py +0 -2
- data_designer/engine/processing/ginja/ast.py +0 -65
- data_designer/engine/processing/ginja/environment.py +0 -463
- data_designer/engine/processing/ginja/exceptions.py +0 -56
- data_designer/engine/processing/ginja/record.py +0 -32
- data_designer/engine/processing/gsonschema/__init__.py +0 -2
- data_designer/engine/processing/gsonschema/exceptions.py +0 -15
- data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
- data_designer/engine/processing/gsonschema/types.py +0 -10
- data_designer/engine/processing/gsonschema/validators.py +0 -202
- data_designer/engine/processing/processors/base.py +0 -13
- data_designer/engine/processing/processors/drop_columns.py +0 -42
- data_designer/engine/processing/processors/registry.py +0 -25
- data_designer/engine/processing/processors/schema_transform.py +0 -49
- data_designer/engine/processing/utils.py +0 -169
- data_designer/engine/registry/base.py +0 -99
- data_designer/engine/registry/data_designer_registry.py +0 -39
- data_designer/engine/registry/errors.py +0 -12
- data_designer/engine/resources/managed_dataset_generator.py +0 -39
- data_designer/engine/resources/managed_dataset_repository.py +0 -197
- data_designer/engine/resources/managed_storage.py +0 -65
- data_designer/engine/resources/resource_provider.py +0 -77
- data_designer/engine/resources/seed_reader.py +0 -154
- data_designer/engine/sampling_gen/column.py +0 -91
- data_designer/engine/sampling_gen/constraints.py +0 -100
- data_designer/engine/sampling_gen/data_sources/base.py +0 -217
- data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
- data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
- data_designer/engine/sampling_gen/entities/__init__.py +0 -2
- data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
- data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
- data_designer/engine/sampling_gen/entities/errors.py +0 -10
- data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
- data_designer/engine/sampling_gen/entities/person.py +0 -144
- data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
- data_designer/engine/sampling_gen/errors.py +0 -26
- data_designer/engine/sampling_gen/generator.py +0 -122
- data_designer/engine/sampling_gen/jinja_utils.py +0 -64
- data_designer/engine/sampling_gen/people_gen.py +0 -199
- data_designer/engine/sampling_gen/person_constants.py +0 -56
- data_designer/engine/sampling_gen/schema.py +0 -147
- data_designer/engine/sampling_gen/schema_builder.py +0 -61
- data_designer/engine/sampling_gen/utils.py +0 -46
- data_designer/engine/secret_resolver.py +0 -82
- data_designer/engine/validation.py +0 -367
- data_designer/engine/validators/__init__.py +0 -19
- data_designer/engine/validators/base.py +0 -38
- data_designer/engine/validators/local_callable.py +0 -39
- data_designer/engine/validators/python.py +0 -254
- data_designer/engine/validators/remote.py +0 -89
- data_designer/engine/validators/sql.py +0 -65
- data_designer/errors.py +0 -7
- data_designer/essentials/__init__.py +0 -33
- data_designer/lazy_heavy_imports.py +0 -54
- data_designer/logging.py +0 -163
- data_designer/plugin_manager.py +0 -78
- data_designer/plugins/__init__.py +0 -8
- data_designer/plugins/errors.py +0 -15
- data_designer/plugins/plugin.py +0 -141
- data_designer/plugins/registry.py +0 -88
- data_designer/plugins/testing/__init__.py +0 -10
- data_designer/plugins/testing/stubs.py +0 -116
- data_designer/plugins/testing/utils.py +0 -20
- data_designer-0.3.8rc1.dist-info/RECORD +0 -196
- data_designer-0.3.8rc1.dist-info/licenses/LICENSE +0 -201
- {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/WHEEL +0 -0
- {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,179 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
"""
|
|
6
|
-
LiteLLM overrides and customizations.
|
|
7
|
-
|
|
8
|
-
Note on imports: This module uses direct (eager) imports for litellm rather than lazy loading.
|
|
9
|
-
This is intentional because:
|
|
10
|
-
|
|
11
|
-
1. Class inheritance requires base classes to be resolved at class definition time,
|
|
12
|
-
making lazy imports incompatible with our ThreadSafeCache and CustomRouter classes.
|
|
13
|
-
|
|
14
|
-
2. This module is already lazily loaded at the application level - it's only imported
|
|
15
|
-
by facade.py, which itself is imported inside the create_model_registry() factory
|
|
16
|
-
function. So litellm is only loaded when models are actually needed.
|
|
17
|
-
|
|
18
|
-
3. Attempting to use lazy imports here causes intermittent ImportErrors.
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
from __future__ import annotations
|
|
22
|
-
|
|
23
|
-
import random
|
|
24
|
-
import threading
|
|
25
|
-
|
|
26
|
-
import httpx
|
|
27
|
-
import litellm
|
|
28
|
-
from litellm import RetryPolicy
|
|
29
|
-
from litellm.caching.in_memory_cache import InMemoryCache
|
|
30
|
-
from litellm.litellm_core_utils.logging_callback_manager import LoggingCallbackManager
|
|
31
|
-
from litellm.router import Router
|
|
32
|
-
from pydantic import BaseModel, Field
|
|
33
|
-
from typing_extensions import override
|
|
34
|
-
|
|
35
|
-
from data_designer.logging import quiet_noisy_logger
|
|
36
|
-
|
|
37
|
-
DEFAULT_MAX_CALLBACKS = 1000
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class LiteLLMRouterDefaultKwargs(BaseModel):
|
|
41
|
-
## Number of seconds to wait initially after a connection
|
|
42
|
-
## failure.
|
|
43
|
-
initial_retry_after_s: float = 2.0
|
|
44
|
-
|
|
45
|
-
## Jitter percentage added during exponential backoff to
|
|
46
|
-
## smooth repeated retries over time.
|
|
47
|
-
jitter_pct: float = 0.2
|
|
48
|
-
|
|
49
|
-
## Maximum number of seconds to wait for an API request
|
|
50
|
-
## before letting it die. Will trigger a retry.
|
|
51
|
-
timeout: float = 60.0
|
|
52
|
-
|
|
53
|
-
## Sets the default retry policy, including the number
|
|
54
|
-
## of retries to use in particular scenarios.
|
|
55
|
-
retry_policy: RetryPolicy = Field(
|
|
56
|
-
default_factory=lambda: RetryPolicy(
|
|
57
|
-
RateLimitErrorRetries=3,
|
|
58
|
-
TimeoutErrorRetries=3,
|
|
59
|
-
)
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
class ThreadSafeCache(InMemoryCache):
|
|
64
|
-
def __init__(self, *args, **kwargs):
|
|
65
|
-
super().__init__(*args, **kwargs)
|
|
66
|
-
|
|
67
|
-
self._lock = threading.RLock()
|
|
68
|
-
|
|
69
|
-
def get_cache(self, key, **kwargs):
|
|
70
|
-
with self._lock:
|
|
71
|
-
return super().get_cache(key, **kwargs)
|
|
72
|
-
|
|
73
|
-
def set_cache(self, key, value, **kwargs):
|
|
74
|
-
with self._lock:
|
|
75
|
-
super().set_cache(key, value, **kwargs)
|
|
76
|
-
|
|
77
|
-
def batch_get_cache(self, keys: list, **kwargs):
|
|
78
|
-
with self._lock:
|
|
79
|
-
return super().batch_get_cache(keys, **kwargs)
|
|
80
|
-
|
|
81
|
-
def delete_cache(self, key):
|
|
82
|
-
with self._lock:
|
|
83
|
-
super().delete_cache(key)
|
|
84
|
-
|
|
85
|
-
def evict_cache(self):
|
|
86
|
-
with self._lock:
|
|
87
|
-
super().evict_cache()
|
|
88
|
-
|
|
89
|
-
def increment_cache(self, key, value: int, **kwargs) -> int:
|
|
90
|
-
with self._lock:
|
|
91
|
-
return super().increment_cache(key, value, **kwargs)
|
|
92
|
-
|
|
93
|
-
def flush_cache(self):
|
|
94
|
-
with self._lock:
|
|
95
|
-
super().flush_cache()
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
class CustomRouter(Router):
|
|
99
|
-
def __init__(
|
|
100
|
-
self,
|
|
101
|
-
*args,
|
|
102
|
-
initial_retry_after_s: float,
|
|
103
|
-
jitter_pct: float,
|
|
104
|
-
**kwargs,
|
|
105
|
-
):
|
|
106
|
-
super().__init__(*args, **kwargs)
|
|
107
|
-
self._initial_retry_after_s = initial_retry_after_s
|
|
108
|
-
self._jitter_pct = jitter_pct
|
|
109
|
-
|
|
110
|
-
def _extract_retry_delay_from_headers(self, e: Exception) -> int | float | None:
|
|
111
|
-
"""
|
|
112
|
-
Most of this code logic was extracted directly from the parent
|
|
113
|
-
`Router`'s `_time_to_sleep_before_retry` function. Our override
|
|
114
|
-
of that method below should only affect requests where the server
|
|
115
|
-
didn't explicitly return a desired retry-delay. If the server did
|
|
116
|
-
return this info, we'll simply use that retry value returned here.
|
|
117
|
-
"""
|
|
118
|
-
|
|
119
|
-
response_headers: httpx.Headers | None = None
|
|
120
|
-
if hasattr(e, "response") and hasattr(e.response, "headers"): # type: ignore
|
|
121
|
-
response_headers = e.response.headers # type: ignore
|
|
122
|
-
if hasattr(e, "litellm_response_headers"):
|
|
123
|
-
response_headers = e.litellm_response_headers # type: ignore
|
|
124
|
-
|
|
125
|
-
retry_after = litellm.utils._get_retry_after_from_exception_header(response_headers)
|
|
126
|
-
|
|
127
|
-
# If the API asks us to wait a certain amount of time (and it's a reasonable amount), just do what it says.
|
|
128
|
-
if retry_after is not None and 0 < retry_after <= 60:
|
|
129
|
-
return retry_after
|
|
130
|
-
else:
|
|
131
|
-
return None
|
|
132
|
-
|
|
133
|
-
@override
|
|
134
|
-
def _time_to_sleep_before_retry(
|
|
135
|
-
self,
|
|
136
|
-
e: Exception,
|
|
137
|
-
remaining_retries: int,
|
|
138
|
-
num_retries: int,
|
|
139
|
-
healthy_deployments: list | None = None,
|
|
140
|
-
all_deployments: list | None = None,
|
|
141
|
-
) -> int | float:
|
|
142
|
-
"""
|
|
143
|
-
Implements exponential backoff for retries.
|
|
144
|
-
|
|
145
|
-
Technically, litellm's `Router` already implements some
|
|
146
|
-
form of exponential backoff. However, that backoff
|
|
147
|
-
is not customizable w.r.t jitter and initial delay
|
|
148
|
-
timing. For that reason, we override this method to
|
|
149
|
-
utilize our own custom instance variables, deferring
|
|
150
|
-
to the existing implementation wherever we can.
|
|
151
|
-
"""
|
|
152
|
-
|
|
153
|
-
# If the response headers indicated how long we should wait,
|
|
154
|
-
# use that information.
|
|
155
|
-
if retry_after := self._extract_retry_delay_from_headers(e):
|
|
156
|
-
return retry_after
|
|
157
|
-
|
|
158
|
-
return self.calculate_exponential_backoff(
|
|
159
|
-
initial_retry_after_s=self._initial_retry_after_s,
|
|
160
|
-
current_retry=num_retries - remaining_retries,
|
|
161
|
-
jitter_pct=self._jitter_pct,
|
|
162
|
-
)
|
|
163
|
-
|
|
164
|
-
@staticmethod
|
|
165
|
-
def calculate_exponential_backoff(initial_retry_after_s: float, current_retry: int, jitter_pct: float) -> float:
|
|
166
|
-
sleep_s = initial_retry_after_s * (pow(2.0, current_retry))
|
|
167
|
-
jitter = 1.0 + random.uniform(-jitter_pct, jitter_pct)
|
|
168
|
-
return sleep_s * jitter
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
def apply_litellm_patches():
|
|
172
|
-
litellm.in_memory_llm_clients_cache = ThreadSafeCache()
|
|
173
|
-
|
|
174
|
-
# Workaround for the litellm issue described in https://github.com/BerriAI/litellm/issues/9792
|
|
175
|
-
LoggingCallbackManager.MAX_CALLBACKS = DEFAULT_MAX_CALLBACKS
|
|
176
|
-
|
|
177
|
-
quiet_noisy_logger("httpx")
|
|
178
|
-
quiet_noisy_logger("LiteLLM")
|
|
179
|
-
quiet_noisy_logger("LiteLLM Router")
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class ParserException(Exception):
|
|
8
|
-
"""Identifies errors resulting from generic parser errors.
|
|
9
|
-
|
|
10
|
-
Attributes:
|
|
11
|
-
source (str | None): The source string that the parser
|
|
12
|
-
attempted to parse.
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
source: str | None
|
|
16
|
-
|
|
17
|
-
@staticmethod
|
|
18
|
-
def _log_format(source: str) -> str:
|
|
19
|
-
## NOTE: The point of this was to be able to report offending
|
|
20
|
-
## failure cases to the logs. This might not be what we want
|
|
21
|
-
## to do in all cases. In the meantime, this note is left
|
|
22
|
-
## for later review.
|
|
23
|
-
#
|
|
24
|
-
# return f"<source>{source}</source>"
|
|
25
|
-
return ""
|
|
26
|
-
|
|
27
|
-
def __init__(self, msg: str | None = None, source: str | None = None):
|
|
28
|
-
msg = "" if msg is None else msg.strip()
|
|
29
|
-
|
|
30
|
-
if source is not None:
|
|
31
|
-
msg += self._log_format(source)
|
|
32
|
-
|
|
33
|
-
super().__init__(msg)
|
|
34
|
-
self.source = source
|
|
@@ -1,235 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
from functools import reduce
|
|
7
|
-
|
|
8
|
-
import marko
|
|
9
|
-
from lxml import etree
|
|
10
|
-
from lxml.etree import _Element
|
|
11
|
-
|
|
12
|
-
import data_designer.engine.models.parsers.tag_parsers as tp
|
|
13
|
-
from data_designer.engine.models.parsers.postprocessors import merge_text_blocks
|
|
14
|
-
from data_designer.engine.models.parsers.types import (
|
|
15
|
-
LLMStructuredResponse,
|
|
16
|
-
PostProcessor,
|
|
17
|
-
TagParser,
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
DEFAULT_TAG_PARSERS = {
|
|
21
|
-
"pre.code": tp.code_block_parser,
|
|
22
|
-
"p.code": tp.inline_code_parser,
|
|
23
|
-
"p": tp.text_parser,
|
|
24
|
-
"pre": tp.text_parser,
|
|
25
|
-
"": tp.text_parser_keep_markup,
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
DEFAULT_POST_PROCESSORS = [merge_text_blocks]
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def _patch_tags_before_code_fences(response: str) -> str:
|
|
32
|
-
"""Patch to add a linebreak between a tag prior to a code block.
|
|
33
|
-
|
|
34
|
-
Marko conversion of MD->HTML has a quirk. If there is a case like
|
|
35
|
-
the following, it will not convert the code block at all:
|
|
36
|
-
|
|
37
|
-
...
|
|
38
|
-
</ending_tag>
|
|
39
|
-
```syntax
|
|
40
|
-
...
|
|
41
|
-
|
|
42
|
-
We want to find these cases and simply introduce an additional
|
|
43
|
-
line break.
|
|
44
|
-
"""
|
|
45
|
-
|
|
46
|
-
return response.replace(">\n```", ">\n\n```")
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class LLMResponseParser:
|
|
50
|
-
"""
|
|
51
|
-
Parses Language Model (LLM) responses containing a mixture of Markdown and custom markup into structured data.
|
|
52
|
-
|
|
53
|
-
The `LLMResponseParser` class facilitates the translation of LLM-generated responses, which may include
|
|
54
|
-
Markdown and custom markup tags, into a structured format using ElementTree. It allows for customizable
|
|
55
|
-
parsing behavior through the registration of tag-specific parsers and post-processors.
|
|
56
|
-
|
|
57
|
-
## Description
|
|
58
|
-
|
|
59
|
-
The core functionality of this class enables LLMs to respond using Markdown along with any custom
|
|
60
|
-
prompted markup specified by the system or task. The parsing process involves converting the Markdown
|
|
61
|
-
and markup into an ElementTree, then processing each element using registered tag parsers to produce
|
|
62
|
-
a list of structured `BaseModel` instances. Post-processors can further refine the structured response.
|
|
63
|
-
|
|
64
|
-
### Tag Parsers
|
|
65
|
-
|
|
66
|
-
Tag parsers are responsible for handling specific markup tags within the LLM response. They can be
|
|
67
|
-
registered with the parser using dot-path notation to manage hierarchical tag structures. This allows
|
|
68
|
-
downstream tasks to customize how specific elements are processed into `BaseModel` instances.
|
|
69
|
-
|
|
70
|
-
### Post-Processors
|
|
71
|
-
|
|
72
|
-
Post-processors are functions that operate on the list of parsed blocks to perform additional
|
|
73
|
-
transformations or aggregations. They are applied after the initial parsing of the response.
|
|
74
|
-
|
|
75
|
-
Attributes:
|
|
76
|
-
tag_parsers (dict[str, TagParser]): A dictionary mapping tag paths to their corresponding `TagParser` instances.
|
|
77
|
-
postprocessors (list[PostProcessor]): A list of post-processing functions to apply to the structured response.
|
|
78
|
-
|
|
79
|
-
Example:
|
|
80
|
-
```python
|
|
81
|
-
class CodeBlock(BaseModel):
|
|
82
|
-
code: str
|
|
83
|
-
syntax: Optional[str] = None
|
|
84
|
-
|
|
85
|
-
class CodeBlockParser:
|
|
86
|
-
def __call__(self, element: _Element) -> CodeBlock:
|
|
87
|
-
# Implementation details...
|
|
88
|
-
return CodeBlock(code=element.text, syntax=element.get("class"))
|
|
89
|
-
|
|
90
|
-
parser = LLMResponseParser(
|
|
91
|
-
tag_parsers={
|
|
92
|
-
"pre.code": CodeBlockParser(),
|
|
93
|
-
}
|
|
94
|
-
)
|
|
95
|
-
|
|
96
|
-
out = parser.parse('```json\n{"answer": 42}\n```')
|
|
97
|
-
print(out.parsed)
|
|
98
|
-
# Output: [CodeBlock(code='{"answer": 42}\n', syntax='json')]
|
|
99
|
-
```
|
|
100
|
-
"""
|
|
101
|
-
|
|
102
|
-
tag_parsers: dict[str, TagParser]
|
|
103
|
-
postprocessors: list[PostProcessor]
|
|
104
|
-
|
|
105
|
-
def __init__(
|
|
106
|
-
self,
|
|
107
|
-
tag_parsers: dict[str, TagParser] | None = None,
|
|
108
|
-
postprocessors: list[PostProcessor] | None = None,
|
|
109
|
-
):
|
|
110
|
-
"""
|
|
111
|
-
Initializes the LLMResponseParser with optional tag parsers and post-processors.
|
|
112
|
-
|
|
113
|
-
Args:
|
|
114
|
-
tag_parsers (Optional[dict[str, TagParser]]): A dictionary mapping tag paths to `TagParser` instances.
|
|
115
|
-
If provided, these parsers will be merged with the default tag parsers.
|
|
116
|
-
postprocessors (Optional[list[PostProcessor]]): A list of post-processing functions to apply
|
|
117
|
-
to the structured response. If not provided, a default post-processor `merge_text_blocks`
|
|
118
|
-
is used.
|
|
119
|
-
|
|
120
|
-
Attributes:
|
|
121
|
-
tag_parsers (dict[str, TagParser]): Initialized with default tag parsers, updated with any provided.
|
|
122
|
-
postprocessors (list[PostProcessor]): Initialized with default post-processors or the provided list.
|
|
123
|
-
"""
|
|
124
|
-
self.tag_parsers = {**DEFAULT_TAG_PARSERS}
|
|
125
|
-
if tag_parsers:
|
|
126
|
-
self.tag_parsers.update(tag_parsers)
|
|
127
|
-
|
|
128
|
-
self.postprocessors = [
|
|
129
|
-
merge_text_blocks,
|
|
130
|
-
]
|
|
131
|
-
if postprocessors is not None:
|
|
132
|
-
self.postprocessors = postprocessors
|
|
133
|
-
|
|
134
|
-
def lookup_parser(self, element: _Element) -> TagParser:
|
|
135
|
-
"""
|
|
136
|
-
Resolves and retrieves the appropriate `TagParser` for a given XML element based on its tag hierarchy.
|
|
137
|
-
|
|
138
|
-
The method constructs the dot-path lineage of the element's tags, starting from the root and moving
|
|
139
|
-
towards the specific element. It then attempts to find the most specific matching `TagParser` by
|
|
140
|
-
progressively reducing the specificity of the tag path until a matching parser is found.
|
|
141
|
-
|
|
142
|
-
Args:
|
|
143
|
-
element (_Element): The XML element for which to find the corresponding `TagParser`.
|
|
144
|
-
|
|
145
|
-
Returns:
|
|
146
|
-
TagParser: The `TagParser` instance that matches the element's tag path.
|
|
147
|
-
|
|
148
|
-
Raises:
|
|
149
|
-
KeyError: If no matching `TagParser` is found for the element's tag path.
|
|
150
|
-
"""
|
|
151
|
-
# Get the dot path lineage of this tag, sans root.
|
|
152
|
-
# Note that the lineage comes back in reverse order.
|
|
153
|
-
parents = [e.tag for e in element.iterancestors()][::-1]
|
|
154
|
-
lineage = [*parents, element.tag]
|
|
155
|
-
|
|
156
|
-
# Now attempt to matchup with the tag parsers name.
|
|
157
|
-
# Starts from the full linear (most specific), and
|
|
158
|
-
# breaks on the first hit. So this should properly
|
|
159
|
-
# prioritize specific parsers over general ones.
|
|
160
|
-
while lineage:
|
|
161
|
-
tag_path = ".".join(lineage)
|
|
162
|
-
if tag_path not in self.tag_parsers:
|
|
163
|
-
lineage.pop(0)
|
|
164
|
-
else:
|
|
165
|
-
break
|
|
166
|
-
|
|
167
|
-
# Tag path can be an empty string, which hits the
|
|
168
|
-
# default parsing option specified by the "" entry
|
|
169
|
-
# of the tag parsers dict.
|
|
170
|
-
tag_path = ".".join(lineage)
|
|
171
|
-
return self.tag_parsers[tag_path]
|
|
172
|
-
|
|
173
|
-
def postprocess(self, structured_response: LLMStructuredResponse) -> LLMStructuredResponse:
|
|
174
|
-
"""
|
|
175
|
-
Applies post-processing functions to the structured response.
|
|
176
|
-
|
|
177
|
-
If no post-processors are registered, the original structured response is returned.
|
|
178
|
-
Otherwise, each post-processor is applied in sequence to transform the response.
|
|
179
|
-
|
|
180
|
-
Args:
|
|
181
|
-
structured_response (LLMStructuredResponse): The initial structured response to be post-processed.
|
|
182
|
-
|
|
183
|
-
Returns:
|
|
184
|
-
LLMStructuredResponse: The post-processed structured response.
|
|
185
|
-
"""
|
|
186
|
-
if not self.postprocessors:
|
|
187
|
-
return structured_response
|
|
188
|
-
|
|
189
|
-
return reduce(lambda acc, func: func(acc), self.postprocessors, structured_response)
|
|
190
|
-
|
|
191
|
-
def parse(self, md_response: str) -> LLMStructuredResponse:
|
|
192
|
-
"""
|
|
193
|
-
Parses a Markdown-formatted LLM response into a structured `LLMStructuredResponse`.
|
|
194
|
-
|
|
195
|
-
The parsing process involves converting the Markdown and custom markup into an XML tree,
|
|
196
|
-
iterating over each element in a depth-first traversal to apply the appropriate
|
|
197
|
-
`TagParser`, and then applying any registered post-processors to the resulting structured data.
|
|
198
|
-
|
|
199
|
-
Args:
|
|
200
|
-
md_response (str): The Markdown-formatted response from the LLM, potentially containing custom markup.
|
|
201
|
-
|
|
202
|
-
Returns:
|
|
203
|
-
LLMStructuredResponse: The structured representation of the parsed response, containing parsed blocks.
|
|
204
|
-
|
|
205
|
-
Raises:
|
|
206
|
-
etree.XMLSyntaxError: If the provided Markdown cannot be converted into a valid XML structure.
|
|
207
|
-
"""
|
|
208
|
-
response = marko.convert(_patch_tags_before_code_fences(md_response))
|
|
209
|
-
output = LLMStructuredResponse(response=md_response, markup=response)
|
|
210
|
-
|
|
211
|
-
# Generate document tree
|
|
212
|
-
parser = etree.HTMLParser(recover=True, remove_blank_text=True)
|
|
213
|
-
root = etree.fromstring(response, parser=parser)
|
|
214
|
-
tags = root.iter() if root is not None else []
|
|
215
|
-
|
|
216
|
-
# Iterate over tags, depth first
|
|
217
|
-
for element in tags:
|
|
218
|
-
if element == root or element.tag == "body":
|
|
219
|
-
continue
|
|
220
|
-
|
|
221
|
-
parsed_block = self.lookup_parser(element)(element)
|
|
222
|
-
|
|
223
|
-
# Make a quick check for dead text blocks, which
|
|
224
|
-
# can happen with container tags like <pre>, <ul>, and <ol>.
|
|
225
|
-
drop_block = isinstance(parsed_block, tp.TextBlock) and not parsed_block.text.strip()
|
|
226
|
-
|
|
227
|
-
if not drop_block:
|
|
228
|
-
output.parsed.append(parsed_block)
|
|
229
|
-
|
|
230
|
-
# Check tails -- inelegant, but they're always text.
|
|
231
|
-
# Don't add the tail if it is just blank space.
|
|
232
|
-
if element.tail and element.tail.strip():
|
|
233
|
-
output.parsed.append(tp.TextBlock(text=element.tail))
|
|
234
|
-
|
|
235
|
-
return self.postprocess(output)
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
import json_repair
|
|
7
|
-
from pydantic import BaseModel, ValidationError
|
|
8
|
-
|
|
9
|
-
from data_designer.engine.models.parsers.types import (
|
|
10
|
-
CodeBlock,
|
|
11
|
-
LLMStructuredResponse,
|
|
12
|
-
PydanticTypeBlock,
|
|
13
|
-
StructuredDataBlock,
|
|
14
|
-
TextBlock,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def merge_text_blocks(
|
|
19
|
-
structured_response: LLMStructuredResponse,
|
|
20
|
-
) -> LLMStructuredResponse:
|
|
21
|
-
processed_response = structured_response.model_copy()
|
|
22
|
-
processed_response.parsed = []
|
|
23
|
-
accumulator = None
|
|
24
|
-
for block in structured_response.parsed:
|
|
25
|
-
if isinstance(block, TextBlock):
|
|
26
|
-
if accumulator is not None:
|
|
27
|
-
accumulator = TextBlock(text=accumulator.text + block.text)
|
|
28
|
-
else:
|
|
29
|
-
accumulator = block
|
|
30
|
-
else:
|
|
31
|
-
if accumulator is not None:
|
|
32
|
-
processed_response.parsed.append(accumulator)
|
|
33
|
-
accumulator = None
|
|
34
|
-
|
|
35
|
-
processed_response.parsed.append(block)
|
|
36
|
-
|
|
37
|
-
if accumulator:
|
|
38
|
-
processed_response.parsed.append(accumulator)
|
|
39
|
-
|
|
40
|
-
return processed_response
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def deserialize_json_code(
|
|
44
|
-
structured_response: LLMStructuredResponse,
|
|
45
|
-
) -> LLMStructuredResponse:
|
|
46
|
-
processed_response = structured_response.model_copy()
|
|
47
|
-
processed_response.parsed = []
|
|
48
|
-
|
|
49
|
-
for block in structured_response.parsed:
|
|
50
|
-
if isinstance(block, CodeBlock) and block.code_lang == "json":
|
|
51
|
-
deserialized = json_repair.loads(block.code)
|
|
52
|
-
|
|
53
|
-
block = StructuredDataBlock(serialized=block.code, obj=deserialized)
|
|
54
|
-
|
|
55
|
-
processed_response.parsed.append(block)
|
|
56
|
-
else:
|
|
57
|
-
processed_response.parsed.append(block)
|
|
58
|
-
|
|
59
|
-
return processed_response
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class RealizePydanticTypes:
|
|
63
|
-
types: list[type[BaseModel]]
|
|
64
|
-
|
|
65
|
-
def __init__(self, types: list[type[BaseModel]]):
|
|
66
|
-
self.types = types
|
|
67
|
-
|
|
68
|
-
def _fit_types(self, obj: dict) -> BaseModel | None:
|
|
69
|
-
final_obj = None
|
|
70
|
-
|
|
71
|
-
for t in self.types:
|
|
72
|
-
try:
|
|
73
|
-
final_obj = t.model_validate(obj)
|
|
74
|
-
except ValidationError:
|
|
75
|
-
pass
|
|
76
|
-
|
|
77
|
-
return final_obj
|
|
78
|
-
|
|
79
|
-
def __call__(self, structured_response: LLMStructuredResponse) -> LLMStructuredResponse:
|
|
80
|
-
processed_response = structured_response.model_copy()
|
|
81
|
-
processed_response.parsed = []
|
|
82
|
-
|
|
83
|
-
for block in structured_response.parsed:
|
|
84
|
-
if isinstance(block, StructuredDataBlock):
|
|
85
|
-
new_block = block
|
|
86
|
-
pydantic_obj = self._fit_types(block.obj)
|
|
87
|
-
if pydantic_obj:
|
|
88
|
-
new_block = PydanticTypeBlock(serialized=block.serialized, obj=pydantic_obj)
|
|
89
|
-
processed_response.parsed.append(new_block)
|
|
90
|
-
else:
|
|
91
|
-
processed_response.parsed.append(block)
|
|
92
|
-
|
|
93
|
-
return processed_response
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
from lxml.etree import _Element
|
|
7
|
-
|
|
8
|
-
from data_designer.engine.models.parsers.types import CodeBlock, TextBlock
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def text_parser(element: _Element) -> TextBlock:
|
|
12
|
-
return TextBlock(text=element.text if element.text else "")
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def text_parser_keep_markup(element: _Element) -> TextBlock:
|
|
16
|
-
body = element.text if element.text else ""
|
|
17
|
-
return TextBlock(text=f"<{element.tag}>{body}</{element.tag}>")
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def inline_code_parser(element: _Element) -> TextBlock:
|
|
21
|
-
return TextBlock(text=f"`{element.text if element.text else ''}`")
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def code_block_parser(element: _Element) -> CodeBlock:
|
|
25
|
-
"""Parse a <pre><code> element node.
|
|
26
|
-
|
|
27
|
-
This parser handles the special case of Markdown->HTML conversion
|
|
28
|
-
for fenced code blocks. These take on the form:
|
|
29
|
-
|
|
30
|
-
```xx
|
|
31
|
-
...
|
|
32
|
-
```
|
|
33
|
-
|
|
34
|
-
<pre><code class="language-xx">...</code></pre>
|
|
35
|
-
|
|
36
|
-
This parser is intended to be attached to the special case of "pre.code"
|
|
37
|
-
tag hierarchies.
|
|
38
|
-
|
|
39
|
-
Syntax Handling
|
|
40
|
-
|
|
41
|
-
If the syntax is not specified, e.g. ``<code>...</code>`` or
|
|
42
|
-
``<code class="">...</code>``, then the syntax field is returned
|
|
43
|
-
as None. However, the parser does not _enforce_ the prefix
|
|
44
|
-
`language-` on the value of the class attribute.
|
|
45
|
-
If it is not present, then the entire value
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
element (lxml.etree._Element): An element of the lxml-parsed
|
|
49
|
-
element tree.
|
|
50
|
-
|
|
51
|
-
Returns:
|
|
52
|
-
CodeBlock: Datat structured containing both the body of the code
|
|
53
|
-
as well as the specified synax of the code block.
|
|
54
|
-
|
|
55
|
-
"""
|
|
56
|
-
prefix = "language-"
|
|
57
|
-
language_identifier = element.attrib.get("class", "")
|
|
58
|
-
language_identifier = language_identifier.removeprefix(prefix)
|
|
59
|
-
return CodeBlock(
|
|
60
|
-
code=element.text.strip() if element.text else "",
|
|
61
|
-
code_lang=language_identifier if language_identifier else None,
|
|
62
|
-
)
|
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
from typing import Any, Protocol, runtime_checkable
|
|
7
|
-
|
|
8
|
-
from lxml.etree import _Element
|
|
9
|
-
from pydantic import BaseModel, Field
|
|
10
|
-
from typing_extensions import Self
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class LLMStructuredResponse(BaseModel):
|
|
14
|
-
"""Output format for the LLM Response Parser."""
|
|
15
|
-
|
|
16
|
-
response: str = Field(description="Raw Markdown/Markup response received from the LLM and input to the parser.")
|
|
17
|
-
markup: str = Field(description="Markup/HTML resulting from running Markdown parsing on response.")
|
|
18
|
-
parsed: list[BaseModel] = Field(
|
|
19
|
-
default_factory=list,
|
|
20
|
-
description="Structured content parsed from markup. Elements of this list are in document-order.",
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
def head(self, n: int) -> Self:
|
|
24
|
-
"""Retain only the first n elements of the parsed response."""
|
|
25
|
-
out = self.model_copy()
|
|
26
|
-
out.parsed = out.parsed[:n]
|
|
27
|
-
return out
|
|
28
|
-
|
|
29
|
-
def tail(self, n: int) -> Self:
|
|
30
|
-
"""Retain only the last n elements of the parsed response."""
|
|
31
|
-
out = self.model_copy()
|
|
32
|
-
out.parsed = out.parsed[-n:]
|
|
33
|
-
return out
|
|
34
|
-
|
|
35
|
-
def filter(self, block_types: list[type[BaseModel]]) -> Self:
|
|
36
|
-
out = self.model_copy()
|
|
37
|
-
out.parsed = [b for b in out.parsed if isinstance(b, tuple(block_types))]
|
|
38
|
-
return out
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@runtime_checkable
|
|
42
|
-
class TagParser(Protocol):
|
|
43
|
-
"""Protocol for tag parsing implementations.
|
|
44
|
-
|
|
45
|
-
All TagParsers are objects which can take as input an `lxml`
|
|
46
|
-
element, do some computation, and return some kind of structured
|
|
47
|
-
output, represented as a subclass of Pydantic `BaseModel`.
|
|
48
|
-
This protocol implementation can cover both classes as well
|
|
49
|
-
as curried functions as parsers (e.g. `partial`).
|
|
50
|
-
"""
|
|
51
|
-
|
|
52
|
-
def __call__(self, element: _Element) -> BaseModel: ...
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
@runtime_checkable
|
|
56
|
-
class PostProcessor(Protocol):
|
|
57
|
-
"""Protocol for parsed output postprocessing implementations.
|
|
58
|
-
|
|
59
|
-
Implementations of this protocol are used to transform the results of
|
|
60
|
-
the LLM response parser while retaining the same output structure.
|
|
61
|
-
This is done so that PostProcessor implementations can be chained
|
|
62
|
-
together.
|
|
63
|
-
"""
|
|
64
|
-
|
|
65
|
-
def __call__(self, structured_response: LLMStructuredResponse) -> LLMStructuredResponse: ...
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class TextBlock(BaseModel):
|
|
69
|
-
text: str
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
class CodeBlock(BaseModel):
|
|
73
|
-
code: str
|
|
74
|
-
code_lang: str | None = None
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
class StructuredDataBlock(BaseModel):
|
|
78
|
-
serialized: str
|
|
79
|
-
obj: Any
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
class PydanticTypeBlock(BaseModel):
|
|
83
|
-
serialized: str
|
|
84
|
-
obj: BaseModel
|