data-designer 0.3.8rc1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data_designer/cli/commands/__init__.py +1 -1
  2. data_designer/interface/__init__.py +21 -1
  3. data_designer/{_version.py → interface/_version.py} +2 -2
  4. data_designer/interface/data_designer.py +8 -11
  5. {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/METADATA +10 -42
  6. data_designer-0.4.0.dist-info/RECORD +39 -0
  7. data_designer/__init__.py +0 -17
  8. data_designer/config/__init__.py +0 -2
  9. data_designer/config/analysis/__init__.py +0 -2
  10. data_designer/config/analysis/column_profilers.py +0 -159
  11. data_designer/config/analysis/column_statistics.py +0 -421
  12. data_designer/config/analysis/dataset_profiler.py +0 -84
  13. data_designer/config/analysis/utils/errors.py +0 -10
  14. data_designer/config/analysis/utils/reporting.py +0 -192
  15. data_designer/config/base.py +0 -69
  16. data_designer/config/column_configs.py +0 -470
  17. data_designer/config/column_types.py +0 -141
  18. data_designer/config/config_builder.py +0 -595
  19. data_designer/config/data_designer_config.py +0 -40
  20. data_designer/config/dataset_builders.py +0 -13
  21. data_designer/config/dataset_metadata.py +0 -18
  22. data_designer/config/default_model_settings.py +0 -121
  23. data_designer/config/errors.py +0 -24
  24. data_designer/config/exports.py +0 -145
  25. data_designer/config/interface.py +0 -55
  26. data_designer/config/models.py +0 -455
  27. data_designer/config/preview_results.py +0 -41
  28. data_designer/config/processors.py +0 -148
  29. data_designer/config/run_config.py +0 -48
  30. data_designer/config/sampler_constraints.py +0 -52
  31. data_designer/config/sampler_params.py +0 -639
  32. data_designer/config/seed.py +0 -116
  33. data_designer/config/seed_source.py +0 -84
  34. data_designer/config/seed_source_types.py +0 -19
  35. data_designer/config/utils/code_lang.py +0 -82
  36. data_designer/config/utils/constants.py +0 -363
  37. data_designer/config/utils/errors.py +0 -21
  38. data_designer/config/utils/info.py +0 -94
  39. data_designer/config/utils/io_helpers.py +0 -258
  40. data_designer/config/utils/misc.py +0 -78
  41. data_designer/config/utils/numerical_helpers.py +0 -30
  42. data_designer/config/utils/type_helpers.py +0 -106
  43. data_designer/config/utils/visualization.py +0 -482
  44. data_designer/config/validator_params.py +0 -94
  45. data_designer/engine/__init__.py +0 -2
  46. data_designer/engine/analysis/column_profilers/base.py +0 -49
  47. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
  48. data_designer/engine/analysis/column_profilers/registry.py +0 -22
  49. data_designer/engine/analysis/column_statistics.py +0 -145
  50. data_designer/engine/analysis/dataset_profiler.py +0 -149
  51. data_designer/engine/analysis/errors.py +0 -9
  52. data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
  53. data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
  54. data_designer/engine/column_generators/__init__.py +0 -2
  55. data_designer/engine/column_generators/generators/__init__.py +0 -2
  56. data_designer/engine/column_generators/generators/base.py +0 -122
  57. data_designer/engine/column_generators/generators/embedding.py +0 -35
  58. data_designer/engine/column_generators/generators/expression.py +0 -55
  59. data_designer/engine/column_generators/generators/llm_completion.py +0 -113
  60. data_designer/engine/column_generators/generators/samplers.py +0 -69
  61. data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
  62. data_designer/engine/column_generators/generators/validation.py +0 -140
  63. data_designer/engine/column_generators/registry.py +0 -60
  64. data_designer/engine/column_generators/utils/errors.py +0 -15
  65. data_designer/engine/column_generators/utils/generator_classification.py +0 -43
  66. data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
  67. data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
  68. data_designer/engine/compiler.py +0 -97
  69. data_designer/engine/configurable_task.py +0 -71
  70. data_designer/engine/dataset_builders/artifact_storage.py +0 -283
  71. data_designer/engine/dataset_builders/column_wise_builder.py +0 -338
  72. data_designer/engine/dataset_builders/errors.py +0 -15
  73. data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
  74. data_designer/engine/dataset_builders/utils/__init__.py +0 -2
  75. data_designer/engine/dataset_builders/utils/concurrency.py +0 -215
  76. data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
  77. data_designer/engine/dataset_builders/utils/dag.py +0 -62
  78. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
  79. data_designer/engine/dataset_builders/utils/errors.py +0 -15
  80. data_designer/engine/errors.py +0 -51
  81. data_designer/engine/model_provider.py +0 -77
  82. data_designer/engine/models/__init__.py +0 -2
  83. data_designer/engine/models/errors.py +0 -300
  84. data_designer/engine/models/facade.py +0 -287
  85. data_designer/engine/models/factory.py +0 -42
  86. data_designer/engine/models/litellm_overrides.py +0 -179
  87. data_designer/engine/models/parsers/__init__.py +0 -2
  88. data_designer/engine/models/parsers/errors.py +0 -34
  89. data_designer/engine/models/parsers/parser.py +0 -235
  90. data_designer/engine/models/parsers/postprocessors.py +0 -93
  91. data_designer/engine/models/parsers/tag_parsers.py +0 -62
  92. data_designer/engine/models/parsers/types.py +0 -84
  93. data_designer/engine/models/recipes/base.py +0 -81
  94. data_designer/engine/models/recipes/response_recipes.py +0 -293
  95. data_designer/engine/models/registry.py +0 -146
  96. data_designer/engine/models/telemetry.py +0 -359
  97. data_designer/engine/models/usage.py +0 -73
  98. data_designer/engine/models/utils.py +0 -38
  99. data_designer/engine/processing/ginja/__init__.py +0 -2
  100. data_designer/engine/processing/ginja/ast.py +0 -65
  101. data_designer/engine/processing/ginja/environment.py +0 -463
  102. data_designer/engine/processing/ginja/exceptions.py +0 -56
  103. data_designer/engine/processing/ginja/record.py +0 -32
  104. data_designer/engine/processing/gsonschema/__init__.py +0 -2
  105. data_designer/engine/processing/gsonschema/exceptions.py +0 -15
  106. data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
  107. data_designer/engine/processing/gsonschema/types.py +0 -10
  108. data_designer/engine/processing/gsonschema/validators.py +0 -202
  109. data_designer/engine/processing/processors/base.py +0 -13
  110. data_designer/engine/processing/processors/drop_columns.py +0 -42
  111. data_designer/engine/processing/processors/registry.py +0 -25
  112. data_designer/engine/processing/processors/schema_transform.py +0 -49
  113. data_designer/engine/processing/utils.py +0 -169
  114. data_designer/engine/registry/base.py +0 -99
  115. data_designer/engine/registry/data_designer_registry.py +0 -39
  116. data_designer/engine/registry/errors.py +0 -12
  117. data_designer/engine/resources/managed_dataset_generator.py +0 -39
  118. data_designer/engine/resources/managed_dataset_repository.py +0 -197
  119. data_designer/engine/resources/managed_storage.py +0 -65
  120. data_designer/engine/resources/resource_provider.py +0 -77
  121. data_designer/engine/resources/seed_reader.py +0 -154
  122. data_designer/engine/sampling_gen/column.py +0 -91
  123. data_designer/engine/sampling_gen/constraints.py +0 -100
  124. data_designer/engine/sampling_gen/data_sources/base.py +0 -217
  125. data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
  126. data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
  127. data_designer/engine/sampling_gen/entities/__init__.py +0 -2
  128. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  129. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
  130. data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
  131. data_designer/engine/sampling_gen/entities/errors.py +0 -10
  132. data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
  133. data_designer/engine/sampling_gen/entities/person.py +0 -144
  134. data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
  135. data_designer/engine/sampling_gen/errors.py +0 -26
  136. data_designer/engine/sampling_gen/generator.py +0 -122
  137. data_designer/engine/sampling_gen/jinja_utils.py +0 -64
  138. data_designer/engine/sampling_gen/people_gen.py +0 -199
  139. data_designer/engine/sampling_gen/person_constants.py +0 -56
  140. data_designer/engine/sampling_gen/schema.py +0 -147
  141. data_designer/engine/sampling_gen/schema_builder.py +0 -61
  142. data_designer/engine/sampling_gen/utils.py +0 -46
  143. data_designer/engine/secret_resolver.py +0 -82
  144. data_designer/engine/validation.py +0 -367
  145. data_designer/engine/validators/__init__.py +0 -19
  146. data_designer/engine/validators/base.py +0 -38
  147. data_designer/engine/validators/local_callable.py +0 -39
  148. data_designer/engine/validators/python.py +0 -254
  149. data_designer/engine/validators/remote.py +0 -89
  150. data_designer/engine/validators/sql.py +0 -65
  151. data_designer/errors.py +0 -7
  152. data_designer/essentials/__init__.py +0 -33
  153. data_designer/lazy_heavy_imports.py +0 -54
  154. data_designer/logging.py +0 -163
  155. data_designer/plugin_manager.py +0 -78
  156. data_designer/plugins/__init__.py +0 -8
  157. data_designer/plugins/errors.py +0 -15
  158. data_designer/plugins/plugin.py +0 -141
  159. data_designer/plugins/registry.py +0 -88
  160. data_designer/plugins/testing/__init__.py +0 -10
  161. data_designer/plugins/testing/stubs.py +0 -116
  162. data_designer/plugins/testing/utils.py +0 -20
  163. data_designer-0.3.8rc1.dist-info/RECORD +0 -196
  164. data_designer-0.3.8rc1.dist-info/licenses/LICENSE +0 -201
  165. {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/WHEEL +0 -0
  166. {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -1,179 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
-
5
- """
6
- LiteLLM overrides and customizations.
7
-
8
- Note on imports: This module uses direct (eager) imports for litellm rather than lazy loading.
9
- This is intentional because:
10
-
11
- 1. Class inheritance requires base classes to be resolved at class definition time,
12
- making lazy imports incompatible with our ThreadSafeCache and CustomRouter classes.
13
-
14
- 2. This module is already lazily loaded at the application level - it's only imported
15
- by facade.py, which itself is imported inside the create_model_registry() factory
16
- function. So litellm is only loaded when models are actually needed.
17
-
18
- 3. Attempting to use lazy imports here causes intermittent ImportErrors.
19
- """
20
-
21
- from __future__ import annotations
22
-
23
- import random
24
- import threading
25
-
26
- import httpx
27
- import litellm
28
- from litellm import RetryPolicy
29
- from litellm.caching.in_memory_cache import InMemoryCache
30
- from litellm.litellm_core_utils.logging_callback_manager import LoggingCallbackManager
31
- from litellm.router import Router
32
- from pydantic import BaseModel, Field
33
- from typing_extensions import override
34
-
35
- from data_designer.logging import quiet_noisy_logger
36
-
37
- DEFAULT_MAX_CALLBACKS = 1000
38
-
39
-
40
- class LiteLLMRouterDefaultKwargs(BaseModel):
41
- ## Number of seconds to wait initially after a connection
42
- ## failure.
43
- initial_retry_after_s: float = 2.0
44
-
45
- ## Jitter percentage added during exponential backoff to
46
- ## smooth repeated retries over time.
47
- jitter_pct: float = 0.2
48
-
49
- ## Maximum number of seconds to wait for an API request
50
- ## before letting it die. Will trigger a retry.
51
- timeout: float = 60.0
52
-
53
- ## Sets the default retry policy, including the number
54
- ## of retries to use in particular scenarios.
55
- retry_policy: RetryPolicy = Field(
56
- default_factory=lambda: RetryPolicy(
57
- RateLimitErrorRetries=3,
58
- TimeoutErrorRetries=3,
59
- )
60
- )
61
-
62
-
63
- class ThreadSafeCache(InMemoryCache):
64
- def __init__(self, *args, **kwargs):
65
- super().__init__(*args, **kwargs)
66
-
67
- self._lock = threading.RLock()
68
-
69
- def get_cache(self, key, **kwargs):
70
- with self._lock:
71
- return super().get_cache(key, **kwargs)
72
-
73
- def set_cache(self, key, value, **kwargs):
74
- with self._lock:
75
- super().set_cache(key, value, **kwargs)
76
-
77
- def batch_get_cache(self, keys: list, **kwargs):
78
- with self._lock:
79
- return super().batch_get_cache(keys, **kwargs)
80
-
81
- def delete_cache(self, key):
82
- with self._lock:
83
- super().delete_cache(key)
84
-
85
- def evict_cache(self):
86
- with self._lock:
87
- super().evict_cache()
88
-
89
- def increment_cache(self, key, value: int, **kwargs) -> int:
90
- with self._lock:
91
- return super().increment_cache(key, value, **kwargs)
92
-
93
- def flush_cache(self):
94
- with self._lock:
95
- super().flush_cache()
96
-
97
-
98
- class CustomRouter(Router):
99
- def __init__(
100
- self,
101
- *args,
102
- initial_retry_after_s: float,
103
- jitter_pct: float,
104
- **kwargs,
105
- ):
106
- super().__init__(*args, **kwargs)
107
- self._initial_retry_after_s = initial_retry_after_s
108
- self._jitter_pct = jitter_pct
109
-
110
- def _extract_retry_delay_from_headers(self, e: Exception) -> int | float | None:
111
- """
112
- Most of this code logic was extracted directly from the parent
113
- `Router`'s `_time_to_sleep_before_retry` function. Our override
114
- of that method below should only affect requests where the server
115
- didn't explicitly return a desired retry-delay. If the server did
116
- return this info, we'll simply use that retry value returned here.
117
- """
118
-
119
- response_headers: httpx.Headers | None = None
120
- if hasattr(e, "response") and hasattr(e.response, "headers"): # type: ignore
121
- response_headers = e.response.headers # type: ignore
122
- if hasattr(e, "litellm_response_headers"):
123
- response_headers = e.litellm_response_headers # type: ignore
124
-
125
- retry_after = litellm.utils._get_retry_after_from_exception_header(response_headers)
126
-
127
- # If the API asks us to wait a certain amount of time (and it's a reasonable amount), just do what it says.
128
- if retry_after is not None and 0 < retry_after <= 60:
129
- return retry_after
130
- else:
131
- return None
132
-
133
- @override
134
- def _time_to_sleep_before_retry(
135
- self,
136
- e: Exception,
137
- remaining_retries: int,
138
- num_retries: int,
139
- healthy_deployments: list | None = None,
140
- all_deployments: list | None = None,
141
- ) -> int | float:
142
- """
143
- Implements exponential backoff for retries.
144
-
145
- Technically, litellm's `Router` already implements some
146
- form of exponential backoff. However, that backoff
147
- is not customizable w.r.t jitter and initial delay
148
- timing. For that reason, we override this method to
149
- utilize our own custom instance variables, deferring
150
- to the existing implementation wherever we can.
151
- """
152
-
153
- # If the response headers indicated how long we should wait,
154
- # use that information.
155
- if retry_after := self._extract_retry_delay_from_headers(e):
156
- return retry_after
157
-
158
- return self.calculate_exponential_backoff(
159
- initial_retry_after_s=self._initial_retry_after_s,
160
- current_retry=num_retries - remaining_retries,
161
- jitter_pct=self._jitter_pct,
162
- )
163
-
164
- @staticmethod
165
- def calculate_exponential_backoff(initial_retry_after_s: float, current_retry: int, jitter_pct: float) -> float:
166
- sleep_s = initial_retry_after_s * (pow(2.0, current_retry))
167
- jitter = 1.0 + random.uniform(-jitter_pct, jitter_pct)
168
- return sleep_s * jitter
169
-
170
-
171
- def apply_litellm_patches():
172
- litellm.in_memory_llm_clients_cache = ThreadSafeCache()
173
-
174
- # Workaround for the litellm issue described in https://github.com/BerriAI/litellm/issues/9792
175
- LoggingCallbackManager.MAX_CALLBACKS = DEFAULT_MAX_CALLBACKS
176
-
177
- quiet_noisy_logger("httpx")
178
- quiet_noisy_logger("LiteLLM")
179
- quiet_noisy_logger("LiteLLM Router")
@@ -1,2 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
@@ -1,34 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
-
7
- class ParserException(Exception):
8
- """Identifies errors resulting from generic parser errors.
9
-
10
- Attributes:
11
- source (str | None): The source string that the parser
12
- attempted to parse.
13
- """
14
-
15
- source: str | None
16
-
17
- @staticmethod
18
- def _log_format(source: str) -> str:
19
- ## NOTE: The point of this was to be able to report offending
20
- ## failure cases to the logs. This might not be what we want
21
- ## to do in all cases. In the meantime, this note is left
22
- ## for later review.
23
- #
24
- # return f"<source>{source}</source>"
25
- return ""
26
-
27
- def __init__(self, msg: str | None = None, source: str | None = None):
28
- msg = "" if msg is None else msg.strip()
29
-
30
- if source is not None:
31
- msg += self._log_format(source)
32
-
33
- super().__init__(msg)
34
- self.source = source
@@ -1,235 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from functools import reduce
7
-
8
- import marko
9
- from lxml import etree
10
- from lxml.etree import _Element
11
-
12
- import data_designer.engine.models.parsers.tag_parsers as tp
13
- from data_designer.engine.models.parsers.postprocessors import merge_text_blocks
14
- from data_designer.engine.models.parsers.types import (
15
- LLMStructuredResponse,
16
- PostProcessor,
17
- TagParser,
18
- )
19
-
20
- DEFAULT_TAG_PARSERS = {
21
- "pre.code": tp.code_block_parser,
22
- "p.code": tp.inline_code_parser,
23
- "p": tp.text_parser,
24
- "pre": tp.text_parser,
25
- "": tp.text_parser_keep_markup,
26
- }
27
-
28
- DEFAULT_POST_PROCESSORS = [merge_text_blocks]
29
-
30
-
31
- def _patch_tags_before_code_fences(response: str) -> str:
32
- """Patch to add a linebreak between a tag prior to a code block.
33
-
34
- Marko conversion of MD->HTML has a quirk. If there is a case like
35
- the following, it will not convert the code block at all:
36
-
37
- ...
38
- </ending_tag>
39
- ```syntax
40
- ...
41
-
42
- We want to find these cases and simply introduce an additional
43
- line break.
44
- """
45
-
46
- return response.replace(">\n```", ">\n\n```")
47
-
48
-
49
- class LLMResponseParser:
50
- """
51
- Parses Language Model (LLM) responses containing a mixture of Markdown and custom markup into structured data.
52
-
53
- The `LLMResponseParser` class facilitates the translation of LLM-generated responses, which may include
54
- Markdown and custom markup tags, into a structured format using ElementTree. It allows for customizable
55
- parsing behavior through the registration of tag-specific parsers and post-processors.
56
-
57
- ## Description
58
-
59
- The core functionality of this class enables LLMs to respond using Markdown along with any custom
60
- prompted markup specified by the system or task. The parsing process involves converting the Markdown
61
- and markup into an ElementTree, then processing each element using registered tag parsers to produce
62
- a list of structured `BaseModel` instances. Post-processors can further refine the structured response.
63
-
64
- ### Tag Parsers
65
-
66
- Tag parsers are responsible for handling specific markup tags within the LLM response. They can be
67
- registered with the parser using dot-path notation to manage hierarchical tag structures. This allows
68
- downstream tasks to customize how specific elements are processed into `BaseModel` instances.
69
-
70
- ### Post-Processors
71
-
72
- Post-processors are functions that operate on the list of parsed blocks to perform additional
73
- transformations or aggregations. They are applied after the initial parsing of the response.
74
-
75
- Attributes:
76
- tag_parsers (dict[str, TagParser]): A dictionary mapping tag paths to their corresponding `TagParser` instances.
77
- postprocessors (list[PostProcessor]): A list of post-processing functions to apply to the structured response.
78
-
79
- Example:
80
- ```python
81
- class CodeBlock(BaseModel):
82
- code: str
83
- syntax: Optional[str] = None
84
-
85
- class CodeBlockParser:
86
- def __call__(self, element: _Element) -> CodeBlock:
87
- # Implementation details...
88
- return CodeBlock(code=element.text, syntax=element.get("class"))
89
-
90
- parser = LLMResponseParser(
91
- tag_parsers={
92
- "pre.code": CodeBlockParser(),
93
- }
94
- )
95
-
96
- out = parser.parse('```json\n{"answer": 42}\n```')
97
- print(out.parsed)
98
- # Output: [CodeBlock(code='{"answer": 42}\n', syntax='json')]
99
- ```
100
- """
101
-
102
- tag_parsers: dict[str, TagParser]
103
- postprocessors: list[PostProcessor]
104
-
105
- def __init__(
106
- self,
107
- tag_parsers: dict[str, TagParser] | None = None,
108
- postprocessors: list[PostProcessor] | None = None,
109
- ):
110
- """
111
- Initializes the LLMResponseParser with optional tag parsers and post-processors.
112
-
113
- Args:
114
- tag_parsers (Optional[dict[str, TagParser]]): A dictionary mapping tag paths to `TagParser` instances.
115
- If provided, these parsers will be merged with the default tag parsers.
116
- postprocessors (Optional[list[PostProcessor]]): A list of post-processing functions to apply
117
- to the structured response. If not provided, a default post-processor `merge_text_blocks`
118
- is used.
119
-
120
- Attributes:
121
- tag_parsers (dict[str, TagParser]): Initialized with default tag parsers, updated with any provided.
122
- postprocessors (list[PostProcessor]): Initialized with default post-processors or the provided list.
123
- """
124
- self.tag_parsers = {**DEFAULT_TAG_PARSERS}
125
- if tag_parsers:
126
- self.tag_parsers.update(tag_parsers)
127
-
128
- self.postprocessors = [
129
- merge_text_blocks,
130
- ]
131
- if postprocessors is not None:
132
- self.postprocessors = postprocessors
133
-
134
- def lookup_parser(self, element: _Element) -> TagParser:
135
- """
136
- Resolves and retrieves the appropriate `TagParser` for a given XML element based on its tag hierarchy.
137
-
138
- The method constructs the dot-path lineage of the element's tags, starting from the root and moving
139
- towards the specific element. It then attempts to find the most specific matching `TagParser` by
140
- progressively reducing the specificity of the tag path until a matching parser is found.
141
-
142
- Args:
143
- element (_Element): The XML element for which to find the corresponding `TagParser`.
144
-
145
- Returns:
146
- TagParser: The `TagParser` instance that matches the element's tag path.
147
-
148
- Raises:
149
- KeyError: If no matching `TagParser` is found for the element's tag path.
150
- """
151
- # Get the dot path lineage of this tag, sans root.
152
- # Note that the lineage comes back in reverse order.
153
- parents = [e.tag for e in element.iterancestors()][::-1]
154
- lineage = [*parents, element.tag]
155
-
156
- # Now attempt to matchup with the tag parsers name.
157
- # Starts from the full linear (most specific), and
158
- # breaks on the first hit. So this should properly
159
- # prioritize specific parsers over general ones.
160
- while lineage:
161
- tag_path = ".".join(lineage)
162
- if tag_path not in self.tag_parsers:
163
- lineage.pop(0)
164
- else:
165
- break
166
-
167
- # Tag path can be an empty string, which hits the
168
- # default parsing option specified by the "" entry
169
- # of the tag parsers dict.
170
- tag_path = ".".join(lineage)
171
- return self.tag_parsers[tag_path]
172
-
173
- def postprocess(self, structured_response: LLMStructuredResponse) -> LLMStructuredResponse:
174
- """
175
- Applies post-processing functions to the structured response.
176
-
177
- If no post-processors are registered, the original structured response is returned.
178
- Otherwise, each post-processor is applied in sequence to transform the response.
179
-
180
- Args:
181
- structured_response (LLMStructuredResponse): The initial structured response to be post-processed.
182
-
183
- Returns:
184
- LLMStructuredResponse: The post-processed structured response.
185
- """
186
- if not self.postprocessors:
187
- return structured_response
188
-
189
- return reduce(lambda acc, func: func(acc), self.postprocessors, structured_response)
190
-
191
- def parse(self, md_response: str) -> LLMStructuredResponse:
192
- """
193
- Parses a Markdown-formatted LLM response into a structured `LLMStructuredResponse`.
194
-
195
- The parsing process involves converting the Markdown and custom markup into an XML tree,
196
- iterating over each element in a depth-first traversal to apply the appropriate
197
- `TagParser`, and then applying any registered post-processors to the resulting structured data.
198
-
199
- Args:
200
- md_response (str): The Markdown-formatted response from the LLM, potentially containing custom markup.
201
-
202
- Returns:
203
- LLMStructuredResponse: The structured representation of the parsed response, containing parsed blocks.
204
-
205
- Raises:
206
- etree.XMLSyntaxError: If the provided Markdown cannot be converted into a valid XML structure.
207
- """
208
- response = marko.convert(_patch_tags_before_code_fences(md_response))
209
- output = LLMStructuredResponse(response=md_response, markup=response)
210
-
211
- # Generate document tree
212
- parser = etree.HTMLParser(recover=True, remove_blank_text=True)
213
- root = etree.fromstring(response, parser=parser)
214
- tags = root.iter() if root is not None else []
215
-
216
- # Iterate over tags, depth first
217
- for element in tags:
218
- if element == root or element.tag == "body":
219
- continue
220
-
221
- parsed_block = self.lookup_parser(element)(element)
222
-
223
- # Make a quick check for dead text blocks, which
224
- # can happen with container tags like <pre>, <ul>, and <ol>.
225
- drop_block = isinstance(parsed_block, tp.TextBlock) and not parsed_block.text.strip()
226
-
227
- if not drop_block:
228
- output.parsed.append(parsed_block)
229
-
230
- # Check tails -- inelegant, but they're always text.
231
- # Don't add the tail if it is just blank space.
232
- if element.tail and element.tail.strip():
233
- output.parsed.append(tp.TextBlock(text=element.tail))
234
-
235
- return self.postprocess(output)
@@ -1,93 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import json_repair
7
- from pydantic import BaseModel, ValidationError
8
-
9
- from data_designer.engine.models.parsers.types import (
10
- CodeBlock,
11
- LLMStructuredResponse,
12
- PydanticTypeBlock,
13
- StructuredDataBlock,
14
- TextBlock,
15
- )
16
-
17
-
18
- def merge_text_blocks(
19
- structured_response: LLMStructuredResponse,
20
- ) -> LLMStructuredResponse:
21
- processed_response = structured_response.model_copy()
22
- processed_response.parsed = []
23
- accumulator = None
24
- for block in structured_response.parsed:
25
- if isinstance(block, TextBlock):
26
- if accumulator is not None:
27
- accumulator = TextBlock(text=accumulator.text + block.text)
28
- else:
29
- accumulator = block
30
- else:
31
- if accumulator is not None:
32
- processed_response.parsed.append(accumulator)
33
- accumulator = None
34
-
35
- processed_response.parsed.append(block)
36
-
37
- if accumulator:
38
- processed_response.parsed.append(accumulator)
39
-
40
- return processed_response
41
-
42
-
43
- def deserialize_json_code(
44
- structured_response: LLMStructuredResponse,
45
- ) -> LLMStructuredResponse:
46
- processed_response = structured_response.model_copy()
47
- processed_response.parsed = []
48
-
49
- for block in structured_response.parsed:
50
- if isinstance(block, CodeBlock) and block.code_lang == "json":
51
- deserialized = json_repair.loads(block.code)
52
-
53
- block = StructuredDataBlock(serialized=block.code, obj=deserialized)
54
-
55
- processed_response.parsed.append(block)
56
- else:
57
- processed_response.parsed.append(block)
58
-
59
- return processed_response
60
-
61
-
62
- class RealizePydanticTypes:
63
- types: list[type[BaseModel]]
64
-
65
- def __init__(self, types: list[type[BaseModel]]):
66
- self.types = types
67
-
68
- def _fit_types(self, obj: dict) -> BaseModel | None:
69
- final_obj = None
70
-
71
- for t in self.types:
72
- try:
73
- final_obj = t.model_validate(obj)
74
- except ValidationError:
75
- pass
76
-
77
- return final_obj
78
-
79
- def __call__(self, structured_response: LLMStructuredResponse) -> LLMStructuredResponse:
80
- processed_response = structured_response.model_copy()
81
- processed_response.parsed = []
82
-
83
- for block in structured_response.parsed:
84
- if isinstance(block, StructuredDataBlock):
85
- new_block = block
86
- pydantic_obj = self._fit_types(block.obj)
87
- if pydantic_obj:
88
- new_block = PydanticTypeBlock(serialized=block.serialized, obj=pydantic_obj)
89
- processed_response.parsed.append(new_block)
90
- else:
91
- processed_response.parsed.append(block)
92
-
93
- return processed_response
@@ -1,62 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from lxml.etree import _Element
7
-
8
- from data_designer.engine.models.parsers.types import CodeBlock, TextBlock
9
-
10
-
11
- def text_parser(element: _Element) -> TextBlock:
12
- return TextBlock(text=element.text if element.text else "")
13
-
14
-
15
- def text_parser_keep_markup(element: _Element) -> TextBlock:
16
- body = element.text if element.text else ""
17
- return TextBlock(text=f"<{element.tag}>{body}</{element.tag}>")
18
-
19
-
20
- def inline_code_parser(element: _Element) -> TextBlock:
21
- return TextBlock(text=f"`{element.text if element.text else ''}`")
22
-
23
-
24
- def code_block_parser(element: _Element) -> CodeBlock:
25
- """Parse a <pre><code> element node.
26
-
27
- This parser handles the special case of Markdown->HTML conversion
28
- for fenced code blocks. These take on the form:
29
-
30
- ```xx
31
- ...
32
- ```
33
-
34
- <pre><code class="language-xx">...</code></pre>
35
-
36
- This parser is intended to be attached to the special case of "pre.code"
37
- tag hierarchies.
38
-
39
- Syntax Handling
40
-
41
- If the syntax is not specified, e.g. ``<code>...</code>`` or
42
- ``<code class="">...</code>``, then the syntax field is returned
43
- as None. However, the parser does not _enforce_ the prefix
44
- `language-` on the value of the class attribute.
45
- If it is not present, then the entire value
46
-
47
- Args:
48
- element (lxml.etree._Element): An element of the lxml-parsed
49
- element tree.
50
-
51
- Returns:
52
- CodeBlock: Datat structured containing both the body of the code
53
- as well as the specified synax of the code block.
54
-
55
- """
56
- prefix = "language-"
57
- language_identifier = element.attrib.get("class", "")
58
- language_identifier = language_identifier.removeprefix(prefix)
59
- return CodeBlock(
60
- code=element.text.strip() if element.text else "",
61
- code_lang=language_identifier if language_identifier else None,
62
- )
@@ -1,84 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from typing import Any, Protocol, runtime_checkable
7
-
8
- from lxml.etree import _Element
9
- from pydantic import BaseModel, Field
10
- from typing_extensions import Self
11
-
12
-
13
- class LLMStructuredResponse(BaseModel):
14
- """Output format for the LLM Response Parser."""
15
-
16
- response: str = Field(description="Raw Markdown/Markup response received from the LLM and input to the parser.")
17
- markup: str = Field(description="Markup/HTML resulting from running Markdown parsing on response.")
18
- parsed: list[BaseModel] = Field(
19
- default_factory=list,
20
- description="Structured content parsed from markup. Elements of this list are in document-order.",
21
- )
22
-
23
- def head(self, n: int) -> Self:
24
- """Retain only the first n elements of the parsed response."""
25
- out = self.model_copy()
26
- out.parsed = out.parsed[:n]
27
- return out
28
-
29
- def tail(self, n: int) -> Self:
30
- """Retain only the last n elements of the parsed response."""
31
- out = self.model_copy()
32
- out.parsed = out.parsed[-n:]
33
- return out
34
-
35
- def filter(self, block_types: list[type[BaseModel]]) -> Self:
36
- out = self.model_copy()
37
- out.parsed = [b for b in out.parsed if isinstance(b, tuple(block_types))]
38
- return out
39
-
40
-
41
- @runtime_checkable
42
- class TagParser(Protocol):
43
- """Protocol for tag parsing implementations.
44
-
45
- All TagParsers are objects which can take as input an `lxml`
46
- element, do some computation, and return some kind of structured
47
- output, represented as a subclass of Pydantic `BaseModel`.
48
- This protocol implementation can cover both classes as well
49
- as curried functions as parsers (e.g. `partial`).
50
- """
51
-
52
- def __call__(self, element: _Element) -> BaseModel: ...
53
-
54
-
55
- @runtime_checkable
56
- class PostProcessor(Protocol):
57
- """Protocol for parsed output postprocessing implementations.
58
-
59
- Implementations of this protocol are used to transform the results of
60
- the LLM response parser while retaining the same output structure.
61
- This is done so that PostProcessor implementations can be chained
62
- together.
63
- """
64
-
65
- def __call__(self, structured_response: LLMStructuredResponse) -> LLMStructuredResponse: ...
66
-
67
-
68
- class TextBlock(BaseModel):
69
- text: str
70
-
71
-
72
- class CodeBlock(BaseModel):
73
- code: str
74
- code_lang: str | None = None
75
-
76
-
77
- class StructuredDataBlock(BaseModel):
78
- serialized: str
79
- obj: Any
80
-
81
-
82
- class PydanticTypeBlock(BaseModel):
83
- serialized: str
84
- obj: BaseModel