data-designer 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. data_designer/_version.py +2 -2
  2. data_designer/cli/README.md +15 -1
  3. data_designer/cli/commands/download.py +56 -0
  4. data_designer/cli/commands/list.py +4 -18
  5. data_designer/cli/controllers/__init__.py +2 -1
  6. data_designer/cli/controllers/download_controller.py +217 -0
  7. data_designer/cli/controllers/model_controller.py +4 -3
  8. data_designer/cli/forms/field.py +65 -19
  9. data_designer/cli/forms/model_builder.py +251 -44
  10. data_designer/cli/main.py +11 -1
  11. data_designer/cli/repositories/persona_repository.py +88 -0
  12. data_designer/cli/services/__init__.py +2 -1
  13. data_designer/cli/services/download_service.py +97 -0
  14. data_designer/cli/ui.py +131 -0
  15. data_designer/cli/utils.py +34 -0
  16. data_designer/config/analysis/__init__.py +2 -0
  17. data_designer/config/analysis/column_profilers.py +75 -7
  18. data_designer/config/analysis/column_statistics.py +192 -48
  19. data_designer/config/analysis/dataset_profiler.py +23 -5
  20. data_designer/config/analysis/utils/reporting.py +3 -3
  21. data_designer/config/base.py +3 -3
  22. data_designer/config/column_configs.py +27 -6
  23. data_designer/config/column_types.py +24 -17
  24. data_designer/config/config_builder.py +34 -26
  25. data_designer/config/data_designer_config.py +7 -7
  26. data_designer/config/datastore.py +6 -6
  27. data_designer/config/default_model_settings.py +27 -34
  28. data_designer/config/exports.py +14 -1
  29. data_designer/config/models.py +155 -29
  30. data_designer/config/preview_results.py +5 -4
  31. data_designer/config/processors.py +109 -4
  32. data_designer/config/sampler_constraints.py +1 -2
  33. data_designer/config/sampler_params.py +31 -31
  34. data_designer/config/seed.py +1 -2
  35. data_designer/config/utils/code_lang.py +4 -5
  36. data_designer/config/utils/constants.py +31 -8
  37. data_designer/config/utils/io_helpers.py +5 -5
  38. data_designer/config/utils/misc.py +1 -4
  39. data_designer/config/utils/numerical_helpers.py +2 -2
  40. data_designer/config/utils/type_helpers.py +3 -3
  41. data_designer/config/utils/validation.py +39 -9
  42. data_designer/config/utils/visualization.py +62 -15
  43. data_designer/config/validator_params.py +4 -8
  44. data_designer/engine/analysis/column_profilers/base.py +0 -7
  45. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +2 -3
  46. data_designer/engine/analysis/column_statistics.py +16 -16
  47. data_designer/engine/analysis/dataset_profiler.py +25 -4
  48. data_designer/engine/analysis/utils/column_statistics_calculations.py +71 -49
  49. data_designer/engine/analysis/utils/judge_score_processing.py +5 -5
  50. data_designer/engine/column_generators/generators/base.py +34 -0
  51. data_designer/engine/column_generators/generators/embedding.py +45 -0
  52. data_designer/engine/column_generators/generators/{llm_generators.py → llm_completion.py} +17 -49
  53. data_designer/engine/column_generators/registry.py +4 -2
  54. data_designer/engine/column_generators/utils/judge_score_factory.py +5 -6
  55. data_designer/engine/configurable_task.py +2 -2
  56. data_designer/engine/dataset_builders/artifact_storage.py +14 -5
  57. data_designer/engine/dataset_builders/column_wise_builder.py +12 -8
  58. data_designer/engine/dataset_builders/utils/concurrency.py +6 -6
  59. data_designer/engine/models/facade.py +66 -9
  60. data_designer/engine/models/litellm_overrides.py +5 -6
  61. data_designer/engine/models/parsers/errors.py +2 -4
  62. data_designer/engine/models/parsers/parser.py +2 -3
  63. data_designer/engine/models/parsers/postprocessors.py +3 -4
  64. data_designer/engine/models/parsers/types.py +4 -4
  65. data_designer/engine/models/registry.py +20 -11
  66. data_designer/engine/models/usage.py +7 -9
  67. data_designer/engine/processing/ginja/ast.py +1 -2
  68. data_designer/engine/processing/processors/drop_columns.py +1 -1
  69. data_designer/engine/processing/processors/registry.py +3 -0
  70. data_designer/engine/processing/processors/schema_transform.py +53 -0
  71. data_designer/engine/processing/utils.py +40 -2
  72. data_designer/engine/registry/base.py +12 -12
  73. data_designer/engine/sampling_gen/constraints.py +1 -2
  74. data_designer/engine/sampling_gen/data_sources/base.py +14 -14
  75. data_designer/engine/sampling_gen/entities/phone_number.py +1 -2
  76. data_designer/engine/sampling_gen/people_gen.py +3 -7
  77. data_designer/engine/validators/base.py +2 -2
  78. data_designer/interface/data_designer.py +12 -0
  79. data_designer/interface/results.py +36 -0
  80. data_designer/logging.py +2 -2
  81. data_designer/plugin_manager.py +3 -3
  82. data_designer/plugins/plugin.py +3 -3
  83. data_designer/plugins/registry.py +2 -2
  84. {data_designer-0.1.4.dist-info → data_designer-0.2.0.dist-info}/METADATA +9 -9
  85. {data_designer-0.1.4.dist-info → data_designer-0.2.0.dist-info}/RECORD +88 -81
  86. {data_designer-0.1.4.dist-info → data_designer-0.2.0.dist-info}/WHEEL +0 -0
  87. {data_designer-0.1.4.dist-info → data_designer-0.2.0.dist-info}/entry_points.txt +0 -0
  88. {data_designer-0.1.4.dist-info → data_designer-0.2.0.dist-info}/licenses/LICENSE +0 -0
data_designer/cli/ui.py CHANGED
@@ -182,6 +182,137 @@ def select_with_arrows(
182
182
  return None
183
183
 
184
184
 
185
+ def select_multiple_with_arrows(
186
+ options: dict[str, str],
187
+ prompt_text: str,
188
+ default_keys: list[str] | None = None,
189
+ allow_empty: bool = False,
190
+ ) -> list[str] | None:
191
+ """Interactive multi-selection with arrow key navigation and space to toggle.
192
+
193
+ Uses prompt_toolkit's Application for an inline checkbox-style menu experience.
194
+
195
+ Args:
196
+ options: Dictionary of {key: display_text} options
197
+ prompt_text: Prompt to display above options
198
+ default_keys: List of keys that should be pre-selected
199
+ allow_empty: If True, allows user to submit with no selections
200
+
201
+ Returns:
202
+ List of selected keys, or None if cancelled
203
+ """
204
+ if not options:
205
+ return None
206
+
207
+ # Build list of keys and track selected state
208
+ keys = list(options.keys())
209
+ selected_set = set(default_keys) if default_keys else set()
210
+ current_index = 0
211
+
212
+ # Store result
213
+ result = {"value": None, "cancelled": False}
214
+
215
+ def get_formatted_text() -> list[tuple[str, str]]:
216
+ """Generate the formatted text for the multi-select menu."""
217
+ text = []
218
+ # Add prompt with padding
219
+ padding = " " * LEFT_PADDING
220
+ text.append(("", f"{padding}{prompt_text}\n"))
221
+
222
+ # Add options with checkboxes
223
+ for i, key in enumerate(keys):
224
+ display = options[key]
225
+ checkbox = "[✓]" if key in selected_set else "[ ]"
226
+
227
+ if i == current_index:
228
+ # Highlighted item with Nord8 color
229
+ text.append((f"fg:{NordColor.NORD8.value} bold", f"{padding} → {checkbox} {display}\n"))
230
+ else:
231
+ # Unselected item
232
+ text.append(("", f"{padding} {checkbox} {display}\n"))
233
+
234
+ # Add hint
235
+ count = len(selected_set)
236
+ text.append(
237
+ (
238
+ "fg:#666666",
239
+ f"{padding} (↑/↓: navigate, Space: toggle, Enter: confirm ({count} selected), Esc: cancel)\n",
240
+ )
241
+ )
242
+ return text
243
+
244
+ # Create key bindings
245
+ kb = KeyBindings()
246
+
247
+ @kb.add("up")
248
+ @kb.add("c-p") # Ctrl+P
249
+ def _move_up(event) -> None:
250
+ nonlocal current_index
251
+ current_index = (current_index - 1) % len(keys)
252
+
253
+ @kb.add("down")
254
+ @kb.add("c-n") # Ctrl+N
255
+ def _move_down(event) -> None:
256
+ nonlocal current_index
257
+ current_index = (current_index + 1) % len(keys)
258
+
259
+ @kb.add("c-h") # Ctrl+H as alternative
260
+ @kb.add(" ", eager=True) # Space key - eager to capture immediately
261
+ def _toggle(event) -> None:
262
+ key = keys[current_index]
263
+ if key in selected_set:
264
+ selected_set.remove(key)
265
+ else:
266
+ selected_set.add(key)
267
+
268
+ @kb.add("enter")
269
+ def _confirm(event) -> None:
270
+ if not allow_empty and not selected_set:
271
+ # Don't allow empty selection if not permitted
272
+ return
273
+ result["value"] = list(selected_set)
274
+ event.app.exit()
275
+
276
+ @kb.add("escape")
277
+ @kb.add("c-c") # Ctrl+C
278
+ def _cancel(event) -> None:
279
+ result["cancelled"] = True
280
+ event.app.exit()
281
+
282
+ # Create the application
283
+ app = Application(
284
+ layout=Layout(
285
+ HSplit(
286
+ [
287
+ Window(
288
+ content=FormattedTextControl(get_formatted_text),
289
+ dont_extend_height=True,
290
+ always_hide_cursor=True,
291
+ )
292
+ ]
293
+ )
294
+ ),
295
+ key_bindings=kb,
296
+ full_screen=False,
297
+ mouse_support=False,
298
+ )
299
+
300
+ try:
301
+ # Run the application
302
+ app.run()
303
+
304
+ # Handle the result
305
+ if result["cancelled"]:
306
+ print_warning("Cancelled")
307
+ return None
308
+ else:
309
+ return result["value"]
310
+
311
+ except (KeyboardInterrupt, EOFError):
312
+ print_warning("Cancelled")
313
+ return None
314
+
315
+
185
316
  def prompt_text_input(
186
317
  prompt_msg: str,
187
318
  default: str | None = None,
@@ -1,6 +1,40 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ import shutil
5
+ import subprocess
6
+
7
+
8
+ def check_ngc_cli_available() -> bool:
9
+ """Check if NGC CLI is installed and available.
10
+
11
+ Returns:
12
+ True if NGC CLI is in PATH and executable, False otherwise.
13
+ """
14
+ if shutil.which("ngc") is None:
15
+ return False
16
+
17
+ return get_ngc_version() is not None
18
+
19
+
20
+ def get_ngc_version() -> str | None:
21
+ """Get the NGC CLI version if available.
22
+
23
+ Returns:
24
+ NGC CLI version string if available, None otherwise.
25
+ """
26
+ try:
27
+ result = subprocess.run(
28
+ ["ngc", "--version"],
29
+ capture_output=True,
30
+ text=True,
31
+ check=True,
32
+ timeout=5,
33
+ )
34
+ return result.stdout.strip()
35
+ except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
36
+ return None
37
+
4
38
 
5
39
  def validate_url(url: str) -> bool:
6
40
  """Validate that a string is a valid URL.
@@ -0,0 +1,2 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
@@ -3,7 +3,6 @@
3
3
 
4
4
  from abc import ABC
5
5
  from enum import Enum
6
- from typing import Optional, Union
7
6
 
8
7
  from pydantic import BaseModel, Field
9
8
  from rich.panel import Panel
@@ -27,7 +26,20 @@ class ColumnProfilerType(str, Enum):
27
26
 
28
27
 
29
28
  class ColumnProfilerResults(BaseModel, ABC):
29
+ """Abstract base class for column profiler results.
30
+
31
+ Stores results from column profiling operations. Subclasses hold profiler-specific
32
+ analysis results and provide methods for generating formatted report sections for display.
33
+ """
34
+
30
35
  def create_report_section(self) -> Panel:
36
+ """Creates a Rich Panel containing the formatted profiler results for display.
37
+
38
+ Returns:
39
+ A Rich Panel containing the formatted profiler results. Default implementation
40
+ returns a "Not Implemented" message; subclasses should override to provide
41
+ specific formatting.
42
+ """
31
43
  return Panel(
32
44
  f"Report section generation not implemented for '{self.__class__.__name__}'.",
33
45
  title="Not Implemented",
@@ -37,33 +49,89 @@ class ColumnProfilerResults(BaseModel, ABC):
37
49
 
38
50
 
39
51
  class JudgeScoreProfilerConfig(ConfigBase):
52
+ """Configuration for the LLM-as-a-judge score profiler.
53
+
54
+ Attributes:
55
+ model_alias: Alias of the LLM model to use for generating score distribution summaries.
56
+ Must match a model alias defined in the Data Designer configuration.
57
+ summary_score_sample_size: Number of score samples to include when prompting the LLM
58
+ to generate summaries. Larger sample sizes provide more context but increase
59
+ token usage. Must be at least 1. Defaults to 20.
60
+ """
61
+
40
62
  model_alias: str
41
- summary_score_sample_size: Optional[int] = Field(default=20, ge=1)
63
+ summary_score_sample_size: int | None = Field(default=20, ge=1)
42
64
 
43
65
 
44
66
  class JudgeScoreSample(BaseModel):
45
- score: Union[int, str]
67
+ """Container for a single judge score and its associated reasoning.
68
+
69
+ Stores a paired score-reasoning sample extracted from an LLM-as-a-judge column.
70
+ Used when generating summaries to provide the LLM with examples of scoring patterns.
71
+
72
+ Attributes:
73
+ score: The score value assigned by the judge. Can be numeric (int) or categorical (str).
74
+ reasoning: The reasoning or explanation provided by the judge for this score.
75
+ """
76
+
77
+ score: int | str
46
78
  reasoning: str
47
79
 
48
80
 
49
81
  class JudgeScoreDistributions(BaseModel):
50
- scores: dict[str, list[Union[int, str]]]
82
+ """Container for computed distributions across all judge score dimensions.
83
+
84
+ Stores the complete distribution analysis for all score dimensions in an LLM-as-a-judge
85
+ column. Each score dimension (e.g., "relevance", "fluency") has its own distribution
86
+ computed from the generated data.
87
+
88
+ Attributes:
89
+ scores: Mapping of each score dimension name to its list of score values.
90
+ reasoning: Mapping of each score dimension name to its list of reasoning texts.
91
+ distribution_types: Mapping of each score dimension name to its classification.
92
+ distributions: Mapping of each score dimension name to its computed distribution statistics.
93
+ histograms: Mapping of each score dimension name to its histogram data.
94
+ """
95
+
96
+ scores: dict[str, list[int | str]]
51
97
  reasoning: dict[str, list[str]]
52
98
  distribution_types: dict[str, ColumnDistributionType]
53
- distributions: dict[str, Union[CategoricalDistribution, NumericalDistribution, MissingValue]]
54
- histograms: dict[str, Union[CategoricalHistogramData, MissingValue]]
99
+ distributions: dict[str, CategoricalDistribution | NumericalDistribution | MissingValue]
100
+ histograms: dict[str, CategoricalHistogramData | MissingValue]
55
101
 
56
102
 
57
103
  class JudgeScoreSummary(BaseModel):
104
+ """Container for an LLM-generated summary of a judge score dimension.
105
+
106
+ Stores the natural language summary and sample data for a single score dimension
107
+ generated by the judge score profiler. The summary is created by an LLM analyzing
108
+ the distribution and patterns in the score-reasoning pairs.
109
+
110
+ Attributes:
111
+ score_name: Name of the score dimension being summarized (e.g., "relevance", "fluency").
112
+ summary: LLM-generated natural language summary describing the scoring patterns,
113
+ distribution characteristics, and notable trends for this score dimension.
114
+ score_samples: List of score-reasoning pairs that were used to generate the summary.
115
+ These are the examples of the scoring behavior that were used to generate the summary.
116
+ """
117
+
58
118
  score_name: str
59
119
  summary: str
60
120
  score_samples: list[JudgeScoreSample]
61
121
 
62
122
 
63
123
  class JudgeScoreProfilerResults(ColumnProfilerResults):
124
+ """Container for complete judge score profiler analysis results.
125
+
126
+ Attributes:
127
+ column_name: Name of the judge column that was profiled.
128
+ summaries: Mapping of each score dimension name to its LLM-generated summary.
129
+ score_distributions: Complete distribution analysis across all score dimensions.
130
+ """
131
+
64
132
  column_name: str
65
133
  summaries: dict[str, JudgeScoreSummary]
66
- score_distributions: Union[JudgeScoreDistributions, MissingValue]
134
+ score_distributions: JudgeScoreDistributions | MissingValue
67
135
 
68
136
  def create_report_section(self) -> Panel:
69
137
  layout = Table.grid(Column(), expand=True, padding=(2, 0))
@@ -5,7 +5,7 @@ from __future__ import annotations
5
5
 
6
6
  from abc import ABC, abstractmethod
7
7
  from enum import Enum
8
- from typing import Any, Literal, Optional, Union
8
+ from typing import Any, Literal
9
9
 
10
10
  from pandas import Series
11
11
  from pydantic import BaseModel, ConfigDict, create_model, field_validator, model_validator
@@ -32,27 +32,56 @@ class ColumnDistributionType(str, Enum):
32
32
 
33
33
 
34
34
  class BaseColumnStatistics(BaseModel, ABC):
35
+ """Abstract base class for all column statistics types.
36
+
37
+ Serves as a container for computed statistics across different column types in
38
+ Data-Designer-generated datasets. Subclasses hold column-specific statistical results
39
+ and provide methods for formatting these results for display in reports.
40
+ """
41
+
35
42
  model_config = ConfigDict(use_enum_values=True)
36
43
 
37
44
  @abstractmethod
38
- def create_report_row_data(self) -> dict[str, str]: ...
45
+ def create_report_row_data(self) -> dict[str, str]:
46
+ """Creates a formatted dictionary of statistics for display in reports.
47
+
48
+ Returns:
49
+ Dictionary mapping display labels to formatted statistic values.
50
+ """
51
+ ...
39
52
 
40
53
 
41
54
  class GeneralColumnStatistics(BaseColumnStatistics):
55
+ """Container for general statistics applicable to all column types.
56
+
57
+ Holds core statistical measures that apply universally across all column types,
58
+ including null counts, unique values, and data type information. Serves as the base
59
+ for more specialized column statistics classes that store additional column-specific metrics.
60
+
61
+ Attributes:
62
+ column_name: Name of the column being analyzed.
63
+ num_records: Total number of records in the column.
64
+ num_null: Number of null/missing values in the column.
65
+ num_unique: Number of distinct values in the column. If a value is not hashable, it is converted to a string.
66
+ pyarrow_dtype: PyArrow data type of the column as a string.
67
+ simple_dtype: Simplified human-readable data type label.
68
+ column_type: Discriminator field, always "general" for this statistics type.
69
+ """
70
+
42
71
  column_name: str
43
- num_records: Union[int, MissingValue]
44
- num_null: Union[int, MissingValue]
45
- num_unique: Union[int, MissingValue]
72
+ num_records: int | MissingValue
73
+ num_null: int | MissingValue
74
+ num_unique: int | MissingValue
46
75
  pyarrow_dtype: str
47
76
  simple_dtype: str
48
77
  column_type: Literal["general"] = "general"
49
78
 
50
79
  @field_validator("num_null", "num_unique", "num_records", mode="before")
51
- def general_statistics_ensure_python_integers(cls, v: Union[int, MissingValue]) -> Union[int, MissingValue]:
80
+ def general_statistics_ensure_python_integers(cls, v: int | MissingValue) -> int | MissingValue:
52
81
  return v if isinstance(v, MissingValue) else prepare_number_for_reporting(v, int)
53
82
 
54
83
  @property
55
- def percent_null(self) -> Union[float, MissingValue]:
84
+ def percent_null(self) -> float | MissingValue:
56
85
  return (
57
86
  self.num_null
58
87
  if self._is_missing_value(self.num_null)
@@ -60,7 +89,7 @@ class GeneralColumnStatistics(BaseColumnStatistics):
60
89
  )
61
90
 
62
91
  @property
63
- def percent_unique(self) -> Union[float, MissingValue]:
92
+ def percent_unique(self) -> float | MissingValue:
64
93
  return (
65
94
  self.num_unique
66
95
  if self._is_missing_value(self.num_unique)
@@ -79,40 +108,55 @@ class GeneralColumnStatistics(BaseColumnStatistics):
79
108
  def create_report_row_data(self) -> dict[str, str]:
80
109
  return self._general_display_row
81
110
 
82
- def _is_missing_value(self, v: Union[float, int, MissingValue]) -> bool:
111
+ def _is_missing_value(self, v: float | int | MissingValue) -> bool:
83
112
  return v in set(MissingValue)
84
113
 
85
114
 
86
115
  class LLMTextColumnStatistics(GeneralColumnStatistics):
87
- completion_tokens_mean: Union[float, MissingValue]
88
- completion_tokens_median: Union[float, MissingValue]
89
- completion_tokens_stddev: Union[float, MissingValue]
90
- prompt_tokens_mean: Union[float, MissingValue]
91
- prompt_tokens_median: Union[float, MissingValue]
92
- prompt_tokens_stddev: Union[float, MissingValue]
116
+ """Container for statistics on LLM-generated text columns.
117
+
118
+ Inherits general statistics plus token usage metrics specific to LLM text generation.
119
+ Stores both prompt and completion token consumption data.
120
+
121
+ Attributes:
122
+ output_tokens_mean: Mean number of output tokens generated per record.
123
+ output_tokens_median: Median number of output tokens generated per record.
124
+ output_tokens_stddev: Standard deviation of output tokens per record.
125
+ input_tokens_mean: Mean number of input tokens used per record.
126
+ input_tokens_median: Median number of input tokens used per record.
127
+ input_tokens_stddev: Standard deviation of input tokens per record.
128
+ column_type: Discriminator field, always "llm-text" for this statistics type.
129
+ """
130
+
131
+ output_tokens_mean: float | MissingValue
132
+ output_tokens_median: float | MissingValue
133
+ output_tokens_stddev: float | MissingValue
134
+ input_tokens_mean: float | MissingValue
135
+ input_tokens_median: float | MissingValue
136
+ input_tokens_stddev: float | MissingValue
93
137
  column_type: Literal[DataDesignerColumnType.LLM_TEXT.value] = DataDesignerColumnType.LLM_TEXT.value
94
138
 
95
139
  @field_validator(
96
- "completion_tokens_mean",
97
- "completion_tokens_median",
98
- "completion_tokens_stddev",
99
- "prompt_tokens_mean",
100
- "prompt_tokens_median",
101
- "prompt_tokens_stddev",
140
+ "output_tokens_mean",
141
+ "output_tokens_median",
142
+ "output_tokens_stddev",
143
+ "input_tokens_mean",
144
+ "input_tokens_median",
145
+ "input_tokens_stddev",
102
146
  mode="before",
103
147
  )
104
- def llm_column_ensure_python_floats(cls, v: Union[float, int, MissingValue]) -> Union[float, int, MissingValue]:
148
+ def llm_column_ensure_python_floats(cls, v: float | int | MissingValue) -> float | int | MissingValue:
105
149
  return v if isinstance(v, MissingValue) else prepare_number_for_reporting(v, float)
106
150
 
107
151
  def create_report_row_data(self) -> dict[str, Any]:
108
152
  prompt_tokens_str = (
109
- f"{self.prompt_tokens_median:.1f} +/- {self.prompt_tokens_stddev:.1f}"
110
- if not self._is_missing_value(self.prompt_tokens_median)
153
+ f"{self.input_tokens_median:.1f} +/- {self.input_tokens_stddev:.1f}"
154
+ if not self._is_missing_value(self.input_tokens_median)
111
155
  else "--"
112
156
  )
113
157
  completion_tokens_str = (
114
- f"{self.completion_tokens_median:.1f} +/- {self.completion_tokens_stddev:.1f}"
115
- if not self._is_missing_value(self.completion_tokens_median)
158
+ f"{self.output_tokens_median:.1f} +/- {self.output_tokens_stddev:.1f}"
159
+ if not self._is_missing_value(self.output_tokens_median)
116
160
  else "--"
117
161
  )
118
162
  return {
@@ -123,21 +167,65 @@ class LLMTextColumnStatistics(GeneralColumnStatistics):
123
167
 
124
168
 
125
169
  class LLMCodeColumnStatistics(LLMTextColumnStatistics):
170
+ """Container for statistics on LLM-generated code columns.
171
+
172
+ Inherits all token usage metrics from LLMTextColumnStatistics. Stores
173
+ statistics from columns that generate code snippets in specific programming languages.
174
+
175
+ Attributes:
176
+ column_type: Discriminator field, always "llm-code" for this statistics type.
177
+ """
178
+
126
179
  column_type: Literal[DataDesignerColumnType.LLM_CODE.value] = DataDesignerColumnType.LLM_CODE.value
127
180
 
128
181
 
129
182
  class LLMStructuredColumnStatistics(LLMTextColumnStatistics):
183
+ """Container for statistics on LLM-generated structured JSON columns.
184
+
185
+ Inherits all token usage metrics from LLMTextColumnStatistics. Stores statistics from
186
+ columns that generate structured data conforming to JSON schemas or Pydantic models.
187
+
188
+ Attributes:
189
+ column_type: Discriminator field, always "llm-structured" for this statistics type.
190
+ """
191
+
130
192
  column_type: Literal[DataDesignerColumnType.LLM_STRUCTURED.value] = DataDesignerColumnType.LLM_STRUCTURED.value
131
193
 
132
194
 
133
195
  class LLMJudgedColumnStatistics(LLMTextColumnStatistics):
196
+ """Container for statistics on LLM-as-a-judge quality assessment columns.
197
+
198
+ Inherits all token usage metrics from LLMTextColumnStatistics. Stores statistics from
199
+ columns that evaluate and score other generated content based on defined criteria.
200
+
201
+ Attributes:
202
+ column_type: Discriminator field, always "llm-judge" for this statistics type.
203
+ """
204
+
134
205
  column_type: Literal[DataDesignerColumnType.LLM_JUDGE.value] = DataDesignerColumnType.LLM_JUDGE.value
135
206
 
136
207
 
137
208
  class SamplerColumnStatistics(GeneralColumnStatistics):
209
+ """Container for statistics on sampler-generated columns.
210
+
211
+ Inherits general statistics plus sampler-specific information including the sampler type
212
+ used and the empirical distribution of generated values. Stores both categorical and
213
+ numerical distribution results.
214
+
215
+ Attributes:
216
+ sampler_type: Type of sampler used to generate this column (e.g., "uniform", "category",
217
+ "gaussian", "person").
218
+ distribution_type: Classification of the column's distribution (categorical, numerical,
219
+ text, other, or unknown).
220
+ distribution: Empirical distribution statistics for the generated values. Can be
221
+ CategoricalDistribution (for discrete values), NumericalDistribution (for continuous
222
+ values), or MissingValue if distribution could not be computed.
223
+ column_type: Discriminator field, always "sampler" for this statistics type.
224
+ """
225
+
138
226
  sampler_type: SamplerType
139
227
  distribution_type: ColumnDistributionType
140
- distribution: Optional[Union[CategoricalDistribution, NumericalDistribution, MissingValue]]
228
+ distribution: CategoricalDistribution | NumericalDistribution | MissingValue | None
141
229
  column_type: Literal[DataDesignerColumnType.SAMPLER.value] = DataDesignerColumnType.SAMPLER.value
142
230
 
143
231
  def create_report_row_data(self) -> dict[str, str]:
@@ -148,23 +236,52 @@ class SamplerColumnStatistics(GeneralColumnStatistics):
148
236
 
149
237
 
150
238
  class SeedDatasetColumnStatistics(GeneralColumnStatistics):
239
+ """Container for statistics on columns sourced from seed datasets.
240
+
241
+ Inherits general statistics and stores statistics computed from columns that originate
242
+ from existing data provided via the seed dataset functionality.
243
+
244
+ Attributes:
245
+ column_type: Discriminator field, always "seed-dataset" for this statistics type.
246
+ """
247
+
151
248
  column_type: Literal[DataDesignerColumnType.SEED_DATASET.value] = DataDesignerColumnType.SEED_DATASET.value
152
249
 
153
250
 
154
251
  class ExpressionColumnStatistics(GeneralColumnStatistics):
252
+ """Container for statistics on expression-based derived columns.
253
+
254
+ Inherits general statistics and stores statistics computed from columns that are derived
255
+ from columns that are derived from Jinja2 expressions referencing other column values.
256
+
257
+ Attributes:
258
+ column_type: Discriminator field, always "expression" for this statistics type.
259
+ """
260
+
155
261
  column_type: Literal[DataDesignerColumnType.EXPRESSION.value] = DataDesignerColumnType.EXPRESSION.value
156
262
 
157
263
 
158
264
  class ValidationColumnStatistics(GeneralColumnStatistics):
159
- num_valid_records: Union[int, MissingValue]
265
+ """Container for statistics on validation result columns.
266
+
267
+ Inherits general statistics plus validation-specific metrics including the count and
268
+ percentage of records that passed validation. Stores results from validation logic
269
+ (Python, SQL, or remote) executed against target columns.
270
+
271
+ Attributes:
272
+ num_valid_records: Number of records that passed validation.
273
+ column_type: Discriminator field, always "validation" for this statistics type.
274
+ """
275
+
276
+ num_valid_records: int | MissingValue
160
277
  column_type: Literal[DataDesignerColumnType.VALIDATION.value] = DataDesignerColumnType.VALIDATION.value
161
278
 
162
279
  @field_validator("num_valid_records", mode="before")
163
- def code_validation_column_ensure_python_integers(cls, v: Union[int, MissingValue]) -> Union[int, MissingValue]:
280
+ def code_validation_column_ensure_python_integers(cls, v: int | MissingValue) -> int | MissingValue:
164
281
  return v if isinstance(v, MissingValue) else prepare_number_for_reporting(v, int)
165
282
 
166
283
  @property
167
- def percent_valid(self) -> Union[float, MissingValue]:
284
+ def percent_valid(self) -> float | MissingValue:
168
285
  return (
169
286
  self.num_valid_records
170
287
  if self._is_missing_value(self.num_valid_records)
@@ -177,7 +294,16 @@ class ValidationColumnStatistics(GeneralColumnStatistics):
177
294
 
178
295
 
179
296
  class CategoricalHistogramData(BaseModel):
180
- categories: list[Union[float, int, str]]
297
+ """Container for categorical distribution histogram data.
298
+
299
+ Stores the computed frequency distribution of categorical values.
300
+
301
+ Attributes:
302
+ categories: List of unique category values that appear in the data.
303
+ counts: List of occurrence counts for each category.
304
+ """
305
+
306
+ categories: list[float | int | str]
181
307
  counts: list[int]
182
308
 
183
309
  @model_validator(mode="after")
@@ -194,12 +320,20 @@ class CategoricalHistogramData(BaseModel):
194
320
 
195
321
 
196
322
  class CategoricalDistribution(BaseModel):
197
- most_common_value: Union[str, int]
198
- least_common_value: Union[str, int]
323
+ """Container for computed categorical distribution statistics.
324
+
325
+ Attributes:
326
+ most_common_value: The category value that appears most frequently in the data.
327
+ least_common_value: The category value that appears least frequently in the data.
328
+ histogram: Complete frequency distribution showing all categories and their counts.
329
+ """
330
+
331
+ most_common_value: str | int
332
+ least_common_value: str | int
199
333
  histogram: CategoricalHistogramData
200
334
 
201
335
  @field_validator("most_common_value", "least_common_value", mode="before")
202
- def ensure_python_types(cls, v: Union[str, int]) -> Union[str, int]:
336
+ def ensure_python_types(cls, v: str | int) -> str | int:
203
337
  return str(v) if not is_int(v) else prepare_number_for_reporting(v, int)
204
338
 
205
339
  @classmethod
@@ -213,14 +347,24 @@ class CategoricalDistribution(BaseModel):
213
347
 
214
348
 
215
349
  class NumericalDistribution(BaseModel):
216
- min: Union[float, int]
217
- max: Union[float, int]
350
+ """Container for computed numerical distribution statistics.
351
+
352
+ Attributes:
353
+ min: Minimum value in the distribution.
354
+ max: Maximum value in the distribution.
355
+ mean: Arithmetic mean (average) of all values.
356
+ stddev: Standard deviation measuring the spread of values around the mean.
357
+ median: Median value of the distribution.
358
+ """
359
+
360
+ min: float | int
361
+ max: float | int
218
362
  mean: float
219
363
  stddev: float
220
364
  median: float
221
365
 
222
366
  @field_validator("min", "max", "mean", "stddev", "median", mode="before")
223
- def ensure_python_types(cls, v: Union[float, int]) -> Union[float, int]:
367
+ def ensure_python_types(cls, v: float | int) -> float | int:
224
368
  return prepare_number_for_reporting(v, int if is_int(v) else float)
225
369
 
226
370
  @classmethod
@@ -234,17 +378,17 @@ class NumericalDistribution(BaseModel):
234
378
  )
235
379
 
236
380
 
237
- ColumnStatisticsT: TypeAlias = Union[
238
- GeneralColumnStatistics,
239
- LLMTextColumnStatistics,
240
- LLMCodeColumnStatistics,
241
- LLMStructuredColumnStatistics,
242
- LLMJudgedColumnStatistics,
243
- SamplerColumnStatistics,
244
- SeedDatasetColumnStatistics,
245
- ValidationColumnStatistics,
246
- ExpressionColumnStatistics,
247
- ]
381
+ ColumnStatisticsT: TypeAlias = (
382
+ GeneralColumnStatistics
383
+ | LLMTextColumnStatistics
384
+ | LLMCodeColumnStatistics
385
+ | LLMStructuredColumnStatistics
386
+ | LLMJudgedColumnStatistics
387
+ | SamplerColumnStatistics
388
+ | SeedDatasetColumnStatistics
389
+ | ValidationColumnStatistics
390
+ | ExpressionColumnStatistics
391
+ )
248
392
 
249
393
 
250
394
  DEFAULT_COLUMN_STATISTICS_MAP = {