data-designer 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/_version.py +2 -2
- data_designer/config/config_builder.py +24 -2
- data_designer/config/sampler_params.py +19 -0
- data_designer/engine/column_generators/generators/base.py +11 -8
- data_designer/engine/dataset_builders/column_wise_builder.py +47 -5
- data_designer/engine/models/registry.py +27 -1
- data_designer/engine/models/telemetry.py +355 -0
- data_designer/engine/validators/python.py +28 -25
- {data_designer-0.2.0.dist-info → data_designer-0.2.2.dist-info}/METADATA +58 -29
- {data_designer-0.2.0.dist-info → data_designer-0.2.2.dist-info}/RECORD +13 -12
- {data_designer-0.2.0.dist-info → data_designer-0.2.2.dist-info}/WHEEL +0 -0
- {data_designer-0.2.0.dist-info → data_designer-0.2.2.dist-info}/entry_points.txt +0 -0
- {data_designer-0.2.0.dist-info → data_designer-0.2.2.dist-info}/licenses/LICENSE +0 -0
data_designer/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.2.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 2,
|
|
31
|
+
__version__ = version = '0.2.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 2, 2)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -224,6 +224,9 @@ class DataDesignerConfigBuilder:
|
|
|
224
224
|
|
|
225
225
|
Returns:
|
|
226
226
|
The current Data Designer config builder instance.
|
|
227
|
+
|
|
228
|
+
Raises:
|
|
229
|
+
BuilderConfigurationError: If the column name collides with an existing seed dataset column.
|
|
227
230
|
"""
|
|
228
231
|
if column_config is None:
|
|
229
232
|
if name is None or column_type is None:
|
|
@@ -240,6 +243,13 @@ class DataDesignerConfigBuilder:
|
|
|
240
243
|
f"{', '.join([t.__name__ for t in allowed_column_configs])}"
|
|
241
244
|
)
|
|
242
245
|
|
|
246
|
+
existing_config = self._column_configs.get(column_config.name)
|
|
247
|
+
if existing_config is not None and isinstance(existing_config, SeedDatasetColumnConfig):
|
|
248
|
+
raise BuilderConfigurationError(
|
|
249
|
+
f"🛑 Column {column_config.name!r} already exists as a seed dataset column. "
|
|
250
|
+
"Please use a different column name or update the seed dataset."
|
|
251
|
+
)
|
|
252
|
+
|
|
243
253
|
self._column_configs[column_config.name] = column_config
|
|
244
254
|
return self
|
|
245
255
|
|
|
@@ -568,7 +578,8 @@ class DataDesignerConfigBuilder:
|
|
|
568
578
|
|
|
569
579
|
This method sets the seed dataset for the configuration and automatically creates
|
|
570
580
|
SeedDatasetColumnConfig objects for each column found in the dataset. The column
|
|
571
|
-
names are fetched from the dataset source
|
|
581
|
+
names are fetched from the dataset source, which can be the Hugging Face Hub, the
|
|
582
|
+
NeMo Microservices Datastore, or in the case of direct library usage, a local file.
|
|
572
583
|
|
|
573
584
|
Args:
|
|
574
585
|
dataset_reference: Seed dataset reference for fetching from the datastore.
|
|
@@ -577,7 +588,18 @@ class DataDesignerConfigBuilder:
|
|
|
577
588
|
|
|
578
589
|
Returns:
|
|
579
590
|
The current Data Designer config builder instance.
|
|
591
|
+
|
|
592
|
+
Raises:
|
|
593
|
+
BuilderConfigurationError: If any seed dataset column name collides with an existing column.
|
|
580
594
|
"""
|
|
595
|
+
seed_column_names = fetch_seed_dataset_column_names(dataset_reference)
|
|
596
|
+
colliding_columns = [name for name in seed_column_names if name in self._column_configs]
|
|
597
|
+
if colliding_columns:
|
|
598
|
+
raise BuilderConfigurationError(
|
|
599
|
+
f"🛑 Seed dataset column(s) {colliding_columns} collide with existing column(s). "
|
|
600
|
+
"Please remove the conflicting columns or use a seed dataset with different column names."
|
|
601
|
+
)
|
|
602
|
+
|
|
581
603
|
self._seed_config = SeedConfig(
|
|
582
604
|
dataset=dataset_reference.dataset,
|
|
583
605
|
sampling_strategy=sampling_strategy,
|
|
@@ -586,7 +608,7 @@ class DataDesignerConfigBuilder:
|
|
|
586
608
|
self.set_seed_datastore_settings(
|
|
587
609
|
dataset_reference.datastore_settings if hasattr(dataset_reference, "datastore_settings") else None
|
|
588
610
|
)
|
|
589
|
-
for column_name in
|
|
611
|
+
for column_name in seed_column_names:
|
|
590
612
|
self._column_configs[column_name] = SeedDatasetColumnConfig(name=column_name)
|
|
591
613
|
return self
|
|
592
614
|
|
|
@@ -522,6 +522,25 @@ class PersonSamplerParams(ConfigBase):
|
|
|
522
522
|
|
|
523
523
|
|
|
524
524
|
class PersonFromFakerSamplerParams(ConfigBase):
|
|
525
|
+
"""Parameters for sampling synthetic person data with demographic attributes from Faker.
|
|
526
|
+
|
|
527
|
+
Uses the Faker library to generate random personal information. The data is basic and not demographically
|
|
528
|
+
accurate, but is useful for quick testing, prototyping, or when realistic demographic distributions are not
|
|
529
|
+
relevant for your use case. For demographically accurate person data, use the `PersonSamplerParams` sampler.
|
|
530
|
+
|
|
531
|
+
Attributes:
|
|
532
|
+
locale: Locale string determining the language and geographic region for synthetic people.
|
|
533
|
+
Can be any locale supported by Faker.
|
|
534
|
+
sex: If specified, filters to only sample people of the specified sex. Options: "Male" or
|
|
535
|
+
"Female". If None, samples both sexes.
|
|
536
|
+
city: If specified, filters to only sample people from the specified city or cities. Can be
|
|
537
|
+
a single city name (string) or a list of city names.
|
|
538
|
+
age_range: Two-element list [min_age, max_age] specifying the age range to sample from
|
|
539
|
+
(inclusive). Defaults to a standard age range. Both values must be between the minimum and
|
|
540
|
+
maximum allowed ages.
|
|
541
|
+
sampler_type: Discriminator for the sampler type. Must be `SamplerType.PERSON_FROM_FAKER`.
|
|
542
|
+
"""
|
|
543
|
+
|
|
525
544
|
locale: str = Field(
|
|
526
545
|
default="en_US",
|
|
527
546
|
description=(
|
|
@@ -1,23 +1,27 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import functools
|
|
5
7
|
import logging
|
|
6
8
|
from abc import ABC, abstractmethod
|
|
7
|
-
from
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import TYPE_CHECKING, overload
|
|
8
11
|
|
|
9
12
|
import pandas as pd
|
|
10
13
|
|
|
11
|
-
from data_designer.config.column_types import COLUMN_TYPE_EMOJI_MAP
|
|
12
|
-
from data_designer.config.models import BaseInferenceParams, ModelConfig
|
|
13
|
-
from data_designer.config.utils.type_helpers import StrEnum
|
|
14
14
|
from data_designer.engine.configurable_task import ConfigurableTask, ConfigurableTaskMetadata, DataT, TaskConfigT
|
|
15
|
-
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from data_designer.config.models import BaseInferenceParams, ModelConfig
|
|
18
|
+
from data_designer.engine.models.facade import ModelFacade
|
|
19
|
+
|
|
16
20
|
|
|
17
21
|
logger = logging.getLogger(__name__)
|
|
18
22
|
|
|
19
23
|
|
|
20
|
-
class GenerationStrategy(
|
|
24
|
+
class GenerationStrategy(str, Enum):
|
|
21
25
|
CELL_BY_CELL = "cell_by_cell"
|
|
22
26
|
FULL_COLUMN = "full_column"
|
|
23
27
|
|
|
@@ -82,8 +86,7 @@ class WithModelGeneration:
|
|
|
82
86
|
return self.model_config.inference_parameters
|
|
83
87
|
|
|
84
88
|
def log_pre_generation(self) -> None:
|
|
85
|
-
|
|
86
|
-
logger.info(f"{emoji} Preparing {self.config.column_type} column generation")
|
|
89
|
+
logger.info(f"Preparing {self.config.column_type} column generation")
|
|
87
90
|
logger.info(f" |-- column name: {self.config.name!r}")
|
|
88
91
|
logger.info(f" |-- model config:\n{self.model_config.model_dump_json(indent=4)}")
|
|
89
92
|
if self.model_config.provider is None:
|
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
from __future__ import annotations
|
|
3
4
|
|
|
4
5
|
import functools
|
|
6
|
+
import importlib.metadata
|
|
5
7
|
import json
|
|
6
8
|
import logging
|
|
7
9
|
import time
|
|
10
|
+
import uuid
|
|
8
11
|
from pathlib import Path
|
|
9
|
-
from typing import Callable
|
|
12
|
+
from typing import TYPE_CHECKING, Callable
|
|
10
13
|
|
|
11
14
|
import pandas as pd
|
|
12
15
|
|
|
@@ -35,14 +38,21 @@ from data_designer.engine.dataset_builders.utils.concurrency import (
|
|
|
35
38
|
from data_designer.engine.dataset_builders.utils.dataset_batch_manager import (
|
|
36
39
|
DatasetBatchManager,
|
|
37
40
|
)
|
|
41
|
+
from data_designer.engine.models.telemetry import InferenceEvent, NemoSourceEnum, TaskStatusEnum, TelemetryHandler
|
|
38
42
|
from data_designer.engine.processing.processors.base import Processor
|
|
39
43
|
from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor
|
|
40
44
|
from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry
|
|
41
45
|
from data_designer.engine.resources.resource_provider import ResourceProvider
|
|
42
46
|
|
|
47
|
+
if TYPE_CHECKING:
|
|
48
|
+
from data_designer.engine.models.usage import ModelUsageStats
|
|
49
|
+
|
|
43
50
|
logger = logging.getLogger(__name__)
|
|
44
51
|
|
|
45
52
|
|
|
53
|
+
_CLIENT_VERSION: str = importlib.metadata.version("data_designer")
|
|
54
|
+
|
|
55
|
+
|
|
46
56
|
class ColumnWiseDatasetBuilder:
|
|
47
57
|
def __init__(
|
|
48
58
|
self,
|
|
@@ -89,11 +99,12 @@ class ColumnWiseDatasetBuilder:
|
|
|
89
99
|
|
|
90
100
|
generators = self._initialize_generators()
|
|
91
101
|
start_time = time.perf_counter()
|
|
102
|
+
group_id = uuid.uuid4().hex
|
|
92
103
|
|
|
93
104
|
self.batch_manager.start(num_records=num_records, buffer_size=buffer_size)
|
|
94
105
|
for batch_idx in range(self.batch_manager.num_batches):
|
|
95
106
|
logger.info(f"⏳ Processing batch {batch_idx + 1} of {self.batch_manager.num_batches}")
|
|
96
|
-
self._run_batch(generators)
|
|
107
|
+
self._run_batch(generators, batch_mode="batch", group_id=group_id)
|
|
97
108
|
df_batch = self._run_processors(
|
|
98
109
|
stage=BuildStage.POST_BATCH,
|
|
99
110
|
dataframe=self.batch_manager.get_current_batch(as_dataframe=True),
|
|
@@ -114,10 +125,10 @@ class ColumnWiseDatasetBuilder:
|
|
|
114
125
|
self._run_model_health_check_if_needed()
|
|
115
126
|
|
|
116
127
|
generators = self._initialize_generators()
|
|
117
|
-
|
|
128
|
+
group_id = uuid.uuid4().hex
|
|
118
129
|
start_time = time.perf_counter()
|
|
119
130
|
self.batch_manager.start(num_records=num_records, buffer_size=num_records)
|
|
120
|
-
self._run_batch(generators, save_partial_results=False)
|
|
131
|
+
self._run_batch(generators, batch_mode="preview", save_partial_results=False, group_id=group_id)
|
|
121
132
|
dataset = self.batch_manager.get_current_batch(as_dataframe=True)
|
|
122
133
|
self.batch_manager.reset()
|
|
123
134
|
|
|
@@ -143,7 +154,10 @@ class ColumnWiseDatasetBuilder:
|
|
|
143
154
|
for config in self._column_configs
|
|
144
155
|
]
|
|
145
156
|
|
|
146
|
-
def _run_batch(
|
|
157
|
+
def _run_batch(
|
|
158
|
+
self, generators: list[ColumnGenerator], *, batch_mode: str, save_partial_results: bool = True, group_id: str
|
|
159
|
+
) -> None:
|
|
160
|
+
pre_batch_snapshot = self._resource_provider.model_registry.get_model_usage_snapshot()
|
|
147
161
|
for generator in generators:
|
|
148
162
|
generator.log_pre_generation()
|
|
149
163
|
try:
|
|
@@ -166,6 +180,12 @@ class ColumnWiseDatasetBuilder:
|
|
|
166
180
|
)
|
|
167
181
|
raise DatasetGenerationError(f"🛑 Failed to process {column_error_str}:\n{e}")
|
|
168
182
|
|
|
183
|
+
try:
|
|
184
|
+
usage_deltas = self._resource_provider.model_registry.get_usage_deltas(pre_batch_snapshot)
|
|
185
|
+
self._emit_batch_inference_events(batch_mode, usage_deltas, group_id)
|
|
186
|
+
except Exception:
|
|
187
|
+
pass
|
|
188
|
+
|
|
169
189
|
def _run_from_scratch_column_generator(self, generator: ColumnGenerator) -> None:
|
|
170
190
|
df = generator.generate_from_scratch(self.batch_manager.num_records_batch)
|
|
171
191
|
self.batch_manager.add_records(df.to_dict(orient="records"))
|
|
@@ -289,3 +309,25 @@ class ColumnWiseDatasetBuilder:
|
|
|
289
309
|
json_file_name="model_configs.json",
|
|
290
310
|
configs=self._resource_provider.model_registry.model_configs.values(),
|
|
291
311
|
)
|
|
312
|
+
|
|
313
|
+
def _emit_batch_inference_events(
|
|
314
|
+
self, batch_mode: str, usage_deltas: dict[str, ModelUsageStats], group_id: str
|
|
315
|
+
) -> None:
|
|
316
|
+
if not usage_deltas:
|
|
317
|
+
return
|
|
318
|
+
|
|
319
|
+
events = [
|
|
320
|
+
InferenceEvent(
|
|
321
|
+
nemo_source=NemoSourceEnum.DATADESIGNER,
|
|
322
|
+
task=batch_mode,
|
|
323
|
+
task_status=TaskStatusEnum.SUCCESS,
|
|
324
|
+
model=model_name,
|
|
325
|
+
input_tokens=delta.token_usage.input_tokens,
|
|
326
|
+
output_tokens=delta.token_usage.output_tokens,
|
|
327
|
+
)
|
|
328
|
+
for model_name, delta in usage_deltas.items()
|
|
329
|
+
]
|
|
330
|
+
|
|
331
|
+
with TelemetryHandler(source_client_version=_CLIENT_VERSION, session_id=group_id) as telemetry_handler:
|
|
332
|
+
for event in events:
|
|
333
|
+
telemetry_handler.enqueue(event)
|
|
@@ -9,6 +9,7 @@ from data_designer.config.models import GenerationType, ModelConfig
|
|
|
9
9
|
from data_designer.engine.model_provider import ModelProvider, ModelProviderRegistry
|
|
10
10
|
from data_designer.engine.models.facade import ModelFacade
|
|
11
11
|
from data_designer.engine.models.litellm_overrides import apply_litellm_patches
|
|
12
|
+
from data_designer.engine.models.usage import ModelUsageStats, RequestUsageStats, TokenUsageStats
|
|
12
13
|
from data_designer.engine.secret_resolver import SecretResolver
|
|
13
14
|
|
|
14
15
|
logger = logging.getLogger(__name__)
|
|
@@ -25,7 +26,7 @@ class ModelRegistry:
|
|
|
25
26
|
self._secret_resolver = secret_resolver
|
|
26
27
|
self._model_provider_registry = model_provider_registry
|
|
27
28
|
self._model_configs = {}
|
|
28
|
-
self._models = {}
|
|
29
|
+
self._models: dict[str, ModelFacade] = {}
|
|
29
30
|
self._set_model_configs(model_configs)
|
|
30
31
|
|
|
31
32
|
@property
|
|
@@ -69,6 +70,31 @@ class ModelRegistry:
|
|
|
69
70
|
if model.usage_stats.has_usage
|
|
70
71
|
}
|
|
71
72
|
|
|
73
|
+
def get_model_usage_snapshot(self) -> dict[str, ModelUsageStats]:
|
|
74
|
+
return {
|
|
75
|
+
model.model_name: model.usage_stats.model_copy(deep=True)
|
|
76
|
+
for model in self._models.values()
|
|
77
|
+
if model.usage_stats.has_usage
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
def get_usage_deltas(self, snapshot: dict[str, ModelUsageStats]) -> dict[str, ModelUsageStats]:
|
|
81
|
+
deltas = {}
|
|
82
|
+
for model_name, current in self.get_model_usage_snapshot().items():
|
|
83
|
+
prev = snapshot.get(model_name)
|
|
84
|
+
delta_input = current.token_usage.input_tokens - (prev.token_usage.input_tokens if prev else 0)
|
|
85
|
+
delta_output = current.token_usage.output_tokens - (prev.token_usage.output_tokens if prev else 0)
|
|
86
|
+
delta_successful = current.request_usage.successful_requests - (
|
|
87
|
+
prev.request_usage.successful_requests if prev else 0
|
|
88
|
+
)
|
|
89
|
+
delta_failed = current.request_usage.failed_requests - (prev.request_usage.failed_requests if prev else 0)
|
|
90
|
+
|
|
91
|
+
if delta_input > 0 or delta_output > 0 or delta_successful > 0 or delta_failed > 0:
|
|
92
|
+
deltas[model_name] = ModelUsageStats(
|
|
93
|
+
token_usage=TokenUsageStats(input_tokens=delta_input, output_tokens=delta_output),
|
|
94
|
+
request_usage=RequestUsageStats(successful_requests=delta_successful, failed_requests=delta_failed),
|
|
95
|
+
)
|
|
96
|
+
return deltas
|
|
97
|
+
|
|
72
98
|
def get_model_provider(self, *, model_alias: str) -> ModelProvider:
|
|
73
99
|
model_config = self.get_model_config(model_alias=model_alias)
|
|
74
100
|
return self._model_provider_registry.get_provider(model_config.provider)
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Telemetry handler for NeMo products.
|
|
6
|
+
|
|
7
|
+
Environment variables:
|
|
8
|
+
- NEMO_TELEMETRY_ENABLED: Whether telemetry is enabled.
|
|
9
|
+
- NEMO_DEPLOYMENT_TYPE: The deployment type the event came from.
|
|
10
|
+
- NEMO_TELEMETRY_ENDPOINT: The endpoint to send the telemetry events to.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import os
|
|
17
|
+
import platform
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from datetime import datetime, timezone
|
|
20
|
+
from enum import Enum
|
|
21
|
+
from typing import Any, ClassVar
|
|
22
|
+
|
|
23
|
+
import httpx
|
|
24
|
+
from pydantic import BaseModel, Field
|
|
25
|
+
|
|
26
|
+
TELEMETRY_ENABLED = os.getenv("NEMO_TELEMETRY_ENABLED", "true").lower() in ("1", "true", "yes")
|
|
27
|
+
CLIENT_ID = "184482118588404"
|
|
28
|
+
NEMO_TELEMETRY_VERSION = "nemo-telemetry/1.0"
|
|
29
|
+
MAX_RETRIES = 3
|
|
30
|
+
NEMO_TELEMETRY_ENDPOINT = os.getenv(
|
|
31
|
+
"NEMO_TELEMETRY_ENDPOINT", "https://events.telemetry.data.nvidia.com/v1.1/events/json"
|
|
32
|
+
).lower()
|
|
33
|
+
CPU_ARCHITECTURE = platform.uname().machine
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class NemoSourceEnum(str, Enum):
|
|
37
|
+
INFERENCE = "inference"
|
|
38
|
+
AUDITOR = "auditor"
|
|
39
|
+
DATADESIGNER = "datadesigner"
|
|
40
|
+
EVALUATOR = "evaluator"
|
|
41
|
+
GUARDRAILS = "guardrails"
|
|
42
|
+
UNDEFINED = "undefined"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class DeploymentTypeEnum(str, Enum):
|
|
46
|
+
LIBRARY = "library"
|
|
47
|
+
API = "api"
|
|
48
|
+
UNDEFINED = "undefined"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
_deployment_type_raw = os.getenv("NEMO_DEPLOYMENT_TYPE", "library").lower()
|
|
52
|
+
try:
|
|
53
|
+
DEPLOYMENT_TYPE = DeploymentTypeEnum(_deployment_type_raw)
|
|
54
|
+
except ValueError:
|
|
55
|
+
valid_values = [e.value for e in DeploymentTypeEnum]
|
|
56
|
+
raise ValueError(
|
|
57
|
+
f"Invalid NEMO_DEPLOYMENT_TYPE: {_deployment_type_raw!r}. Must be one of: {valid_values}"
|
|
58
|
+
) from None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class TaskStatusEnum(str, Enum):
|
|
62
|
+
SUCCESS = "success"
|
|
63
|
+
FAILURE = "failure"
|
|
64
|
+
UNDEFINED = "undefined"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class TelemetryEvent(BaseModel):
|
|
68
|
+
_event_name: ClassVar[str] # Subclasses must define this
|
|
69
|
+
_schema_version: ClassVar[str] = "1.3"
|
|
70
|
+
|
|
71
|
+
def __init_subclass__(cls, **kwargs: Any) -> None:
|
|
72
|
+
super().__init_subclass__(**kwargs)
|
|
73
|
+
if "_event_name" not in cls.__dict__:
|
|
74
|
+
raise TypeError(f"{cls.__name__} must define '_event_name' class variable")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class InferenceEvent(TelemetryEvent):
|
|
78
|
+
_event_name: ClassVar[str] = "inference_event"
|
|
79
|
+
|
|
80
|
+
nemo_source: NemoSourceEnum = Field(
|
|
81
|
+
...,
|
|
82
|
+
alias="nemoSource",
|
|
83
|
+
description="The NeMo product that created the event (i.e. data-designer).",
|
|
84
|
+
)
|
|
85
|
+
task: str = Field(
|
|
86
|
+
...,
|
|
87
|
+
description="The type of task that was performed that generated the inference event (i.e. preview-job, batch-job).",
|
|
88
|
+
)
|
|
89
|
+
task_status: TaskStatusEnum = Field(
|
|
90
|
+
...,
|
|
91
|
+
alias="taskStatus",
|
|
92
|
+
description="The status of the task.",
|
|
93
|
+
)
|
|
94
|
+
deployment_type: DeploymentTypeEnum = Field(
|
|
95
|
+
default=DEPLOYMENT_TYPE,
|
|
96
|
+
alias="deploymentType",
|
|
97
|
+
description="The deployment type the event came from.",
|
|
98
|
+
)
|
|
99
|
+
model: str = Field(
|
|
100
|
+
...,
|
|
101
|
+
description="The name of the model that was used.",
|
|
102
|
+
)
|
|
103
|
+
model_group: str = Field(
|
|
104
|
+
default="undefined",
|
|
105
|
+
alias="modelGroup",
|
|
106
|
+
description="An optional identifier to group models together.",
|
|
107
|
+
)
|
|
108
|
+
input_bytes: int = Field(
|
|
109
|
+
default=-1,
|
|
110
|
+
alias="inputBytes",
|
|
111
|
+
description="Number of bytes provided as input to the model. -1 if not available.",
|
|
112
|
+
ge=-9223372036854775808,
|
|
113
|
+
le=9223372036854775807,
|
|
114
|
+
)
|
|
115
|
+
input_tokens: int = Field(
|
|
116
|
+
default=-1,
|
|
117
|
+
alias="inputTokens",
|
|
118
|
+
description="Number of tokens provided as input to the model. -1 if not available.",
|
|
119
|
+
ge=-9223372036854775808,
|
|
120
|
+
le=9223372036854775807,
|
|
121
|
+
)
|
|
122
|
+
output_bytes: int = Field(
|
|
123
|
+
default=-1,
|
|
124
|
+
alias="outputBytes",
|
|
125
|
+
description="Number of bytes returned by the model. -1 if not available.",
|
|
126
|
+
ge=-9223372036854775808,
|
|
127
|
+
le=9223372036854775807,
|
|
128
|
+
)
|
|
129
|
+
output_tokens: int = Field(
|
|
130
|
+
default=-1,
|
|
131
|
+
alias="outputTokens",
|
|
132
|
+
description="Number of tokens returned by the model. -1 if not available.",
|
|
133
|
+
ge=-9223372036854775808,
|
|
134
|
+
le=9223372036854775807,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
model_config = {"populate_by_name": True}
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
@dataclass
|
|
141
|
+
class QueuedEvent:
|
|
142
|
+
event: TelemetryEvent
|
|
143
|
+
timestamp: datetime
|
|
144
|
+
retry_count: int = 0
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _get_iso_timestamp(dt: datetime | None = None) -> str:
|
|
148
|
+
if dt is None:
|
|
149
|
+
dt = datetime.now(timezone.utc)
|
|
150
|
+
return dt.strftime("%Y-%m-%dT%H:%M:%S.") + f"{dt.microsecond // 1000:03d}Z"
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def build_payload(
|
|
154
|
+
events: list[QueuedEvent], *, source_client_version: str, session_id: str = "undefined"
|
|
155
|
+
) -> dict[str, Any]:
|
|
156
|
+
return {
|
|
157
|
+
"browserType": "undefined", # do not change
|
|
158
|
+
"clientId": CLIENT_ID,
|
|
159
|
+
"clientType": "Native", # do not change
|
|
160
|
+
"clientVariant": "Release", # do not change
|
|
161
|
+
"clientVer": source_client_version,
|
|
162
|
+
"cpuArchitecture": CPU_ARCHITECTURE,
|
|
163
|
+
"deviceGdprBehOptIn": "None", # do not change
|
|
164
|
+
"deviceGdprFuncOptIn": "None", # do not change
|
|
165
|
+
"deviceGdprTechOptIn": "None", # do not change
|
|
166
|
+
"deviceId": "undefined", # do not change
|
|
167
|
+
"deviceMake": "undefined", # do not change
|
|
168
|
+
"deviceModel": "undefined", # do not change
|
|
169
|
+
"deviceOS": "undefined", # do not change
|
|
170
|
+
"deviceOSVersion": "undefined", # do not change
|
|
171
|
+
"deviceType": "undefined", # do not change
|
|
172
|
+
"eventProtocol": "1.6", # do not change
|
|
173
|
+
"eventSchemaVer": events[0].event._schema_version,
|
|
174
|
+
"eventSysVer": NEMO_TELEMETRY_VERSION,
|
|
175
|
+
"externalUserId": "undefined", # do not change
|
|
176
|
+
"gdprBehOptIn": "None", # do not change
|
|
177
|
+
"gdprFuncOptIn": "None", # do not change
|
|
178
|
+
"gdprTechOptIn": "None", # do not change
|
|
179
|
+
"idpId": "undefined", # do not change
|
|
180
|
+
"integrationId": "undefined", # do not change
|
|
181
|
+
"productName": "undefined", # do not change
|
|
182
|
+
"productVersion": "undefined", # do not change
|
|
183
|
+
"sentTs": _get_iso_timestamp(),
|
|
184
|
+
"sessionId": session_id,
|
|
185
|
+
"userId": "undefined", # do not change
|
|
186
|
+
"events": [
|
|
187
|
+
{
|
|
188
|
+
"ts": _get_iso_timestamp(queued.timestamp),
|
|
189
|
+
"parameters": queued.event.model_dump(by_alias=True),
|
|
190
|
+
"name": queued.event._event_name,
|
|
191
|
+
}
|
|
192
|
+
for queued in events
|
|
193
|
+
],
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class TelemetryHandler:
|
|
198
|
+
"""
|
|
199
|
+
Handles telemetry event batching, flushing, and retry logic for NeMo products.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
flush_interval_seconds (float): The interval in seconds to flush the events.
|
|
203
|
+
max_queue_size (int): The maximum number of events to queue before flushing.
|
|
204
|
+
max_retries (int): The maximum number of times to retry sending an event.
|
|
205
|
+
source_client_version (str): The version of the source client. This should be the version of
|
|
206
|
+
the actual NeMo product that is sending the events, typically the same as the version of
|
|
207
|
+
a PyPi package that a user would install.
|
|
208
|
+
session_id (str): An optional session ID to associate with the events.
|
|
209
|
+
This should be a unique identifier for the session, such as a UUID.
|
|
210
|
+
It is used to group events together.
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
def __init__(
|
|
214
|
+
self,
|
|
215
|
+
flush_interval_seconds: float = 120.0,
|
|
216
|
+
max_queue_size: int = 50,
|
|
217
|
+
max_retries: int = MAX_RETRIES,
|
|
218
|
+
source_client_version: str = "undefined",
|
|
219
|
+
session_id: str = "undefined",
|
|
220
|
+
):
|
|
221
|
+
self._flush_interval = flush_interval_seconds
|
|
222
|
+
self._max_queue_size = max_queue_size
|
|
223
|
+
self._max_retries = max_retries
|
|
224
|
+
self._events: list[QueuedEvent] = []
|
|
225
|
+
self._dlq: list[QueuedEvent] = [] # Dead letter queue for retry
|
|
226
|
+
self._flush_signal = asyncio.Event()
|
|
227
|
+
self._timer_task: asyncio.Task | None = None
|
|
228
|
+
self._running = False
|
|
229
|
+
self._source_client_version = source_client_version
|
|
230
|
+
self._session_id = session_id
|
|
231
|
+
|
|
232
|
+
async def astart(self) -> None:
|
|
233
|
+
if self._running:
|
|
234
|
+
return
|
|
235
|
+
self._running = True
|
|
236
|
+
self._timer_task = asyncio.create_task(self._timer_loop())
|
|
237
|
+
|
|
238
|
+
async def astop(self) -> None:
|
|
239
|
+
self._running = False
|
|
240
|
+
self._flush_signal.set()
|
|
241
|
+
if self._timer_task:
|
|
242
|
+
self._timer_task.cancel()
|
|
243
|
+
try:
|
|
244
|
+
await self._timer_task
|
|
245
|
+
except asyncio.CancelledError:
|
|
246
|
+
pass
|
|
247
|
+
self._timer_task = None
|
|
248
|
+
await self._flush_events()
|
|
249
|
+
|
|
250
|
+
async def aflush(self) -> None:
|
|
251
|
+
self._flush_signal.set()
|
|
252
|
+
|
|
253
|
+
def start(self) -> None:
|
|
254
|
+
self._run_sync(self.astart())
|
|
255
|
+
|
|
256
|
+
def stop(self) -> None:
|
|
257
|
+
self._run_sync(self.astop())
|
|
258
|
+
|
|
259
|
+
def flush(self) -> None:
|
|
260
|
+
self._flush_signal.set()
|
|
261
|
+
|
|
262
|
+
def enqueue(self, event: TelemetryEvent) -> None:
|
|
263
|
+
if not TELEMETRY_ENABLED:
|
|
264
|
+
return
|
|
265
|
+
if not isinstance(event, TelemetryEvent):
|
|
266
|
+
# Silently fail as we prioritize not disrupting upstream call sites and telemetry is best effort
|
|
267
|
+
return
|
|
268
|
+
queued = QueuedEvent(event=event, timestamp=datetime.now(timezone.utc))
|
|
269
|
+
self._events.append(queued)
|
|
270
|
+
if len(self._events) >= self._max_queue_size:
|
|
271
|
+
self._flush_signal.set()
|
|
272
|
+
|
|
273
|
+
def _run_sync(self, coro: Any) -> Any:
|
|
274
|
+
try:
|
|
275
|
+
loop = asyncio.get_running_loop()
|
|
276
|
+
except RuntimeError:
|
|
277
|
+
loop = None
|
|
278
|
+
|
|
279
|
+
if loop and loop.is_running():
|
|
280
|
+
import concurrent.futures
|
|
281
|
+
|
|
282
|
+
with concurrent.futures.ThreadPoolExecutor() as pool:
|
|
283
|
+
future = pool.submit(asyncio.run, coro)
|
|
284
|
+
return future.result()
|
|
285
|
+
else:
|
|
286
|
+
return asyncio.run(coro)
|
|
287
|
+
|
|
288
|
+
def __enter__(self) -> TelemetryHandler:
|
|
289
|
+
self.start()
|
|
290
|
+
return self
|
|
291
|
+
|
|
292
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
293
|
+
self.stop()
|
|
294
|
+
|
|
295
|
+
async def __aenter__(self) -> TelemetryHandler:
|
|
296
|
+
await self.astart()
|
|
297
|
+
return self
|
|
298
|
+
|
|
299
|
+
async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
300
|
+
await self.astop()
|
|
301
|
+
|
|
302
|
+
async def _timer_loop(self) -> None:
|
|
303
|
+
while self._running:
|
|
304
|
+
try:
|
|
305
|
+
await asyncio.wait_for(
|
|
306
|
+
self._flush_signal.wait(),
|
|
307
|
+
timeout=self._flush_interval,
|
|
308
|
+
)
|
|
309
|
+
except asyncio.TimeoutError:
|
|
310
|
+
pass
|
|
311
|
+
self._flush_signal.clear()
|
|
312
|
+
await self._flush_events()
|
|
313
|
+
|
|
314
|
+
async def _flush_events(self) -> None:
|
|
315
|
+
dlq_events, self._dlq = self._dlq, []
|
|
316
|
+
new_events, self._events = self._events, []
|
|
317
|
+
events_to_send = dlq_events + new_events
|
|
318
|
+
if events_to_send:
|
|
319
|
+
await self._send_events(events_to_send)
|
|
320
|
+
|
|
321
|
+
async def _send_events(self, events: list[QueuedEvent]) -> None:
|
|
322
|
+
async with httpx.AsyncClient() as client:
|
|
323
|
+
await self._send_events_with_client(client, events)
|
|
324
|
+
|
|
325
|
+
async def _send_events_with_client(self, client: httpx.AsyncClient, events: list[QueuedEvent]) -> None:
|
|
326
|
+
if not events:
|
|
327
|
+
return
|
|
328
|
+
|
|
329
|
+
payload = build_payload(events, source_client_version=self._source_client_version, session_id=self._session_id)
|
|
330
|
+
try:
|
|
331
|
+
response = await client.post(NEMO_TELEMETRY_ENDPOINT, json=payload)
|
|
332
|
+
# 2xx, 400, 422 are all considered complete (no retry)
|
|
333
|
+
# 400/422 indicate bad payload which retrying won't fix
|
|
334
|
+
if response.status_code in (400, 422) or response.is_success:
|
|
335
|
+
return
|
|
336
|
+
# 413 (payload too large) - split and retry
|
|
337
|
+
if response.status_code == 413:
|
|
338
|
+
if len(events) == 1:
|
|
339
|
+
# Can't split further, drop the event
|
|
340
|
+
return
|
|
341
|
+
mid = len(events) // 2
|
|
342
|
+
await self._send_events_with_client(client, events[:mid])
|
|
343
|
+
await self._send_events_with_client(client, events[mid:])
|
|
344
|
+
return
|
|
345
|
+
if response.status_code == 408 or response.status_code >= 500:
|
|
346
|
+
self._add_to_dlq(events)
|
|
347
|
+
except httpx.HTTPError:
|
|
348
|
+
self._add_to_dlq(events)
|
|
349
|
+
|
|
350
|
+
def _add_to_dlq(self, events: list[QueuedEvent]) -> None:
|
|
351
|
+
for queued in events:
|
|
352
|
+
queued.retry_count += 1
|
|
353
|
+
if queued.retry_count > self._max_retries:
|
|
354
|
+
continue
|
|
355
|
+
self._dlq.append(queued)
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
import ast
|
|
5
|
+
import json
|
|
5
6
|
import logging
|
|
6
|
-
import re
|
|
7
7
|
import subprocess
|
|
8
8
|
import tempfile
|
|
9
9
|
from collections import defaultdict
|
|
@@ -179,9 +179,8 @@ class PythonValidator(BaseValidator):
|
|
|
179
179
|
for file in Path(codebase_path).glob("*.py"):
|
|
180
180
|
processed[file.stem] = PythonLinterMessages()
|
|
181
181
|
|
|
182
|
-
# Run ruff linter
|
|
182
|
+
# Run ruff linter with JSON output
|
|
183
183
|
ruff_bin = find_ruff_bin()
|
|
184
|
-
env = {"NO_COLOR": "1"}
|
|
185
184
|
|
|
186
185
|
ruff_exec = subprocess.run(
|
|
187
186
|
[
|
|
@@ -189,9 +188,9 @@ class PythonValidator(BaseValidator):
|
|
|
189
188
|
"check",
|
|
190
189
|
"--select",
|
|
191
190
|
"E,F6,F7,F8,SIM,PLC,PLE,PLR,PLW",
|
|
191
|
+
"--output-format=json",
|
|
192
192
|
codebase_path,
|
|
193
193
|
],
|
|
194
|
-
env=env,
|
|
195
194
|
text=True,
|
|
196
195
|
capture_output=True,
|
|
197
196
|
check=False,
|
|
@@ -199,30 +198,34 @@ class PythonValidator(BaseValidator):
|
|
|
199
198
|
)
|
|
200
199
|
ruff_output = ruff_exec.stdout
|
|
201
200
|
|
|
202
|
-
# Parse
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
errors = re.findall(pattern, ruff_output)
|
|
201
|
+
# Parse JSON output
|
|
202
|
+
try:
|
|
203
|
+
diagnostics = json.loads(ruff_output)
|
|
204
|
+
except json.JSONDecodeError as e:
|
|
205
|
+
raise RuntimeError(f"Failed to parse ruff JSON output: {e}")
|
|
208
206
|
|
|
209
|
-
if
|
|
210
|
-
|
|
207
|
+
if not diagnostics:
|
|
208
|
+
return processed # no errors or warnings
|
|
211
209
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
210
|
+
for diagnostic in diagnostics:
|
|
211
|
+
filename = diagnostic["filename"]
|
|
212
|
+
code = diagnostic["code"]
|
|
213
|
+
location = diagnostic["location"]
|
|
214
|
+
message = diagnostic["message"]
|
|
215
|
+
|
|
216
|
+
# Extract alphabetic prefix from code for type mapping
|
|
217
|
+
alpha_prefix = "".join(c for c in code if c.isalpha())
|
|
218
|
+
error_type = TYPE_FROM_SYMBOL.get(alpha_prefix, "warning")
|
|
219
|
+
|
|
220
|
+
processed[Path(filename).stem].add(
|
|
221
|
+
PythonLinterMessage(
|
|
222
|
+
type=error_type,
|
|
223
|
+
symbol=code,
|
|
224
|
+
line=location["row"],
|
|
225
|
+
column=location["column"],
|
|
226
|
+
message=message,
|
|
223
227
|
)
|
|
224
|
-
|
|
225
|
-
raise RuntimeError("ruff's output not in expected format")
|
|
228
|
+
)
|
|
226
229
|
|
|
227
230
|
return processed
|
|
228
231
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-designer
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: General framework for synthetic data generation
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
License-File: LICENSE
|
|
@@ -15,36 +15,34 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
16
|
Classifier: Topic :: Software Development
|
|
17
17
|
Requires-Python: >=3.10
|
|
18
|
-
Requires-Dist: anyascii<1
|
|
19
|
-
Requires-Dist:
|
|
20
|
-
Requires-Dist:
|
|
21
|
-
Requires-Dist:
|
|
22
|
-
Requires-Dist: httpx
|
|
23
|
-
Requires-Dist:
|
|
24
|
-
Requires-Dist: huggingface-hub>=0.34.4
|
|
18
|
+
Requires-Dist: anyascii<1,>=0.3.3
|
|
19
|
+
Requires-Dist: duckdb<2,>=1.1.3
|
|
20
|
+
Requires-Dist: faker<21,>=20.1.0
|
|
21
|
+
Requires-Dist: httpx-retries<1,>=0.4.2
|
|
22
|
+
Requires-Dist: httpx<1,>=0.27.2
|
|
23
|
+
Requires-Dist: huggingface-hub<2,>=1.0.1
|
|
25
24
|
Requires-Dist: jinja2<4,>=3.1.6
|
|
26
|
-
Requires-Dist: json-repair
|
|
27
|
-
Requires-Dist: jsonpath-rust-bindings
|
|
28
|
-
Requires-Dist: litellm
|
|
29
|
-
Requires-Dist: lxml
|
|
30
|
-
Requires-Dist: marko
|
|
31
|
-
Requires-Dist: networkx
|
|
32
|
-
Requires-Dist: numpy
|
|
33
|
-
Requires-Dist: pandas
|
|
34
|
-
Requires-Dist: prompt-toolkit
|
|
35
|
-
Requires-Dist: pyarrow
|
|
36
|
-
Requires-Dist: pydantic
|
|
37
|
-
Requires-Dist:
|
|
38
|
-
Requires-Dist:
|
|
39
|
-
Requires-Dist:
|
|
40
|
-
Requires-Dist: pyyaml>=6.0.1
|
|
25
|
+
Requires-Dist: json-repair<1,>=0.48.0
|
|
26
|
+
Requires-Dist: jsonpath-rust-bindings<2,>=1.0
|
|
27
|
+
Requires-Dist: litellm<2,>=1.73.6
|
|
28
|
+
Requires-Dist: lxml<7,>=6.0.2
|
|
29
|
+
Requires-Dist: marko<3,>=2.1.2
|
|
30
|
+
Requires-Dist: networkx<4,>=3.0
|
|
31
|
+
Requires-Dist: numpy<3,>=1.23.5
|
|
32
|
+
Requires-Dist: pandas<3,>=2.3.3
|
|
33
|
+
Requires-Dist: prompt-toolkit<4,>=3.0.0
|
|
34
|
+
Requires-Dist: pyarrow<20,>=19.0.1
|
|
35
|
+
Requires-Dist: pydantic[email]<3,>=2.9.2
|
|
36
|
+
Requires-Dist: pygments<3,>=2.19.2
|
|
37
|
+
Requires-Dist: python-json-logger<4,>=3
|
|
38
|
+
Requires-Dist: pyyaml<7,>=6.0.1
|
|
41
39
|
Requires-Dist: requests<3,>=2.32.2
|
|
42
|
-
Requires-Dist: rich
|
|
43
|
-
Requires-Dist: ruff
|
|
44
|
-
Requires-Dist: scipy
|
|
45
|
-
Requires-Dist: sqlfluff
|
|
46
|
-
Requires-Dist: tiktoken
|
|
47
|
-
Requires-Dist: typer
|
|
40
|
+
Requires-Dist: rich<14,>=13.7.1
|
|
41
|
+
Requires-Dist: ruff<1,>=0.14.10
|
|
42
|
+
Requires-Dist: scipy<2,>=1.11.0
|
|
43
|
+
Requires-Dist: sqlfluff<4,>=3.2.0
|
|
44
|
+
Requires-Dist: tiktoken<1,>=0.8.0
|
|
45
|
+
Requires-Dist: typer<1,>=0.12.0
|
|
48
46
|
Description-Content-Type: text/markdown
|
|
49
47
|
|
|
50
48
|
# 🎨 NeMo Data Designer
|
|
@@ -166,6 +164,37 @@ data-designer config list # View current settings
|
|
|
166
164
|
|
|
167
165
|
---
|
|
168
166
|
|
|
167
|
+
## Telemetry
|
|
168
|
+
|
|
169
|
+
Data Designer collects telemetry to help us improve the library for developers. We collect:
|
|
170
|
+
|
|
171
|
+
* The names of models used
|
|
172
|
+
* The count of input tokens
|
|
173
|
+
* The count of output tokens
|
|
174
|
+
|
|
175
|
+
**No user or device information is collected.** This data is not used to track any individual user behavior. It is used to see an aggregation of which models are the most popular for SDG. We will share this usage data with the community.
|
|
176
|
+
|
|
177
|
+
Specifically, a model name that is defined a `ModelConfig` object, is what will be collected. In the below example config:
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
ModelConfig(
|
|
181
|
+
alias="nv-reasoning",
|
|
182
|
+
model="openai/gpt-oss-20b",
|
|
183
|
+
provider="nvidia",
|
|
184
|
+
inference_parameters=InferenceParameters(
|
|
185
|
+
temperature=0.3,
|
|
186
|
+
top_p=0.9,
|
|
187
|
+
max_tokens=4096,
|
|
188
|
+
),
|
|
189
|
+
)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
The value `openai/gpt-oss-20b` would be collected.
|
|
193
|
+
|
|
194
|
+
To disable telemetry capture, set `NEMO_TELEMETRY_ENABLED=false`.
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
169
198
|
## License
|
|
170
199
|
|
|
171
200
|
Apache License 2.0 – see [LICENSE](LICENSE) for details.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
data_designer/__init__.py,sha256=iCeqRnb640RrL2QpA630GY5Ng7JiDt83Vq0DwLnNugU,461
|
|
2
|
-
data_designer/_version.py,sha256=
|
|
2
|
+
data_designer/_version.py,sha256=o3ZTescp-19Z9cvBGq9dQnbppljgzdUYUf98Nov0spY,704
|
|
3
3
|
data_designer/errors.py,sha256=Z4eN9XwzZvGRdBluSNoSqQYkPPzNQIDf0ET_OqWRZh8,179
|
|
4
4
|
data_designer/logging.py,sha256=ZsruJ0tx_4NK0PIMyxCZJJ0wJugoDff9UP3PbsdEDxQ,5341
|
|
5
5
|
data_designer/plugin_manager.py,sha256=xaMX274gdlYLNNPIrAOmJNLaZlG_0ROJ0H29v8t2aKs,2604
|
|
@@ -37,7 +37,7 @@ data_designer/config/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMb
|
|
|
37
37
|
data_designer/config/base.py,sha256=ypam9XX6dg8Q_55su84WmVExNXsmt5jb3eeW3JLlHwc,2396
|
|
38
38
|
data_designer/config/column_configs.py,sha256=pjpy5z3Kk7i4WmIjOxdiW5Awpjy5CxQSy0YMy0QxtvA,18961
|
|
39
39
|
data_designer/config/column_types.py,sha256=EILVM42d4TMl2xbSj5htMsenJwybCHIc_G8AUXyrjWU,7197
|
|
40
|
-
data_designer/config/config_builder.py,sha256=
|
|
40
|
+
data_designer/config/config_builder.py,sha256=n8in3O-hR2j3wJBnZMCoT5NawlobJDWTyNZCIYSgWIo,29241
|
|
41
41
|
data_designer/config/data_designer_config.py,sha256=D2b4Dl8pR6kCkvPoZ3APxC9pVBqXi5EJMVK1WBZ6ni8,1886
|
|
42
42
|
data_designer/config/dataset_builders.py,sha256=1pNFy_pkQ5lJ6AVZ43AeTuSbz6yC_l7Ndcyp5yaT8hQ,327
|
|
43
43
|
data_designer/config/datastore.py,sha256=gEHR2hYlJwD_vzjuaSOMRiYjtwdQhyO9q1afZDrhBCo,7586
|
|
@@ -49,7 +49,7 @@ data_designer/config/models.py,sha256=_uLOh2TutJV3Fq_8YyAi5E7G37j47j64zcrCYnzpjb
|
|
|
49
49
|
data_designer/config/preview_results.py,sha256=bPRKX1enzNTY240ixc8jZVgO7EDHABZ1_er0TabhLZg,1345
|
|
50
50
|
data_designer/config/processors.py,sha256=bA6SVF1kmAJSshmWseLE6HzlEBAsH9FtUtNJk0QzJtU,5987
|
|
51
51
|
data_designer/config/sampler_constraints.py,sha256=Q8-JrwTD69AJy8cvs_-0yf4yOBGemLwLZNmk-8Y5wPk,1156
|
|
52
|
-
data_designer/config/sampler_params.py,sha256
|
|
52
|
+
data_designer/config/sampler_params.py,sha256=-MLNFDqattNWrHuWPYyGTe2YdbaGMH-JKTCzxq1ji3E,27838
|
|
53
53
|
data_designer/config/seed.py,sha256=n4iHDBkUlwNJSXqDu6BqD6uZZeFtLu6x1hyyOhcG9zM,5503
|
|
54
54
|
data_designer/config/validator_params.py,sha256=BSDNVZQvXB4hmhuX4EnJ89pR-1hdEfI_KWYO8POQlMk,3906
|
|
55
55
|
data_designer/config/analysis/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
@@ -84,7 +84,7 @@ data_designer/engine/analysis/utils/judge_score_processing.py,sha256=rl11e3PxAOQ
|
|
|
84
84
|
data_designer/engine/column_generators/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
85
85
|
data_designer/engine/column_generators/registry.py,sha256=Eg6tqNM7mmEPNom1fWF9S5D3qABpMennOHGEGePwJN0,3060
|
|
86
86
|
data_designer/engine/column_generators/generators/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
87
|
-
data_designer/engine/column_generators/generators/base.py,sha256=
|
|
87
|
+
data_designer/engine/column_generators/generators/base.py,sha256=zurwtamM2l3shLa4SLjUOE0zOTDozQ5wPGAvDkrNYqE,3231
|
|
88
88
|
data_designer/engine/column_generators/generators/embedding.py,sha256=xYnFWRJ2W7JuwK8CRIUhv4QiT_DCGDuQkuHFKXCxrow,1724
|
|
89
89
|
data_designer/engine/column_generators/generators/expression.py,sha256=7xniEj8aPscWDYLrnNbG2mF3s08C7aR8ZgNUCzr_x8g,2539
|
|
90
90
|
data_designer/engine/column_generators/generators/llm_completion.py,sha256=XqpXzYczbZ6efUIVuvcm2O_mXBnXCMAvcjeyaB5dIFA,5301
|
|
@@ -95,7 +95,7 @@ data_designer/engine/column_generators/utils/errors.py,sha256=ugNwaqnPdrPZI7YnKL
|
|
|
95
95
|
data_designer/engine/column_generators/utils/judge_score_factory.py,sha256=umo8-iMWbvkAztWkB5m_pU1cY1eBpR5L2gHt_fuZPD4,2100
|
|
96
96
|
data_designer/engine/column_generators/utils/prompt_renderer.py,sha256=d4tbyPsgmFDikW3nxL5is9RNaajMkoPDCrfkQkxw7rc,4760
|
|
97
97
|
data_designer/engine/dataset_builders/artifact_storage.py,sha256=mVCqcW8shylofi_pjYEeHUa9Mo-tjIcl4nR8D8oy2bw,8420
|
|
98
|
-
data_designer/engine/dataset_builders/column_wise_builder.py,sha256=
|
|
98
|
+
data_designer/engine/dataset_builders/column_wise_builder.py,sha256=h6R6YfU2sfhxewIyTaLdcgSI6FpfIouyc1qdfnWfUZ0,14801
|
|
99
99
|
data_designer/engine/dataset_builders/errors.py,sha256=1kChleChG4rASWIiL4Bel6Ox6aFZjQUrh5ogPt1CDWo,359
|
|
100
100
|
data_designer/engine/dataset_builders/multi_column_configs.py,sha256=t28fhI-WRIBohFnAJ80l5EAETEDB5rJ5RSWInMiRfyE,1619
|
|
101
101
|
data_designer/engine/dataset_builders/utils/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
@@ -108,7 +108,8 @@ data_designer/engine/models/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQo
|
|
|
108
108
|
data_designer/engine/models/errors.py,sha256=AQlZ-cf0IqFW-e-K9HZFH3YhXXOTLLI2eZCXr7_58Yk,12174
|
|
109
109
|
data_designer/engine/models/facade.py,sha256=sqTSqW7jQ1vbRc1fCOoKuhb7vTVil5Z8RqN_NBp6exY,12410
|
|
110
110
|
data_designer/engine/models/litellm_overrides.py,sha256=tUVB_Zvg_6VoFNTK7WEKHUjoAknMkAOvureLycMENh0,5504
|
|
111
|
-
data_designer/engine/models/registry.py,sha256
|
|
111
|
+
data_designer/engine/models/registry.py,sha256=-TbGhvs8WRq6f7z6cH_DDdo7uhs4Hb5qkJce_Y4UBWM,6840
|
|
112
|
+
data_designer/engine/models/telemetry.py,sha256=3g4jDz8xxOOkPtIYit94c4D4mGUwgfiCDaDdnbTLhFQ,12407
|
|
112
113
|
data_designer/engine/models/usage.py,sha256=rObhH8X0O7L-P863Jz2EAO4FO25-6VP42HL6Tvx4lRg,2405
|
|
113
114
|
data_designer/engine/models/utils.py,sha256=91oPXpHsnER1rEWxIkBhphlln8VOuTOoLGJL9eyWYBo,1254
|
|
114
115
|
data_designer/engine/models/parsers/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
@@ -166,7 +167,7 @@ data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet,sha2
|
|
|
166
167
|
data_designer/engine/validators/__init__.py,sha256=lMouN5BTbDNi31KfTQNjV7qrL46q-ssejXNT24iDTGI,652
|
|
167
168
|
data_designer/engine/validators/base.py,sha256=lzO4gRoCDvh3BFP0sM4OjcwG-84qodYFW-G9NEOk3Cs,964
|
|
168
169
|
data_designer/engine/validators/local_callable.py,sha256=oCUXj_NRt0gVqUIh0fLrvw-iURDR6OHFrVi5GOMhXj8,1387
|
|
169
|
-
data_designer/engine/validators/python.py,sha256=
|
|
170
|
+
data_designer/engine/validators/python.py,sha256=jAp1u8yLjqfebh60xGapkHVjMz58WHB0QjfMc2zQCaY,7894
|
|
170
171
|
data_designer/engine/validators/remote.py,sha256=jtDIvWzfHh17m2ac_Fp93p49Th8RlkBzzih2jiqD7gk,2929
|
|
171
172
|
data_designer/engine/validators/sql.py,sha256=bxbyxPxDT9yuwjhABVEY40iR1pzWRFi65WU4tPgG2bE,2250
|
|
172
173
|
data_designer/essentials/__init__.py,sha256=eHuZFJTmeRf_b6KQZ2vZeqy1afJ7y7RMTm7q4Jrg58s,1012
|
|
@@ -178,8 +179,8 @@ data_designer/plugins/__init__.py,sha256=c_V7q4QhfVoNf_uc9UwmXCsWqwtyWogI7YoN_0P
|
|
|
178
179
|
data_designer/plugins/errors.py,sha256=yPIHpSddEr-o9ZcNVibb2hI-73O15Kg_Od8SlmQlnRs,297
|
|
179
180
|
data_designer/plugins/plugin.py,sha256=a2KfoCNhYa8U0uQrPSBWfuyjXOb5WeITzFRpEdZFo6s,2516
|
|
180
181
|
data_designer/plugins/registry.py,sha256=c0X03TnA_J60RWpxaVJEmtIXKvA9up-LznrUHXDcYxg,3012
|
|
181
|
-
data_designer-0.2.
|
|
182
|
-
data_designer-0.2.
|
|
183
|
-
data_designer-0.2.
|
|
184
|
-
data_designer-0.2.
|
|
185
|
-
data_designer-0.2.
|
|
182
|
+
data_designer-0.2.2.dist-info/METADATA,sha256=kcCjCe9CSOS7xenYsG6NduNpMm5ELNmRBBv3goYAqoY,7636
|
|
183
|
+
data_designer-0.2.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
184
|
+
data_designer-0.2.2.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
|
|
185
|
+
data_designer-0.2.2.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
|
|
186
|
+
data_designer-0.2.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|