asynth 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. asynth/__init__.py +79 -0
  2. asynth/_compat.py +291 -0
  3. asynth/configs/__init__.py +15 -0
  4. asynth/configs/environment_config.py +124 -0
  5. asynth/configs/inference_config.py +14 -0
  6. asynth/configs/judge_config.py +205 -0
  7. asynth/configs/params/__init__.py +76 -0
  8. asynth/configs/params/environment_params.py +121 -0
  9. asynth/configs/params/grounding_params.py +109 -0
  10. asynth/configs/params/guided_decoding_params.py +55 -0
  11. asynth/configs/params/judge_params.py +118 -0
  12. asynth/configs/params/rule_judge_params.py +70 -0
  13. asynth/configs/params/synthesis_params.py +960 -0
  14. asynth/configs/params/tool_params.py +145 -0
  15. asynth/configs/synthesis_config.py +172 -0
  16. asynth/environments/__init__.py +16 -0
  17. asynth/environments/base_environment.py +48 -0
  18. asynth/environments/deterministic_environment.py +185 -0
  19. asynth/environments/synthetic_environment.py +483 -0
  20. asynth/environments/utils.py +36 -0
  21. asynth/inference/__init__.py +6 -0
  22. asynth/inference/litellm_engine.py +179 -0
  23. asynth/judges/__init__.py +103 -0
  24. asynth/judges/base_judge.py +449 -0
  25. asynth/judges/rule_based_judge.py +81 -0
  26. asynth/judges/rules/__init__.py +7 -0
  27. asynth/judges/rules/base_rule.py +22 -0
  28. asynth/judges/rules/regex.py +63 -0
  29. asynth/judges/simple_judge.py +204 -0
  30. asynth/judges/templates/code/code_quality.yaml +36 -0
  31. asynth/judges/templates/code/correctness.yaml +36 -0
  32. asynth/judges/templates/code/maintainability.yaml +38 -0
  33. asynth/judges/templates/code/performance.yaml +38 -0
  34. asynth/judges/templates/code/security.yaml +38 -0
  35. asynth/judges/templates/doc_qa/completeness.yaml +38 -0
  36. asynth/judges/templates/doc_qa/groundedness.yaml +53 -0
  37. asynth/judges/templates/doc_qa/relevance.yaml +37 -0
  38. asynth/judges/templates/generic/format_compliance.yaml +32 -0
  39. asynth/judges/templates/generic/instruction_following.yaml +29 -0
  40. asynth/judges/templates/generic/safety.yaml +31 -0
  41. asynth/judges/templates/generic/topic_adherence.yaml +29 -0
  42. asynth/judges/templates/generic/truthfulness.yaml +29 -0
  43. asynth/judges/templates/rule_based/regex_match_phone.yaml +34 -0
  44. asynth/judges/templates/rule_based/regex_no_error_keywords.yaml +28 -0
  45. asynth/py.typed +0 -0
  46. asynth/synthesis/__init__.py +42 -0
  47. asynth/synthesis/attribute_formatter.py +142 -0
  48. asynth/synthesis/attribute_synthesizer.py +433 -0
  49. asynth/synthesis/attribute_transformation.py +147 -0
  50. asynth/synthesis/conversation_synthesizer.py +1023 -0
  51. asynth/synthesis/data_synthesizer.py +46 -0
  52. asynth/synthesis/dataset_ingestion.py +238 -0
  53. asynth/synthesis/dataset_planner.py +410 -0
  54. asynth/synthesis/document_ingestion.py +194 -0
  55. asynth/synthesis/quality_checker.py +235 -0
  56. asynth/synthesis/synthesis_pipeline.py +288 -0
  57. asynth/synthesis/tool_router.py +172 -0
  58. asynth/types/__init__.py +44 -0
  59. asynth/types/conversation.py +646 -0
  60. asynth/types/tool_call.py +180 -0
  61. asynth/utils/__init__.py +6 -0
  62. asynth/utils/placeholders.py +86 -0
  63. asynth-0.1.0.dist-info/METADATA +121 -0
  64. asynth-0.1.0.dist-info/RECORD +66 -0
  65. asynth-0.1.0.dist-info/WHEEL +4 -0
  66. asynth-0.1.0.dist-info/licenses/LICENSE +201 -0
asynth/__init__.py ADDED
@@ -0,0 +1,79 @@
1
+ # Copyright 2026 Amortized AI — Licensed under Apache-2.0
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from importlib.metadata import version
5
+
6
+ from asynth.configs import LiteLLMInferenceConfig, SynthesisConfig
7
+ from asynth.configs.judge_config import JudgeConfig
8
+ from asynth.configs.params.synthesis_params import (
9
+ DatasetSource,
10
+ DocumentSource,
11
+ ExampleSource,
12
+ GeneralSynthesisParams,
13
+ GeneratedAttribute,
14
+ MultiTurnAttribute,
15
+ SampledAttribute,
16
+ SampledAttributeValue,
17
+ TextMessage,
18
+ TransformedAttribute,
19
+ )
20
+ from asynth.judges import RuleBasedJudge, SimpleJudge, create_judge, judge
21
+ from asynth.synthesis.synthesis_pipeline import SynthesisPipeline
22
+ from asynth.types.conversation import Conversation, Message, Role
23
+
24
+ __version__ = version("asynth")
25
+
26
+ __all__ = [
27
+ "Conversation",
28
+ "DatasetSource",
29
+ "DocumentSource",
30
+ "ExampleSource",
31
+ "GeneralSynthesisParams",
32
+ "GeneratedAttribute",
33
+ "JudgeConfig",
34
+ "LiteLLMInferenceConfig",
35
+ "Message",
36
+ "MultiTurnAttribute",
37
+ "Role",
38
+ "RuleBasedJudge",
39
+ "SampledAttribute",
40
+ "SampledAttributeValue",
41
+ "SimpleJudge",
42
+ "SynthesisConfig",
43
+ "SynthesisPipeline",
44
+ "TextMessage",
45
+ "TransformedAttribute",
46
+ "__version__",
47
+ "create_judge",
48
+ "judge",
49
+ "synthesize",
50
+ ]
51
+
52
+
53
+ def synthesize(config: SynthesisConfig) -> list[dict]:
54
+ """Run the full synthesis pipeline from a config dict or object.
55
+
56
+ Returns:
57
+ One dict per sample. Keys are derived from configured attributes:
58
+
59
+ - ``SampledAttribute.id`` → str: the sampled value name.
60
+ - ``GeneratedAttribute.id`` → str: LLM response (postprocessed if configured).
61
+ - ``MultiTurnAttribute.id`` → dict: ``Conversation.to_dict()`` with
62
+ ``messages`` and ``metadata`` keys.
63
+ - ``MultiTurnAttribute.id + "_plan"`` → str: conversation plan
64
+ (present only when ``conversation_planner`` is set).
65
+ - ``TransformedAttribute.id`` → str | list | dict: depends on
66
+ ``TransformationStrategy.type``.
67
+
68
+ If ``passthrough_attributes`` is set, only those keys are retained.
69
+
70
+ Note:
71
+ To access the quality report after synthesis, use
72
+ :class:`SynthesisPipeline` directly::
73
+
74
+ pipeline = SynthesisPipeline(config)
75
+ results = pipeline.synthesize()
76
+ report = pipeline.quality_report
77
+ """
78
+ pipeline = SynthesisPipeline(config)
79
+ return pipeline.synthesize()
asynth/_compat.py ADDED
@@ -0,0 +1,291 @@
1
+ # Copyright 2026 Amortized AI — Licensed under Apache-2.0
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Inlined utilities from oumi.utils.placeholders, oumi.utils.str_utils,
5
+ and oumi.utils.io_utils to avoid depending on the full Oumi package."""
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import re
11
+ from collections.abc import Mapping
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import jsonlines
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # From oumi/utils/placeholders.py
19
+ # ---------------------------------------------------------------------------
20
+
21
+
22
+ class _DictWrapper:
23
+ """Wrapper that allows dict keys to be accessed as attributes in format strings.
24
+
25
+ Enables {item.field} syntax where item is a dictionary with a 'field' key.
26
+ """
27
+
28
+ def __init__(self, data: dict):
29
+ """Initialize with a dictionary.
30
+
31
+ Args:
32
+ data: Dictionary to wrap for attribute-style access.
33
+ """
34
+ self._data = data
35
+
36
+ def __getattr__(self, key: str):
37
+ """Support attribute-style access: item.field.
38
+
39
+ Args:
40
+ key: Dictionary key to access.
41
+
42
+ Returns:
43
+ Value at the specified key.
44
+
45
+ Raises:
46
+ AttributeError: If key is not in dictionary.
47
+ """
48
+ try:
49
+ return self._data[key]
50
+ except KeyError as e:
51
+ raise AttributeError(
52
+ f"'{type(self).__name__}' object has no attribute '{key}'"
53
+ ) from e
54
+
55
+ def __getitem__(self, key):
56
+ """Support dict-style access: item['field'].
57
+
58
+ Args:
59
+ key: Dictionary key to access.
60
+
61
+ Returns:
62
+ Value at the specified key.
63
+ """
64
+ return self._data[key]
65
+
66
+
67
+ class IndexableValue:
68
+ """Wrapper for list values that supports bracket notation in format strings.
69
+
70
+ Enables {examples[0].field} syntax in templates by implementing __getitem__.
71
+ """
72
+
73
+ def __init__(self, items: list[dict]):
74
+ """Initialize with a list of dictionaries.
75
+
76
+ Args:
77
+ items: List of dictionaries to wrap for indexed access.
78
+ """
79
+ self._items = items
80
+
81
+ def __getitem__(self, index: int | str):
82
+ """Support bracket notation: examples[0].
83
+
84
+ Args:
85
+ index: Integer index to access (supports negative indices).
86
+ Can be passed as an int or a string representation of an int.
87
+
88
+ Returns:
89
+ Dictionary at the specified index, wrapped to support attribute access.
90
+
91
+ Raises:
92
+ TypeError: If index is not an integer or string representation of one.
93
+ IndexError: If index is out of range.
94
+ """
95
+ # Convert string indices to integers (needed for format_map)
96
+ if isinstance(index, str):
97
+ try:
98
+ index = int(index)
99
+ except ValueError as e:
100
+ raise TypeError(
101
+ "Index must be integer or string representation of integer, "
102
+ f"got string '{index}'"
103
+ ) from e
104
+ elif not isinstance(index, int):
105
+ raise TypeError(f"Index must be integer, got {type(index).__name__}")
106
+
107
+ # Handle negative indices like Python lists
108
+ if index < 0:
109
+ index = len(self._items) + index
110
+
111
+ if index < 0 or index >= len(self._items):
112
+ raise IndexError("Index out of range")
113
+ return _DictWrapper(self._items[index])
114
+
115
+ def __len__(self) -> int:
116
+ """Return the number of items."""
117
+ return len(self._items)
118
+
119
+
120
+ class SafeDict(dict):
121
+ def __init__(self, missing_values_allowed: bool, *args, **kwargs):
122
+ """Initialize the SafeDict with the missing_values_allowed flag."""
123
+ self.missing_values_allowed = missing_values_allowed
124
+ self.placeholder_names = set()
125
+ super().__init__(*args, **kwargs)
126
+
127
+ def __missing__(self, key: str) -> str:
128
+ """Handle missing keys in the dictionary."""
129
+ self.placeholder_names.add(key)
130
+ if self.missing_values_allowed:
131
+ return "{" + key + "}"
132
+ else:
133
+ raise ValueError(f"Missing value for placeholder: {key}")
134
+
135
+ def __getitem__(self, key):
136
+ """Override to wrap list values with IndexableValue for bracket support.
137
+
138
+ Args:
139
+ key: Dictionary key to access.
140
+
141
+ Returns:
142
+ Value at the key, with lists of dicts wrapped in IndexableValue.
143
+ """
144
+ value = super().__getitem__(key)
145
+
146
+ # Wrap lists of dicts to support bracket notation like {examples[0].field}
147
+ if isinstance(value, list) and len(value) > 0 and isinstance(value[0], dict):
148
+ return IndexableValue(value)
149
+
150
+ return value
151
+
152
+
153
+ def resolve_placeholders(
154
+ text: str,
155
+ values_dict: Mapping[str, object],
156
+ missing_values_allowed: bool = False,
157
+ ) -> str:
158
+ """Resolve placeholder {variables} in the provided text from the values_dict."""
159
+ return text.format_map(SafeDict(missing_values_allowed, values_dict))
160
+
161
+
162
+ def get_placeholders(text: str) -> set[str]:
163
+ """Extract placeholder variable names from text with {variable} syntax."""
164
+ safe_dict = SafeDict(missing_values_allowed=True)
165
+ text.format_map(safe_dict)
166
+ return safe_dict.placeholder_names
167
+
168
+
169
+ # ---------------------------------------------------------------------------
170
+ # From oumi/utils/str_utils.py
171
+ # ---------------------------------------------------------------------------
172
+
173
+
174
+ def extract_json(text: str, expected_type: type | None = list) -> dict | list | None:
175
+ """Extract a JSON object or array from text that may contain surrounding prose.
176
+
177
+ Extraction strategy (first match wins):
178
+
179
+ 1. Code-fenced JSON (```json ... ``` or ``` ... ```).
180
+ 2. Raw delimiters -- takes the span from the *first* opening delimiter
181
+ (``[`` or ``{``) to the *last* matching closing delimiter (``]`` or
182
+ ``}``), then attempts ``json.loads`` on that slice.
183
+
184
+ Because step 2 uses the outermost delimiter span, the input text should
185
+ contain at most **one** JSON structure of the expected type. If multiple
186
+ JSON blocks or stray brackets appear in the surrounding prose, parsing
187
+ may fail or return an unexpected result.
188
+
189
+ Args:
190
+ text: The text to extract JSON from (e.g. LLM output that wraps
191
+ a JSON payload in natural-language prose).
192
+ expected_type: The expected Python type of the parsed result
193
+ (``list``, ``dict``, or ``None`` to accept either).
194
+
195
+ Returns:
196
+ The parsed JSON value if extraction and type-checking succeed,
197
+ otherwise ``None``.
198
+ """
199
+
200
+ def _matches_expected(value: object) -> bool:
201
+ return expected_type is None or isinstance(value, expected_type)
202
+
203
+ json_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", text)
204
+ if json_match:
205
+ try:
206
+ result = json.loads(json_match.group(1))
207
+ if _matches_expected(result):
208
+ return result
209
+ except json.JSONDecodeError:
210
+ pass
211
+
212
+ delimiters = []
213
+ if expected_type in (list, None):
214
+ delimiters.append(("[", "]"))
215
+
216
+ if expected_type in (dict, None):
217
+ delimiters.append(("{", "}"))
218
+
219
+ for open_char, close_char in delimiters:
220
+ start = text.find(open_char)
221
+ end = text.rfind(close_char)
222
+ if start != -1 and end > start:
223
+ try:
224
+ result = json.loads(text[start : end + 1])
225
+ if _matches_expected(result):
226
+ return result
227
+ except json.JSONDecodeError:
228
+ pass
229
+ return None
230
+
231
+
232
+ # ---------------------------------------------------------------------------
233
+ # From oumi/utils/io_utils.py
234
+ # ---------------------------------------------------------------------------
235
+
236
+
237
+ def load_xlsx_all_sheets(filename: str | Path) -> Any:
238
+ """Load all sheets from an XLSX file and concatenate them into a single DataFrame.
239
+
240
+ Args:
241
+ filename: Path to the XLSX file.
242
+
243
+ Returns:
244
+ pd.DataFrame: DataFrame containing all data from all sheets concatenated.
245
+
246
+ Raises:
247
+ ImportError: If openpyxl is not installed.
248
+ FileNotFoundError: If the file doesn't exist.
249
+ """
250
+ import pandas as pd
251
+
252
+ try:
253
+ import openpyxl # noqa: F401
254
+ except ImportError:
255
+ raise ImportError(
256
+ "openpyxl is not installed. Please install it with "
257
+ "`pip install asynth[docs]` or `pip install openpyxl`."
258
+ ) from None
259
+
260
+ file_path = Path(filename)
261
+ if not file_path.exists():
262
+ raise FileNotFoundError(f"The file {filename} does not exist.")
263
+
264
+ # Read all sheets from the XLSX file
265
+ all_sheets = pd.read_excel(file_path, sheet_name=None, engine="openpyxl")
266
+
267
+ # Handle empty XLSX files (no sheets)
268
+ if not all_sheets:
269
+ return pd.DataFrame()
270
+
271
+ # Concatenate all DataFrames from all sheets
272
+ return pd.concat(all_sheets.values(), ignore_index=True)
273
+
274
+
275
+ def save_jsonlines(filename: str | Path, data: list[dict[str, Any]]) -> None:
276
+ """Save a list of dictionaries to a jsonlines file.
277
+
278
+ Args:
279
+ filename: Path to the jsonlines file to be created or overwritten.
280
+ data: A list of dictionaries to be saved as JSON objects.
281
+
282
+ Raises:
283
+ IOError: If there's an error writing to the file.
284
+ """
285
+ file_path = Path(filename)
286
+
287
+ try:
288
+ with jsonlines.open(file_path, mode="w") as writer:
289
+ writer.write_all(data)
290
+ except OSError as e:
291
+ raise OSError(f"Error writing to file {filename}") from e
@@ -0,0 +1,15 @@
1
+ # Copyright 2026 Amortized AI — Licensed under Apache-2.0
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from asynth.configs.environment_config import EnvironmentConfig
5
+ from asynth.configs.inference_config import LiteLLMInferenceConfig
6
+ from asynth.configs.judge_config import JudgeConfig
7
+ from asynth.configs.synthesis_config import SynthesisConfig, SynthesisStrategy
8
+
9
+ __all__ = [
10
+ "EnvironmentConfig",
11
+ "JudgeConfig",
12
+ "LiteLLMInferenceConfig",
13
+ "SynthesisConfig",
14
+ "SynthesisStrategy",
15
+ ]
@@ -0,0 +1,124 @@
1
+ # Copyright 2026 Amortized AI — Licensed under Apache-2.0
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Configuration for agentic environments."""
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass, field
9
+
10
+ from asynth.configs.params.environment_params import EnvironmentParams
11
+ from asynth.configs.params.tool_params import ToolParams
12
+
13
+
14
+ @dataclass
15
+ class EnvironmentConfig:
16
+ """Top-level config for environment-first tool definitions."""
17
+
18
+ environments: list[EnvironmentParams] = field(default_factory=list)
19
+ """Reusable environments and their owned tools."""
20
+
21
+ def __post_init__(self) -> None:
22
+ """Coerce raw dicts into EnvironmentParams and check global uniqueness."""
23
+ self.environments = [
24
+ env if isinstance(env, EnvironmentParams) else EnvironmentParams(**env)
25
+ for env in self.environments
26
+ ]
27
+
28
+ env_ids: set[str] = set()
29
+ tool_ids: set[str] = set()
30
+
31
+ for environment in self.environments:
32
+ if environment.id in env_ids:
33
+ raise ValueError(
34
+ f"EnvironmentConfig.environments contains duplicate "
35
+ f"environment id '{environment.id}'."
36
+ )
37
+ env_ids.add(environment.id)
38
+
39
+ for tool in environment.tools:
40
+ if tool.id in tool_ids:
41
+ raise ValueError(
42
+ f"EnvironmentConfig.environments contains duplicate "
43
+ f"tool id '{tool.id}'."
44
+ )
45
+ tool_ids.add(tool.id)
46
+
47
+ def finalize_and_validate(self) -> None:
48
+ """Validate every environment in the list."""
49
+ for environment in self.environments:
50
+ environment.finalize_and_validate()
51
+
52
+ @property
53
+ def all_tools(self) -> list[ToolParams]:
54
+ """Flatten all tools across environments."""
55
+ return [tool for environment in self.environments for tool in environment.tools]
56
+
57
+ @property
58
+ def tool_environment_map(self) -> dict[str, str]:
59
+ """Map each tool id to the environment that owns it."""
60
+ return {
61
+ tool.id: environment.id
62
+ for environment in self.environments
63
+ for tool in environment.tools
64
+ }
65
+
66
+ def get_environment(self, environment_id: str) -> EnvironmentParams | None:
67
+ """Look up an environment by id."""
68
+ for environment in self.environments:
69
+ if environment.id == environment_id:
70
+ return environment
71
+ return None
72
+
73
+ def get_tool(self, tool_id: str) -> ToolParams | None:
74
+ """Look up a tool by id."""
75
+ for tool in self.all_tools:
76
+ if tool.id == tool_id:
77
+ return tool
78
+ return None
79
+
80
+ def resolve_tools(
81
+ self,
82
+ environment_ids: list[str] | None = None,
83
+ tool_ids: list[str] | None = None,
84
+ ) -> list[ToolParams]:
85
+ """Resolve tools from selected environments and optional tool ids.
86
+
87
+ Raises:
88
+ ValueError: If any environment_id or tool_id is not found.
89
+ """
90
+ all_env_ids = {env.id for env in self.environments}
91
+
92
+ if environment_ids:
93
+ unknown_envs = set(environment_ids) - all_env_ids
94
+ if unknown_envs:
95
+ raise ValueError(
96
+ f"Unknown environment id(s): {sorted(unknown_envs)}. "
97
+ f"Defined: {sorted(all_env_ids)}"
98
+ )
99
+ selected_environment_ids = environment_ids
100
+ else:
101
+ selected_environment_ids = list(all_env_ids)
102
+
103
+ selected_environments = [
104
+ environment
105
+ for environment in self.environments
106
+ if environment.id in set(selected_environment_ids)
107
+ ]
108
+ tools = [
109
+ tool for environment in selected_environments for tool in environment.tools
110
+ ]
111
+
112
+ if tool_ids:
113
+ available_tool_ids = {tool.id for tool in tools}
114
+ unknown_tools = set(tool_ids) - available_tool_ids
115
+ if unknown_tools:
116
+ raise ValueError(
117
+ f"Unknown tool id(s): {sorted(unknown_tools)}. "
118
+ f"Available in selected environments: "
119
+ f"{sorted(available_tool_ids)}"
120
+ )
121
+ allowed_tool_ids = set(tool_ids)
122
+ tools = [tool for tool in tools if tool.id in allowed_tool_ids]
123
+
124
+ return tools
@@ -0,0 +1,14 @@
1
+ # Copyright 2026 Amortized AI — Licensed under Apache-2.0
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Inference configuration for asynth.
5
+
6
+ Replaces Oumi's InferenceConfig which pulled in model loading,
7
+ generation params, and engine type registry. asynth uses LiteLLM only.
8
+ """
9
+
10
+ from asynth.inference.litellm_engine import LiteLLMInferenceConfig
11
+
12
+ InferenceConfig = LiteLLMInferenceConfig
13
+
14
+ __all__ = ["InferenceConfig", "LiteLLMInferenceConfig"]