udiagent 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- udiagent/__init__.py +41 -0
- udiagent/_compat.py +17 -0
- udiagent/agent.py +95 -0
- udiagent/benchmark/__init__.py +6 -0
- udiagent/benchmark/convert.py +211 -0
- udiagent/benchmark/runner.py +693 -0
- udiagent/benchmark/sample.py +296 -0
- udiagent/data/UDIGrammarSchema.json +1642 -0
- udiagent/data/UDIGrammarSchema_spec_string.json +28 -0
- udiagent/data/skills/clarify_variable.md +34 -0
- udiagent/data/skills/free_text_explain.md +55 -0
- udiagent/data/skills/generate.md +37 -0
- udiagent/data/skills/orchestrate.md +30 -0
- udiagent/data/skills/rebuff.md +30 -0
- udiagent/data/skills/template_visualizations.json +877 -0
- udiagent/data/skills/validate.md +18 -0
- udiagent/generate_tools.py +551 -0
- udiagent/generated_vis_tools.py +1522 -0
- udiagent/grammar.py +34 -0
- udiagent/messages.py +100 -0
- udiagent/orchestrator.py +400 -0
- udiagent/schema.py +135 -0
- udiagent/server/__init__.py +10 -0
- udiagent/server/app.py +170 -0
- udiagent/server/auth.py +27 -0
- udiagent/server/config.py +30 -0
- udiagent/server/models.py +16 -0
- udiagent/skills.py +82 -0
- udiagent/structured_functions.py +217 -0
- udiagent/tools.py +241 -0
- udiagent/vis_generate.py +622 -0
- udiagent-0.2.0.dist-info/METADATA +235 -0
- udiagent-0.2.0.dist-info/RECORD +35 -0
- udiagent-0.2.0.dist-info/WHEEL +4 -0
- udiagent-0.2.0.dist-info/licenses/LICENSE +21 -0
udiagent/__init__.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""UDIAgent — LLM-powered data visualization orchestration library."""
|
|
2
|
+
|
|
3
|
+
from udiagent.agent import UDIAgent
|
|
4
|
+
from udiagent.orchestrator import Orchestrator, OrchestratorResult
|
|
5
|
+
from udiagent.skills import Skill, load_skills, render_template
|
|
6
|
+
from udiagent.grammar import load_grammar
|
|
7
|
+
from udiagent.vis_generate import generate_vis_spec
|
|
8
|
+
from udiagent.schema import (
|
|
9
|
+
parse_schema_from_dict,
|
|
10
|
+
simplify_data_domains,
|
|
11
|
+
simplify_data_schema,
|
|
12
|
+
)
|
|
13
|
+
from udiagent.messages import split_tool_calls, normalize_tool_calls
|
|
14
|
+
from udiagent.structured_functions import (
|
|
15
|
+
validate_structured_text,
|
|
16
|
+
segment_structured_text,
|
|
17
|
+
get_function_signatures,
|
|
18
|
+
export_registry_json,
|
|
19
|
+
)
|
|
20
|
+
from udiagent.tools import ORCHESTRATOR_TOOLS
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"UDIAgent",
|
|
24
|
+
"Orchestrator",
|
|
25
|
+
"OrchestratorResult",
|
|
26
|
+
"Skill",
|
|
27
|
+
"load_grammar",
|
|
28
|
+
"load_skills",
|
|
29
|
+
"generate_vis_spec",
|
|
30
|
+
"parse_schema_from_dict",
|
|
31
|
+
"simplify_data_domains",
|
|
32
|
+
"simplify_data_schema",
|
|
33
|
+
"split_tool_calls",
|
|
34
|
+
"normalize_tool_calls",
|
|
35
|
+
"validate_structured_text",
|
|
36
|
+
"segment_structured_text",
|
|
37
|
+
"get_function_signatures",
|
|
38
|
+
"export_registry_json",
|
|
39
|
+
"ORCHESTRATOR_TOOLS",
|
|
40
|
+
"render_template",
|
|
41
|
+
]
|
udiagent/_compat.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Optional dependency handling."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_openai_class():
|
|
9
|
+
"""Return the OpenAI client class, preferring langfuse-wrapped if available."""
|
|
10
|
+
try:
|
|
11
|
+
from langfuse.openai import OpenAI
|
|
12
|
+
|
|
13
|
+
return OpenAI
|
|
14
|
+
except ImportError:
|
|
15
|
+
from openai import OpenAI
|
|
16
|
+
|
|
17
|
+
return OpenAI
|
udiagent/agent.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""UDIAgent — OpenAI client wrapper."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
|
|
7
|
+
from udiagent._compat import get_openai_class
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
_OpenAI = get_openai_class()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@lru_cache(maxsize=128)
|
|
15
|
+
def _make_openai_client(api_key: str):
|
|
16
|
+
"""Cached OpenAI client factory — preserves httpx connection pooling across requests."""
|
|
17
|
+
return _OpenAI(api_key=api_key)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class UDIAgent:
|
|
21
|
+
"""UDIAgent for requesting UDI grammar via OpenAI."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
gpt_model_name: str,
|
|
26
|
+
openai_api_key: str | None = None,
|
|
27
|
+
):
|
|
28
|
+
self.gpt_model_name = gpt_model_name
|
|
29
|
+
self._init_server_model_connection(openai_api_key)
|
|
30
|
+
|
|
31
|
+
def _init_server_model_connection(self, openai_api_key: str | None = None):
|
|
32
|
+
"""Instantiate the OpenAI client for GPT-based features.
|
|
33
|
+
|
|
34
|
+
Uses the explicitly provided *openai_api_key* if given.
|
|
35
|
+
"""
|
|
36
|
+
if openai_api_key is None:
|
|
37
|
+
logger.info(
|
|
38
|
+
"No OpenAI API key provided; GPT-based features will require per-request keys."
|
|
39
|
+
)
|
|
40
|
+
self.gpt_model = None
|
|
41
|
+
else:
|
|
42
|
+
logger.info(
|
|
43
|
+
"OpenAI API key provided; GPT-based features will use this key by default."
|
|
44
|
+
)
|
|
45
|
+
self.gpt_model = _OpenAI(api_key=openai_api_key)
|
|
46
|
+
|
|
47
|
+
def _get_gpt_client(self, openai_api_key: str | None = None):
|
|
48
|
+
"""Return a per-request OpenAI client if a custom key is provided, otherwise the default."""
|
|
49
|
+
if openai_api_key:
|
|
50
|
+
return _make_openai_client(openai_api_key)
|
|
51
|
+
if self.gpt_model is None:
|
|
52
|
+
raise RuntimeError(
|
|
53
|
+
"No OpenAI API key available. Provide openai_api_key to UDIAgent() "
|
|
54
|
+
"or pass a per-request key."
|
|
55
|
+
)
|
|
56
|
+
return self.gpt_model
|
|
57
|
+
|
|
58
|
+
def gpt_completions_guided_json(
|
|
59
|
+
self,
|
|
60
|
+
messages: list[dict],
|
|
61
|
+
json_schema: str,
|
|
62
|
+
n=1,
|
|
63
|
+
openai_api_key: str | None = None,
|
|
64
|
+
):
|
|
65
|
+
# Normalize schema to dict
|
|
66
|
+
if isinstance(json_schema, str):
|
|
67
|
+
try:
|
|
68
|
+
schema_obj = json.loads(json_schema)
|
|
69
|
+
except json.JSONDecodeError as e:
|
|
70
|
+
raise ValueError(f"json_schema must be a valid JSON string: {e}")
|
|
71
|
+
else:
|
|
72
|
+
schema_obj = json_schema
|
|
73
|
+
|
|
74
|
+
# Wrap for Structured Outputs (required shape)
|
|
75
|
+
schema_wrapper = {
|
|
76
|
+
"name": "GuidedJSON",
|
|
77
|
+
"schema": schema_obj,
|
|
78
|
+
"strict": True,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
client = self._get_gpt_client(openai_api_key)
|
|
82
|
+
resp = client.chat.completions.create(
|
|
83
|
+
model=self.gpt_model_name,
|
|
84
|
+
messages=messages,
|
|
85
|
+
response_format={
|
|
86
|
+
"type": "json_schema",
|
|
87
|
+
"json_schema": schema_wrapper,
|
|
88
|
+
},
|
|
89
|
+
n=n,
|
|
90
|
+
temperature=0.0,
|
|
91
|
+
max_completion_tokens=16_384,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
outputs = [json.loads(choice.message.content) for choice in resp.choices]
|
|
95
|
+
return outputs
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Convert the large benchmark_dqvis.json (or .json.gz) into a compact JSONL layout.
|
|
3
|
+
|
|
4
|
+
Output structure under <output-dir>/benchmark_dqvis/:
|
|
5
|
+
schemas.json — {dataset_key: {dataSchema, dataDomains}} for each unique dataset
|
|
6
|
+
full.jsonl — one compact JSON object per line (no schema/domains duplication)
|
|
7
|
+
small.jsonl — ~100 items (stratified sample)
|
|
8
|
+
medium.jsonl — ~1000 items (stratified sample)
|
|
9
|
+
large.jsonl — ~10000 items (stratified sample)
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python src/convert_benchmark_to_jsonl.py --input data/benchmark_dqvis.json.gz
|
|
13
|
+
python src/convert_benchmark_to_jsonl.py --input data/benchmark_dqvis.json --output-dir data
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import gzip
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import random
|
|
21
|
+
|
|
22
|
+
import ijson
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def open_input(path):
|
|
26
|
+
"""Open a JSON file, transparently handling .gz compression."""
|
|
27
|
+
if path.endswith(".gz"):
|
|
28
|
+
return gzip.open(path, "rb")
|
|
29
|
+
return open(path, "rb")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_dataset_key(data_schema_str):
|
|
33
|
+
"""Parse the dataSchema JSON string and return the dataset key (udi:name or name)."""
|
|
34
|
+
schema = json.loads(data_schema_str)
|
|
35
|
+
return schema.get("udi:name", schema.get("name"))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def strip_overlapping_fields(data_schema_str):
|
|
39
|
+
"""Remove udi:overlapping_fields from every field in every resource of a dataSchema JSON string."""
|
|
40
|
+
schema = json.loads(data_schema_str)
|
|
41
|
+
for resource in schema.get("resources", []):
|
|
42
|
+
for field in resource.get("schema", {}).get("fields", []):
|
|
43
|
+
field.pop("udi:overlapping_fields", None)
|
|
44
|
+
return json.dumps(schema, separators=(",", ":"))
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def pass1_extract_schemas(input_path):
|
|
48
|
+
"""
|
|
49
|
+
Pass 1: Stream through every item and collect the unique
|
|
50
|
+
{dataSchema, dataDomains} pairs keyed by dataset name.
|
|
51
|
+
"""
|
|
52
|
+
schemas = {} # cleaned schemas for output
|
|
53
|
+
raw_schemas = {} # raw strings for consistency checking
|
|
54
|
+
count = 0
|
|
55
|
+
with open_input(input_path) as f:
|
|
56
|
+
for item in ijson.items(f, "item"):
|
|
57
|
+
count += 1
|
|
58
|
+
ds_str = item["input"]["dataSchema"]
|
|
59
|
+
dd_str = item["input"]["dataDomains"]
|
|
60
|
+
key = extract_dataset_key(ds_str)
|
|
61
|
+
|
|
62
|
+
if key not in schemas:
|
|
63
|
+
raw_schemas[key] = {"dataSchema": ds_str, "dataDomains": dd_str}
|
|
64
|
+
schemas[key] = {
|
|
65
|
+
"dataSchema": strip_overlapping_fields(ds_str),
|
|
66
|
+
"dataDomains": dd_str,
|
|
67
|
+
}
|
|
68
|
+
print(f" Found dataset: {key}")
|
|
69
|
+
else:
|
|
70
|
+
# Verify consistency against raw (pre-cleaning) values
|
|
71
|
+
if raw_schemas[key]["dataSchema"] != ds_str:
|
|
72
|
+
raise ValueError(f"dataSchema mismatch for {key} at item {count}")
|
|
73
|
+
if raw_schemas[key]["dataDomains"] != dd_str:
|
|
74
|
+
raise ValueError(f"dataDomains mismatch for {key} at item {count}")
|
|
75
|
+
|
|
76
|
+
if count % 5000 == 0:
|
|
77
|
+
print(f" Pass 1: {count} items scanned ...")
|
|
78
|
+
|
|
79
|
+
print(f" Pass 1 complete: {count} items, {len(schemas)} unique datasets")
|
|
80
|
+
return schemas, count
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def pass2_write_jsonl(input_path, output_path, schemas):
|
|
84
|
+
"""
|
|
85
|
+
Pass 2: Stream items again and write compact JSONL.
|
|
86
|
+
Each line has dataSchema/dataDomains stripped, replaced by dataset_key.
|
|
87
|
+
"""
|
|
88
|
+
count = 0
|
|
89
|
+
with open_input(input_path) as f_in, open(output_path, "w") as f_out:
|
|
90
|
+
for item in ijson.items(f_in, "item"):
|
|
91
|
+
count += 1
|
|
92
|
+
ds_str = item["input"]["dataSchema"]
|
|
93
|
+
key = extract_dataset_key(ds_str)
|
|
94
|
+
|
|
95
|
+
# Build compact item
|
|
96
|
+
compact_input = {
|
|
97
|
+
k: v
|
|
98
|
+
for k, v in item["input"].items()
|
|
99
|
+
if k not in ("dataSchema", "dataDomains")
|
|
100
|
+
}
|
|
101
|
+
compact_input["dataset_key"] = key
|
|
102
|
+
|
|
103
|
+
compact_item = {"input": compact_input, "expected": item["expected"]}
|
|
104
|
+
f_out.write(json.dumps(compact_item, separators=(",", ":")) + "\n")
|
|
105
|
+
|
|
106
|
+
if count % 5000 == 0:
|
|
107
|
+
print(f" Pass 2: {count} items written ...")
|
|
108
|
+
|
|
109
|
+
print(f" Pass 2 complete: {count} items written to {output_path}")
|
|
110
|
+
return count
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def pass3_generate_subsets(full_jsonl_path, output_dir, subsets):
|
|
114
|
+
"""
|
|
115
|
+
Pass 3: Read full.jsonl and generate stratified random subsets.
|
|
116
|
+
Stratified by dataset_key so each subset has proportional representation.
|
|
117
|
+
"""
|
|
118
|
+
# Load all lines grouped by dataset_key
|
|
119
|
+
by_dataset = {}
|
|
120
|
+
all_lines = []
|
|
121
|
+
with open(full_jsonl_path, "r") as f:
|
|
122
|
+
for line_num, line in enumerate(f):
|
|
123
|
+
all_lines.append(line)
|
|
124
|
+
item = json.loads(line)
|
|
125
|
+
key = item["input"]["dataset_key"]
|
|
126
|
+
if key not in by_dataset:
|
|
127
|
+
by_dataset[key] = []
|
|
128
|
+
by_dataset[key].append(line_num)
|
|
129
|
+
|
|
130
|
+
total = len(all_lines)
|
|
131
|
+
print(f" Pass 3: {total} total items across {len(by_dataset)} datasets")
|
|
132
|
+
|
|
133
|
+
rng = random.Random(42)
|
|
134
|
+
|
|
135
|
+
for subset_name, target_size in subsets.items():
|
|
136
|
+
target_size = min(target_size, total)
|
|
137
|
+
selected_indices = set()
|
|
138
|
+
|
|
139
|
+
# Stratified sampling: proportional to dataset size
|
|
140
|
+
for key, indices in by_dataset.items():
|
|
141
|
+
proportion = len(indices) / total
|
|
142
|
+
n_for_key = max(1, round(proportion * target_size))
|
|
143
|
+
n_for_key = min(n_for_key, len(indices))
|
|
144
|
+
selected_indices.update(rng.sample(indices, n_for_key))
|
|
145
|
+
|
|
146
|
+
# If we have too few, add random ones; if too many, trim
|
|
147
|
+
remaining = list(set(range(total)) - selected_indices)
|
|
148
|
+
if len(selected_indices) < target_size:
|
|
149
|
+
extra = rng.sample(remaining, min(target_size - len(selected_indices), len(remaining)))
|
|
150
|
+
selected_indices.update(extra)
|
|
151
|
+
elif len(selected_indices) > target_size:
|
|
152
|
+
selected_indices = set(rng.sample(list(selected_indices), target_size))
|
|
153
|
+
|
|
154
|
+
# Write in original order
|
|
155
|
+
sorted_indices = sorted(selected_indices)
|
|
156
|
+
subset_path = os.path.join(output_dir, f"{subset_name}.jsonl")
|
|
157
|
+
with open(subset_path, "w") as f:
|
|
158
|
+
for idx in sorted_indices:
|
|
159
|
+
f.write(all_lines[idx])
|
|
160
|
+
|
|
161
|
+
print(f" {subset_name}.jsonl: {len(sorted_indices)} items")
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def main():
|
|
165
|
+
parser = argparse.ArgumentParser(
|
|
166
|
+
description="Convert benchmark JSON to compact JSONL format"
|
|
167
|
+
)
|
|
168
|
+
parser.add_argument(
|
|
169
|
+
"--input",
|
|
170
|
+
default="data/benchmark_dqvis.json",
|
|
171
|
+
help="Path to the input benchmark JSON file (supports .gz)",
|
|
172
|
+
)
|
|
173
|
+
parser.add_argument(
|
|
174
|
+
"--output-dir",
|
|
175
|
+
default="data",
|
|
176
|
+
help="Parent directory for output (default: data). Creates benchmark_dqvis/ subdirectory.",
|
|
177
|
+
)
|
|
178
|
+
args = parser.parse_args()
|
|
179
|
+
|
|
180
|
+
out_dir = os.path.join(args.output_dir, "benchmark_dqvis")
|
|
181
|
+
os.makedirs(out_dir, exist_ok=True)
|
|
182
|
+
|
|
183
|
+
schemas_path = os.path.join(out_dir, "schemas.json")
|
|
184
|
+
full_jsonl_path = os.path.join(out_dir, "full.jsonl")
|
|
185
|
+
|
|
186
|
+
# Pass 1: Extract unique schemas
|
|
187
|
+
print("Pass 1: Extracting unique schemas ...")
|
|
188
|
+
schemas, total_count = pass1_extract_schemas(args.input)
|
|
189
|
+
|
|
190
|
+
# Write schemas.json
|
|
191
|
+
with open(schemas_path, "w") as f:
|
|
192
|
+
json.dump(schemas, f, separators=(",", ":"))
|
|
193
|
+
schema_size = os.path.getsize(schemas_path)
|
|
194
|
+
print(f" schemas.json: {schema_size / 1024:.0f} KB ({len(schemas)} datasets)")
|
|
195
|
+
|
|
196
|
+
# Pass 2: Write compact JSONL
|
|
197
|
+
print("Pass 2: Writing compact JSONL ...")
|
|
198
|
+
pass2_write_jsonl(args.input, full_jsonl_path, schemas)
|
|
199
|
+
full_size = os.path.getsize(full_jsonl_path)
|
|
200
|
+
print(f" full.jsonl: {full_size / 1024 / 1024:.0f} MB")
|
|
201
|
+
|
|
202
|
+
# Pass 3: Generate subsets
|
|
203
|
+
print("Pass 3: Generating subsets ...")
|
|
204
|
+
subsets = {"small": 100, "medium": 1000, "large": 10000}
|
|
205
|
+
pass3_generate_subsets(full_jsonl_path, out_dir, subsets)
|
|
206
|
+
|
|
207
|
+
print("\nDone! Output directory:", out_dir)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
if __name__ == "__main__":
|
|
211
|
+
main()
|