udiagent 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
udiagent/__init__.py ADDED
@@ -0,0 +1,41 @@
1
+ """UDIAgent — LLM-powered data visualization orchestration library."""
2
+
3
+ from udiagent.agent import UDIAgent
4
+ from udiagent.orchestrator import Orchestrator, OrchestratorResult
5
+ from udiagent.skills import Skill, load_skills, render_template
6
+ from udiagent.grammar import load_grammar
7
+ from udiagent.vis_generate import generate_vis_spec
8
+ from udiagent.schema import (
9
+ parse_schema_from_dict,
10
+ simplify_data_domains,
11
+ simplify_data_schema,
12
+ )
13
+ from udiagent.messages import split_tool_calls, normalize_tool_calls
14
+ from udiagent.structured_functions import (
15
+ validate_structured_text,
16
+ segment_structured_text,
17
+ get_function_signatures,
18
+ export_registry_json,
19
+ )
20
+ from udiagent.tools import ORCHESTRATOR_TOOLS
21
+
22
+ __all__ = [
23
+ "UDIAgent",
24
+ "Orchestrator",
25
+ "OrchestratorResult",
26
+ "Skill",
27
+ "load_grammar",
28
+ "load_skills",
29
+ "generate_vis_spec",
30
+ "parse_schema_from_dict",
31
+ "simplify_data_domains",
32
+ "simplify_data_schema",
33
+ "split_tool_calls",
34
+ "normalize_tool_calls",
35
+ "validate_structured_text",
36
+ "segment_structured_text",
37
+ "get_function_signatures",
38
+ "export_registry_json",
39
+ "ORCHESTRATOR_TOOLS",
40
+ "render_template",
41
+ ]
udiagent/_compat.py ADDED
@@ -0,0 +1,17 @@
1
+ """Optional dependency handling."""
2
+
3
+ import logging
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ def get_openai_class():
9
+ """Return the OpenAI client class, preferring langfuse-wrapped if available."""
10
+ try:
11
+ from langfuse.openai import OpenAI
12
+
13
+ return OpenAI
14
+ except ImportError:
15
+ from openai import OpenAI
16
+
17
+ return OpenAI
udiagent/agent.py ADDED
@@ -0,0 +1,95 @@
1
+ """UDIAgent — OpenAI client wrapper."""
2
+
3
+ import json
4
+ import logging
5
+ from functools import lru_cache
6
+
7
+ from udiagent._compat import get_openai_class
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ _OpenAI = get_openai_class()
12
+
13
+
14
+ @lru_cache(maxsize=128)
15
+ def _make_openai_client(api_key: str):
16
+ """Cached OpenAI client factory — preserves httpx connection pooling across requests."""
17
+ return _OpenAI(api_key=api_key)
18
+
19
+
20
+ class UDIAgent:
21
+ """UDIAgent for requesting UDI grammar via OpenAI."""
22
+
23
+ def __init__(
24
+ self,
25
+ gpt_model_name: str,
26
+ openai_api_key: str | None = None,
27
+ ):
28
+ self.gpt_model_name = gpt_model_name
29
+ self._init_server_model_connection(openai_api_key)
30
+
31
+ def _init_server_model_connection(self, openai_api_key: str | None = None):
32
+ """Instantiate the OpenAI client for GPT-based features.
33
+
34
+ Uses the explicitly provided *openai_api_key* if given.
35
+ """
36
+ if openai_api_key is None:
37
+ logger.info(
38
+ "No OpenAI API key provided; GPT-based features will require per-request keys."
39
+ )
40
+ self.gpt_model = None
41
+ else:
42
+ logger.info(
43
+ "OpenAI API key provided; GPT-based features will use this key by default."
44
+ )
45
+ self.gpt_model = _OpenAI(api_key=openai_api_key)
46
+
47
+ def _get_gpt_client(self, openai_api_key: str | None = None):
48
+ """Return a per-request OpenAI client if a custom key is provided, otherwise the default."""
49
+ if openai_api_key:
50
+ return _make_openai_client(openai_api_key)
51
+ if self.gpt_model is None:
52
+ raise RuntimeError(
53
+ "No OpenAI API key available. Provide openai_api_key to UDIAgent() "
54
+ "or pass a per-request key."
55
+ )
56
+ return self.gpt_model
57
+
58
+ def gpt_completions_guided_json(
59
+ self,
60
+ messages: list[dict],
61
+ json_schema: str,
62
+ n=1,
63
+ openai_api_key: str | None = None,
64
+ ):
65
+ # Normalize schema to dict
66
+ if isinstance(json_schema, str):
67
+ try:
68
+ schema_obj = json.loads(json_schema)
69
+ except json.JSONDecodeError as e:
70
+ raise ValueError(f"json_schema must be a valid JSON string: {e}")
71
+ else:
72
+ schema_obj = json_schema
73
+
74
+ # Wrap for Structured Outputs (required shape)
75
+ schema_wrapper = {
76
+ "name": "GuidedJSON",
77
+ "schema": schema_obj,
78
+ "strict": True,
79
+ }
80
+
81
+ client = self._get_gpt_client(openai_api_key)
82
+ resp = client.chat.completions.create(
83
+ model=self.gpt_model_name,
84
+ messages=messages,
85
+ response_format={
86
+ "type": "json_schema",
87
+ "json_schema": schema_wrapper,
88
+ },
89
+ n=n,
90
+ temperature=0.0,
91
+ max_completion_tokens=16_384,
92
+ )
93
+
94
+ outputs = [json.loads(choice.message.content) for choice in resp.choices]
95
+ return outputs
@@ -0,0 +1,6 @@
1
+ """UDIAgent benchmark utilities.
2
+
3
+ Requires the ``[benchmark]`` optional dependency group::
4
+
5
+ pip install udiagent[benchmark]
6
+ """
@@ -0,0 +1,211 @@
1
+ """
2
+ Convert the large benchmark_dqvis.json (or .json.gz) into a compact JSONL layout.
3
+
4
+ Output structure under <output-dir>/benchmark_dqvis/:
5
+ schemas.json — {dataset_key: {dataSchema, dataDomains}} for each unique dataset
6
+ full.jsonl — one compact JSON object per line (no schema/domains duplication)
7
+ small.jsonl — ~100 items (stratified sample)
8
+ medium.jsonl — ~1000 items (stratified sample)
9
+ large.jsonl — ~10000 items (stratified sample)
10
+
11
+ Usage:
12
+ python src/convert_benchmark_to_jsonl.py --input data/benchmark_dqvis.json.gz
13
+ python src/convert_benchmark_to_jsonl.py --input data/benchmark_dqvis.json --output-dir data
14
+ """
15
+
16
+ import argparse
17
+ import gzip
18
+ import json
19
+ import os
20
+ import random
21
+
22
+ import ijson
23
+
24
+
25
+ def open_input(path):
26
+ """Open a JSON file, transparently handling .gz compression."""
27
+ if path.endswith(".gz"):
28
+ return gzip.open(path, "rb")
29
+ return open(path, "rb")
30
+
31
+
32
+ def extract_dataset_key(data_schema_str):
33
+ """Parse the dataSchema JSON string and return the dataset key (udi:name or name)."""
34
+ schema = json.loads(data_schema_str)
35
+ return schema.get("udi:name", schema.get("name"))
36
+
37
+
38
+ def strip_overlapping_fields(data_schema_str):
39
+ """Remove udi:overlapping_fields from every field in every resource of a dataSchema JSON string."""
40
+ schema = json.loads(data_schema_str)
41
+ for resource in schema.get("resources", []):
42
+ for field in resource.get("schema", {}).get("fields", []):
43
+ field.pop("udi:overlapping_fields", None)
44
+ return json.dumps(schema, separators=(",", ":"))
45
+
46
+
47
+ def pass1_extract_schemas(input_path):
48
+ """
49
+ Pass 1: Stream through every item and collect the unique
50
+ {dataSchema, dataDomains} pairs keyed by dataset name.
51
+ """
52
+ schemas = {} # cleaned schemas for output
53
+ raw_schemas = {} # raw strings for consistency checking
54
+ count = 0
55
+ with open_input(input_path) as f:
56
+ for item in ijson.items(f, "item"):
57
+ count += 1
58
+ ds_str = item["input"]["dataSchema"]
59
+ dd_str = item["input"]["dataDomains"]
60
+ key = extract_dataset_key(ds_str)
61
+
62
+ if key not in schemas:
63
+ raw_schemas[key] = {"dataSchema": ds_str, "dataDomains": dd_str}
64
+ schemas[key] = {
65
+ "dataSchema": strip_overlapping_fields(ds_str),
66
+ "dataDomains": dd_str,
67
+ }
68
+ print(f" Found dataset: {key}")
69
+ else:
70
+ # Verify consistency against raw (pre-cleaning) values
71
+ if raw_schemas[key]["dataSchema"] != ds_str:
72
+ raise ValueError(f"dataSchema mismatch for {key} at item {count}")
73
+ if raw_schemas[key]["dataDomains"] != dd_str:
74
+ raise ValueError(f"dataDomains mismatch for {key} at item {count}")
75
+
76
+ if count % 5000 == 0:
77
+ print(f" Pass 1: {count} items scanned ...")
78
+
79
+ print(f" Pass 1 complete: {count} items, {len(schemas)} unique datasets")
80
+ return schemas, count
81
+
82
+
83
+ def pass2_write_jsonl(input_path, output_path, schemas):
84
+ """
85
+ Pass 2: Stream items again and write compact JSONL.
86
+ Each line has dataSchema/dataDomains stripped, replaced by dataset_key.
87
+ """
88
+ count = 0
89
+ with open_input(input_path) as f_in, open(output_path, "w") as f_out:
90
+ for item in ijson.items(f_in, "item"):
91
+ count += 1
92
+ ds_str = item["input"]["dataSchema"]
93
+ key = extract_dataset_key(ds_str)
94
+
95
+ # Build compact item
96
+ compact_input = {
97
+ k: v
98
+ for k, v in item["input"].items()
99
+ if k not in ("dataSchema", "dataDomains")
100
+ }
101
+ compact_input["dataset_key"] = key
102
+
103
+ compact_item = {"input": compact_input, "expected": item["expected"]}
104
+ f_out.write(json.dumps(compact_item, separators=(",", ":")) + "\n")
105
+
106
+ if count % 5000 == 0:
107
+ print(f" Pass 2: {count} items written ...")
108
+
109
+ print(f" Pass 2 complete: {count} items written to {output_path}")
110
+ return count
111
+
112
+
113
+ def pass3_generate_subsets(full_jsonl_path, output_dir, subsets):
114
+ """
115
+ Pass 3: Read full.jsonl and generate stratified random subsets.
116
+ Stratified by dataset_key so each subset has proportional representation.
117
+ """
118
+ # Load all lines grouped by dataset_key
119
+ by_dataset = {}
120
+ all_lines = []
121
+ with open(full_jsonl_path, "r") as f:
122
+ for line_num, line in enumerate(f):
123
+ all_lines.append(line)
124
+ item = json.loads(line)
125
+ key = item["input"]["dataset_key"]
126
+ if key not in by_dataset:
127
+ by_dataset[key] = []
128
+ by_dataset[key].append(line_num)
129
+
130
+ total = len(all_lines)
131
+ print(f" Pass 3: {total} total items across {len(by_dataset)} datasets")
132
+
133
+ rng = random.Random(42)
134
+
135
+ for subset_name, target_size in subsets.items():
136
+ target_size = min(target_size, total)
137
+ selected_indices = set()
138
+
139
+ # Stratified sampling: proportional to dataset size
140
+ for key, indices in by_dataset.items():
141
+ proportion = len(indices) / total
142
+ n_for_key = max(1, round(proportion * target_size))
143
+ n_for_key = min(n_for_key, len(indices))
144
+ selected_indices.update(rng.sample(indices, n_for_key))
145
+
146
+ # If we have too few, add random ones; if too many, trim
147
+ remaining = list(set(range(total)) - selected_indices)
148
+ if len(selected_indices) < target_size:
149
+ extra = rng.sample(remaining, min(target_size - len(selected_indices), len(remaining)))
150
+ selected_indices.update(extra)
151
+ elif len(selected_indices) > target_size:
152
+ selected_indices = set(rng.sample(list(selected_indices), target_size))
153
+
154
+ # Write in original order
155
+ sorted_indices = sorted(selected_indices)
156
+ subset_path = os.path.join(output_dir, f"{subset_name}.jsonl")
157
+ with open(subset_path, "w") as f:
158
+ for idx in sorted_indices:
159
+ f.write(all_lines[idx])
160
+
161
+ print(f" {subset_name}.jsonl: {len(sorted_indices)} items")
162
+
163
+
164
+ def main():
165
+ parser = argparse.ArgumentParser(
166
+ description="Convert benchmark JSON to compact JSONL format"
167
+ )
168
+ parser.add_argument(
169
+ "--input",
170
+ default="data/benchmark_dqvis.json",
171
+ help="Path to the input benchmark JSON file (supports .gz)",
172
+ )
173
+ parser.add_argument(
174
+ "--output-dir",
175
+ default="data",
176
+ help="Parent directory for output (default: data). Creates benchmark_dqvis/ subdirectory.",
177
+ )
178
+ args = parser.parse_args()
179
+
180
+ out_dir = os.path.join(args.output_dir, "benchmark_dqvis")
181
+ os.makedirs(out_dir, exist_ok=True)
182
+
183
+ schemas_path = os.path.join(out_dir, "schemas.json")
184
+ full_jsonl_path = os.path.join(out_dir, "full.jsonl")
185
+
186
+ # Pass 1: Extract unique schemas
187
+ print("Pass 1: Extracting unique schemas ...")
188
+ schemas, total_count = pass1_extract_schemas(args.input)
189
+
190
+ # Write schemas.json
191
+ with open(schemas_path, "w") as f:
192
+ json.dump(schemas, f, separators=(",", ":"))
193
+ schema_size = os.path.getsize(schemas_path)
194
+ print(f" schemas.json: {schema_size / 1024:.0f} KB ({len(schemas)} datasets)")
195
+
196
+ # Pass 2: Write compact JSONL
197
+ print("Pass 2: Writing compact JSONL ...")
198
+ pass2_write_jsonl(args.input, full_jsonl_path, schemas)
199
+ full_size = os.path.getsize(full_jsonl_path)
200
+ print(f" full.jsonl: {full_size / 1024 / 1024:.0f} MB")
201
+
202
+ # Pass 3: Generate subsets
203
+ print("Pass 3: Generating subsets ...")
204
+ subsets = {"small": 100, "medium": 1000, "large": 10000}
205
+ pass3_generate_subsets(full_jsonl_path, out_dir, subsets)
206
+
207
+ print("\nDone! Output directory:", out_dir)
208
+
209
+
210
+ if __name__ == "__main__":
211
+ main()