openai-gabriel 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gabriel/__init__.py +61 -0
- gabriel/_version.py +1 -0
- gabriel/api.py +2284 -0
- gabriel/cli/__main__.py +60 -0
- gabriel/core/__init__.py +7 -0
- gabriel/core/llm_client.py +34 -0
- gabriel/core/pipeline.py +18 -0
- gabriel/core/prompt_template.py +152 -0
- gabriel/prompts/__init__.py +1 -0
- gabriel/prompts/bucket_prompt.jinja2 +113 -0
- gabriel/prompts/classification_prompt.jinja2 +50 -0
- gabriel/prompts/codify_prompt.jinja2 +95 -0
- gabriel/prompts/comparison_prompt.jinja2 +60 -0
- gabriel/prompts/deduplicate_prompt.jinja2 +41 -0
- gabriel/prompts/deidentification_prompt.jinja2 +112 -0
- gabriel/prompts/extraction_prompt.jinja2 +61 -0
- gabriel/prompts/filter_prompt.jinja2 +31 -0
- gabriel/prompts/ideation_prompt.jinja2 +80 -0
- gabriel/prompts/merge_prompt.jinja2 +47 -0
- gabriel/prompts/paraphrase_prompt.jinja2 +17 -0
- gabriel/prompts/rankings_prompt.jinja2 +49 -0
- gabriel/prompts/ratings_prompt.jinja2 +50 -0
- gabriel/prompts/regional_analysis_prompt.jinja2 +40 -0
- gabriel/prompts/seed.jinja2 +43 -0
- gabriel/prompts/snippets.jinja2 +117 -0
- gabriel/tasks/__init__.py +63 -0
- gabriel/tasks/_attribute_utils.py +69 -0
- gabriel/tasks/bucket.py +432 -0
- gabriel/tasks/classify.py +562 -0
- gabriel/tasks/codify.py +1033 -0
- gabriel/tasks/compare.py +235 -0
- gabriel/tasks/debias.py +1460 -0
- gabriel/tasks/deduplicate.py +341 -0
- gabriel/tasks/deidentify.py +316 -0
- gabriel/tasks/discover.py +524 -0
- gabriel/tasks/extract.py +455 -0
- gabriel/tasks/filter.py +169 -0
- gabriel/tasks/ideate.py +782 -0
- gabriel/tasks/merge.py +464 -0
- gabriel/tasks/paraphrase.py +531 -0
- gabriel/tasks/rank.py +2041 -0
- gabriel/tasks/rate.py +347 -0
- gabriel/tasks/seed.py +465 -0
- gabriel/tasks/whatever.py +344 -0
- gabriel/utils/__init__.py +64 -0
- gabriel/utils/audio_utils.py +42 -0
- gabriel/utils/file_utils.py +464 -0
- gabriel/utils/image_utils.py +22 -0
- gabriel/utils/jinja.py +31 -0
- gabriel/utils/logging.py +86 -0
- gabriel/utils/mapmaker.py +304 -0
- gabriel/utils/media_utils.py +78 -0
- gabriel/utils/modality_utils.py +148 -0
- gabriel/utils/openai_utils.py +5470 -0
- gabriel/utils/parsing.py +282 -0
- gabriel/utils/passage_viewer.py +2557 -0
- gabriel/utils/pdf_utils.py +20 -0
- gabriel/utils/plot_utils.py +2881 -0
- gabriel/utils/prompt_utils.py +42 -0
- gabriel/utils/word_matching.py +158 -0
- openai_gabriel-1.0.1.dist-info/METADATA +443 -0
- openai_gabriel-1.0.1.dist-info/RECORD +67 -0
- openai_gabriel-1.0.1.dist-info/WHEEL +5 -0
- openai_gabriel-1.0.1.dist-info/entry_points.txt +2 -0
- openai_gabriel-1.0.1.dist-info/licenses/LICENSE +201 -0
- openai_gabriel-1.0.1.dist-info/licenses/NOTICE +13 -0
- openai_gabriel-1.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
"""Lightweight runner for arbitrary prompts via :func:`get_all_responses`."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import os
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any, Dict, Iterable, List, Optional, Union
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from ..utils import load_audio_inputs, load_image_inputs
|
|
13
|
+
from ..utils.openai_utils import get_all_responses, response_to_text
|
|
14
|
+
from ..utils.parsing import safe_json
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class WhateverConfig:
|
|
19
|
+
"""Configuration for :class:`Whatever`."""
|
|
20
|
+
|
|
21
|
+
save_dir: str = "whatever"
|
|
22
|
+
file_name: str = "custom_prompt_responses.csv"
|
|
23
|
+
model: str = "gpt-5-mini"
|
|
24
|
+
json_mode: bool = False
|
|
25
|
+
web_search: Optional[bool] = None
|
|
26
|
+
web_search_filters: Optional[Dict[str, Any]] = None
|
|
27
|
+
search_context_size: str = "medium"
|
|
28
|
+
n_parallels: int = 650
|
|
29
|
+
use_dummy: bool = False
|
|
30
|
+
reasoning_effort: Optional[str] = None
|
|
31
|
+
reasoning_summary: Optional[str] = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Whatever:
|
|
35
|
+
"""Prepare prompts and dispatch them through :func:`get_all_responses`."""
|
|
36
|
+
|
|
37
|
+
def __init__(self, cfg: WhateverConfig) -> None:
|
|
38
|
+
expanded = os.path.expandvars(os.path.expanduser(cfg.save_dir))
|
|
39
|
+
os.makedirs(expanded, exist_ok=True)
|
|
40
|
+
cfg.save_dir = expanded
|
|
41
|
+
self.cfg = cfg
|
|
42
|
+
|
|
43
|
+
# ------------------------------------------------------------------
|
|
44
|
+
@staticmethod
|
|
45
|
+
def _generate_identifiers(
|
|
46
|
+
prompts: List[str], provided: Optional[List[str]] = None
|
|
47
|
+
) -> List[str]:
|
|
48
|
+
if provided is not None:
|
|
49
|
+
if len(provided) != len(prompts):
|
|
50
|
+
raise ValueError("Length of identifiers must match number of prompts")
|
|
51
|
+
ids = [str(i) for i in provided]
|
|
52
|
+
if len(set(ids)) != len(ids):
|
|
53
|
+
raise ValueError("Identifiers must be unique")
|
|
54
|
+
return ids
|
|
55
|
+
|
|
56
|
+
counts: Dict[str, int] = {}
|
|
57
|
+
generated: List[str] = []
|
|
58
|
+
for prompt in prompts:
|
|
59
|
+
key = hashlib.sha1(prompt.encode("utf-8")).hexdigest()[:8]
|
|
60
|
+
idx = counts.get(key, 0)
|
|
61
|
+
counts[key] = idx + 1
|
|
62
|
+
ident = key if idx == 0 else f"{key}-{idx}"
|
|
63
|
+
generated.append(ident)
|
|
64
|
+
return generated
|
|
65
|
+
|
|
66
|
+
# ------------------------------------------------------------------
|
|
67
|
+
@staticmethod
|
|
68
|
+
def _is_missing(value: Any) -> bool:
|
|
69
|
+
if value is None:
|
|
70
|
+
return True
|
|
71
|
+
try:
|
|
72
|
+
res = pd.isna(value)
|
|
73
|
+
except Exception:
|
|
74
|
+
return False
|
|
75
|
+
try:
|
|
76
|
+
return bool(res)
|
|
77
|
+
except Exception:
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
# ------------------------------------------------------------------
|
|
81
|
+
@staticmethod
|
|
82
|
+
def _coerce_domains(value: Any) -> List[str]:
|
|
83
|
+
if Whatever._is_missing(value):
|
|
84
|
+
return []
|
|
85
|
+
if isinstance(value, str):
|
|
86
|
+
return [part.strip() for part in value.split(",") if part.strip()]
|
|
87
|
+
if isinstance(value, dict):
|
|
88
|
+
return []
|
|
89
|
+
if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
|
|
90
|
+
domains = [str(item).strip() for item in value if str(item).strip()]
|
|
91
|
+
return domains
|
|
92
|
+
text = str(value).strip()
|
|
93
|
+
return [text] if text else []
|
|
94
|
+
|
|
95
|
+
# ------------------------------------------------------------------
|
|
96
|
+
@staticmethod
|
|
97
|
+
def extract_json(
|
|
98
|
+
df: pd.DataFrame,
|
|
99
|
+
*,
|
|
100
|
+
text_column: str = "Response",
|
|
101
|
+
target_column: str = "Response JSON",
|
|
102
|
+
) -> pd.DataFrame:
|
|
103
|
+
"""Return a copy of ``df`` with parsed JSON objects in ``target_column``."""
|
|
104
|
+
|
|
105
|
+
if text_column not in df.columns:
|
|
106
|
+
raise ValueError(
|
|
107
|
+
f"Column '{text_column}' not found in DataFrame; available columns: {list(df.columns)}"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
out = df.copy()
|
|
111
|
+
|
|
112
|
+
def _parse(value: Any) -> Optional[Union[dict, list]]:
|
|
113
|
+
text = response_to_text(value)
|
|
114
|
+
if not text:
|
|
115
|
+
return None
|
|
116
|
+
parsed = safe_json(text)
|
|
117
|
+
return parsed if parsed else None
|
|
118
|
+
|
|
119
|
+
out[target_column] = out[text_column].apply(_parse)
|
|
120
|
+
return out
|
|
121
|
+
|
|
122
|
+
async def run(
|
|
123
|
+
self,
|
|
124
|
+
prompts: Optional[Union[str, List[str], pd.DataFrame]] = None,
|
|
125
|
+
*,
|
|
126
|
+
df: Optional[pd.DataFrame] = None,
|
|
127
|
+
identifiers: Optional[List[str]] = None,
|
|
128
|
+
column_name: Optional[str] = None,
|
|
129
|
+
identifier_column: Optional[str] = None,
|
|
130
|
+
image_column: Optional[str] = None,
|
|
131
|
+
audio_column: Optional[str] = None,
|
|
132
|
+
prompt_images: Optional[Dict[str, List[str]]] = None,
|
|
133
|
+
prompt_audio: Optional[Dict[str, List[Dict[str, str]]]] = None,
|
|
134
|
+
web_search_filters: Optional[Dict[str, Any]] = None,
|
|
135
|
+
reset_files: bool = False,
|
|
136
|
+
return_original_columns: bool = True,
|
|
137
|
+
drop_prompts: bool = True,
|
|
138
|
+
parse_json: Optional[bool] = None,
|
|
139
|
+
**kwargs: Any,
|
|
140
|
+
) -> pd.DataFrame:
|
|
141
|
+
"""Normalise inputs and call :func:`get_all_responses`.
|
|
142
|
+
|
|
143
|
+
Parameters
|
|
144
|
+
----------
|
|
145
|
+
parse_json:
|
|
146
|
+
When ``True`` (the default while ``json_mode`` is enabled) the
|
|
147
|
+
returned DataFrame will include a ``Response JSON`` column with the
|
|
148
|
+
parsed structure for each response. Set to ``False`` to skip the
|
|
149
|
+
extra parsing step and keep the raw JSON text in the ``Response``
|
|
150
|
+
column.
|
|
151
|
+
return_original_columns:
|
|
152
|
+
When ``True`` and ``df`` is provided, merge response columns back
|
|
153
|
+
onto the original DataFrame using the same identifier strategy.
|
|
154
|
+
drop_prompts:
|
|
155
|
+
When ``True`` and merging back onto ``df``, drop the prompt column
|
|
156
|
+
before saving/returning the merged DataFrame.
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
filters_spec: Dict[str, Any] = dict(
|
|
160
|
+
web_search_filters
|
|
161
|
+
if web_search_filters is not None
|
|
162
|
+
else (self.cfg.web_search_filters or {})
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
df_filters: Optional[Dict[str, Dict[str, Any]]] = None
|
|
166
|
+
global_filters: Optional[Dict[str, Any]] = filters_spec or None
|
|
167
|
+
|
|
168
|
+
source_data = df if df is not None else prompts
|
|
169
|
+
if source_data is None:
|
|
170
|
+
raise ValueError("Either prompts or df must be provided to Whatever.run")
|
|
171
|
+
|
|
172
|
+
if isinstance(source_data, pd.DataFrame):
|
|
173
|
+
if column_name is None:
|
|
174
|
+
raise ValueError("column_name must be provided when passing a DataFrame")
|
|
175
|
+
if column_name not in source_data.columns:
|
|
176
|
+
raise ValueError(f"Column '{column_name}' not found in DataFrame")
|
|
177
|
+
df_input = source_data.reset_index(drop=True)
|
|
178
|
+
df_proc = df_input.copy()
|
|
179
|
+
prompt_series = df_input[column_name]
|
|
180
|
+
prompt_values = [
|
|
181
|
+
"" if self._is_missing(val) else str(val)
|
|
182
|
+
for val in prompt_series.tolist()
|
|
183
|
+
]
|
|
184
|
+
if identifier_column is not None:
|
|
185
|
+
if identifier_column not in df_input.columns:
|
|
186
|
+
raise ValueError(
|
|
187
|
+
f"Identifier column '{identifier_column}' not found in DataFrame"
|
|
188
|
+
)
|
|
189
|
+
identifiers_list = [str(i) for i in df_input[identifier_column].tolist()]
|
|
190
|
+
if len(set(identifiers_list)) != len(identifiers_list):
|
|
191
|
+
raise ValueError("identifier_column must contain unique values")
|
|
192
|
+
else:
|
|
193
|
+
identifiers_list = self._generate_identifiers(prompt_values)
|
|
194
|
+
|
|
195
|
+
df_proc["_gid"] = identifiers_list
|
|
196
|
+
|
|
197
|
+
image_map: Dict[str, List[str]] = {}
|
|
198
|
+
if image_column is not None:
|
|
199
|
+
if image_column not in df_input.columns:
|
|
200
|
+
raise ValueError(
|
|
201
|
+
f"Image column '{image_column}' not found in DataFrame"
|
|
202
|
+
)
|
|
203
|
+
for ident, cell in zip(identifiers_list, df_input[image_column]):
|
|
204
|
+
imgs = load_image_inputs(cell)
|
|
205
|
+
if imgs:
|
|
206
|
+
image_map[str(ident)] = imgs
|
|
207
|
+
|
|
208
|
+
audio_map: Dict[str, List[Dict[str, str]]] = {}
|
|
209
|
+
if audio_column is not None:
|
|
210
|
+
if audio_column not in df_input.columns:
|
|
211
|
+
raise ValueError(
|
|
212
|
+
f"Audio column '{audio_column}' not found in DataFrame"
|
|
213
|
+
)
|
|
214
|
+
for ident, cell in zip(identifiers_list, df_input[audio_column]):
|
|
215
|
+
auds = load_audio_inputs(cell)
|
|
216
|
+
if auds:
|
|
217
|
+
audio_map[str(ident)] = auds
|
|
218
|
+
|
|
219
|
+
column_filters: Dict[str, str] = {}
|
|
220
|
+
base_filters: Dict[str, Any] = {}
|
|
221
|
+
for key, spec in filters_spec.items():
|
|
222
|
+
if isinstance(spec, str) and spec in df_input.columns:
|
|
223
|
+
column_filters[key] = spec
|
|
224
|
+
elif key == "allowed_domains" and isinstance(spec, str) and spec in df_input.columns:
|
|
225
|
+
column_filters[key] = spec
|
|
226
|
+
else:
|
|
227
|
+
base_filters[key] = spec
|
|
228
|
+
|
|
229
|
+
per_prompt_filters: Dict[str, Dict[str, Any]] = {}
|
|
230
|
+
if column_filters:
|
|
231
|
+
for idx, ident in enumerate(identifiers_list):
|
|
232
|
+
row = df_input.iloc[idx]
|
|
233
|
+
row_filters: Dict[str, Any] = {}
|
|
234
|
+
for key, col in column_filters.items():
|
|
235
|
+
value = row.get(col)
|
|
236
|
+
if self._is_missing(value):
|
|
237
|
+
continue
|
|
238
|
+
if key == "allowed_domains":
|
|
239
|
+
domains = self._coerce_domains(value)
|
|
240
|
+
if domains:
|
|
241
|
+
row_filters[key] = domains
|
|
242
|
+
else:
|
|
243
|
+
text = str(value).strip()
|
|
244
|
+
if text:
|
|
245
|
+
row_filters[key] = text
|
|
246
|
+
if row_filters:
|
|
247
|
+
per_prompt_filters[str(ident)] = row_filters
|
|
248
|
+
df_filters = per_prompt_filters or None
|
|
249
|
+
global_filters = base_filters or None
|
|
250
|
+
|
|
251
|
+
prompts_list = prompt_values
|
|
252
|
+
df_source = df_proc
|
|
253
|
+
else:
|
|
254
|
+
if isinstance(source_data, str):
|
|
255
|
+
prompts_list = [source_data]
|
|
256
|
+
else:
|
|
257
|
+
prompts_list = [str(p) for p in source_data]
|
|
258
|
+
identifiers_list = self._generate_identifiers(
|
|
259
|
+
prompts_list, identifiers
|
|
260
|
+
)
|
|
261
|
+
image_map = {}
|
|
262
|
+
audio_map = {}
|
|
263
|
+
df_source = None
|
|
264
|
+
|
|
265
|
+
if prompt_images:
|
|
266
|
+
if not isinstance(prompt_images, dict):
|
|
267
|
+
raise TypeError("prompt_images must be a mapping of identifier to images")
|
|
268
|
+
for key, val in prompt_images.items():
|
|
269
|
+
if val:
|
|
270
|
+
image_map[str(key)] = val
|
|
271
|
+
|
|
272
|
+
if prompt_audio:
|
|
273
|
+
if not isinstance(prompt_audio, dict):
|
|
274
|
+
raise TypeError("prompt_audio must be a mapping of identifier to audio payloads")
|
|
275
|
+
for key, val in prompt_audio.items():
|
|
276
|
+
if val:
|
|
277
|
+
audio_map[str(key)] = val
|
|
278
|
+
|
|
279
|
+
images_payload = image_map or None
|
|
280
|
+
audio_payload = audio_map or None
|
|
281
|
+
|
|
282
|
+
save_path = kwargs.pop(
|
|
283
|
+
"save_path", os.path.join(self.cfg.save_dir, self.cfg.file_name)
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
web_search_flag = (
|
|
287
|
+
self.cfg.web_search
|
|
288
|
+
if self.cfg.web_search is not None
|
|
289
|
+
else bool(global_filters or df_filters)
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
df_resp = await get_all_responses(
|
|
293
|
+
prompts=prompts_list,
|
|
294
|
+
identifiers=identifiers_list,
|
|
295
|
+
prompt_images=images_payload,
|
|
296
|
+
prompt_audio=audio_payload,
|
|
297
|
+
prompt_web_search_filters=df_filters,
|
|
298
|
+
save_path=save_path,
|
|
299
|
+
model=self.cfg.model,
|
|
300
|
+
json_mode=self.cfg.json_mode,
|
|
301
|
+
web_search=web_search_flag,
|
|
302
|
+
web_search_filters=global_filters,
|
|
303
|
+
search_context_size=self.cfg.search_context_size,
|
|
304
|
+
n_parallels=self.cfg.n_parallels,
|
|
305
|
+
use_dummy=self.cfg.use_dummy,
|
|
306
|
+
reset_files=reset_files,
|
|
307
|
+
reasoning_effort=self.cfg.reasoning_effort,
|
|
308
|
+
reasoning_summary=self.cfg.reasoning_summary,
|
|
309
|
+
**kwargs,
|
|
310
|
+
)
|
|
311
|
+
if not isinstance(df_resp, pd.DataFrame):
|
|
312
|
+
raise RuntimeError("get_all_responses returned no DataFrame")
|
|
313
|
+
|
|
314
|
+
df_clean = df_resp.copy()
|
|
315
|
+
if "Response" in df_clean.columns:
|
|
316
|
+
df_clean["Response"] = df_clean["Response"].apply(response_to_text)
|
|
317
|
+
|
|
318
|
+
if self.cfg.json_mode:
|
|
319
|
+
auto_parse = parse_json if parse_json is not None else True
|
|
320
|
+
if auto_parse:
|
|
321
|
+
df_clean = self.extract_json(
|
|
322
|
+
df_clean,
|
|
323
|
+
text_column="Response",
|
|
324
|
+
target_column="Response JSON",
|
|
325
|
+
)
|
|
326
|
+
print(
|
|
327
|
+
"[Whatever] Parsed JSON output is available in the 'Response JSON' column."
|
|
328
|
+
)
|
|
329
|
+
else:
|
|
330
|
+
print(
|
|
331
|
+
"[Whatever] JSON responses are stored as text in the 'Response' column. "
|
|
332
|
+
"Call `Whatever.extract_json(df)` to parse them into structured objects."
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
if df_source is not None and return_original_columns:
|
|
336
|
+
merged = df_source.merge(
|
|
337
|
+
df_clean, left_on="_gid", right_on="Identifier", how="left"
|
|
338
|
+
).drop(columns=["_gid"])
|
|
339
|
+
if drop_prompts and column_name and column_name in merged.columns:
|
|
340
|
+
merged = merged.drop(columns=[column_name])
|
|
341
|
+
merged.to_csv(save_path, index=False)
|
|
342
|
+
return merged
|
|
343
|
+
|
|
344
|
+
return df_clean
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Utility helpers for GABRIEL."""
|
|
2
|
+
|
|
3
|
+
from .openai_utils import (
|
|
4
|
+
get_response,
|
|
5
|
+
get_all_responses,
|
|
6
|
+
get_embedding,
|
|
7
|
+
get_all_embeddings,
|
|
8
|
+
DummyResponseSpec,
|
|
9
|
+
)
|
|
10
|
+
from .image_utils import encode_image
|
|
11
|
+
from .audio_utils import encode_audio
|
|
12
|
+
from .media_utils import load_image_inputs, load_audio_inputs, load_pdf_inputs
|
|
13
|
+
from .pdf_utils import encode_pdf
|
|
14
|
+
from .logging import get_logger, set_log_level
|
|
15
|
+
from .mapmaker import MapMaker, create_county_choropleth
|
|
16
|
+
from .parsing import safe_json, safest_json, clean_json_df
|
|
17
|
+
from .jinja import shuffled, shuffled_dict, get_env
|
|
18
|
+
from .passage_viewer import PassageViewer, view
|
|
19
|
+
from .word_matching import (
|
|
20
|
+
normalize_text_aggressive,
|
|
21
|
+
normalize_text_generous,
|
|
22
|
+
normalize_whitespace,
|
|
23
|
+
letters_only,
|
|
24
|
+
robust_find_improved,
|
|
25
|
+
strict_find,
|
|
26
|
+
)
|
|
27
|
+
from .prompt_utils import swap_circle_square
|
|
28
|
+
from .modality_utils import warn_if_modality_mismatch
|
|
29
|
+
from .file_utils import load
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"get_response",
|
|
33
|
+
"get_all_responses",
|
|
34
|
+
"get_embedding",
|
|
35
|
+
"get_all_embeddings",
|
|
36
|
+
"DummyResponseSpec",
|
|
37
|
+
"get_logger",
|
|
38
|
+
"set_log_level",
|
|
39
|
+
"MapMaker",
|
|
40
|
+
"create_county_choropleth",
|
|
41
|
+
"safe_json",
|
|
42
|
+
"safest_json",
|
|
43
|
+
"clean_json_df",
|
|
44
|
+
"encode_image",
|
|
45
|
+
"encode_audio",
|
|
46
|
+
"encode_pdf",
|
|
47
|
+
"load_image_inputs",
|
|
48
|
+
"load_audio_inputs",
|
|
49
|
+
"load_pdf_inputs",
|
|
50
|
+
"shuffled",
|
|
51
|
+
"shuffled_dict",
|
|
52
|
+
"get_env",
|
|
53
|
+
"normalize_text_aggressive",
|
|
54
|
+
"normalize_text_generous",
|
|
55
|
+
"normalize_whitespace",
|
|
56
|
+
"letters_only",
|
|
57
|
+
"robust_find_improved",
|
|
58
|
+
"strict_find",
|
|
59
|
+
"PassageViewer",
|
|
60
|
+
"view",
|
|
61
|
+
"swap_circle_square",
|
|
62
|
+
"warn_if_modality_mismatch",
|
|
63
|
+
"load",
|
|
64
|
+
]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Utility helpers for working with audio files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def encode_audio(audio_path: str) -> Optional[Dict[str, str]]:
|
|
11
|
+
"""Return the audio at ``audio_path`` as a dict suitable for the OpenAI API.
|
|
12
|
+
|
|
13
|
+
The returned dictionary has two keys:
|
|
14
|
+
|
|
15
|
+
``data``
|
|
16
|
+
Base64 encoded contents of the file.
|
|
17
|
+
|
|
18
|
+
``format``
|
|
19
|
+
The lowercase file extension (e.g. ``"mp3"``, ``"wav"``). If the
|
|
20
|
+
extension cannot be determined ``"wav"`` is used as a fallback.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
audio_path:
|
|
25
|
+
Path to the audio file to encode.
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
dict or None
|
|
30
|
+
A mapping with ``data`` and ``format`` keys, or ``None`` if reading the
|
|
31
|
+
file fails.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
path = Path(audio_path)
|
|
36
|
+
with path.open("rb") as f:
|
|
37
|
+
b64 = base64.b64encode(f.read()).decode("utf-8")
|
|
38
|
+
ext = path.suffix.lstrip(".").lower() or "wav"
|
|
39
|
+
return {"data": b64, "format": ext}
|
|
40
|
+
except Exception:
|
|
41
|
+
return None
|
|
42
|
+
|