openai-gabriel 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. gabriel/__init__.py +61 -0
  2. gabriel/_version.py +1 -0
  3. gabriel/api.py +2284 -0
  4. gabriel/cli/__main__.py +60 -0
  5. gabriel/core/__init__.py +7 -0
  6. gabriel/core/llm_client.py +34 -0
  7. gabriel/core/pipeline.py +18 -0
  8. gabriel/core/prompt_template.py +152 -0
  9. gabriel/prompts/__init__.py +1 -0
  10. gabriel/prompts/bucket_prompt.jinja2 +113 -0
  11. gabriel/prompts/classification_prompt.jinja2 +50 -0
  12. gabriel/prompts/codify_prompt.jinja2 +95 -0
  13. gabriel/prompts/comparison_prompt.jinja2 +60 -0
  14. gabriel/prompts/deduplicate_prompt.jinja2 +41 -0
  15. gabriel/prompts/deidentification_prompt.jinja2 +112 -0
  16. gabriel/prompts/extraction_prompt.jinja2 +61 -0
  17. gabriel/prompts/filter_prompt.jinja2 +31 -0
  18. gabriel/prompts/ideation_prompt.jinja2 +80 -0
  19. gabriel/prompts/merge_prompt.jinja2 +47 -0
  20. gabriel/prompts/paraphrase_prompt.jinja2 +17 -0
  21. gabriel/prompts/rankings_prompt.jinja2 +49 -0
  22. gabriel/prompts/ratings_prompt.jinja2 +50 -0
  23. gabriel/prompts/regional_analysis_prompt.jinja2 +40 -0
  24. gabriel/prompts/seed.jinja2 +43 -0
  25. gabriel/prompts/snippets.jinja2 +117 -0
  26. gabriel/tasks/__init__.py +63 -0
  27. gabriel/tasks/_attribute_utils.py +69 -0
  28. gabriel/tasks/bucket.py +432 -0
  29. gabriel/tasks/classify.py +562 -0
  30. gabriel/tasks/codify.py +1033 -0
  31. gabriel/tasks/compare.py +235 -0
  32. gabriel/tasks/debias.py +1460 -0
  33. gabriel/tasks/deduplicate.py +341 -0
  34. gabriel/tasks/deidentify.py +316 -0
  35. gabriel/tasks/discover.py +524 -0
  36. gabriel/tasks/extract.py +455 -0
  37. gabriel/tasks/filter.py +169 -0
  38. gabriel/tasks/ideate.py +782 -0
  39. gabriel/tasks/merge.py +464 -0
  40. gabriel/tasks/paraphrase.py +531 -0
  41. gabriel/tasks/rank.py +2041 -0
  42. gabriel/tasks/rate.py +347 -0
  43. gabriel/tasks/seed.py +465 -0
  44. gabriel/tasks/whatever.py +344 -0
  45. gabriel/utils/__init__.py +64 -0
  46. gabriel/utils/audio_utils.py +42 -0
  47. gabriel/utils/file_utils.py +464 -0
  48. gabriel/utils/image_utils.py +22 -0
  49. gabriel/utils/jinja.py +31 -0
  50. gabriel/utils/logging.py +86 -0
  51. gabriel/utils/mapmaker.py +304 -0
  52. gabriel/utils/media_utils.py +78 -0
  53. gabriel/utils/modality_utils.py +148 -0
  54. gabriel/utils/openai_utils.py +5470 -0
  55. gabriel/utils/parsing.py +282 -0
  56. gabriel/utils/passage_viewer.py +2557 -0
  57. gabriel/utils/pdf_utils.py +20 -0
  58. gabriel/utils/plot_utils.py +2881 -0
  59. gabriel/utils/prompt_utils.py +42 -0
  60. gabriel/utils/word_matching.py +158 -0
  61. openai_gabriel-1.0.1.dist-info/METADATA +443 -0
  62. openai_gabriel-1.0.1.dist-info/RECORD +67 -0
  63. openai_gabriel-1.0.1.dist-info/WHEEL +5 -0
  64. openai_gabriel-1.0.1.dist-info/entry_points.txt +2 -0
  65. openai_gabriel-1.0.1.dist-info/licenses/LICENSE +201 -0
  66. openai_gabriel-1.0.1.dist-info/licenses/NOTICE +13 -0
  67. openai_gabriel-1.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,344 @@
1
+ """Lightweight runner for arbitrary prompts via :func:`get_all_responses`."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import os
7
+ from dataclasses import dataclass
8
+ from typing import Any, Dict, Iterable, List, Optional, Union
9
+
10
+ import pandas as pd
11
+
12
+ from ..utils import load_audio_inputs, load_image_inputs
13
+ from ..utils.openai_utils import get_all_responses, response_to_text
14
+ from ..utils.parsing import safe_json
15
+
16
+
17
+ @dataclass
18
+ class WhateverConfig:
19
+ """Configuration for :class:`Whatever`."""
20
+
21
+ save_dir: str = "whatever"
22
+ file_name: str = "custom_prompt_responses.csv"
23
+ model: str = "gpt-5-mini"
24
+ json_mode: bool = False
25
+ web_search: Optional[bool] = None
26
+ web_search_filters: Optional[Dict[str, Any]] = None
27
+ search_context_size: str = "medium"
28
+ n_parallels: int = 650
29
+ use_dummy: bool = False
30
+ reasoning_effort: Optional[str] = None
31
+ reasoning_summary: Optional[str] = None
32
+
33
+
34
+ class Whatever:
35
+ """Prepare prompts and dispatch them through :func:`get_all_responses`."""
36
+
37
+ def __init__(self, cfg: WhateverConfig) -> None:
38
+ expanded = os.path.expandvars(os.path.expanduser(cfg.save_dir))
39
+ os.makedirs(expanded, exist_ok=True)
40
+ cfg.save_dir = expanded
41
+ self.cfg = cfg
42
+
43
+ # ------------------------------------------------------------------
44
+ @staticmethod
45
+ def _generate_identifiers(
46
+ prompts: List[str], provided: Optional[List[str]] = None
47
+ ) -> List[str]:
48
+ if provided is not None:
49
+ if len(provided) != len(prompts):
50
+ raise ValueError("Length of identifiers must match number of prompts")
51
+ ids = [str(i) for i in provided]
52
+ if len(set(ids)) != len(ids):
53
+ raise ValueError("Identifiers must be unique")
54
+ return ids
55
+
56
+ counts: Dict[str, int] = {}
57
+ generated: List[str] = []
58
+ for prompt in prompts:
59
+ key = hashlib.sha1(prompt.encode("utf-8")).hexdigest()[:8]
60
+ idx = counts.get(key, 0)
61
+ counts[key] = idx + 1
62
+ ident = key if idx == 0 else f"{key}-{idx}"
63
+ generated.append(ident)
64
+ return generated
65
+
66
+ # ------------------------------------------------------------------
67
+ @staticmethod
68
+ def _is_missing(value: Any) -> bool:
69
+ if value is None:
70
+ return True
71
+ try:
72
+ res = pd.isna(value)
73
+ except Exception:
74
+ return False
75
+ try:
76
+ return bool(res)
77
+ except Exception:
78
+ return False
79
+
80
+ # ------------------------------------------------------------------
81
+ @staticmethod
82
+ def _coerce_domains(value: Any) -> List[str]:
83
+ if Whatever._is_missing(value):
84
+ return []
85
+ if isinstance(value, str):
86
+ return [part.strip() for part in value.split(",") if part.strip()]
87
+ if isinstance(value, dict):
88
+ return []
89
+ if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
90
+ domains = [str(item).strip() for item in value if str(item).strip()]
91
+ return domains
92
+ text = str(value).strip()
93
+ return [text] if text else []
94
+
95
+ # ------------------------------------------------------------------
96
+ @staticmethod
97
+ def extract_json(
98
+ df: pd.DataFrame,
99
+ *,
100
+ text_column: str = "Response",
101
+ target_column: str = "Response JSON",
102
+ ) -> pd.DataFrame:
103
+ """Return a copy of ``df`` with parsed JSON objects in ``target_column``."""
104
+
105
+ if text_column not in df.columns:
106
+ raise ValueError(
107
+ f"Column '{text_column}' not found in DataFrame; available columns: {list(df.columns)}"
108
+ )
109
+
110
+ out = df.copy()
111
+
112
+ def _parse(value: Any) -> Optional[Union[dict, list]]:
113
+ text = response_to_text(value)
114
+ if not text:
115
+ return None
116
+ parsed = safe_json(text)
117
+ return parsed if parsed else None
118
+
119
+ out[target_column] = out[text_column].apply(_parse)
120
+ return out
121
+
122
+ async def run(
123
+ self,
124
+ prompts: Optional[Union[str, List[str], pd.DataFrame]] = None,
125
+ *,
126
+ df: Optional[pd.DataFrame] = None,
127
+ identifiers: Optional[List[str]] = None,
128
+ column_name: Optional[str] = None,
129
+ identifier_column: Optional[str] = None,
130
+ image_column: Optional[str] = None,
131
+ audio_column: Optional[str] = None,
132
+ prompt_images: Optional[Dict[str, List[str]]] = None,
133
+ prompt_audio: Optional[Dict[str, List[Dict[str, str]]]] = None,
134
+ web_search_filters: Optional[Dict[str, Any]] = None,
135
+ reset_files: bool = False,
136
+ return_original_columns: bool = True,
137
+ drop_prompts: bool = True,
138
+ parse_json: Optional[bool] = None,
139
+ **kwargs: Any,
140
+ ) -> pd.DataFrame:
141
+ """Normalise inputs and call :func:`get_all_responses`.
142
+
143
+ Parameters
144
+ ----------
145
+ parse_json:
146
+ When ``True`` (the default while ``json_mode`` is enabled) the
147
+ returned DataFrame will include a ``Response JSON`` column with the
148
+ parsed structure for each response. Set to ``False`` to skip the
149
+ extra parsing step and keep the raw JSON text in the ``Response``
150
+ column.
151
+ return_original_columns:
152
+ When ``True`` and ``df`` is provided, merge response columns back
153
+ onto the original DataFrame using the same identifier strategy.
154
+ drop_prompts:
155
+ When ``True`` and merging back onto ``df``, drop the prompt column
156
+ before saving/returning the merged DataFrame.
157
+ """
158
+
159
+ filters_spec: Dict[str, Any] = dict(
160
+ web_search_filters
161
+ if web_search_filters is not None
162
+ else (self.cfg.web_search_filters or {})
163
+ )
164
+
165
+ df_filters: Optional[Dict[str, Dict[str, Any]]] = None
166
+ global_filters: Optional[Dict[str, Any]] = filters_spec or None
167
+
168
+ source_data = df if df is not None else prompts
169
+ if source_data is None:
170
+ raise ValueError("Either prompts or df must be provided to Whatever.run")
171
+
172
+ if isinstance(source_data, pd.DataFrame):
173
+ if column_name is None:
174
+ raise ValueError("column_name must be provided when passing a DataFrame")
175
+ if column_name not in source_data.columns:
176
+ raise ValueError(f"Column '{column_name}' not found in DataFrame")
177
+ df_input = source_data.reset_index(drop=True)
178
+ df_proc = df_input.copy()
179
+ prompt_series = df_input[column_name]
180
+ prompt_values = [
181
+ "" if self._is_missing(val) else str(val)
182
+ for val in prompt_series.tolist()
183
+ ]
184
+ if identifier_column is not None:
185
+ if identifier_column not in df_input.columns:
186
+ raise ValueError(
187
+ f"Identifier column '{identifier_column}' not found in DataFrame"
188
+ )
189
+ identifiers_list = [str(i) for i in df_input[identifier_column].tolist()]
190
+ if len(set(identifiers_list)) != len(identifiers_list):
191
+ raise ValueError("identifier_column must contain unique values")
192
+ else:
193
+ identifiers_list = self._generate_identifiers(prompt_values)
194
+
195
+ df_proc["_gid"] = identifiers_list
196
+
197
+ image_map: Dict[str, List[str]] = {}
198
+ if image_column is not None:
199
+ if image_column not in df_input.columns:
200
+ raise ValueError(
201
+ f"Image column '{image_column}' not found in DataFrame"
202
+ )
203
+ for ident, cell in zip(identifiers_list, df_input[image_column]):
204
+ imgs = load_image_inputs(cell)
205
+ if imgs:
206
+ image_map[str(ident)] = imgs
207
+
208
+ audio_map: Dict[str, List[Dict[str, str]]] = {}
209
+ if audio_column is not None:
210
+ if audio_column not in df_input.columns:
211
+ raise ValueError(
212
+ f"Audio column '{audio_column}' not found in DataFrame"
213
+ )
214
+ for ident, cell in zip(identifiers_list, df_input[audio_column]):
215
+ auds = load_audio_inputs(cell)
216
+ if auds:
217
+ audio_map[str(ident)] = auds
218
+
219
+ column_filters: Dict[str, str] = {}
220
+ base_filters: Dict[str, Any] = {}
221
+ for key, spec in filters_spec.items():
222
+ if isinstance(spec, str) and spec in df_input.columns:
223
+ column_filters[key] = spec
224
+ elif key == "allowed_domains" and isinstance(spec, str) and spec in df_input.columns:
225
+ column_filters[key] = spec
226
+ else:
227
+ base_filters[key] = spec
228
+
229
+ per_prompt_filters: Dict[str, Dict[str, Any]] = {}
230
+ if column_filters:
231
+ for idx, ident in enumerate(identifiers_list):
232
+ row = df_input.iloc[idx]
233
+ row_filters: Dict[str, Any] = {}
234
+ for key, col in column_filters.items():
235
+ value = row.get(col)
236
+ if self._is_missing(value):
237
+ continue
238
+ if key == "allowed_domains":
239
+ domains = self._coerce_domains(value)
240
+ if domains:
241
+ row_filters[key] = domains
242
+ else:
243
+ text = str(value).strip()
244
+ if text:
245
+ row_filters[key] = text
246
+ if row_filters:
247
+ per_prompt_filters[str(ident)] = row_filters
248
+ df_filters = per_prompt_filters or None
249
+ global_filters = base_filters or None
250
+
251
+ prompts_list = prompt_values
252
+ df_source = df_proc
253
+ else:
254
+ if isinstance(source_data, str):
255
+ prompts_list = [source_data]
256
+ else:
257
+ prompts_list = [str(p) for p in source_data]
258
+ identifiers_list = self._generate_identifiers(
259
+ prompts_list, identifiers
260
+ )
261
+ image_map = {}
262
+ audio_map = {}
263
+ df_source = None
264
+
265
+ if prompt_images:
266
+ if not isinstance(prompt_images, dict):
267
+ raise TypeError("prompt_images must be a mapping of identifier to images")
268
+ for key, val in prompt_images.items():
269
+ if val:
270
+ image_map[str(key)] = val
271
+
272
+ if prompt_audio:
273
+ if not isinstance(prompt_audio, dict):
274
+ raise TypeError("prompt_audio must be a mapping of identifier to audio payloads")
275
+ for key, val in prompt_audio.items():
276
+ if val:
277
+ audio_map[str(key)] = val
278
+
279
+ images_payload = image_map or None
280
+ audio_payload = audio_map or None
281
+
282
+ save_path = kwargs.pop(
283
+ "save_path", os.path.join(self.cfg.save_dir, self.cfg.file_name)
284
+ )
285
+
286
+ web_search_flag = (
287
+ self.cfg.web_search
288
+ if self.cfg.web_search is not None
289
+ else bool(global_filters or df_filters)
290
+ )
291
+
292
+ df_resp = await get_all_responses(
293
+ prompts=prompts_list,
294
+ identifiers=identifiers_list,
295
+ prompt_images=images_payload,
296
+ prompt_audio=audio_payload,
297
+ prompt_web_search_filters=df_filters,
298
+ save_path=save_path,
299
+ model=self.cfg.model,
300
+ json_mode=self.cfg.json_mode,
301
+ web_search=web_search_flag,
302
+ web_search_filters=global_filters,
303
+ search_context_size=self.cfg.search_context_size,
304
+ n_parallels=self.cfg.n_parallels,
305
+ use_dummy=self.cfg.use_dummy,
306
+ reset_files=reset_files,
307
+ reasoning_effort=self.cfg.reasoning_effort,
308
+ reasoning_summary=self.cfg.reasoning_summary,
309
+ **kwargs,
310
+ )
311
+ if not isinstance(df_resp, pd.DataFrame):
312
+ raise RuntimeError("get_all_responses returned no DataFrame")
313
+
314
+ df_clean = df_resp.copy()
315
+ if "Response" in df_clean.columns:
316
+ df_clean["Response"] = df_clean["Response"].apply(response_to_text)
317
+
318
+ if self.cfg.json_mode:
319
+ auto_parse = parse_json if parse_json is not None else True
320
+ if auto_parse:
321
+ df_clean = self.extract_json(
322
+ df_clean,
323
+ text_column="Response",
324
+ target_column="Response JSON",
325
+ )
326
+ print(
327
+ "[Whatever] Parsed JSON output is available in the 'Response JSON' column."
328
+ )
329
+ else:
330
+ print(
331
+ "[Whatever] JSON responses are stored as text in the 'Response' column. "
332
+ "Call `Whatever.extract_json(df)` to parse them into structured objects."
333
+ )
334
+
335
+ if df_source is not None and return_original_columns:
336
+ merged = df_source.merge(
337
+ df_clean, left_on="_gid", right_on="Identifier", how="left"
338
+ ).drop(columns=["_gid"])
339
+ if drop_prompts and column_name and column_name in merged.columns:
340
+ merged = merged.drop(columns=[column_name])
341
+ merged.to_csv(save_path, index=False)
342
+ return merged
343
+
344
+ return df_clean
@@ -0,0 +1,64 @@
1
+ """Utility helpers for GABRIEL."""
2
+
3
+ from .openai_utils import (
4
+ get_response,
5
+ get_all_responses,
6
+ get_embedding,
7
+ get_all_embeddings,
8
+ DummyResponseSpec,
9
+ )
10
+ from .image_utils import encode_image
11
+ from .audio_utils import encode_audio
12
+ from .media_utils import load_image_inputs, load_audio_inputs, load_pdf_inputs
13
+ from .pdf_utils import encode_pdf
14
+ from .logging import get_logger, set_log_level
15
+ from .mapmaker import MapMaker, create_county_choropleth
16
+ from .parsing import safe_json, safest_json, clean_json_df
17
+ from .jinja import shuffled, shuffled_dict, get_env
18
+ from .passage_viewer import PassageViewer, view
19
+ from .word_matching import (
20
+ normalize_text_aggressive,
21
+ normalize_text_generous,
22
+ normalize_whitespace,
23
+ letters_only,
24
+ robust_find_improved,
25
+ strict_find,
26
+ )
27
+ from .prompt_utils import swap_circle_square
28
+ from .modality_utils import warn_if_modality_mismatch
29
+ from .file_utils import load
30
+
31
+ __all__ = [
32
+ "get_response",
33
+ "get_all_responses",
34
+ "get_embedding",
35
+ "get_all_embeddings",
36
+ "DummyResponseSpec",
37
+ "get_logger",
38
+ "set_log_level",
39
+ "MapMaker",
40
+ "create_county_choropleth",
41
+ "safe_json",
42
+ "safest_json",
43
+ "clean_json_df",
44
+ "encode_image",
45
+ "encode_audio",
46
+ "encode_pdf",
47
+ "load_image_inputs",
48
+ "load_audio_inputs",
49
+ "load_pdf_inputs",
50
+ "shuffled",
51
+ "shuffled_dict",
52
+ "get_env",
53
+ "normalize_text_aggressive",
54
+ "normalize_text_generous",
55
+ "normalize_whitespace",
56
+ "letters_only",
57
+ "robust_find_improved",
58
+ "strict_find",
59
+ "PassageViewer",
60
+ "view",
61
+ "swap_circle_square",
62
+ "warn_if_modality_mismatch",
63
+ "load",
64
+ ]
@@ -0,0 +1,42 @@
1
+ """Utility helpers for working with audio files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import base64
6
+ from pathlib import Path
7
+ from typing import Dict, Optional
8
+
9
+
10
+ def encode_audio(audio_path: str) -> Optional[Dict[str, str]]:
11
+ """Return the audio at ``audio_path`` as a dict suitable for the OpenAI API.
12
+
13
+ The returned dictionary has two keys:
14
+
15
+ ``data``
16
+ Base64 encoded contents of the file.
17
+
18
+ ``format``
19
+ The lowercase file extension (e.g. ``"mp3"``, ``"wav"``). If the
20
+ extension cannot be determined ``"wav"`` is used as a fallback.
21
+
22
+ Parameters
23
+ ----------
24
+ audio_path:
25
+ Path to the audio file to encode.
26
+
27
+ Returns
28
+ -------
29
+ dict or None
30
+ A mapping with ``data`` and ``format`` keys, or ``None`` if reading the
31
+ file fails.
32
+ """
33
+
34
+ try:
35
+ path = Path(audio_path)
36
+ with path.open("rb") as f:
37
+ b64 = base64.b64encode(f.read()).decode("utf-8")
38
+ ext = path.suffix.lstrip(".").lower() or "wav"
39
+ return {"data": b64, "format": ext}
40
+ except Exception:
41
+ return None
42
+