openai-gabriel 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gabriel/__init__.py +61 -0
- gabriel/_version.py +1 -0
- gabriel/api.py +2284 -0
- gabriel/cli/__main__.py +60 -0
- gabriel/core/__init__.py +7 -0
- gabriel/core/llm_client.py +34 -0
- gabriel/core/pipeline.py +18 -0
- gabriel/core/prompt_template.py +152 -0
- gabriel/prompts/__init__.py +1 -0
- gabriel/prompts/bucket_prompt.jinja2 +113 -0
- gabriel/prompts/classification_prompt.jinja2 +50 -0
- gabriel/prompts/codify_prompt.jinja2 +95 -0
- gabriel/prompts/comparison_prompt.jinja2 +60 -0
- gabriel/prompts/deduplicate_prompt.jinja2 +41 -0
- gabriel/prompts/deidentification_prompt.jinja2 +112 -0
- gabriel/prompts/extraction_prompt.jinja2 +61 -0
- gabriel/prompts/filter_prompt.jinja2 +31 -0
- gabriel/prompts/ideation_prompt.jinja2 +80 -0
- gabriel/prompts/merge_prompt.jinja2 +47 -0
- gabriel/prompts/paraphrase_prompt.jinja2 +17 -0
- gabriel/prompts/rankings_prompt.jinja2 +49 -0
- gabriel/prompts/ratings_prompt.jinja2 +50 -0
- gabriel/prompts/regional_analysis_prompt.jinja2 +40 -0
- gabriel/prompts/seed.jinja2 +43 -0
- gabriel/prompts/snippets.jinja2 +117 -0
- gabriel/tasks/__init__.py +63 -0
- gabriel/tasks/_attribute_utils.py +69 -0
- gabriel/tasks/bucket.py +432 -0
- gabriel/tasks/classify.py +562 -0
- gabriel/tasks/codify.py +1033 -0
- gabriel/tasks/compare.py +235 -0
- gabriel/tasks/debias.py +1460 -0
- gabriel/tasks/deduplicate.py +341 -0
- gabriel/tasks/deidentify.py +316 -0
- gabriel/tasks/discover.py +524 -0
- gabriel/tasks/extract.py +455 -0
- gabriel/tasks/filter.py +169 -0
- gabriel/tasks/ideate.py +782 -0
- gabriel/tasks/merge.py +464 -0
- gabriel/tasks/paraphrase.py +531 -0
- gabriel/tasks/rank.py +2041 -0
- gabriel/tasks/rate.py +347 -0
- gabriel/tasks/seed.py +465 -0
- gabriel/tasks/whatever.py +344 -0
- gabriel/utils/__init__.py +64 -0
- gabriel/utils/audio_utils.py +42 -0
- gabriel/utils/file_utils.py +464 -0
- gabriel/utils/image_utils.py +22 -0
- gabriel/utils/jinja.py +31 -0
- gabriel/utils/logging.py +86 -0
- gabriel/utils/mapmaker.py +304 -0
- gabriel/utils/media_utils.py +78 -0
- gabriel/utils/modality_utils.py +148 -0
- gabriel/utils/openai_utils.py +5470 -0
- gabriel/utils/parsing.py +282 -0
- gabriel/utils/passage_viewer.py +2557 -0
- gabriel/utils/pdf_utils.py +20 -0
- gabriel/utils/plot_utils.py +2881 -0
- gabriel/utils/prompt_utils.py +42 -0
- gabriel/utils/word_matching.py +158 -0
- openai_gabriel-1.0.1.dist-info/METADATA +443 -0
- openai_gabriel-1.0.1.dist-info/RECORD +67 -0
- openai_gabriel-1.0.1.dist-info/WHEEL +5 -0
- openai_gabriel-1.0.1.dist-info/entry_points.txt +2 -0
- openai_gabriel-1.0.1.dist-info/licenses/LICENSE +201 -0
- openai_gabriel-1.0.1.dist-info/licenses/NOTICE +13 -0
- openai_gabriel-1.0.1.dist-info/top_level.txt +1 -0
gabriel/api.py
ADDED
|
@@ -0,0 +1,2284 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import warnings
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import Awaitable, Callable, Dict, Optional, Union, Any, List, Mapping, Sequence
|
|
5
|
+
|
|
6
|
+
from .tasks import (
|
|
7
|
+
Rate,
|
|
8
|
+
RateConfig,
|
|
9
|
+
Classify,
|
|
10
|
+
ClassifyConfig,
|
|
11
|
+
Rank,
|
|
12
|
+
RankConfig,
|
|
13
|
+
Deidentifier,
|
|
14
|
+
DeidentifyConfig,
|
|
15
|
+
Codify,
|
|
16
|
+
CodifyConfig,
|
|
17
|
+
Extract,
|
|
18
|
+
ExtractConfig,
|
|
19
|
+
Paraphrase,
|
|
20
|
+
ParaphraseConfig,
|
|
21
|
+
Compare,
|
|
22
|
+
CompareConfig,
|
|
23
|
+
Merge,
|
|
24
|
+
MergeConfig,
|
|
25
|
+
Deduplicate,
|
|
26
|
+
DeduplicateConfig,
|
|
27
|
+
Bucket,
|
|
28
|
+
BucketConfig,
|
|
29
|
+
Discover,
|
|
30
|
+
DiscoverConfig,
|
|
31
|
+
Seed,
|
|
32
|
+
SeedConfig,
|
|
33
|
+
Filter,
|
|
34
|
+
FilterConfig,
|
|
35
|
+
Whatever,
|
|
36
|
+
WhateverConfig,
|
|
37
|
+
Ideate,
|
|
38
|
+
IdeateConfig,
|
|
39
|
+
)
|
|
40
|
+
from .utils.openai_utils import get_all_responses
|
|
41
|
+
from .utils.passage_viewer import view as _view_passages
|
|
42
|
+
from .tasks.debias import (
|
|
43
|
+
DebiasConfig,
|
|
44
|
+
DebiasPipeline,
|
|
45
|
+
DebiasResult,
|
|
46
|
+
MeasurementMode,
|
|
47
|
+
RemovalMethod,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
__all__ = [
|
|
51
|
+
"rate",
|
|
52
|
+
"extract",
|
|
53
|
+
"seed",
|
|
54
|
+
"classify",
|
|
55
|
+
"ideate",
|
|
56
|
+
"id8",
|
|
57
|
+
"deidentify",
|
|
58
|
+
"rank",
|
|
59
|
+
"codify",
|
|
60
|
+
"paraphrase",
|
|
61
|
+
"compare",
|
|
62
|
+
"bucket",
|
|
63
|
+
"discover",
|
|
64
|
+
"deduplicate",
|
|
65
|
+
"merge",
|
|
66
|
+
"filter",
|
|
67
|
+
"debias",
|
|
68
|
+
"whatever",
|
|
69
|
+
"view",
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
async def rate(
|
|
73
|
+
df: pd.DataFrame,
|
|
74
|
+
column_name: str,
|
|
75
|
+
*,
|
|
76
|
+
attributes: Dict[str, str],
|
|
77
|
+
save_dir: str,
|
|
78
|
+
additional_instructions: Optional[str] = None,
|
|
79
|
+
model: str = "gpt-5-mini",
|
|
80
|
+
n_parallels: int = 650,
|
|
81
|
+
n_runs: int = 1,
|
|
82
|
+
n_attributes_per_run: int = 8,
|
|
83
|
+
reset_files: bool = False,
|
|
84
|
+
use_dummy: bool = False,
|
|
85
|
+
file_name: str = "ratings.csv",
|
|
86
|
+
modality: str = "text",
|
|
87
|
+
reasoning_effort: Optional[str] = None,
|
|
88
|
+
reasoning_summary: Optional[str] = None,
|
|
89
|
+
search_context_size: str = "medium",
|
|
90
|
+
template_path: Optional[str] = None,
|
|
91
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
92
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
93
|
+
**cfg_kwargs,
|
|
94
|
+
) -> pd.DataFrame:
|
|
95
|
+
"""Asks GPT to score each text / image / audio / pdf / item on natural language attributes. Output = 0-100 rating.
|
|
96
|
+
|
|
97
|
+
Example Use
|
|
98
|
+
-----------
|
|
99
|
+
Measure "populist rhetoric" in a speech; "toxicity" of tweets; "luxury" in ad images.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
df:
|
|
104
|
+
Source DataFrame containing the passages to rate.
|
|
105
|
+
column_name:
|
|
106
|
+
Column in ``df`` that holds the passages (text, image, audio, or PDF
|
|
107
|
+
references depending on ``modality``).
|
|
108
|
+
attributes:
|
|
109
|
+
Mapping of attribute names to natural-language descriptions that the
|
|
110
|
+
model should evaluate on a 0–100 scale.
|
|
111
|
+
save_dir:
|
|
112
|
+
Directory where raw responses and the aggregated ratings CSV are
|
|
113
|
+
written. Created if it does not exist.
|
|
114
|
+
additional_instructions:
|
|
115
|
+
Optional extra guidance injected into the prompt template.
|
|
116
|
+
model:
|
|
117
|
+
Model name passed through to the OpenAI Responses API.
|
|
118
|
+
n_parallels:
|
|
119
|
+
Maximum number of concurrent requests to issue.
|
|
120
|
+
n_runs:
|
|
121
|
+
Number of repeat rating passes to perform for each passage.
|
|
122
|
+
n_attributes_per_run:
|
|
123
|
+
Maximum number of attributes to include in a single prompt. Attributes
|
|
124
|
+
are split across prompts when this limit is exceeded.
|
|
125
|
+
reset_files:
|
|
126
|
+
When ``True`` existing outputs in ``save_dir`` are ignored and
|
|
127
|
+
regenerated.
|
|
128
|
+
use_dummy:
|
|
129
|
+
If ``True`` use deterministic dummy responses for offline testing.
|
|
130
|
+
file_name:
|
|
131
|
+
Basename (without the automatic ``_raw_responses`` suffix) for saved
|
|
132
|
+
artifacts.
|
|
133
|
+
modality:
|
|
134
|
+
One of ``"text"``, ``"entity"``, ``"web"``, ``"image"``, ``"audio"``, or ``"pdf"``
|
|
135
|
+
to control how inputs are packaged into prompts.
|
|
136
|
+
reasoning_effort, reasoning_summary:
|
|
137
|
+
Optional OpenAI metadata that tunes reasoning depth and summary capture.
|
|
138
|
+
search_context_size:
|
|
139
|
+
Size hint forwarded to web-search capable models.
|
|
140
|
+
template_path:
|
|
141
|
+
Override the default rating prompt template with a custom Jinja2 file.
|
|
142
|
+
response_fn:
|
|
143
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
144
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
145
|
+
``get_all_responses_fn`` is supplied.
|
|
146
|
+
get_all_responses_fn:
|
|
147
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
148
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
149
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
150
|
+
**cfg_kwargs:
|
|
151
|
+
Additional overrides applied to :class:`gabriel.tasks.rate.RateConfig`.
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
pandas.DataFrame
|
|
156
|
+
Input DataFrame with one column per attribute containing the mean score
|
|
157
|
+
across runs.
|
|
158
|
+
"""
|
|
159
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
160
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
161
|
+
cfg = RateConfig(
|
|
162
|
+
attributes=attributes,
|
|
163
|
+
save_dir=save_dir,
|
|
164
|
+
file_name=file_name,
|
|
165
|
+
model=model,
|
|
166
|
+
n_parallels=n_parallels,
|
|
167
|
+
n_runs=n_runs,
|
|
168
|
+
n_attributes_per_run=n_attributes_per_run,
|
|
169
|
+
use_dummy=use_dummy,
|
|
170
|
+
additional_instructions=additional_instructions,
|
|
171
|
+
modality=modality,
|
|
172
|
+
reasoning_effort=reasoning_effort,
|
|
173
|
+
reasoning_summary=reasoning_summary,
|
|
174
|
+
search_context_size=search_context_size,
|
|
175
|
+
**cfg_kwargs,
|
|
176
|
+
)
|
|
177
|
+
return await Rate(cfg, template_path=template_path).run(
|
|
178
|
+
df,
|
|
179
|
+
column_name,
|
|
180
|
+
reset_files=reset_files,
|
|
181
|
+
response_fn=response_fn,
|
|
182
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
async def extract(
|
|
186
|
+
df: pd.DataFrame,
|
|
187
|
+
column_name: str,
|
|
188
|
+
*,
|
|
189
|
+
attributes: Dict[str, str],
|
|
190
|
+
save_dir: str,
|
|
191
|
+
additional_instructions: Optional[str] = None,
|
|
192
|
+
model: str = "gpt-5-mini",
|
|
193
|
+
n_parallels: int = 650,
|
|
194
|
+
n_runs: int = 1,
|
|
195
|
+
n_attributes_per_run: int = 8,
|
|
196
|
+
reset_files: bool = False,
|
|
197
|
+
use_dummy: bool = False,
|
|
198
|
+
file_name: str = "extraction.csv",
|
|
199
|
+
modality: str = "entity",
|
|
200
|
+
reasoning_effort: Optional[str] = None,
|
|
201
|
+
reasoning_summary: Optional[str] = None,
|
|
202
|
+
types: Optional[Dict[str, Any]] = None,
|
|
203
|
+
template_path: Optional[str] = None,
|
|
204
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
205
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
206
|
+
**cfg_kwargs,
|
|
207
|
+
) -> pd.DataFrame:
|
|
208
|
+
"""Structured fact extraction on each item. Output = string / numeric values.
|
|
209
|
+
|
|
210
|
+
Example Use
|
|
211
|
+
-----------
|
|
212
|
+
For each product, provide the "company", "CEO", and "year of invention".
|
|
213
|
+
|
|
214
|
+
Parameters
|
|
215
|
+
----------
|
|
216
|
+
df:
|
|
217
|
+
Source DataFrame containing the passages to parse.
|
|
218
|
+
column_name:
|
|
219
|
+
Column in ``df`` with the content to extract from.
|
|
220
|
+
attributes:
|
|
221
|
+
Mapping of field names to descriptions of what should be extracted.
|
|
222
|
+
save_dir:
|
|
223
|
+
Directory where extraction outputs will be written. Created if absent.
|
|
224
|
+
additional_instructions:
|
|
225
|
+
Optional extra guidance injected into the extraction prompt.
|
|
226
|
+
model:
|
|
227
|
+
Model used for extraction via the OpenAI Responses API.
|
|
228
|
+
n_parallels:
|
|
229
|
+
Maximum number of concurrent extraction calls.
|
|
230
|
+
n_runs:
|
|
231
|
+
Number of extraction passes to perform; results are averaged when
|
|
232
|
+
applicable.
|
|
233
|
+
n_attributes_per_run:
|
|
234
|
+
Maximum number of attributes to include in each prompt. Attributes are
|
|
235
|
+
split into multiple prompts when this threshold is exceeded.
|
|
236
|
+
reset_files:
|
|
237
|
+
When ``True`` forces regeneration of outputs in ``save_dir``.
|
|
238
|
+
use_dummy:
|
|
239
|
+
If ``True`` return deterministic dummy outputs instead of real API
|
|
240
|
+
calls.
|
|
241
|
+
file_name:
|
|
242
|
+
CSV name used when saving extraction results.
|
|
243
|
+
modality:
|
|
244
|
+
Indicates whether the content is ``"entity"`` text or another modality
|
|
245
|
+
supported by the templates.
|
|
246
|
+
reasoning_effort, reasoning_summary:
|
|
247
|
+
Optional OpenAI metadata for reasoning depth and summarisation.
|
|
248
|
+
types:
|
|
249
|
+
Optional mapping of attribute names to explicit Python types for
|
|
250
|
+
stronger downstream typing.
|
|
251
|
+
template_path:
|
|
252
|
+
Custom Jinja2 template path to override the default extraction prompt.
|
|
253
|
+
response_fn:
|
|
254
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
255
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
256
|
+
``get_all_responses_fn`` is supplied.
|
|
257
|
+
get_all_responses_fn:
|
|
258
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
259
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
260
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
261
|
+
**cfg_kwargs:
|
|
262
|
+
Additional overrides forwarded to :class:`gabriel.tasks.extract.ExtractConfig`.
|
|
263
|
+
|
|
264
|
+
Returns
|
|
265
|
+
-------
|
|
266
|
+
pandas.DataFrame
|
|
267
|
+
The original DataFrame augmented with one column per requested
|
|
268
|
+
attribute.
|
|
269
|
+
"""
|
|
270
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
271
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
272
|
+
cfg = ExtractConfig(
|
|
273
|
+
attributes=attributes,
|
|
274
|
+
save_dir=save_dir,
|
|
275
|
+
file_name=file_name,
|
|
276
|
+
model=model,
|
|
277
|
+
n_parallels=n_parallels,
|
|
278
|
+
n_runs=n_runs,
|
|
279
|
+
n_attributes_per_run=n_attributes_per_run,
|
|
280
|
+
use_dummy=use_dummy,
|
|
281
|
+
additional_instructions=additional_instructions,
|
|
282
|
+
modality=modality,
|
|
283
|
+
reasoning_effort=reasoning_effort,
|
|
284
|
+
reasoning_summary=reasoning_summary,
|
|
285
|
+
**cfg_kwargs,
|
|
286
|
+
)
|
|
287
|
+
return await Extract(cfg, template_path=template_path).run(
|
|
288
|
+
df,
|
|
289
|
+
column_name,
|
|
290
|
+
reset_files=reset_files,
|
|
291
|
+
types=types,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
async def seed(
|
|
296
|
+
instructions: str,
|
|
297
|
+
*,
|
|
298
|
+
save_dir: str,
|
|
299
|
+
file_name: str = "seed_entities.csv",
|
|
300
|
+
model: str = "gpt-5.1",
|
|
301
|
+
n_parallels: int = 650,
|
|
302
|
+
num_entities: int = 1000,
|
|
303
|
+
entities_per_generation: int = 50,
|
|
304
|
+
entity_batch_frac: float = 0.25,
|
|
305
|
+
existing_entities_cap: int = 100,
|
|
306
|
+
use_dummy: bool = False,
|
|
307
|
+
deduplicate: bool = False,
|
|
308
|
+
deduplicate_sample_seed: int = 42,
|
|
309
|
+
reasoning_effort: Optional[str] = None,
|
|
310
|
+
reasoning_summary: Optional[str] = None,
|
|
311
|
+
max_timeout: Optional[float] = None,
|
|
312
|
+
template_path: Optional[str] = None,
|
|
313
|
+
existing_entities: Optional[List[str]] = None,
|
|
314
|
+
reset_files: bool = False,
|
|
315
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
316
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
317
|
+
**response_kwargs: Any,
|
|
318
|
+
) -> pd.DataFrame:
|
|
319
|
+
"""Enforces a representative distribution / diversity of seeds.
|
|
320
|
+
|
|
321
|
+
Example Use
|
|
322
|
+
-----------
|
|
323
|
+
Initialize unique personas that match US population distribution.
|
|
324
|
+
|
|
325
|
+
Parameters
|
|
326
|
+
----------
|
|
327
|
+
instructions:
|
|
328
|
+
High-level description of the domain and what constitutes a good seed
|
|
329
|
+
entity.
|
|
330
|
+
save_dir:
|
|
331
|
+
Directory where seed entities and raw responses are stored.
|
|
332
|
+
file_name:
|
|
333
|
+
Name of the CSV to write seed entities to.
|
|
334
|
+
model:
|
|
335
|
+
Model used for generation.
|
|
336
|
+
n_parallels:
|
|
337
|
+
Maximum number of concurrent generation calls.
|
|
338
|
+
num_entities:
|
|
339
|
+
Target number of entities to generate in total.
|
|
340
|
+
entities_per_generation:
|
|
341
|
+
Number of entities requested from each API call.
|
|
342
|
+
entity_batch_frac:
|
|
343
|
+
Fraction of generated entities to keep per batch before deduplication.
|
|
344
|
+
existing_entities_cap:
|
|
345
|
+
Maximum number of prior entities to consider when avoiding duplicates.
|
|
346
|
+
use_dummy:
|
|
347
|
+
If ``True`` emit deterministic dummy seeds for offline testing.
|
|
348
|
+
deduplicate:
|
|
349
|
+
When ``True`` over-generate and apply a shallow deduplication pass
|
|
350
|
+
before returning results.
|
|
351
|
+
deduplicate_sample_seed:
|
|
352
|
+
Random seed used when sampling a deterministic subset after
|
|
353
|
+
deduplication.
|
|
354
|
+
reasoning_effort, reasoning_summary:
|
|
355
|
+
Optional OpenAI reasoning controls.
|
|
356
|
+
max_timeout:
|
|
357
|
+
Optional timeout in seconds for each API call.
|
|
358
|
+
template_path:
|
|
359
|
+
Optional Jinja2 template override for the seeding prompt.
|
|
360
|
+
existing_entities:
|
|
361
|
+
List of pre-existing entities to avoid regenerating.
|
|
362
|
+
reset_files:
|
|
363
|
+
When ``True`` ignore any saved state in ``save_dir`` and regenerate.
|
|
364
|
+
response_fn:
|
|
365
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
366
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
367
|
+
``get_all_responses_fn`` is supplied.
|
|
368
|
+
get_all_responses_fn:
|
|
369
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
370
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
371
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
372
|
+
**response_kwargs:
|
|
373
|
+
Additional keyword arguments forwarded to
|
|
374
|
+
:func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
375
|
+
|
|
376
|
+
Returns
|
|
377
|
+
-------
|
|
378
|
+
pandas.DataFrame
|
|
379
|
+
DataFrame of seed entities with provenance metadata.
|
|
380
|
+
"""
|
|
381
|
+
|
|
382
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
383
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
384
|
+
cfg = SeedConfig(
|
|
385
|
+
instructions=instructions,
|
|
386
|
+
save_dir=save_dir,
|
|
387
|
+
file_name=file_name,
|
|
388
|
+
model=model,
|
|
389
|
+
n_parallels=n_parallels,
|
|
390
|
+
num_entities=num_entities,
|
|
391
|
+
entities_per_generation=entities_per_generation,
|
|
392
|
+
entity_batch_frac=entity_batch_frac,
|
|
393
|
+
existing_entities_cap=existing_entities_cap,
|
|
394
|
+
use_dummy=use_dummy,
|
|
395
|
+
deduplicate=deduplicate,
|
|
396
|
+
deduplicate_sample_seed=deduplicate_sample_seed,
|
|
397
|
+
reasoning_effort=reasoning_effort,
|
|
398
|
+
reasoning_summary=reasoning_summary,
|
|
399
|
+
max_timeout=max_timeout,
|
|
400
|
+
)
|
|
401
|
+
task = Seed(cfg, template_path=template_path)
|
|
402
|
+
return await task.run(
|
|
403
|
+
existing_entities=existing_entities,
|
|
404
|
+
reset_files=reset_files,
|
|
405
|
+
response_fn=response_fn,
|
|
406
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
407
|
+
**response_kwargs,
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
async def classify(
|
|
412
|
+
df: pd.DataFrame,
|
|
413
|
+
column_name: Optional[str] = None,
|
|
414
|
+
*,
|
|
415
|
+
labels: Dict[str, str],
|
|
416
|
+
save_dir: str,
|
|
417
|
+
additional_instructions: Optional[str] = None,
|
|
418
|
+
model: str = "gpt-5-mini",
|
|
419
|
+
differentiate: bool = False,
|
|
420
|
+
circle_column_name: Optional[str] = None,
|
|
421
|
+
square_column_name: Optional[str] = None,
|
|
422
|
+
n_parallels: int = 650,
|
|
423
|
+
n_runs: int = 1,
|
|
424
|
+
n_attributes_per_run: int = 8,
|
|
425
|
+
min_frequency: float = 0.6,
|
|
426
|
+
reset_files: bool = False,
|
|
427
|
+
use_dummy: bool = False,
|
|
428
|
+
file_name: str = "classify_responses.csv",
|
|
429
|
+
modality: str = "text",
|
|
430
|
+
reasoning_effort: Optional[str] = None,
|
|
431
|
+
reasoning_summary: Optional[str] = None,
|
|
432
|
+
search_context_size: str = "medium",
|
|
433
|
+
template_path: Optional[str] = None,
|
|
434
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
435
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
436
|
+
**cfg_kwargs,
|
|
437
|
+
) -> pd.DataFrame:
|
|
438
|
+
"""Classifies texts / images / audio / pdfs / items on whether provided labels apply. Output = one or more classes per item.
|
|
439
|
+
|
|
440
|
+
Example Use
|
|
441
|
+
-----------
|
|
442
|
+
Tag news articles, product photos, or interview clips into topical categories.
|
|
443
|
+
|
|
444
|
+
Parameters
|
|
445
|
+
----------
|
|
446
|
+
df:
|
|
447
|
+
DataFrame containing content to classify.
|
|
448
|
+
column_name:
|
|
449
|
+
Column with the main passage text. Can be ``None`` when using paired
|
|
450
|
+
circle/square inputs.
|
|
451
|
+
labels:
|
|
452
|
+
Mapping of label names to definitions the model should follow.
|
|
453
|
+
save_dir:
|
|
454
|
+
Directory where classification artifacts are written.
|
|
455
|
+
additional_instructions:
|
|
456
|
+
Free-form instructions appended to the classification prompt.
|
|
457
|
+
model:
|
|
458
|
+
Model name used for classification.
|
|
459
|
+
differentiate:
|
|
460
|
+
When ``True`` use differentiation mode to highlight contrasts.
|
|
461
|
+
circle_column_name, square_column_name:
|
|
462
|
+
Optional paired columns for contrastive classification.
|
|
463
|
+
n_parallels:
|
|
464
|
+
Maximum number of concurrent classification calls.
|
|
465
|
+
n_runs:
|
|
466
|
+
Number of repeated classification passes.
|
|
467
|
+
n_attributes_per_run:
|
|
468
|
+
Maximum number of labels to evaluate per prompt. Labels are split into
|
|
469
|
+
batches when this count is exceeded.
|
|
470
|
+
min_frequency:
|
|
471
|
+
Minimum label frequency required to keep a label during aggregation.
|
|
472
|
+
reset_files:
|
|
473
|
+
When ``True`` overwrite any existing outputs in ``save_dir``.
|
|
474
|
+
use_dummy:
|
|
475
|
+
If ``True`` return deterministic dummy outputs for offline testing.
|
|
476
|
+
file_name:
|
|
477
|
+
Basename for saved classification CSVs.
|
|
478
|
+
modality:
|
|
479
|
+
Indicates the content modality for prompt rendering.
|
|
480
|
+
reasoning_effort, reasoning_summary:
|
|
481
|
+
Optional OpenAI reasoning controls.
|
|
482
|
+
search_context_size:
|
|
483
|
+
Context size hint forwarded to the Responses API.
|
|
484
|
+
template_path:
|
|
485
|
+
Override the default classification prompt template.
|
|
486
|
+
response_fn:
|
|
487
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
488
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
489
|
+
``get_all_responses_fn`` is supplied.
|
|
490
|
+
get_all_responses_fn:
|
|
491
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
492
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
493
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
494
|
+
**cfg_kwargs:
|
|
495
|
+
Extra configuration passed to :class:`gabriel.tasks.classify.ClassifyConfig`.
|
|
496
|
+
|
|
497
|
+
Returns
|
|
498
|
+
-------
|
|
499
|
+
pandas.DataFrame
|
|
500
|
+
DataFrame including one column per label plus ``predicted_classes``; aggregates repeated runs using ``min_frequency``.
|
|
501
|
+
"""
|
|
502
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
503
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
504
|
+
cfg = ClassifyConfig(
|
|
505
|
+
labels=labels,
|
|
506
|
+
save_dir=save_dir,
|
|
507
|
+
file_name=file_name,
|
|
508
|
+
model=model,
|
|
509
|
+
differentiate=differentiate,
|
|
510
|
+
n_parallels=n_parallels,
|
|
511
|
+
n_runs=n_runs,
|
|
512
|
+
n_attributes_per_run=n_attributes_per_run,
|
|
513
|
+
min_frequency=min_frequency,
|
|
514
|
+
additional_instructions=additional_instructions or "",
|
|
515
|
+
use_dummy=use_dummy,
|
|
516
|
+
modality=modality,
|
|
517
|
+
reasoning_effort=reasoning_effort,
|
|
518
|
+
reasoning_summary=reasoning_summary,
|
|
519
|
+
search_context_size=search_context_size,
|
|
520
|
+
**cfg_kwargs,
|
|
521
|
+
)
|
|
522
|
+
return await Classify(cfg, template_path=template_path).run(
|
|
523
|
+
df,
|
|
524
|
+
column_name,
|
|
525
|
+
circle_column_name=circle_column_name,
|
|
526
|
+
square_column_name=square_column_name,
|
|
527
|
+
reset_files=reset_files,
|
|
528
|
+
response_fn=response_fn,
|
|
529
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
async def ideate(
|
|
534
|
+
topic: str,
|
|
535
|
+
*,
|
|
536
|
+
save_dir: str,
|
|
537
|
+
file_name: str = "ideation.csv",
|
|
538
|
+
model: str = "gpt-5-mini",
|
|
539
|
+
ranking_model: Optional[str] = None,
|
|
540
|
+
n_ideas: int = 1000,
|
|
541
|
+
n_parallels: int = 650,
|
|
542
|
+
evaluation_mode: str = "recursive_rank",
|
|
543
|
+
attributes: Optional[Dict[str, str]] = None,
|
|
544
|
+
rank_attribute: Optional[str] = None,
|
|
545
|
+
recursive_fraction: float = 1.0 / 3.0,
|
|
546
|
+
recursive_min_remaining: int = 30,
|
|
547
|
+
recursive_final_round_multiplier: int = 3,
|
|
548
|
+
recursive_cut_side: str = "top",
|
|
549
|
+
recursive_rate_first_round: bool = True,
|
|
550
|
+
additional_instructions: Optional[str] = None,
|
|
551
|
+
web_search: bool = False,
|
|
552
|
+
use_dummy: bool = False,
|
|
553
|
+
reasoning_effort: Optional[str] = None,
|
|
554
|
+
reasoning_summary: Optional[str] = None,
|
|
555
|
+
reset_files: bool = False,
|
|
556
|
+
generation_kwargs: Optional[Dict[str, Any]] = None,
|
|
557
|
+
rank_config_updates: Optional[Dict[str, Any]] = None,
|
|
558
|
+
rank_run_kwargs: Optional[Dict[str, Any]] = None,
|
|
559
|
+
rate_config_updates: Optional[Dict[str, Any]] = None,
|
|
560
|
+
rate_run_kwargs: Optional[Dict[str, Any]] = None,
|
|
561
|
+
use_seed_entities: Optional[bool] = None,
|
|
562
|
+
seed_deduplicate: bool = True,
|
|
563
|
+
seed_config_updates: Optional[Dict[str, Any]] = None,
|
|
564
|
+
seed_run_kwargs: Optional[Dict[str, Any]] = None,
|
|
565
|
+
template_path: Optional[str] = None,
|
|
566
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
567
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
568
|
+
) -> pd.DataFrame:
|
|
569
|
+
"""Generates many novel scientific theories and filters the cream of the crop.
|
|
570
|
+
|
|
571
|
+
Example Use
|
|
572
|
+
-----------
|
|
573
|
+
Procure novel theories on inflation for potential research.
|
|
574
|
+
|
|
575
|
+
Parameters
|
|
576
|
+
----------
|
|
577
|
+
topic:
|
|
578
|
+
Subject area or question to ideate on.
|
|
579
|
+
save_dir:
|
|
580
|
+
Directory where generated ideas and intermediate rankings are saved.
|
|
581
|
+
file_name:
|
|
582
|
+
CSV name for the consolidated ideation output.
|
|
583
|
+
model, ranking_model:
|
|
584
|
+
Models used for idea generation and ranking (if different).
|
|
585
|
+
n_ideas:
|
|
586
|
+
Target number of ideas to generate before pruning.
|
|
587
|
+
n_parallels:
|
|
588
|
+
Maximum concurrent calls for generation and ranking phases.
|
|
589
|
+
evaluation_mode:
|
|
590
|
+
Strategy used to evaluate ideas (for example ``"recursive_rank"``).
|
|
591
|
+
attributes:
|
|
592
|
+
Optional attributes to rate ideas on during evaluation.
|
|
593
|
+
rank_attribute:
|
|
594
|
+
Name of the attribute used for final ranking when multiple attributes
|
|
595
|
+
are present.
|
|
596
|
+
recursive_*:
|
|
597
|
+
Parameters controlling iterative ranking passes (fraction kept,
|
|
598
|
+
minimum remaining, cut side, etc.).
|
|
599
|
+
additional_instructions:
|
|
600
|
+
Extra guidance injected into prompts for both generation and ranking.
|
|
601
|
+
web_search:
|
|
602
|
+
Enable web search augmentation for generation.
|
|
603
|
+
use_dummy:
|
|
604
|
+
When ``True`` perform deterministic offline runs.
|
|
605
|
+
reasoning_effort, reasoning_summary:
|
|
606
|
+
Optional OpenAI reasoning controls.
|
|
607
|
+
reset_files:
|
|
608
|
+
Force regeneration of outputs in ``save_dir``.
|
|
609
|
+
*_config_updates, *_run_kwargs:
|
|
610
|
+
Fine-grained overrides for nested Rate/Rank/Seed tasks.
|
|
611
|
+
seed_deduplicate:
|
|
612
|
+
When ``True`` enable deduplication in the nested seed generation.
|
|
613
|
+
template_path:
|
|
614
|
+
Optional template override for the ideation prompts.
|
|
615
|
+
response_fn:
|
|
616
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
617
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
618
|
+
``get_all_responses_fn`` is supplied.
|
|
619
|
+
get_all_responses_fn:
|
|
620
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
621
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
622
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
623
|
+
|
|
624
|
+
Returns
|
|
625
|
+
-------
|
|
626
|
+
pandas.DataFrame
|
|
627
|
+
Ranked list of ideas with evaluation metadata.
|
|
628
|
+
"""
|
|
629
|
+
|
|
630
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
631
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
632
|
+
|
|
633
|
+
cfg_kwargs: Dict[str, Any] = dict(
|
|
634
|
+
save_dir=save_dir,
|
|
635
|
+
file_name=file_name,
|
|
636
|
+
model=model,
|
|
637
|
+
ranking_model=ranking_model,
|
|
638
|
+
n_parallels=n_parallels,
|
|
639
|
+
n_ideas=n_ideas,
|
|
640
|
+
evaluation_mode=evaluation_mode,
|
|
641
|
+
rank_attribute=rank_attribute,
|
|
642
|
+
recursive_fraction=recursive_fraction,
|
|
643
|
+
recursive_min_remaining=recursive_min_remaining,
|
|
644
|
+
recursive_final_round_multiplier=recursive_final_round_multiplier,
|
|
645
|
+
recursive_cut_side=recursive_cut_side,
|
|
646
|
+
recursive_rate_first_round=recursive_rate_first_round,
|
|
647
|
+
additional_instructions=additional_instructions,
|
|
648
|
+
web_search=web_search,
|
|
649
|
+
use_dummy=use_dummy,
|
|
650
|
+
reasoning_effort=reasoning_effort,
|
|
651
|
+
reasoning_summary=reasoning_summary,
|
|
652
|
+
seed_deduplicate=seed_deduplicate,
|
|
653
|
+
)
|
|
654
|
+
if attributes is not None:
|
|
655
|
+
cfg_kwargs["attributes"] = attributes
|
|
656
|
+
cfg = IdeateConfig(**cfg_kwargs)
|
|
657
|
+
|
|
658
|
+
def _with_callable_overrides(payload: Optional[Dict[str, Any]]) -> Dict[str, Any]:
|
|
659
|
+
updated = dict(payload or {})
|
|
660
|
+
if response_fn is not None:
|
|
661
|
+
updated.setdefault("response_fn", response_fn)
|
|
662
|
+
if get_all_responses_fn is not None:
|
|
663
|
+
updated.setdefault("get_all_responses_fn", get_all_responses_fn)
|
|
664
|
+
return updated
|
|
665
|
+
|
|
666
|
+
generation_kwargs = _with_callable_overrides(generation_kwargs)
|
|
667
|
+
rank_run_kwargs = _with_callable_overrides(rank_run_kwargs)
|
|
668
|
+
rate_run_kwargs = _with_callable_overrides(rate_run_kwargs)
|
|
669
|
+
seed_run_kwargs = _with_callable_overrides(seed_run_kwargs)
|
|
670
|
+
|
|
671
|
+
ideator = Ideate(cfg, template_path=template_path)
|
|
672
|
+
return await ideator.run(
|
|
673
|
+
topic,
|
|
674
|
+
additional_instructions=additional_instructions,
|
|
675
|
+
evaluation_mode=evaluation_mode,
|
|
676
|
+
attributes=attributes,
|
|
677
|
+
rank_attribute=rank_attribute,
|
|
678
|
+
reset_files=reset_files,
|
|
679
|
+
generation_kwargs=generation_kwargs,
|
|
680
|
+
rank_config_updates=rank_config_updates,
|
|
681
|
+
rank_run_kwargs=rank_run_kwargs,
|
|
682
|
+
rate_config_updates=rate_config_updates,
|
|
683
|
+
rate_run_kwargs=rate_run_kwargs,
|
|
684
|
+
use_seed_entities=use_seed_entities,
|
|
685
|
+
seed_config_updates=seed_config_updates,
|
|
686
|
+
seed_run_kwargs=seed_run_kwargs,
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
async def id8(*args, **kwargs) -> pd.DataFrame:
|
|
691
|
+
"""Alias for :func:`ideate`."""
|
|
692
|
+
|
|
693
|
+
return await ideate(*args, **kwargs)
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
async def deidentify(
|
|
697
|
+
df: pd.DataFrame,
|
|
698
|
+
column_name: str,
|
|
699
|
+
*,
|
|
700
|
+
save_dir: str,
|
|
701
|
+
grouping_column: Optional[str] = None,
|
|
702
|
+
mapping_column: Optional[str] = None,
|
|
703
|
+
model: str = "gpt-5-mini",
|
|
704
|
+
n_parallels: int = 650,
|
|
705
|
+
use_dummy: bool = False,
|
|
706
|
+
file_name: str = "deidentified.csv",
|
|
707
|
+
max_words_per_call: int = 7500,
|
|
708
|
+
additional_instructions: Optional[str] = None,
|
|
709
|
+
reasoning_effort: Optional[str] = None,
|
|
710
|
+
reasoning_summary: Optional[str] = None,
|
|
711
|
+
n_passes: int = 1,
|
|
712
|
+
use_existing_mappings_only: bool = False,
|
|
713
|
+
template_path: Optional[str] = None,
|
|
714
|
+
reset_files: bool = False,
|
|
715
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
716
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
717
|
+
**cfg_kwargs,
|
|
718
|
+
) -> pd.DataFrame:
|
|
719
|
+
"""Replaces PII with realistic, consistent fake PII. Outputs anonymized text + mapping.
|
|
720
|
+
|
|
721
|
+
Example Use
|
|
722
|
+
-----------
|
|
723
|
+
Replace names, employers, addresses before sharing interview corpora.
|
|
724
|
+
|
|
725
|
+
Parameters
|
|
726
|
+
----------
|
|
727
|
+
df:
|
|
728
|
+
DataFrame containing passages to deidentify.
|
|
729
|
+
column_name:
|
|
730
|
+
Column in ``df`` holding the text to scrub.
|
|
731
|
+
save_dir:
|
|
732
|
+
Directory where anonymised outputs and mappings are written.
|
|
733
|
+
grouping_column:
|
|
734
|
+
Optional column grouping records that should share replacements.
|
|
735
|
+
mapping_column:
|
|
736
|
+
Optional column providing deterministic replacement tokens.
|
|
737
|
+
model:
|
|
738
|
+
Model name used to perform the deidentification.
|
|
739
|
+
n_parallels:
|
|
740
|
+
Maximum concurrent requests.
|
|
741
|
+
use_dummy:
|
|
742
|
+
When ``True`` produce deterministic dummy replacements for testing.
|
|
743
|
+
file_name:
|
|
744
|
+
CSV filename used when persisting deidentified text.
|
|
745
|
+
max_words_per_call:
|
|
746
|
+
Chunk size control for long passages.
|
|
747
|
+
additional_instructions:
|
|
748
|
+
Extra guidance appended to the prompt.
|
|
749
|
+
reasoning_effort, reasoning_summary:
|
|
750
|
+
Optional OpenAI reasoning controls.
|
|
751
|
+
n_passes:
|
|
752
|
+
Number of deidentification passes to run over each passage.
|
|
753
|
+
use_existing_mappings_only:
|
|
754
|
+
If ``True`` only apply existing mappings and avoid new model calls.
|
|
755
|
+
template_path:
|
|
756
|
+
Custom prompt template path.
|
|
757
|
+
reset_files:
|
|
758
|
+
When ``True`` ignore cached outputs in ``save_dir``.
|
|
759
|
+
response_fn:
|
|
760
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
761
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
762
|
+
``get_all_responses_fn`` is supplied.
|
|
763
|
+
get_all_responses_fn:
|
|
764
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
765
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
766
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
767
|
+
**cfg_kwargs:
|
|
768
|
+
Additional overrides for :class:`gabriel.tasks.deidentify.DeidentifyConfig`.
|
|
769
|
+
|
|
770
|
+
Returns
|
|
771
|
+
-------
|
|
772
|
+
pandas.DataFrame
|
|
773
|
+
DataFrame containing deidentified text and replacement mappings.
|
|
774
|
+
"""
|
|
775
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
776
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
777
|
+
cfg = DeidentifyConfig(
|
|
778
|
+
save_dir=save_dir,
|
|
779
|
+
file_name=file_name,
|
|
780
|
+
model=model,
|
|
781
|
+
n_parallels=n_parallels,
|
|
782
|
+
use_dummy=use_dummy,
|
|
783
|
+
max_words_per_call=max_words_per_call,
|
|
784
|
+
additional_instructions=additional_instructions,
|
|
785
|
+
reasoning_effort=reasoning_effort,
|
|
786
|
+
reasoning_summary=reasoning_summary,
|
|
787
|
+
n_passes=n_passes,
|
|
788
|
+
use_existing_mappings_only=use_existing_mappings_only,
|
|
789
|
+
**cfg_kwargs,
|
|
790
|
+
)
|
|
791
|
+
return await Deidentifier(cfg, template_path=template_path).run(
|
|
792
|
+
df,
|
|
793
|
+
column_name,
|
|
794
|
+
grouping_column=grouping_column,
|
|
795
|
+
mapping_column=mapping_column,
|
|
796
|
+
reset_files=reset_files,
|
|
797
|
+
response_fn=response_fn,
|
|
798
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
async def rank(
|
|
802
|
+
df: pd.DataFrame,
|
|
803
|
+
column_name: str,
|
|
804
|
+
*,
|
|
805
|
+
attributes: Union[Dict[str, str], List[str]],
|
|
806
|
+
save_dir: str,
|
|
807
|
+
additional_instructions: Optional[str] = None,
|
|
808
|
+
model: str = "gpt-5-mini",
|
|
809
|
+
n_rounds: int = 5,
|
|
810
|
+
matches_per_round: int = 3,
|
|
811
|
+
power_matching: bool = True,
|
|
812
|
+
return_raw_scores: bool = False,
|
|
813
|
+
learning_rate: float = 0.1,
|
|
814
|
+
n_parallels: int = 650,
|
|
815
|
+
n_attributes_per_run: int = 8,
|
|
816
|
+
use_dummy: bool = False,
|
|
817
|
+
file_name: str = "rankings",
|
|
818
|
+
reset_files: bool = False,
|
|
819
|
+
modality: str = "text",
|
|
820
|
+
reasoning_effort: Optional[str] = None,
|
|
821
|
+
reasoning_summary: Optional[str] = None,
|
|
822
|
+
template_path: Optional[str] = None,
|
|
823
|
+
recursive: bool = False,
|
|
824
|
+
recursive_fraction: float = 1.0 / 3.0,
|
|
825
|
+
recursive_min_remaining: int = 30,
|
|
826
|
+
recursive_final_round_multiplier: int = 3,
|
|
827
|
+
recursive_cut_attr: Optional[str] = None,
|
|
828
|
+
recursive_cut_side: str = "top",
|
|
829
|
+
recursive_rate_first_round: bool = True,
|
|
830
|
+
recursive_rewrite_func: Optional[Callable[[str, str, int], str]] = None,
|
|
831
|
+
recursive_rewrite_text_col: str = "text",
|
|
832
|
+
recursive_keep_stage_columns: bool = True,
|
|
833
|
+
recursive_add_stage_suffix: bool = True,
|
|
834
|
+
initial_rating_pass: bool = True,
|
|
835
|
+
rate_kwargs: Optional[Dict[str, Any]] = None,
|
|
836
|
+
primer_scores: Optional[Dict[str, Dict[str, float]]] = None,
|
|
837
|
+
primer_scale: float = 1.0,
|
|
838
|
+
primer_center: bool = True,
|
|
839
|
+
id_column: Optional[str] = None,
|
|
840
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
841
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
842
|
+
**cfg_kwargs,
|
|
843
|
+
) -> pd.DataFrame:
|
|
844
|
+
"""Pairwise comparisons between texts yields ELO-like attribute ratings. Output = grounded, relative z scores for each text.
|
|
845
|
+
|
|
846
|
+
Example Use
|
|
847
|
+
-----------
|
|
848
|
+
Rank technologies by "bulkiness" or artworks by "fine brushwork".
|
|
849
|
+
|
|
850
|
+
Parameters
|
|
851
|
+
----------
|
|
852
|
+
df:
|
|
853
|
+
DataFrame containing passages to rank.
|
|
854
|
+
column_name:
|
|
855
|
+
Column holding the content to rank.
|
|
856
|
+
attributes:
|
|
857
|
+
Either a mapping of attribute names to descriptions or a list of
|
|
858
|
+
attribute names (descriptions inferred from templates).
|
|
859
|
+
save_dir:
|
|
860
|
+
Directory where ranking artifacts are saved.
|
|
861
|
+
additional_instructions:
|
|
862
|
+
Free-form prompt additions applied to each comparison.
|
|
863
|
+
model:
|
|
864
|
+
Model name used for ranking calls.
|
|
865
|
+
n_rounds, matches_per_round, power_matching, learning_rate:
|
|
866
|
+
Parameters controlling the Elo-style tournament mechanics.
|
|
867
|
+
n_parallels:
|
|
868
|
+
Maximum concurrent ranking calls.
|
|
869
|
+
n_attributes_per_run:
|
|
870
|
+
Maximum number of attributes to compare per prompt. Attributes are
|
|
871
|
+
batched across prompts when this cap is exceeded.
|
|
872
|
+
use_dummy:
|
|
873
|
+
When ``True`` run deterministic offline ranking.
|
|
874
|
+
file_name:
|
|
875
|
+
Base filename for saved rankings (without extension).
|
|
876
|
+
reset_files:
|
|
877
|
+
Force regeneration of any existing outputs in ``save_dir``.
|
|
878
|
+
modality:
|
|
879
|
+
Content modality forwarded to the prompt.
|
|
880
|
+
reasoning_effort, reasoning_summary:
|
|
881
|
+
Optional OpenAI reasoning controls.
|
|
882
|
+
template_path:
|
|
883
|
+
Path to a custom ranking prompt template.
|
|
884
|
+
recursive_*:
|
|
885
|
+
Settings for recursive pruning (fraction kept, minimum remaining, etc.).
|
|
886
|
+
initial_rating_pass:
|
|
887
|
+
Whether to run a preliminary rating stage before comparisons. Enabled by
|
|
888
|
+
default to give the tournament grounded starting scores; set to
|
|
889
|
+
``False`` to skip the rating seed.
|
|
890
|
+
rate_kwargs:
|
|
891
|
+
Additional configuration forwarded to the preliminary rating stage.
|
|
892
|
+
primer_scores, primer_scale, primer_center:
|
|
893
|
+
Optional seed ratings to prime the Bradley–Terry loop. Scores are
|
|
894
|
+
centred per attribute when ``primer_center`` is ``True`` and scaled
|
|
895
|
+
by ``primer_scale``.
|
|
896
|
+
id_column:
|
|
897
|
+
Optional existing identifier column; otherwise hashes of ``column_name``
|
|
898
|
+
are generated.
|
|
899
|
+
response_fn:
|
|
900
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
901
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
902
|
+
``get_all_responses_fn`` is supplied.
|
|
903
|
+
get_all_responses_fn:
|
|
904
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
905
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
906
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
907
|
+
**cfg_kwargs:
|
|
908
|
+
Extra parameters passed to :class:`gabriel.tasks.rank.RankConfig`.
|
|
909
|
+
|
|
910
|
+
Returns
|
|
911
|
+
-------
|
|
912
|
+
pandas.DataFrame
|
|
913
|
+
Ranked outputs. The CSV written to ``save_dir`` always contains raw
|
|
914
|
+
scores and standard errors, but the returned DataFrame hides those
|
|
915
|
+
columns unless ``return_raw_scores`` is ``True``.
|
|
916
|
+
"""
|
|
917
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
918
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
919
|
+
cfg = RankConfig(
|
|
920
|
+
attributes=attributes,
|
|
921
|
+
n_rounds=n_rounds,
|
|
922
|
+
matches_per_round=matches_per_round,
|
|
923
|
+
power_matching=power_matching,
|
|
924
|
+
learning_rate=learning_rate,
|
|
925
|
+
model=model,
|
|
926
|
+
n_parallels=n_parallels,
|
|
927
|
+
n_attributes_per_run=n_attributes_per_run,
|
|
928
|
+
use_dummy=use_dummy,
|
|
929
|
+
save_dir=save_dir,
|
|
930
|
+
file_name=file_name,
|
|
931
|
+
additional_instructions=additional_instructions or "",
|
|
932
|
+
modality=modality,
|
|
933
|
+
reasoning_effort=reasoning_effort,
|
|
934
|
+
reasoning_summary=reasoning_summary,
|
|
935
|
+
recursive=recursive,
|
|
936
|
+
recursive_fraction=recursive_fraction,
|
|
937
|
+
recursive_min_remaining=recursive_min_remaining,
|
|
938
|
+
recursive_final_round_multiplier=recursive_final_round_multiplier,
|
|
939
|
+
recursive_cut_attr=recursive_cut_attr,
|
|
940
|
+
recursive_cut_side=recursive_cut_side,
|
|
941
|
+
recursive_rate_first_round=recursive_rate_first_round,
|
|
942
|
+
recursive_rewrite_func=recursive_rewrite_func,
|
|
943
|
+
recursive_rewrite_text_col=recursive_rewrite_text_col,
|
|
944
|
+
recursive_keep_stage_columns=recursive_keep_stage_columns,
|
|
945
|
+
recursive_add_stage_suffix=recursive_add_stage_suffix,
|
|
946
|
+
initial_rating_pass=initial_rating_pass,
|
|
947
|
+
rate_kwargs=rate_kwargs or {},
|
|
948
|
+
primer_scores=primer_scores,
|
|
949
|
+
primer_scale=primer_scale,
|
|
950
|
+
primer_center=primer_center,
|
|
951
|
+
**cfg_kwargs,
|
|
952
|
+
)
|
|
953
|
+
result_df = await Rank(cfg, template_path=template_path).run(
|
|
954
|
+
df,
|
|
955
|
+
column_name,
|
|
956
|
+
id_column=id_column,
|
|
957
|
+
reset_files=reset_files,
|
|
958
|
+
response_fn=response_fn,
|
|
959
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
960
|
+
)
|
|
961
|
+
|
|
962
|
+
# By default only expose the z-score columns (attribute names without suffixes)
|
|
963
|
+
# to API callers while keeping the raw/SE columns persisted in the CSV output.
|
|
964
|
+
if return_raw_scores:
|
|
965
|
+
return result_df
|
|
966
|
+
|
|
967
|
+
if isinstance(attributes, dict):
|
|
968
|
+
attr_keys: List[str] = list(attributes.keys())
|
|
969
|
+
else:
|
|
970
|
+
attr_keys = list(attributes)
|
|
971
|
+
drop_cols: List[str] = []
|
|
972
|
+
for attr in attr_keys:
|
|
973
|
+
raw_col = f"{attr}_raw"
|
|
974
|
+
se_col = f"{attr}_se"
|
|
975
|
+
if raw_col in result_df.columns:
|
|
976
|
+
drop_cols.append(raw_col)
|
|
977
|
+
if se_col in result_df.columns:
|
|
978
|
+
drop_cols.append(se_col)
|
|
979
|
+
if drop_cols:
|
|
980
|
+
result_df = result_df.drop(columns=drop_cols)
|
|
981
|
+
return result_df
|
|
982
|
+
|
|
983
|
+
|
|
984
|
+
async def codify(
|
|
985
|
+
df: pd.DataFrame,
|
|
986
|
+
column_name: str,
|
|
987
|
+
*,
|
|
988
|
+
save_dir: str,
|
|
989
|
+
categories: Optional[Dict[str, str]] = None,
|
|
990
|
+
additional_instructions: str = "",
|
|
991
|
+
model: str = "gpt-5-mini",
|
|
992
|
+
n_parallels: int = 650,
|
|
993
|
+
max_words_per_call: int = 1000,
|
|
994
|
+
max_categories_per_call: int = 8,
|
|
995
|
+
file_name: str = "coding_results.csv",
|
|
996
|
+
reset_files: bool = False,
|
|
997
|
+
debug_print: bool = False,
|
|
998
|
+
use_dummy: bool = False,
|
|
999
|
+
reasoning_effort: Optional[str] = None,
|
|
1000
|
+
reasoning_summary: Optional[str] = None,
|
|
1001
|
+
modality: str = "text",
|
|
1002
|
+
json_mode: bool = True,
|
|
1003
|
+
max_timeout: Optional[float] = None,
|
|
1004
|
+
n_rounds: int = 2,
|
|
1005
|
+
completion_classifier_instructions: Optional[str] = None,
|
|
1006
|
+
template_path: Optional[str] = None,
|
|
1007
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
1008
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
1009
|
+
**cfg_kwargs,
|
|
1010
|
+
) -> pd.DataFrame:
|
|
1011
|
+
"""Passage coding: highlights snippets in text that match qualitative codes.
|
|
1012
|
+
|
|
1013
|
+
Example Use
|
|
1014
|
+
-----------
|
|
1015
|
+
Flag sentences about "economic insecurity" in speeches; "stressors" mentioned in interview.
|
|
1016
|
+
|
|
1017
|
+
Parameters
|
|
1018
|
+
----------
|
|
1019
|
+
df:
|
|
1020
|
+
DataFrame containing the passages to code.
|
|
1021
|
+
column_name:
|
|
1022
|
+
Column with the text to be coded.
|
|
1023
|
+
save_dir:
|
|
1024
|
+
Directory where coding outputs are written.
|
|
1025
|
+
categories:
|
|
1026
|
+
Optional mapping of category names to descriptions. If omitted the model
|
|
1027
|
+
infers categories.
|
|
1028
|
+
additional_instructions:
|
|
1029
|
+
Extra guidance appended to the coding prompt.
|
|
1030
|
+
model:
|
|
1031
|
+
Model used for coding requests.
|
|
1032
|
+
n_parallels:
|
|
1033
|
+
Maximum number of concurrent coding calls.
|
|
1034
|
+
max_words_per_call:
|
|
1035
|
+
Chunk size control for each request.
|
|
1036
|
+
max_categories_per_call:
|
|
1037
|
+
Limit on the number of categories evaluated per call.
|
|
1038
|
+
file_name:
|
|
1039
|
+
Filename for saved coding responses.
|
|
1040
|
+
reset_files:
|
|
1041
|
+
When ``True`` regenerate outputs even if files exist.
|
|
1042
|
+
debug_print:
|
|
1043
|
+
Enable verbose logging of prompts and responses.
|
|
1044
|
+
use_dummy:
|
|
1045
|
+
Use deterministic dummy outputs for testing.
|
|
1046
|
+
reasoning_effort, reasoning_summary:
|
|
1047
|
+
Optional OpenAI reasoning controls.
|
|
1048
|
+
modality:
|
|
1049
|
+
Content modality hint (text, entity, etc.).
|
|
1050
|
+
json_mode:
|
|
1051
|
+
Request JSON-mode responses where supported.
|
|
1052
|
+
max_timeout:
|
|
1053
|
+
Optional per-call timeout.
|
|
1054
|
+
n_rounds:
|
|
1055
|
+
Number of completion passes to refine codes.
|
|
1056
|
+
completion_classifier_instructions:
|
|
1057
|
+
Optional classifier guidance for completion steps.
|
|
1058
|
+
template_path:
|
|
1059
|
+
Custom Jinja2 template for coding prompts.
|
|
1060
|
+
response_fn:
|
|
1061
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
1062
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
1063
|
+
``get_all_responses_fn`` is supplied.
|
|
1064
|
+
get_all_responses_fn:
|
|
1065
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
1066
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
1067
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
1068
|
+
**cfg_kwargs:
|
|
1069
|
+
Additional overrides passed to :class:`gabriel.tasks.codify.CodifyConfig`.
|
|
1070
|
+
|
|
1071
|
+
Returns
|
|
1072
|
+
-------
|
|
1073
|
+
pandas.DataFrame
|
|
1074
|
+
DataFrame with coded categories and any iterative refinement metadata.
|
|
1075
|
+
"""
|
|
1076
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
1077
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
1078
|
+
cfg_kwargs = dict(cfg_kwargs)
|
|
1079
|
+
|
|
1080
|
+
cfg = CodifyConfig(
|
|
1081
|
+
save_dir=save_dir,
|
|
1082
|
+
file_name=file_name,
|
|
1083
|
+
model=model,
|
|
1084
|
+
n_parallels=n_parallels,
|
|
1085
|
+
max_words_per_call=max_words_per_call,
|
|
1086
|
+
max_categories_per_call=max_categories_per_call,
|
|
1087
|
+
debug_print=debug_print,
|
|
1088
|
+
use_dummy=use_dummy,
|
|
1089
|
+
reasoning_effort=reasoning_effort,
|
|
1090
|
+
reasoning_summary=reasoning_summary,
|
|
1091
|
+
modality=modality,
|
|
1092
|
+
json_mode=json_mode,
|
|
1093
|
+
max_timeout=max_timeout,
|
|
1094
|
+
n_rounds=n_rounds,
|
|
1095
|
+
completion_classifier_instructions=completion_classifier_instructions,
|
|
1096
|
+
**cfg_kwargs,
|
|
1097
|
+
)
|
|
1098
|
+
return await Codify(cfg, template_path=template_path).run(
|
|
1099
|
+
df,
|
|
1100
|
+
column_name,
|
|
1101
|
+
categories=categories,
|
|
1102
|
+
additional_instructions=additional_instructions,
|
|
1103
|
+
reset_files=reset_files,
|
|
1104
|
+
response_fn=response_fn,
|
|
1105
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
1106
|
+
)
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
async def paraphrase(
|
|
1110
|
+
df: pd.DataFrame,
|
|
1111
|
+
column_name: str,
|
|
1112
|
+
*,
|
|
1113
|
+
instructions: str,
|
|
1114
|
+
save_dir: str,
|
|
1115
|
+
revised_column_name: Optional[str] = None,
|
|
1116
|
+
n_revisions: int = 1,
|
|
1117
|
+
file_name: str = "paraphrase_responses.csv",
|
|
1118
|
+
model: str = "gpt-5-mini",
|
|
1119
|
+
json_mode: bool = False,
|
|
1120
|
+
web_search: Optional[bool] = None,
|
|
1121
|
+
n_parallels: int = 650,
|
|
1122
|
+
use_dummy: bool = False,
|
|
1123
|
+
reset_files: bool = False,
|
|
1124
|
+
reasoning_effort: Optional[str] = None,
|
|
1125
|
+
reasoning_summary: Optional[str] = None,
|
|
1126
|
+
n_rounds: int = 1,
|
|
1127
|
+
recursive_validation: Optional[bool] = None,
|
|
1128
|
+
n_initial_candidates: int = 1,
|
|
1129
|
+
n_validation_candidates: int = 5,
|
|
1130
|
+
use_modified_source: bool = False,
|
|
1131
|
+
template_path: Optional[str] = None,
|
|
1132
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
1133
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
1134
|
+
**cfg_kwargs,
|
|
1135
|
+
) -> pd.DataFrame:
|
|
1136
|
+
"""Rewrites texts consistently per instructions.
|
|
1137
|
+
|
|
1138
|
+
Example Use
|
|
1139
|
+
-----------
|
|
1140
|
+
Summarize earnings call transcripts to remove company specifics.
|
|
1141
|
+
|
|
1142
|
+
Parameters
|
|
1143
|
+
----------
|
|
1144
|
+
df:
|
|
1145
|
+
DataFrame containing passages to paraphrase.
|
|
1146
|
+
column_name:
|
|
1147
|
+
Column with text to rewrite.
|
|
1148
|
+
instructions:
|
|
1149
|
+
Guidance describing how the paraphrase should differ from the source.
|
|
1150
|
+
save_dir:
|
|
1151
|
+
Directory where paraphrase outputs are written.
|
|
1152
|
+
revised_column_name:
|
|
1153
|
+
Optional name for the paraphrased column; defaults to a generated one.
|
|
1154
|
+
n_revisions:
|
|
1155
|
+
Number of paraphrases to produce per passage.
|
|
1156
|
+
file_name:
|
|
1157
|
+
CSV filename for saved paraphrases.
|
|
1158
|
+
model:
|
|
1159
|
+
Model name used for generation.
|
|
1160
|
+
json_mode:
|
|
1161
|
+
Whether to request JSON responses.
|
|
1162
|
+
web_search:
|
|
1163
|
+
Enable web search augmentation when supported by the model.
|
|
1164
|
+
n_parallels:
|
|
1165
|
+
Maximum concurrent paraphrase calls.
|
|
1166
|
+
use_dummy:
|
|
1167
|
+
Produce deterministic dummy paraphrases.
|
|
1168
|
+
reset_files:
|
|
1169
|
+
When ``True`` regenerate outputs even if files already exist.
|
|
1170
|
+
reasoning_effort, reasoning_summary:
|
|
1171
|
+
Optional OpenAI reasoning controls.
|
|
1172
|
+
n_rounds:
|
|
1173
|
+
Maximum number of paraphrase/validation cycles. ``1`` disables recursion.
|
|
1174
|
+
recursive_validation:
|
|
1175
|
+
Deprecated boolean flag retained for compatibility; prefer ``n_rounds``.
|
|
1176
|
+
n_initial_candidates, n_validation_candidates:
|
|
1177
|
+
Control the number of candidates in generation and validation phases.
|
|
1178
|
+
use_modified_source:
|
|
1179
|
+
If ``True`` allow modified source text to be used during validation.
|
|
1180
|
+
template_path:
|
|
1181
|
+
Custom template path to override the default paraphrase prompt.
|
|
1182
|
+
response_fn:
|
|
1183
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
1184
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
1185
|
+
``get_all_responses_fn`` is supplied.
|
|
1186
|
+
get_all_responses_fn:
|
|
1187
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
1188
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
1189
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
1190
|
+
**cfg_kwargs:
|
|
1191
|
+
Additional configuration passed to :class:`gabriel.tasks.paraphrase.ParaphraseConfig`.
|
|
1192
|
+
|
|
1193
|
+
Returns
|
|
1194
|
+
-------
|
|
1195
|
+
pandas.DataFrame
|
|
1196
|
+
DataFrame containing paraphrased text and any validation scores.
|
|
1197
|
+
"""
|
|
1198
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
1199
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
1200
|
+
cfg = ParaphraseConfig(
|
|
1201
|
+
instructions=instructions,
|
|
1202
|
+
revised_column_name=revised_column_name,
|
|
1203
|
+
n_revisions=n_revisions,
|
|
1204
|
+
save_dir=save_dir,
|
|
1205
|
+
file_name=file_name,
|
|
1206
|
+
model=model,
|
|
1207
|
+
json_mode=json_mode,
|
|
1208
|
+
web_search=web_search,
|
|
1209
|
+
n_parallels=n_parallels,
|
|
1210
|
+
use_dummy=use_dummy,
|
|
1211
|
+
reasoning_effort=reasoning_effort,
|
|
1212
|
+
reasoning_summary=reasoning_summary,
|
|
1213
|
+
n_rounds=n_rounds,
|
|
1214
|
+
recursive_validation=recursive_validation,
|
|
1215
|
+
n_initial_candidates=n_initial_candidates,
|
|
1216
|
+
n_validation_candidates=n_validation_candidates,
|
|
1217
|
+
use_modified_source=use_modified_source,
|
|
1218
|
+
**cfg_kwargs,
|
|
1219
|
+
)
|
|
1220
|
+
return await Paraphrase(cfg, template_path=template_path).run(
|
|
1221
|
+
df,
|
|
1222
|
+
column_name,
|
|
1223
|
+
reset_files=reset_files,
|
|
1224
|
+
response_fn=response_fn,
|
|
1225
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
1226
|
+
)
|
|
1227
|
+
|
|
1228
|
+
|
|
1229
|
+
async def compare(
|
|
1230
|
+
df: pd.DataFrame,
|
|
1231
|
+
circle_column_name: str,
|
|
1232
|
+
square_column_name: str,
|
|
1233
|
+
*,
|
|
1234
|
+
save_dir: str,
|
|
1235
|
+
differentiate: bool = True,
|
|
1236
|
+
additional_instructions: Optional[str] = None,
|
|
1237
|
+
model: str = "gpt-5-mini",
|
|
1238
|
+
n_parallels: int = 650,
|
|
1239
|
+
n_runs: int = 1,
|
|
1240
|
+
reset_files: bool = False,
|
|
1241
|
+
use_dummy: bool = False,
|
|
1242
|
+
file_name: str = "comparison_responses.csv",
|
|
1243
|
+
modality: str = "text",
|
|
1244
|
+
reasoning_effort: Optional[str] = None,
|
|
1245
|
+
reasoning_summary: Optional[str] = None,
|
|
1246
|
+
template_path: Optional[str] = None,
|
|
1247
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
1248
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
1249
|
+
**cfg_kwargs,
|
|
1250
|
+
) -> pd.DataFrame:
|
|
1251
|
+
"""Identifies similarities / differences between paired items. Output = list of differences.
|
|
1252
|
+
|
|
1253
|
+
Example Use
|
|
1254
|
+
-----------
|
|
1255
|
+
Contrast op-eds from different districts; compare two ad campaigns.
|
|
1256
|
+
|
|
1257
|
+
Parameters
|
|
1258
|
+
----------
|
|
1259
|
+
df:
|
|
1260
|
+
DataFrame containing the paired passages to compare.
|
|
1261
|
+
circle_column_name, square_column_name:
|
|
1262
|
+
Columns representing the two sides of each comparison.
|
|
1263
|
+
save_dir:
|
|
1264
|
+
Directory where comparison outputs are written.
|
|
1265
|
+
differentiate:
|
|
1266
|
+
Whether to prompt the model to emphasise key differences.
|
|
1267
|
+
additional_instructions:
|
|
1268
|
+
Extra prompt guidance applied to each comparison.
|
|
1269
|
+
model:
|
|
1270
|
+
Model name for comparison calls.
|
|
1271
|
+
n_parallels:
|
|
1272
|
+
Maximum number of concurrent comparison requests.
|
|
1273
|
+
n_runs:
|
|
1274
|
+
Number of repeated comparisons to gather per pair.
|
|
1275
|
+
reset_files:
|
|
1276
|
+
When ``True`` regenerate results regardless of existing files.
|
|
1277
|
+
use_dummy:
|
|
1278
|
+
If ``True`` return deterministic dummy comparison outputs.
|
|
1279
|
+
file_name:
|
|
1280
|
+
CSV filename for saved comparison responses.
|
|
1281
|
+
modality:
|
|
1282
|
+
Content modality hint for prompt rendering.
|
|
1283
|
+
reasoning_effort, reasoning_summary:
|
|
1284
|
+
Optional OpenAI reasoning controls.
|
|
1285
|
+
template_path:
|
|
1286
|
+
Custom template override for comparison prompts.
|
|
1287
|
+
response_fn:
|
|
1288
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
1289
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
1290
|
+
``get_all_responses_fn`` is supplied.
|
|
1291
|
+
get_all_responses_fn:
|
|
1292
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
1293
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
1294
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
1295
|
+
**cfg_kwargs:
|
|
1296
|
+
Additional configuration passed to :class:`gabriel.tasks.compare.CompareConfig`.
|
|
1297
|
+
|
|
1298
|
+
Returns
|
|
1299
|
+
-------
|
|
1300
|
+
pandas.DataFrame
|
|
1301
|
+
DataFrame indexed by both input columns with one row per attribute and
|
|
1302
|
+
an ``explanation`` field describing the preference rationale.
|
|
1303
|
+
"""
|
|
1304
|
+
|
|
1305
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
1306
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
1307
|
+
cfg = CompareConfig(
|
|
1308
|
+
save_dir=save_dir,
|
|
1309
|
+
file_name=file_name,
|
|
1310
|
+
model=model,
|
|
1311
|
+
n_parallels=n_parallels,
|
|
1312
|
+
n_runs=n_runs,
|
|
1313
|
+
use_dummy=use_dummy,
|
|
1314
|
+
differentiate=differentiate,
|
|
1315
|
+
additional_instructions=additional_instructions or "",
|
|
1316
|
+
modality=modality,
|
|
1317
|
+
reasoning_effort=reasoning_effort,
|
|
1318
|
+
reasoning_summary=reasoning_summary,
|
|
1319
|
+
**cfg_kwargs,
|
|
1320
|
+
)
|
|
1321
|
+
return await Compare(cfg, template_path=template_path).run(
|
|
1322
|
+
df,
|
|
1323
|
+
circle_column_name,
|
|
1324
|
+
square_column_name,
|
|
1325
|
+
reset_files=reset_files,
|
|
1326
|
+
response_fn=response_fn,
|
|
1327
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
1328
|
+
)
|
|
1329
|
+
|
|
1330
|
+
|
|
1331
|
+
async def bucket(
|
|
1332
|
+
df: pd.DataFrame,
|
|
1333
|
+
column_name: str,
|
|
1334
|
+
*,
|
|
1335
|
+
save_dir: str,
|
|
1336
|
+
additional_instructions: Optional[str] = None,
|
|
1337
|
+
model: str = "gpt-5-mini",
|
|
1338
|
+
n_parallels: int = 650,
|
|
1339
|
+
reset_files: bool = False,
|
|
1340
|
+
use_dummy: bool = False,
|
|
1341
|
+
file_name: str = "bucket_definitions.csv",
|
|
1342
|
+
bucket_count: int = 10,
|
|
1343
|
+
differentiate: bool = False,
|
|
1344
|
+
reasoning_effort: Optional[str] = None,
|
|
1345
|
+
reasoning_summary: Optional[str] = None,
|
|
1346
|
+
template_path: Optional[str] = None,
|
|
1347
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
1348
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
1349
|
+
**cfg_kwargs,
|
|
1350
|
+
) -> pd.DataFrame:
|
|
1351
|
+
"""Builds taxonomies from many terms. Output = bucket/cluster labels.
|
|
1352
|
+
|
|
1353
|
+
Example Use
|
|
1354
|
+
-----------
|
|
1355
|
+
Group technologies, artworks, or HR complaints into emergent categories.
|
|
1356
|
+
|
|
1357
|
+
Parameters
|
|
1358
|
+
----------
|
|
1359
|
+
df:
|
|
1360
|
+
DataFrame containing passages to bucket.
|
|
1361
|
+
column_name:
|
|
1362
|
+
Column holding the text to cluster.
|
|
1363
|
+
save_dir:
|
|
1364
|
+
Directory where bucket definitions and intermediate state are saved.
|
|
1365
|
+
additional_instructions:
|
|
1366
|
+
Extra prompt guidance for bucket creation.
|
|
1367
|
+
model:
|
|
1368
|
+
Model used to propose bucket definitions.
|
|
1369
|
+
n_parallels:
|
|
1370
|
+
Maximum number of concurrent bucket definition calls.
|
|
1371
|
+
reset_files:
|
|
1372
|
+
When ``True`` regenerate outputs despite existing files.
|
|
1373
|
+
use_dummy:
|
|
1374
|
+
Return deterministic dummy buckets for offline testing.
|
|
1375
|
+
file_name:
|
|
1376
|
+
Filename for saved bucket definitions.
|
|
1377
|
+
bucket_count:
|
|
1378
|
+
Target number of buckets to generate.
|
|
1379
|
+
differentiate:
|
|
1380
|
+
Whether to encourage distinctive bucket descriptions.
|
|
1381
|
+
reasoning_effort, reasoning_summary:
|
|
1382
|
+
Optional OpenAI reasoning controls.
|
|
1383
|
+
template_path:
|
|
1384
|
+
Custom template path for bucket prompts.
|
|
1385
|
+
response_fn:
|
|
1386
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
1387
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
1388
|
+
``get_all_responses_fn`` is supplied.
|
|
1389
|
+
get_all_responses_fn:
|
|
1390
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
1391
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
1392
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
1393
|
+
**cfg_kwargs:
|
|
1394
|
+
Additional overrides forwarded to :class:`gabriel.tasks.bucket.BucketConfig`.
|
|
1395
|
+
|
|
1396
|
+
Returns
|
|
1397
|
+
-------
|
|
1398
|
+
pandas.DataFrame
|
|
1399
|
+
DataFrame containing the finalized bucket names and definitions (one
|
|
1400
|
+
row per bucket).
|
|
1401
|
+
"""
|
|
1402
|
+
|
|
1403
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
1404
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
1405
|
+
cfg = BucketConfig(
|
|
1406
|
+
bucket_count=bucket_count,
|
|
1407
|
+
save_dir=save_dir,
|
|
1408
|
+
file_name=file_name,
|
|
1409
|
+
model=model,
|
|
1410
|
+
n_parallels=n_parallels,
|
|
1411
|
+
use_dummy=use_dummy,
|
|
1412
|
+
additional_instructions=additional_instructions,
|
|
1413
|
+
differentiate=differentiate,
|
|
1414
|
+
reasoning_effort=reasoning_effort,
|
|
1415
|
+
reasoning_summary=reasoning_summary,
|
|
1416
|
+
**cfg_kwargs,
|
|
1417
|
+
)
|
|
1418
|
+
return await Bucket(cfg, template_path=template_path).run(
|
|
1419
|
+
df,
|
|
1420
|
+
column_name,
|
|
1421
|
+
reset_files=reset_files,
|
|
1422
|
+
response_fn=response_fn,
|
|
1423
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
1424
|
+
)
|
|
1425
|
+
|
|
1426
|
+
|
|
1427
|
+
async def discover(
|
|
1428
|
+
df: pd.DataFrame,
|
|
1429
|
+
*,
|
|
1430
|
+
column_name: Optional[str] = None,
|
|
1431
|
+
circle_column_name: Optional[str] = None,
|
|
1432
|
+
square_column_name: Optional[str] = None,
|
|
1433
|
+
save_dir: str,
|
|
1434
|
+
additional_instructions: Optional[str] = None,
|
|
1435
|
+
model: str = "gpt-5-mini",
|
|
1436
|
+
n_parallels: int = 650,
|
|
1437
|
+
n_runs: int = 1,
|
|
1438
|
+
min_frequency: float = 0.6,
|
|
1439
|
+
bucket_count: int = 10,
|
|
1440
|
+
differentiate: bool = True,
|
|
1441
|
+
max_words_per_call: int = 1000,
|
|
1442
|
+
max_categories_per_call: int = 8,
|
|
1443
|
+
n_terms_per_prompt: int = 250,
|
|
1444
|
+
repeat_bucketing: int = 5,
|
|
1445
|
+
repeat_voting: int = 25,
|
|
1446
|
+
next_round_frac: float = 0.25,
|
|
1447
|
+
top_k_per_round: int = 1,
|
|
1448
|
+
raw_term_definitions: bool = True,
|
|
1449
|
+
use_dummy: bool = False,
|
|
1450
|
+
modality: str = "text",
|
|
1451
|
+
reasoning_effort: Optional[str] = None,
|
|
1452
|
+
reasoning_summary: Optional[str] = None,
|
|
1453
|
+
reset_files: bool = False,
|
|
1454
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
1455
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
1456
|
+
**cfg_kwargs,
|
|
1457
|
+
) -> Dict[str, pd.DataFrame]:
|
|
1458
|
+
"""Discovers natural language features which discriminate two classes of data.
|
|
1459
|
+
|
|
1460
|
+
Example Use
|
|
1461
|
+
-----------
|
|
1462
|
+
Identify what distinguishes 5 star vs. 1 star reviews or successful vs. failed startups.
|
|
1463
|
+
|
|
1464
|
+
Parameters
|
|
1465
|
+
----------
|
|
1466
|
+
df:
|
|
1467
|
+
DataFrame containing the corpus to mine for labels.
|
|
1468
|
+
column_name:
|
|
1469
|
+
Column with free-form text to analyse. Optional when providing paired
|
|
1470
|
+
circle/square columns for contrastive discovery.
|
|
1471
|
+
circle_column_name, square_column_name:
|
|
1472
|
+
Optional paired columns enabling bidirectional discovery.
|
|
1473
|
+
save_dir:
|
|
1474
|
+
Directory where intermediate and final discovery outputs are saved.
|
|
1475
|
+
additional_instructions:
|
|
1476
|
+
Extra guidance applied throughout the discovery pipeline.
|
|
1477
|
+
model:
|
|
1478
|
+
Model used for bucket definitions and classification.
|
|
1479
|
+
n_parallels:
|
|
1480
|
+
Maximum concurrent calls per stage.
|
|
1481
|
+
n_runs:
|
|
1482
|
+
Number of classification repetitions to stabilise label prevalence.
|
|
1483
|
+
min_frequency:
|
|
1484
|
+
Minimum frequency threshold for labels to persist.
|
|
1485
|
+
bucket_count:
|
|
1486
|
+
Target number of buckets to propose in the initial step.
|
|
1487
|
+
differentiate:
|
|
1488
|
+
Encourage distinctive bucket descriptions when ``True``.
|
|
1489
|
+
max_words_per_call, max_categories_per_call:
|
|
1490
|
+
Chunking controls for classification prompts.
|
|
1491
|
+
n_terms_per_prompt, repeat_bucketing, repeat_voting:
|
|
1492
|
+
Parameters that regulate how many discovered terms are evaluated and how
|
|
1493
|
+
often bucketing/voting rounds repeat.
|
|
1494
|
+
next_round_frac, top_k_per_round:
|
|
1495
|
+
Controls for carrying top-performing terms into subsequent rounds.
|
|
1496
|
+
raw_term_definitions:
|
|
1497
|
+
Whether to keep raw label definitions in the outputs.
|
|
1498
|
+
use_dummy:
|
|
1499
|
+
If ``True`` perform deterministic offline discovery.
|
|
1500
|
+
modality:
|
|
1501
|
+
Content modality hint forwarded to downstream tasks.
|
|
1502
|
+
reasoning_effort, reasoning_summary:
|
|
1503
|
+
Optional OpenAI reasoning controls.
|
|
1504
|
+
reset_files:
|
|
1505
|
+
When ``True`` regenerate all discovery artifacts.
|
|
1506
|
+
response_fn:
|
|
1507
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
1508
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
1509
|
+
``get_all_responses_fn`` is supplied.
|
|
1510
|
+
get_all_responses_fn:
|
|
1511
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
1512
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
1513
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
1514
|
+
**cfg_kwargs:
|
|
1515
|
+
Additional overrides passed to :class:`gabriel.tasks.discover.DiscoverConfig`.
|
|
1516
|
+
|
|
1517
|
+
Returns
|
|
1518
|
+
-------
|
|
1519
|
+
Dict[str, pandas.DataFrame]
|
|
1520
|
+
Intermediate DataFrames from each step of the discovery pipeline. When
|
|
1521
|
+
``circle_column_name`` and ``square_column_name`` are provided,
|
|
1522
|
+
classification is performed twice (circle and square directions). A
|
|
1523
|
+
``summary`` key describes label prevalence differences with
|
|
1524
|
+
``difference_pct`` expressed as circle minus square percentage points.
|
|
1525
|
+
"""
|
|
1526
|
+
|
|
1527
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
1528
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
1529
|
+
cfg = DiscoverConfig(
|
|
1530
|
+
save_dir=save_dir,
|
|
1531
|
+
model=model,
|
|
1532
|
+
n_parallels=n_parallels,
|
|
1533
|
+
n_runs=n_runs,
|
|
1534
|
+
min_frequency=min_frequency,
|
|
1535
|
+
bucket_count=bucket_count,
|
|
1536
|
+
additional_instructions=additional_instructions,
|
|
1537
|
+
differentiate=differentiate,
|
|
1538
|
+
max_words_per_call=max_words_per_call,
|
|
1539
|
+
max_categories_per_call=max_categories_per_call,
|
|
1540
|
+
n_terms_per_prompt=n_terms_per_prompt,
|
|
1541
|
+
repeat_bucketing=repeat_bucketing,
|
|
1542
|
+
repeat_voting=repeat_voting,
|
|
1543
|
+
next_round_frac=next_round_frac,
|
|
1544
|
+
top_k_per_round=top_k_per_round,
|
|
1545
|
+
raw_term_definitions=raw_term_definitions,
|
|
1546
|
+
use_dummy=use_dummy,
|
|
1547
|
+
modality=modality,
|
|
1548
|
+
reasoning_effort=reasoning_effort,
|
|
1549
|
+
reasoning_summary=reasoning_summary,
|
|
1550
|
+
**cfg_kwargs,
|
|
1551
|
+
)
|
|
1552
|
+
return await Discover(cfg).run(
|
|
1553
|
+
df,
|
|
1554
|
+
column_name=column_name,
|
|
1555
|
+
circle_column_name=circle_column_name,
|
|
1556
|
+
square_column_name=square_column_name,
|
|
1557
|
+
reset_files=reset_files,
|
|
1558
|
+
response_fn=response_fn,
|
|
1559
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
1560
|
+
)
|
|
1561
|
+
|
|
1562
|
+
|
|
1563
|
+
async def deduplicate(
|
|
1564
|
+
df: pd.DataFrame,
|
|
1565
|
+
column_name: str,
|
|
1566
|
+
*,
|
|
1567
|
+
save_dir: str,
|
|
1568
|
+
additional_instructions: Optional[str] = None,
|
|
1569
|
+
modality: str = "entity",
|
|
1570
|
+
max_words_per_text: int = 500,
|
|
1571
|
+
model: str = "gpt-5-mini",
|
|
1572
|
+
n_parallels: int = 650,
|
|
1573
|
+
n_runs: int = 3,
|
|
1574
|
+
reset_files: bool = False,
|
|
1575
|
+
use_dummy: bool = False,
|
|
1576
|
+
file_name: str = "deduplicate_responses.csv",
|
|
1577
|
+
use_embeddings: bool = True,
|
|
1578
|
+
group_size: int = 500,
|
|
1579
|
+
max_timeout: Optional[float] = None,
|
|
1580
|
+
template_path: Optional[str] = None,
|
|
1581
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
1582
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
1583
|
+
**cfg_kwargs,
|
|
1584
|
+
) -> pd.DataFrame:
|
|
1585
|
+
"""Detects conceptual duplicates. Maps all duplicates to one representative term.
|
|
1586
|
+
|
|
1587
|
+
Example Use
|
|
1588
|
+
-----------
|
|
1589
|
+
Collapse "F-18", "Super Hornet Fighter Jet", "f-18 hornet" into "F-18".
|
|
1590
|
+
|
|
1591
|
+
Parameters
|
|
1592
|
+
----------
|
|
1593
|
+
df:
|
|
1594
|
+
DataFrame containing the passages to deduplicate.
|
|
1595
|
+
column_name:
|
|
1596
|
+
Column holding the text to deduplicate.
|
|
1597
|
+
save_dir:
|
|
1598
|
+
Directory where deduplication artifacts are written.
|
|
1599
|
+
additional_instructions:
|
|
1600
|
+
Extra guidance appended to the deduplication prompt.
|
|
1601
|
+
modality:
|
|
1602
|
+
Use ``"entity"`` for short entity strings or ``"text"`` for long-form text snippets.
|
|
1603
|
+
max_words_per_text:
|
|
1604
|
+
Maximum word count for each text snippet when ``modality="text"``.
|
|
1605
|
+
model:
|
|
1606
|
+
Model name used for overlap detection.
|
|
1607
|
+
n_parallels:
|
|
1608
|
+
Maximum number of concurrent calls.
|
|
1609
|
+
n_runs:
|
|
1610
|
+
Number of passes to run; helps stabilise duplicate detection.
|
|
1611
|
+
reset_files:
|
|
1612
|
+
When ``True`` regenerate outputs regardless of existing files.
|
|
1613
|
+
use_dummy:
|
|
1614
|
+
Return deterministic dummy outputs for offline testing.
|
|
1615
|
+
file_name:
|
|
1616
|
+
CSV filename for saved deduplication responses.
|
|
1617
|
+
use_embeddings:
|
|
1618
|
+
Whether to use embedding-based prefiltering prior to model calls.
|
|
1619
|
+
group_size:
|
|
1620
|
+
Number of passages to evaluate per batch during deduplication.
|
|
1621
|
+
max_timeout:
|
|
1622
|
+
Optional timeout per API call.
|
|
1623
|
+
template_path:
|
|
1624
|
+
Custom template override for deduplication prompts.
|
|
1625
|
+
response_fn:
|
|
1626
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
1627
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
1628
|
+
``get_all_responses_fn`` is supplied.
|
|
1629
|
+
get_all_responses_fn:
|
|
1630
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
1631
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
1632
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
1633
|
+
**cfg_kwargs:
|
|
1634
|
+
Additional configuration passed to
|
|
1635
|
+
:class:`gabriel.tasks.deduplicate.DeduplicateConfig`.
|
|
1636
|
+
|
|
1637
|
+
Returns
|
|
1638
|
+
-------
|
|
1639
|
+
pandas.DataFrame
|
|
1640
|
+
DataFrame including the original content plus ``mapped_<column_name>`` columns
|
|
1641
|
+
(per run and final) indicating the canonical representative for each
|
|
1642
|
+
detected duplicate cluster.
|
|
1643
|
+
"""
|
|
1644
|
+
|
|
1645
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
1646
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
1647
|
+
cfg = DeduplicateConfig(
|
|
1648
|
+
save_dir=save_dir,
|
|
1649
|
+
file_name=file_name,
|
|
1650
|
+
model=model,
|
|
1651
|
+
n_parallels=n_parallels,
|
|
1652
|
+
n_runs=n_runs,
|
|
1653
|
+
use_dummy=use_dummy,
|
|
1654
|
+
max_timeout=max_timeout,
|
|
1655
|
+
additional_instructions=additional_instructions,
|
|
1656
|
+
use_embeddings=use_embeddings,
|
|
1657
|
+
group_size=group_size,
|
|
1658
|
+
modality=modality,
|
|
1659
|
+
max_words_per_text=max_words_per_text,
|
|
1660
|
+
**cfg_kwargs,
|
|
1661
|
+
)
|
|
1662
|
+
return await Deduplicate(cfg, template_path=template_path).run(
|
|
1663
|
+
df,
|
|
1664
|
+
column_name=column_name,
|
|
1665
|
+
reset_files=reset_files,
|
|
1666
|
+
response_fn=response_fn,
|
|
1667
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
1668
|
+
)
|
|
1669
|
+
|
|
1670
|
+
|
|
1671
|
+
async def merge(
|
|
1672
|
+
df_left: pd.DataFrame,
|
|
1673
|
+
df_right: pd.DataFrame,
|
|
1674
|
+
*,
|
|
1675
|
+
save_dir: str,
|
|
1676
|
+
on: Optional[str] = None,
|
|
1677
|
+
left_on: Optional[str] = None,
|
|
1678
|
+
right_on: Optional[str] = None,
|
|
1679
|
+
how: str = "left",
|
|
1680
|
+
additional_instructions: Optional[str] = None,
|
|
1681
|
+
model: str = "gpt-5-nano",
|
|
1682
|
+
n_parallels: int = 650,
|
|
1683
|
+
n_runs: int = 1,
|
|
1684
|
+
reset_files: bool = False,
|
|
1685
|
+
use_dummy: bool = False,
|
|
1686
|
+
file_name: str = "merge_responses.csv",
|
|
1687
|
+
use_embeddings: bool = True,
|
|
1688
|
+
short_list_len: int = 16,
|
|
1689
|
+
long_list_len: int = 256,
|
|
1690
|
+
max_attempts: int = 4,
|
|
1691
|
+
short_list_multiplier: float = 0.5,
|
|
1692
|
+
auto_match_threshold: float = 0.75,
|
|
1693
|
+
use_best_auto_match: bool = False,
|
|
1694
|
+
candidate_scan_chunks: int = 5,
|
|
1695
|
+
template_path: Optional[str] = None,
|
|
1696
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
1697
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
1698
|
+
**cfg_kwargs,
|
|
1699
|
+
) -> pd.DataFrame:
|
|
1700
|
+
"""Creates crosswalks. Output = merged table with GPT-matched identifiers.
|
|
1701
|
+
|
|
1702
|
+
Example Use
|
|
1703
|
+
-----------
|
|
1704
|
+
Match two distinct job title directories; link patent titles to product names.
|
|
1705
|
+
|
|
1706
|
+
Parameters
|
|
1707
|
+
----------
|
|
1708
|
+
df_left, df_right:
|
|
1709
|
+
DataFrames to merge.
|
|
1710
|
+
save_dir:
|
|
1711
|
+
Directory where merge results and diagnostics are saved.
|
|
1712
|
+
on, left_on, right_on:
|
|
1713
|
+
Column(s) to match on. ``on`` applies to both sides; ``left_on`` and
|
|
1714
|
+
``right_on`` override per side.
|
|
1715
|
+
how:
|
|
1716
|
+
Merge strategy (``"left"`` or ``"right"``) determining which side is treated as
|
|
1717
|
+
the short/base table.
|
|
1718
|
+
additional_instructions:
|
|
1719
|
+
Extra prompt context for the model.
|
|
1720
|
+
model:
|
|
1721
|
+
Model used to compare candidate records.
|
|
1722
|
+
n_parallels:
|
|
1723
|
+
Maximum number of concurrent merge comparisons.
|
|
1724
|
+
n_runs:
|
|
1725
|
+
Number of repeated comparisons per candidate.
|
|
1726
|
+
reset_files:
|
|
1727
|
+
When ``True`` regenerate outputs even if files exist.
|
|
1728
|
+
use_dummy:
|
|
1729
|
+
If ``True`` return deterministic dummy matches.
|
|
1730
|
+
file_name:
|
|
1731
|
+
CSV filename for saved merge responses.
|
|
1732
|
+
use_embeddings:
|
|
1733
|
+
Whether to use embeddings to shortlist candidates before calling the
|
|
1734
|
+
model.
|
|
1735
|
+
short_list_len, long_list_len, short_list_multiplier:
|
|
1736
|
+
Controls for candidate pool sizes.
|
|
1737
|
+
max_attempts:
|
|
1738
|
+
Maximum retry attempts per match before giving up.
|
|
1739
|
+
auto_match_threshold:
|
|
1740
|
+
Confidence threshold for automatically accepting matches.
|
|
1741
|
+
use_best_auto_match:
|
|
1742
|
+
When ``True`` pick the highest confidence candidate when multiple exceed
|
|
1743
|
+
``auto_match_threshold``.
|
|
1744
|
+
candidate_scan_chunks:
|
|
1745
|
+
Number of candidate batches to scan when building the shortlist.
|
|
1746
|
+
template_path:
|
|
1747
|
+
Custom template override for merge prompts.
|
|
1748
|
+
response_fn:
|
|
1749
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
1750
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
1751
|
+
``get_all_responses_fn`` is supplied.
|
|
1752
|
+
get_all_responses_fn:
|
|
1753
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
1754
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
1755
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
1756
|
+
**cfg_kwargs:
|
|
1757
|
+
Additional overrides forwarded to :class:`gabriel.tasks.merge.MergeConfig`.
|
|
1758
|
+
|
|
1759
|
+
Returns
|
|
1760
|
+
-------
|
|
1761
|
+
pandas.DataFrame
|
|
1762
|
+
Merged result keyed to the ``how``-selected short side, enriched with
|
|
1763
|
+
model-evaluated matches from the long side and deduplicated on the
|
|
1764
|
+
short key.
|
|
1765
|
+
"""
|
|
1766
|
+
|
|
1767
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
1768
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
1769
|
+
cfg = MergeConfig(
|
|
1770
|
+
save_dir=save_dir,
|
|
1771
|
+
file_name=file_name,
|
|
1772
|
+
model=model,
|
|
1773
|
+
n_parallels=n_parallels,
|
|
1774
|
+
n_runs=n_runs,
|
|
1775
|
+
use_dummy=use_dummy,
|
|
1776
|
+
additional_instructions=additional_instructions,
|
|
1777
|
+
use_embeddings=use_embeddings,
|
|
1778
|
+
short_list_len=short_list_len,
|
|
1779
|
+
long_list_len=long_list_len,
|
|
1780
|
+
max_attempts=max_attempts,
|
|
1781
|
+
short_list_multiplier=short_list_multiplier,
|
|
1782
|
+
auto_match_threshold=auto_match_threshold,
|
|
1783
|
+
use_best_auto_match=use_best_auto_match,
|
|
1784
|
+
candidate_scan_chunks=candidate_scan_chunks,
|
|
1785
|
+
**cfg_kwargs,
|
|
1786
|
+
)
|
|
1787
|
+
return await Merge(cfg, template_path=template_path).run(
|
|
1788
|
+
df_left,
|
|
1789
|
+
df_right,
|
|
1790
|
+
on=on,
|
|
1791
|
+
left_on=left_on,
|
|
1792
|
+
right_on=right_on,
|
|
1793
|
+
how=how,
|
|
1794
|
+
reset_files=reset_files,
|
|
1795
|
+
response_fn=response_fn,
|
|
1796
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
1797
|
+
)
|
|
1798
|
+
|
|
1799
|
+
|
|
1800
|
+
async def filter(
|
|
1801
|
+
df: pd.DataFrame,
|
|
1802
|
+
column_name: str,
|
|
1803
|
+
*,
|
|
1804
|
+
condition: str,
|
|
1805
|
+
save_dir: str,
|
|
1806
|
+
entities_per_call: int = 150,
|
|
1807
|
+
shuffle: bool = True,
|
|
1808
|
+
random_seed: int = 42,
|
|
1809
|
+
n_runs: int = 1,
|
|
1810
|
+
threshold: float = 0.5,
|
|
1811
|
+
additional_instructions: Optional[str] = None,
|
|
1812
|
+
model: str = "gpt-5-nano",
|
|
1813
|
+
n_parallels: int = 650,
|
|
1814
|
+
reset_files: bool = False,
|
|
1815
|
+
use_dummy: bool = False,
|
|
1816
|
+
file_name: str = "filter_responses.csv",
|
|
1817
|
+
max_timeout: Optional[float] = None,
|
|
1818
|
+
template_path: Optional[str] = None,
|
|
1819
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
1820
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
1821
|
+
**cfg_kwargs,
|
|
1822
|
+
) -> pd.DataFrame:
|
|
1823
|
+
"""High-throughput boolean screening. Outputs items which meet natural language condition.
|
|
1824
|
+
|
|
1825
|
+
Example Use
|
|
1826
|
+
-----------
|
|
1827
|
+
Subset 18M Wikipedia titles to only technologies.
|
|
1828
|
+
|
|
1829
|
+
Parameters
|
|
1830
|
+
----------
|
|
1831
|
+
df:
|
|
1832
|
+
DataFrame containing passages to filter.
|
|
1833
|
+
column_name:
|
|
1834
|
+
Column with the text to evaluate.
|
|
1835
|
+
condition:
|
|
1836
|
+
Natural-language condition that determines whether a passage is kept.
|
|
1837
|
+
save_dir:
|
|
1838
|
+
Directory where filter responses are saved.
|
|
1839
|
+
entities_per_call:
|
|
1840
|
+
Number of passages to send in each API call.
|
|
1841
|
+
shuffle:
|
|
1842
|
+
Whether to randomise order before batching.
|
|
1843
|
+
random_seed:
|
|
1844
|
+
Seed used when ``shuffle`` is ``True``.
|
|
1845
|
+
n_runs:
|
|
1846
|
+
Number of repeated evaluations per passage.
|
|
1847
|
+
threshold:
|
|
1848
|
+
Probability threshold above which a passage is retained.
|
|
1849
|
+
additional_instructions:
|
|
1850
|
+
Extra guidance appended to the filter prompt.
|
|
1851
|
+
model:
|
|
1852
|
+
Model used for filtering.
|
|
1853
|
+
n_parallels:
|
|
1854
|
+
Maximum number of concurrent filtering calls.
|
|
1855
|
+
reset_files:
|
|
1856
|
+
When ``True`` regenerate outputs even if files exist.
|
|
1857
|
+
use_dummy:
|
|
1858
|
+
Return deterministic dummy outputs instead of real API responses.
|
|
1859
|
+
file_name:
|
|
1860
|
+
CSV filename for saved filter responses.
|
|
1861
|
+
max_timeout:
|
|
1862
|
+
Optional per-call timeout.
|
|
1863
|
+
template_path:
|
|
1864
|
+
Custom prompt template path.
|
|
1865
|
+
response_fn:
|
|
1866
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
1867
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
1868
|
+
``get_all_responses_fn`` is supplied.
|
|
1869
|
+
get_all_responses_fn:
|
|
1870
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
1871
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
1872
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
1873
|
+
**cfg_kwargs:
|
|
1874
|
+
Additional configuration passed to :class:`gabriel.tasks.filter.FilterConfig`.
|
|
1875
|
+
|
|
1876
|
+
Returns
|
|
1877
|
+
-------
|
|
1878
|
+
pandas.DataFrame
|
|
1879
|
+
Filtered DataFrame with keep/score columns reflecting model decisions.
|
|
1880
|
+
"""
|
|
1881
|
+
|
|
1882
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
1883
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
1884
|
+
cfg = FilterConfig(
|
|
1885
|
+
condition=condition,
|
|
1886
|
+
save_dir=save_dir,
|
|
1887
|
+
file_name=file_name,
|
|
1888
|
+
model=model,
|
|
1889
|
+
n_parallels=n_parallels,
|
|
1890
|
+
entities_per_call=entities_per_call,
|
|
1891
|
+
shuffle=shuffle,
|
|
1892
|
+
random_seed=random_seed,
|
|
1893
|
+
n_runs=n_runs,
|
|
1894
|
+
threshold=threshold,
|
|
1895
|
+
additional_instructions=additional_instructions or "",
|
|
1896
|
+
use_dummy=use_dummy,
|
|
1897
|
+
max_timeout=max_timeout,
|
|
1898
|
+
**cfg_kwargs,
|
|
1899
|
+
)
|
|
1900
|
+
return await Filter(cfg, template_path=template_path).run(
|
|
1901
|
+
df,
|
|
1902
|
+
column_name,
|
|
1903
|
+
reset_files=reset_files,
|
|
1904
|
+
response_fn=response_fn,
|
|
1905
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
1906
|
+
)
|
|
1907
|
+
|
|
1908
|
+
|
|
1909
|
+
async def debias(
|
|
1910
|
+
df: pd.DataFrame,
|
|
1911
|
+
column_name: str,
|
|
1912
|
+
*,
|
|
1913
|
+
mode: MeasurementMode = "rate",
|
|
1914
|
+
measurement_attribute: Optional[str] = None,
|
|
1915
|
+
removal_attribute: Optional[str] = None,
|
|
1916
|
+
signal_dictionary: Dict[str, str],
|
|
1917
|
+
attributes: Optional[Dict[str, str]] = None,
|
|
1918
|
+
removal_method: RemovalMethod = "codify",
|
|
1919
|
+
save_dir: str = os.path.expanduser("~/Documents/runs"),
|
|
1920
|
+
run_name: Optional[str] = None,
|
|
1921
|
+
strip_percentages: Optional[List[int]] = None,
|
|
1922
|
+
categories_to_strip: Optional[List[str]] = None,
|
|
1923
|
+
template_path: Optional[str] = None,
|
|
1924
|
+
model: str = "gpt-5-mini",
|
|
1925
|
+
n_parallels: int = 650,
|
|
1926
|
+
measurement_kwargs: Optional[Dict[str, Any]] = None,
|
|
1927
|
+
removal_kwargs: Optional[Dict[str, Any]] = None,
|
|
1928
|
+
remaining_signal: bool = True,
|
|
1929
|
+
max_words_per_call: Optional[int] = 1000,
|
|
1930
|
+
n_rounds: Optional[int] = 3,
|
|
1931
|
+
use_dummy: bool = False,
|
|
1932
|
+
robust_regression: bool = True,
|
|
1933
|
+
random_seed: int = 12345,
|
|
1934
|
+
verbose: bool = True,
|
|
1935
|
+
reset_files: bool = False,
|
|
1936
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
1937
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
1938
|
+
) -> pd.DataFrame:
|
|
1939
|
+
"""Post-process measurements to remove inference bias.
|
|
1940
|
+
|
|
1941
|
+
Example Use
|
|
1942
|
+
-----------
|
|
1943
|
+
Ensure GPT isn't guessing climate opinions in speeches based on general political lean.
|
|
1944
|
+
|
|
1945
|
+
Parameters
|
|
1946
|
+
----------
|
|
1947
|
+
df:
|
|
1948
|
+
DataFrame containing passages to measure and debias.
|
|
1949
|
+
column_name:
|
|
1950
|
+
Column with the text to process.
|
|
1951
|
+
mode:
|
|
1952
|
+
Measurement mode (e.g., ``"rate"``) determining how bias is estimated.
|
|
1953
|
+
measurement_attribute, removal_attribute:
|
|
1954
|
+
Specify the attribute used for regression and the key from
|
|
1955
|
+
``signal_dictionary`` that should be removed. When
|
|
1956
|
+
``measurement_attribute`` is omitted the first key from ``attributes``
|
|
1957
|
+
is used. ``removal_attribute`` defaults to the measurement attribute
|
|
1958
|
+
when present in ``signal_dictionary`` or otherwise the first key from
|
|
1959
|
+
``signal_dictionary``. Notices are printed when inferred and
|
|
1960
|
+
``verbose`` is ``True``.
|
|
1961
|
+
signal_dictionary:
|
|
1962
|
+
Mapping of bias signals to their definitions.
|
|
1963
|
+
attributes:
|
|
1964
|
+
Optional rating attributes used during measurement.
|
|
1965
|
+
removal_method:
|
|
1966
|
+
Strategy for removing bias (for example ``"codify"``).
|
|
1967
|
+
save_dir:
|
|
1968
|
+
Base directory for all debiasing artifacts.
|
|
1969
|
+
run_name:
|
|
1970
|
+
Optional run identifier; defaults to a timestamped folder.
|
|
1971
|
+
strip_percentages, categories_to_strip:
|
|
1972
|
+
Optional controls for category pruning during removal.
|
|
1973
|
+
template_path:
|
|
1974
|
+
Optional template override used during removal steps.
|
|
1975
|
+
model:
|
|
1976
|
+
Model used across the measurement and removal stages.
|
|
1977
|
+
n_parallels:
|
|
1978
|
+
Maximum concurrent API calls.
|
|
1979
|
+
measurement_kwargs, removal_kwargs:
|
|
1980
|
+
Fine-grained overrides for the measurement and removal tasks.
|
|
1981
|
+
remaining_signal:
|
|
1982
|
+
When ``True`` (default) measure a remaining-signal prevalence attribute on
|
|
1983
|
+
the stripped text and use it in the two-step debiasing regression.
|
|
1984
|
+
max_words_per_call, n_rounds:
|
|
1985
|
+
Convenience passthroughs for the removal stage. ``max_words_per_call``
|
|
1986
|
+
configures the codify task's chunk size, while ``n_rounds`` controls the
|
|
1987
|
+
number of completion passes run by codify and any downstream
|
|
1988
|
+
paraphrasing steps. Defaults to 3 when not explicitly provided.
|
|
1989
|
+
use_dummy:
|
|
1990
|
+
If ``True`` run deterministic offline debiasing.
|
|
1991
|
+
robust_regression:
|
|
1992
|
+
Whether to use robust regression when estimating bias coefficients.
|
|
1993
|
+
random_seed:
|
|
1994
|
+
Seed for deterministic behaviour in sampling-heavy steps.
|
|
1995
|
+
verbose:
|
|
1996
|
+
When ``True`` print notices about inferred defaults and progress.
|
|
1997
|
+
reset_files:
|
|
1998
|
+
When ``True`` propagate reset behaviour to all measurement and removal stages.
|
|
1999
|
+
response_fn:
|
|
2000
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
2001
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
2002
|
+
``get_all_responses_fn`` is supplied.
|
|
2003
|
+
get_all_responses_fn:
|
|
2004
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
2005
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
2006
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
2007
|
+
|
|
2008
|
+
Returns
|
|
2009
|
+
-------
|
|
2010
|
+
pandas.DataFrame
|
|
2011
|
+
Debiased results with raw, stripped, and debiased columns appended.
|
|
2012
|
+
"""
|
|
2013
|
+
|
|
2014
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
2015
|
+
measurement_kwargs = dict(measurement_kwargs or {})
|
|
2016
|
+
removal_kwargs = dict(removal_kwargs or {})
|
|
2017
|
+
if response_fn is not None:
|
|
2018
|
+
measurement_kwargs.setdefault("response_fn", response_fn)
|
|
2019
|
+
removal_kwargs.setdefault("response_fn", response_fn)
|
|
2020
|
+
if get_all_responses_fn is not None:
|
|
2021
|
+
measurement_kwargs.setdefault("get_all_responses_fn", get_all_responses_fn)
|
|
2022
|
+
removal_kwargs.setdefault("get_all_responses_fn", get_all_responses_fn)
|
|
2023
|
+
|
|
2024
|
+
if reset_files:
|
|
2025
|
+
measurement_kwargs.setdefault("reset_files", True)
|
|
2026
|
+
removal_kwargs.setdefault("reset_files", True)
|
|
2027
|
+
|
|
2028
|
+
if removal_method == "codify" and max_words_per_call is not None:
|
|
2029
|
+
removal_kwargs.setdefault("max_words_per_call", max_words_per_call)
|
|
2030
|
+
if "completion_max_rounds" in removal_kwargs and "n_rounds" not in removal_kwargs:
|
|
2031
|
+
replacement = removal_kwargs.pop("completion_max_rounds")
|
|
2032
|
+
warnings.warn(
|
|
2033
|
+
"completion_max_rounds in removal_kwargs is deprecated; use n_rounds instead.",
|
|
2034
|
+
DeprecationWarning,
|
|
2035
|
+
stacklevel=2,
|
|
2036
|
+
)
|
|
2037
|
+
if replacement is not None:
|
|
2038
|
+
removal_kwargs.setdefault("n_rounds", replacement)
|
|
2039
|
+
if n_rounds is not None:
|
|
2040
|
+
removal_kwargs.setdefault("n_rounds", n_rounds)
|
|
2041
|
+
|
|
2042
|
+
cfg = DebiasConfig(
|
|
2043
|
+
mode=mode,
|
|
2044
|
+
measurement_attribute=measurement_attribute,
|
|
2045
|
+
removal_attribute=removal_attribute,
|
|
2046
|
+
signal_dictionary=signal_dictionary,
|
|
2047
|
+
attributes=attributes or {},
|
|
2048
|
+
removal_method=removal_method,
|
|
2049
|
+
save_dir=save_dir,
|
|
2050
|
+
run_name=run_name,
|
|
2051
|
+
strip_percentages=strip_percentages,
|
|
2052
|
+
categories_to_strip=categories_to_strip,
|
|
2053
|
+
template_path=template_path,
|
|
2054
|
+
model=model,
|
|
2055
|
+
n_parallels=n_parallels,
|
|
2056
|
+
measurement_kwargs=measurement_kwargs,
|
|
2057
|
+
removal_kwargs=removal_kwargs,
|
|
2058
|
+
remaining_signal=remaining_signal,
|
|
2059
|
+
use_dummy=use_dummy,
|
|
2060
|
+
robust_regression=robust_regression,
|
|
2061
|
+
random_seed=random_seed,
|
|
2062
|
+
verbose=verbose,
|
|
2063
|
+
)
|
|
2064
|
+
pipeline = DebiasPipeline(cfg)
|
|
2065
|
+
result = await pipeline.run(df, column_name, reset_files=reset_files)
|
|
2066
|
+
return result.results
|
|
2067
|
+
|
|
2068
|
+
|
|
2069
|
+
async def whatever(
|
|
2070
|
+
prompts: Optional[Union[str, List[str], pd.DataFrame]] = None,
|
|
2071
|
+
identifiers: Optional[List[str]] = None,
|
|
2072
|
+
*,
|
|
2073
|
+
save_dir: str,
|
|
2074
|
+
df: Optional[pd.DataFrame] = None,
|
|
2075
|
+
column_name: Optional[str] = None,
|
|
2076
|
+
identifier_column: Optional[str] = None,
|
|
2077
|
+
image_column: Optional[str] = None,
|
|
2078
|
+
audio_column: Optional[str] = None,
|
|
2079
|
+
prompt_images: Optional[Dict[str, List[str]]] = None,
|
|
2080
|
+
prompt_audio: Optional[Dict[str, List[Dict[str, str]]]] = None,
|
|
2081
|
+
file_name: str = "custom_prompt_responses.csv",
|
|
2082
|
+
model: str = "gpt-5-mini",
|
|
2083
|
+
json_mode: bool = False,
|
|
2084
|
+
web_search: Optional[bool] = None,
|
|
2085
|
+
web_search_filters: Optional[Dict[str, Any]] = None,
|
|
2086
|
+
search_context_size: str = "medium",
|
|
2087
|
+
n_parallels: int = 650,
|
|
2088
|
+
use_dummy: bool = False,
|
|
2089
|
+
reset_files: bool = False,
|
|
2090
|
+
return_original_columns: bool = True,
|
|
2091
|
+
drop_prompts: bool = True,
|
|
2092
|
+
reasoning_effort: Optional[str] = None,
|
|
2093
|
+
reasoning_summary: Optional[str] = None,
|
|
2094
|
+
response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
|
|
2095
|
+
get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
|
|
2096
|
+
**kwargs,
|
|
2097
|
+
) -> pd.DataFrame:
|
|
2098
|
+
"""Run any GPT prompts, but leverage GABRIEL's parallelization / checkpointing.
|
|
2099
|
+
|
|
2100
|
+
Example Use
|
|
2101
|
+
-----------
|
|
2102
|
+
Any set of prompts; slots into any pipeline.
|
|
2103
|
+
|
|
2104
|
+
Parameters
|
|
2105
|
+
----------
|
|
2106
|
+
prompts:
|
|
2107
|
+
Single prompt string, list of prompts, or DataFrame of prompts.
|
|
2108
|
+
identifiers:
|
|
2109
|
+
Optional identifiers to align responses with custom keys.
|
|
2110
|
+
save_dir:
|
|
2111
|
+
Directory where raw responses are written.
|
|
2112
|
+
df:
|
|
2113
|
+
Source DataFrame to pull prompts from when ``prompts`` is not provided.
|
|
2114
|
+
column_name:
|
|
2115
|
+
Column in ``df`` containing prompts to send.
|
|
2116
|
+
identifier_column:
|
|
2117
|
+
Column providing identifiers for each prompt row.
|
|
2118
|
+
image_column, audio_column:
|
|
2119
|
+
Optional columns containing image or audio references to include.
|
|
2120
|
+
prompt_images, prompt_audio:
|
|
2121
|
+
Pre-constructed multimodal payloads keyed by identifier.
|
|
2122
|
+
file_name:
|
|
2123
|
+
CSV filename for persisted responses.
|
|
2124
|
+
model:
|
|
2125
|
+
Model name passed to :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
2126
|
+
json_mode:
|
|
2127
|
+
Whether to request JSON-mode responses where supported.
|
|
2128
|
+
web_search:
|
|
2129
|
+
Enable web search augmentation.
|
|
2130
|
+
web_search_filters:
|
|
2131
|
+
Filters dict forwarded to the Responses API (allowed domains and optional
|
|
2132
|
+
location hints such as ``city`` or ``timezone``).
|
|
2133
|
+
search_context_size:
|
|
2134
|
+
Context size hint for web-search capable models.
|
|
2135
|
+
n_parallels:
|
|
2136
|
+
Maximum concurrent response requests.
|
|
2137
|
+
use_dummy:
|
|
2138
|
+
If ``True`` return deterministic dummy responses.
|
|
2139
|
+
reset_files:
|
|
2140
|
+
When ``True`` regenerate outputs even if files already exist.
|
|
2141
|
+
return_original_columns:
|
|
2142
|
+
When ``True`` and ``df`` is provided, merge response columns back onto
|
|
2143
|
+
the input DataFrame using the prompt identifiers.
|
|
2144
|
+
drop_prompts:
|
|
2145
|
+
When ``True`` and merging back onto ``df``, drop the prompt column
|
|
2146
|
+
before saving/returning the result.
|
|
2147
|
+
reasoning_effort, reasoning_summary:
|
|
2148
|
+
Optional OpenAI reasoning controls.
|
|
2149
|
+
response_fn:
|
|
2150
|
+
Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
|
|
2151
|
+
that replaces the per-prompt model invocation. Ignored when
|
|
2152
|
+
``get_all_responses_fn`` is supplied.
|
|
2153
|
+
get_all_responses_fn:
|
|
2154
|
+
Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
2155
|
+
It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
|
|
2156
|
+
``json_mode``) and return a DataFrame containing a ``"Response"`` column.
|
|
2157
|
+
**kwargs:
|
|
2158
|
+
Additional parameters forwarded directly to
|
|
2159
|
+
:func:`gabriel.utils.openai_utils.get_all_responses`.
|
|
2160
|
+
|
|
2161
|
+
Returns
|
|
2162
|
+
-------
|
|
2163
|
+
pandas.DataFrame
|
|
2164
|
+
DataFrame of prompts, identifiers, and model responses saved to
|
|
2165
|
+
``save_dir/file_name``.
|
|
2166
|
+
"""
|
|
2167
|
+
save_dir = os.path.expandvars(os.path.expanduser(save_dir))
|
|
2168
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
2169
|
+
|
|
2170
|
+
if df is None and prompts is None:
|
|
2171
|
+
raise ValueError("Either prompts or df must be provided to `whatever`.")
|
|
2172
|
+
|
|
2173
|
+
kwargs = dict(kwargs)
|
|
2174
|
+
if response_fn is not None:
|
|
2175
|
+
kwargs.setdefault("response_fn", response_fn)
|
|
2176
|
+
if get_all_responses_fn is not None:
|
|
2177
|
+
kwargs.setdefault("get_all_responses_fn", get_all_responses_fn)
|
|
2178
|
+
|
|
2179
|
+
if web_search is None and "web_search" in kwargs:
|
|
2180
|
+
web_search = kwargs.pop("web_search")
|
|
2181
|
+
else:
|
|
2182
|
+
kwargs.pop("web_search", None)
|
|
2183
|
+
|
|
2184
|
+
if web_search_filters is None and "web_search_filters" in kwargs:
|
|
2185
|
+
web_search_filters = kwargs.pop("web_search_filters")
|
|
2186
|
+
else:
|
|
2187
|
+
kwargs.pop("web_search_filters", None)
|
|
2188
|
+
|
|
2189
|
+
if "search_context_size" in kwargs:
|
|
2190
|
+
if search_context_size == "medium":
|
|
2191
|
+
search_context_size = kwargs.pop("search_context_size")
|
|
2192
|
+
else:
|
|
2193
|
+
kwargs.pop("search_context_size")
|
|
2194
|
+
|
|
2195
|
+
cfg = WhateverConfig(
|
|
2196
|
+
save_dir=save_dir,
|
|
2197
|
+
file_name=file_name,
|
|
2198
|
+
model=model,
|
|
2199
|
+
json_mode=json_mode,
|
|
2200
|
+
web_search=web_search,
|
|
2201
|
+
web_search_filters=web_search_filters,
|
|
2202
|
+
search_context_size=search_context_size,
|
|
2203
|
+
n_parallels=n_parallels,
|
|
2204
|
+
use_dummy=use_dummy,
|
|
2205
|
+
reasoning_effort=reasoning_effort,
|
|
2206
|
+
reasoning_summary=reasoning_summary,
|
|
2207
|
+
)
|
|
2208
|
+
|
|
2209
|
+
runner = Whatever(cfg)
|
|
2210
|
+
return await runner.run(
|
|
2211
|
+
prompts,
|
|
2212
|
+
df=df,
|
|
2213
|
+
identifiers=identifiers,
|
|
2214
|
+
column_name=column_name,
|
|
2215
|
+
identifier_column=identifier_column,
|
|
2216
|
+
image_column=image_column,
|
|
2217
|
+
audio_column=audio_column,
|
|
2218
|
+
prompt_images=prompt_images,
|
|
2219
|
+
prompt_audio=prompt_audio,
|
|
2220
|
+
web_search_filters=web_search_filters,
|
|
2221
|
+
reset_files=reset_files,
|
|
2222
|
+
return_original_columns=return_original_columns,
|
|
2223
|
+
drop_prompts=drop_prompts,
|
|
2224
|
+
response_fn=response_fn,
|
|
2225
|
+
get_all_responses_fn=get_all_responses_fn,
|
|
2226
|
+
**kwargs,
|
|
2227
|
+
)
|
|
2228
|
+
|
|
2229
|
+
|
|
2230
|
+
def view(
|
|
2231
|
+
df: pd.DataFrame,
|
|
2232
|
+
column_name: str,
|
|
2233
|
+
attributes: Optional[Union[Mapping[str, Any], Sequence[Any], Any]] = None,
|
|
2234
|
+
*,
|
|
2235
|
+
header_columns: Optional[Any] = None,
|
|
2236
|
+
max_passages: Optional[int] = None,
|
|
2237
|
+
font_scale: float = 1.0,
|
|
2238
|
+
font_family: Optional[str] = None,
|
|
2239
|
+
color_mode: str = "auto",
|
|
2240
|
+
):
|
|
2241
|
+
"""UI to view sample texts with ratings / passage coding.
|
|
2242
|
+
|
|
2243
|
+
Example Use
|
|
2244
|
+
-----------
|
|
2245
|
+
Spot-check classify / rating outputs; view coded passages.
|
|
2246
|
+
|
|
2247
|
+
Parameters
|
|
2248
|
+
----------
|
|
2249
|
+
df:
|
|
2250
|
+
DataFrame containing passages to display.
|
|
2251
|
+
column_name:
|
|
2252
|
+
Column with the primary text to render.
|
|
2253
|
+
attributes:
|
|
2254
|
+
Optional iterable or mapping of attribute columns to include alongside
|
|
2255
|
+
the passage text.
|
|
2256
|
+
header_columns:
|
|
2257
|
+
Optional columns whose values should appear in the viewer header.
|
|
2258
|
+
max_passages:
|
|
2259
|
+
Optional cap on the number of passages displayed.
|
|
2260
|
+
font_scale:
|
|
2261
|
+
Scaling factor applied to viewer typography.
|
|
2262
|
+
font_family:
|
|
2263
|
+
Optional font family override.
|
|
2264
|
+
color_mode:
|
|
2265
|
+
Either ``"auto"``, ``"light"``, or ``"dark"`` to control the viewer
|
|
2266
|
+
theme.
|
|
2267
|
+
|
|
2268
|
+
Returns
|
|
2269
|
+
-------
|
|
2270
|
+
Any
|
|
2271
|
+
The rendered viewer object produced by
|
|
2272
|
+
:func:`gabriel.utils.passage_viewer.view`.
|
|
2273
|
+
"""
|
|
2274
|
+
|
|
2275
|
+
return _view_passages(
|
|
2276
|
+
df,
|
|
2277
|
+
column_name,
|
|
2278
|
+
attributes=attributes,
|
|
2279
|
+
header_columns=header_columns,
|
|
2280
|
+
max_passages=max_passages,
|
|
2281
|
+
font_scale=font_scale,
|
|
2282
|
+
font_family=font_family,
|
|
2283
|
+
color_mode=color_mode,
|
|
2284
|
+
)
|