openai-gabriel 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. gabriel/__init__.py +61 -0
  2. gabriel/_version.py +1 -0
  3. gabriel/api.py +2284 -0
  4. gabriel/cli/__main__.py +60 -0
  5. gabriel/core/__init__.py +7 -0
  6. gabriel/core/llm_client.py +34 -0
  7. gabriel/core/pipeline.py +18 -0
  8. gabriel/core/prompt_template.py +152 -0
  9. gabriel/prompts/__init__.py +1 -0
  10. gabriel/prompts/bucket_prompt.jinja2 +113 -0
  11. gabriel/prompts/classification_prompt.jinja2 +50 -0
  12. gabriel/prompts/codify_prompt.jinja2 +95 -0
  13. gabriel/prompts/comparison_prompt.jinja2 +60 -0
  14. gabriel/prompts/deduplicate_prompt.jinja2 +41 -0
  15. gabriel/prompts/deidentification_prompt.jinja2 +112 -0
  16. gabriel/prompts/extraction_prompt.jinja2 +61 -0
  17. gabriel/prompts/filter_prompt.jinja2 +31 -0
  18. gabriel/prompts/ideation_prompt.jinja2 +80 -0
  19. gabriel/prompts/merge_prompt.jinja2 +47 -0
  20. gabriel/prompts/paraphrase_prompt.jinja2 +17 -0
  21. gabriel/prompts/rankings_prompt.jinja2 +49 -0
  22. gabriel/prompts/ratings_prompt.jinja2 +50 -0
  23. gabriel/prompts/regional_analysis_prompt.jinja2 +40 -0
  24. gabriel/prompts/seed.jinja2 +43 -0
  25. gabriel/prompts/snippets.jinja2 +117 -0
  26. gabriel/tasks/__init__.py +63 -0
  27. gabriel/tasks/_attribute_utils.py +69 -0
  28. gabriel/tasks/bucket.py +432 -0
  29. gabriel/tasks/classify.py +562 -0
  30. gabriel/tasks/codify.py +1033 -0
  31. gabriel/tasks/compare.py +235 -0
  32. gabriel/tasks/debias.py +1460 -0
  33. gabriel/tasks/deduplicate.py +341 -0
  34. gabriel/tasks/deidentify.py +316 -0
  35. gabriel/tasks/discover.py +524 -0
  36. gabriel/tasks/extract.py +455 -0
  37. gabriel/tasks/filter.py +169 -0
  38. gabriel/tasks/ideate.py +782 -0
  39. gabriel/tasks/merge.py +464 -0
  40. gabriel/tasks/paraphrase.py +531 -0
  41. gabriel/tasks/rank.py +2041 -0
  42. gabriel/tasks/rate.py +347 -0
  43. gabriel/tasks/seed.py +465 -0
  44. gabriel/tasks/whatever.py +344 -0
  45. gabriel/utils/__init__.py +64 -0
  46. gabriel/utils/audio_utils.py +42 -0
  47. gabriel/utils/file_utils.py +464 -0
  48. gabriel/utils/image_utils.py +22 -0
  49. gabriel/utils/jinja.py +31 -0
  50. gabriel/utils/logging.py +86 -0
  51. gabriel/utils/mapmaker.py +304 -0
  52. gabriel/utils/media_utils.py +78 -0
  53. gabriel/utils/modality_utils.py +148 -0
  54. gabriel/utils/openai_utils.py +5470 -0
  55. gabriel/utils/parsing.py +282 -0
  56. gabriel/utils/passage_viewer.py +2557 -0
  57. gabriel/utils/pdf_utils.py +20 -0
  58. gabriel/utils/plot_utils.py +2881 -0
  59. gabriel/utils/prompt_utils.py +42 -0
  60. gabriel/utils/word_matching.py +158 -0
  61. openai_gabriel-1.0.1.dist-info/METADATA +443 -0
  62. openai_gabriel-1.0.1.dist-info/RECORD +67 -0
  63. openai_gabriel-1.0.1.dist-info/WHEEL +5 -0
  64. openai_gabriel-1.0.1.dist-info/entry_points.txt +2 -0
  65. openai_gabriel-1.0.1.dist-info/licenses/LICENSE +201 -0
  66. openai_gabriel-1.0.1.dist-info/licenses/NOTICE +13 -0
  67. openai_gabriel-1.0.1.dist-info/top_level.txt +1 -0
gabriel/api.py ADDED
@@ -0,0 +1,2284 @@
1
+ import os
2
+ import warnings
3
+ import pandas as pd
4
+ from typing import Awaitable, Callable, Dict, Optional, Union, Any, List, Mapping, Sequence
5
+
6
+ from .tasks import (
7
+ Rate,
8
+ RateConfig,
9
+ Classify,
10
+ ClassifyConfig,
11
+ Rank,
12
+ RankConfig,
13
+ Deidentifier,
14
+ DeidentifyConfig,
15
+ Codify,
16
+ CodifyConfig,
17
+ Extract,
18
+ ExtractConfig,
19
+ Paraphrase,
20
+ ParaphraseConfig,
21
+ Compare,
22
+ CompareConfig,
23
+ Merge,
24
+ MergeConfig,
25
+ Deduplicate,
26
+ DeduplicateConfig,
27
+ Bucket,
28
+ BucketConfig,
29
+ Discover,
30
+ DiscoverConfig,
31
+ Seed,
32
+ SeedConfig,
33
+ Filter,
34
+ FilterConfig,
35
+ Whatever,
36
+ WhateverConfig,
37
+ Ideate,
38
+ IdeateConfig,
39
+ )
40
+ from .utils.openai_utils import get_all_responses
41
+ from .utils.passage_viewer import view as _view_passages
42
+ from .tasks.debias import (
43
+ DebiasConfig,
44
+ DebiasPipeline,
45
+ DebiasResult,
46
+ MeasurementMode,
47
+ RemovalMethod,
48
+ )
49
+
50
+ __all__ = [
51
+ "rate",
52
+ "extract",
53
+ "seed",
54
+ "classify",
55
+ "ideate",
56
+ "id8",
57
+ "deidentify",
58
+ "rank",
59
+ "codify",
60
+ "paraphrase",
61
+ "compare",
62
+ "bucket",
63
+ "discover",
64
+ "deduplicate",
65
+ "merge",
66
+ "filter",
67
+ "debias",
68
+ "whatever",
69
+ "view",
70
+ ]
71
+
72
+ async def rate(
73
+ df: pd.DataFrame,
74
+ column_name: str,
75
+ *,
76
+ attributes: Dict[str, str],
77
+ save_dir: str,
78
+ additional_instructions: Optional[str] = None,
79
+ model: str = "gpt-5-mini",
80
+ n_parallels: int = 650,
81
+ n_runs: int = 1,
82
+ n_attributes_per_run: int = 8,
83
+ reset_files: bool = False,
84
+ use_dummy: bool = False,
85
+ file_name: str = "ratings.csv",
86
+ modality: str = "text",
87
+ reasoning_effort: Optional[str] = None,
88
+ reasoning_summary: Optional[str] = None,
89
+ search_context_size: str = "medium",
90
+ template_path: Optional[str] = None,
91
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
92
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
93
+ **cfg_kwargs,
94
+ ) -> pd.DataFrame:
95
+ """Asks GPT to score each text / image / audio / pdf / item on natural language attributes. Output = 0-100 rating.
96
+
97
+ Example Use
98
+ -----------
99
+ Measure "populist rhetoric" in a speech; "toxicity" of tweets; "luxury" in ad images.
100
+
101
+ Parameters
102
+ ----------
103
+ df:
104
+ Source DataFrame containing the passages to rate.
105
+ column_name:
106
+ Column in ``df`` that holds the passages (text, image, audio, or PDF
107
+ references depending on ``modality``).
108
+ attributes:
109
+ Mapping of attribute names to natural-language descriptions that the
110
+ model should evaluate on a 0–100 scale.
111
+ save_dir:
112
+ Directory where raw responses and the aggregated ratings CSV are
113
+ written. Created if it does not exist.
114
+ additional_instructions:
115
+ Optional extra guidance injected into the prompt template.
116
+ model:
117
+ Model name passed through to the OpenAI Responses API.
118
+ n_parallels:
119
+ Maximum number of concurrent requests to issue.
120
+ n_runs:
121
+ Number of repeat rating passes to perform for each passage.
122
+ n_attributes_per_run:
123
+ Maximum number of attributes to include in a single prompt. Attributes
124
+ are split across prompts when this limit is exceeded.
125
+ reset_files:
126
+ When ``True`` existing outputs in ``save_dir`` are ignored and
127
+ regenerated.
128
+ use_dummy:
129
+ If ``True`` use deterministic dummy responses for offline testing.
130
+ file_name:
131
+ Basename (without the automatic ``_raw_responses`` suffix) for saved
132
+ artifacts.
133
+ modality:
134
+ One of ``"text"``, ``"entity"``, ``"web"``, ``"image"``, ``"audio"``, or ``"pdf"``
135
+ to control how inputs are packaged into prompts.
136
+ reasoning_effort, reasoning_summary:
137
+ Optional OpenAI metadata that tunes reasoning depth and summary capture.
138
+ search_context_size:
139
+ Size hint forwarded to web-search capable models.
140
+ template_path:
141
+ Override the default rating prompt template with a custom Jinja2 file.
142
+ response_fn:
143
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
144
+ that replaces the per-prompt model invocation. Ignored when
145
+ ``get_all_responses_fn`` is supplied.
146
+ get_all_responses_fn:
147
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
148
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
149
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
150
+ **cfg_kwargs:
151
+ Additional overrides applied to :class:`gabriel.tasks.rate.RateConfig`.
152
+
153
+ Returns
154
+ -------
155
+ pandas.DataFrame
156
+ Input DataFrame with one column per attribute containing the mean score
157
+ across runs.
158
+ """
159
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
160
+ os.makedirs(save_dir, exist_ok=True)
161
+ cfg = RateConfig(
162
+ attributes=attributes,
163
+ save_dir=save_dir,
164
+ file_name=file_name,
165
+ model=model,
166
+ n_parallels=n_parallels,
167
+ n_runs=n_runs,
168
+ n_attributes_per_run=n_attributes_per_run,
169
+ use_dummy=use_dummy,
170
+ additional_instructions=additional_instructions,
171
+ modality=modality,
172
+ reasoning_effort=reasoning_effort,
173
+ reasoning_summary=reasoning_summary,
174
+ search_context_size=search_context_size,
175
+ **cfg_kwargs,
176
+ )
177
+ return await Rate(cfg, template_path=template_path).run(
178
+ df,
179
+ column_name,
180
+ reset_files=reset_files,
181
+ response_fn=response_fn,
182
+ get_all_responses_fn=get_all_responses_fn,
183
+ )
184
+
185
+ async def extract(
186
+ df: pd.DataFrame,
187
+ column_name: str,
188
+ *,
189
+ attributes: Dict[str, str],
190
+ save_dir: str,
191
+ additional_instructions: Optional[str] = None,
192
+ model: str = "gpt-5-mini",
193
+ n_parallels: int = 650,
194
+ n_runs: int = 1,
195
+ n_attributes_per_run: int = 8,
196
+ reset_files: bool = False,
197
+ use_dummy: bool = False,
198
+ file_name: str = "extraction.csv",
199
+ modality: str = "entity",
200
+ reasoning_effort: Optional[str] = None,
201
+ reasoning_summary: Optional[str] = None,
202
+ types: Optional[Dict[str, Any]] = None,
203
+ template_path: Optional[str] = None,
204
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
205
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
206
+ **cfg_kwargs,
207
+ ) -> pd.DataFrame:
208
+ """Structured fact extraction on each item. Output = string / numeric values.
209
+
210
+ Example Use
211
+ -----------
212
+ For each product, provide the "company", "CEO", and "year of invention".
213
+
214
+ Parameters
215
+ ----------
216
+ df:
217
+ Source DataFrame containing the passages to parse.
218
+ column_name:
219
+ Column in ``df`` with the content to extract from.
220
+ attributes:
221
+ Mapping of field names to descriptions of what should be extracted.
222
+ save_dir:
223
+ Directory where extraction outputs will be written. Created if absent.
224
+ additional_instructions:
225
+ Optional extra guidance injected into the extraction prompt.
226
+ model:
227
+ Model used for extraction via the OpenAI Responses API.
228
+ n_parallels:
229
+ Maximum number of concurrent extraction calls.
230
+ n_runs:
231
+ Number of extraction passes to perform; results are averaged when
232
+ applicable.
233
+ n_attributes_per_run:
234
+ Maximum number of attributes to include in each prompt. Attributes are
235
+ split into multiple prompts when this threshold is exceeded.
236
+ reset_files:
237
+ When ``True`` forces regeneration of outputs in ``save_dir``.
238
+ use_dummy:
239
+ If ``True`` return deterministic dummy outputs instead of real API
240
+ calls.
241
+ file_name:
242
+ CSV name used when saving extraction results.
243
+ modality:
244
+ Indicates whether the content is ``"entity"`` text or another modality
245
+ supported by the templates.
246
+ reasoning_effort, reasoning_summary:
247
+ Optional OpenAI metadata for reasoning depth and summarisation.
248
+ types:
249
+ Optional mapping of attribute names to explicit Python types for
250
+ stronger downstream typing.
251
+ template_path:
252
+ Custom Jinja2 template path to override the default extraction prompt.
253
+ response_fn:
254
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
255
+ that replaces the per-prompt model invocation. Ignored when
256
+ ``get_all_responses_fn`` is supplied.
257
+ get_all_responses_fn:
258
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
259
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
260
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
261
+ **cfg_kwargs:
262
+ Additional overrides forwarded to :class:`gabriel.tasks.extract.ExtractConfig`.
263
+
264
+ Returns
265
+ -------
266
+ pandas.DataFrame
267
+ The original DataFrame augmented with one column per requested
268
+ attribute.
269
+ """
270
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
271
+ os.makedirs(save_dir, exist_ok=True)
272
+ cfg = ExtractConfig(
273
+ attributes=attributes,
274
+ save_dir=save_dir,
275
+ file_name=file_name,
276
+ model=model,
277
+ n_parallels=n_parallels,
278
+ n_runs=n_runs,
279
+ n_attributes_per_run=n_attributes_per_run,
280
+ use_dummy=use_dummy,
281
+ additional_instructions=additional_instructions,
282
+ modality=modality,
283
+ reasoning_effort=reasoning_effort,
284
+ reasoning_summary=reasoning_summary,
285
+ **cfg_kwargs,
286
+ )
287
+ return await Extract(cfg, template_path=template_path).run(
288
+ df,
289
+ column_name,
290
+ reset_files=reset_files,
291
+ types=types,
292
+ )
293
+
294
+
295
+ async def seed(
296
+ instructions: str,
297
+ *,
298
+ save_dir: str,
299
+ file_name: str = "seed_entities.csv",
300
+ model: str = "gpt-5.1",
301
+ n_parallels: int = 650,
302
+ num_entities: int = 1000,
303
+ entities_per_generation: int = 50,
304
+ entity_batch_frac: float = 0.25,
305
+ existing_entities_cap: int = 100,
306
+ use_dummy: bool = False,
307
+ deduplicate: bool = False,
308
+ deduplicate_sample_seed: int = 42,
309
+ reasoning_effort: Optional[str] = None,
310
+ reasoning_summary: Optional[str] = None,
311
+ max_timeout: Optional[float] = None,
312
+ template_path: Optional[str] = None,
313
+ existing_entities: Optional[List[str]] = None,
314
+ reset_files: bool = False,
315
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
316
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
317
+ **response_kwargs: Any,
318
+ ) -> pd.DataFrame:
319
+ """Enforces a representative distribution / diversity of seeds.
320
+
321
+ Example Use
322
+ -----------
323
+ Initialize unique personas that match US population distribution.
324
+
325
+ Parameters
326
+ ----------
327
+ instructions:
328
+ High-level description of the domain and what constitutes a good seed
329
+ entity.
330
+ save_dir:
331
+ Directory where seed entities and raw responses are stored.
332
+ file_name:
333
+ Name of the CSV to write seed entities to.
334
+ model:
335
+ Model used for generation.
336
+ n_parallels:
337
+ Maximum number of concurrent generation calls.
338
+ num_entities:
339
+ Target number of entities to generate in total.
340
+ entities_per_generation:
341
+ Number of entities requested from each API call.
342
+ entity_batch_frac:
343
+ Fraction of generated entities to keep per batch before deduplication.
344
+ existing_entities_cap:
345
+ Maximum number of prior entities to consider when avoiding duplicates.
346
+ use_dummy:
347
+ If ``True`` emit deterministic dummy seeds for offline testing.
348
+ deduplicate:
349
+ When ``True`` over-generate and apply a shallow deduplication pass
350
+ before returning results.
351
+ deduplicate_sample_seed:
352
+ Random seed used when sampling a deterministic subset after
353
+ deduplication.
354
+ reasoning_effort, reasoning_summary:
355
+ Optional OpenAI reasoning controls.
356
+ max_timeout:
357
+ Optional timeout in seconds for each API call.
358
+ template_path:
359
+ Optional Jinja2 template override for the seeding prompt.
360
+ existing_entities:
361
+ List of pre-existing entities to avoid regenerating.
362
+ reset_files:
363
+ When ``True`` ignore any saved state in ``save_dir`` and regenerate.
364
+ response_fn:
365
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
366
+ that replaces the per-prompt model invocation. Ignored when
367
+ ``get_all_responses_fn`` is supplied.
368
+ get_all_responses_fn:
369
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
370
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
371
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
372
+ **response_kwargs:
373
+ Additional keyword arguments forwarded to
374
+ :func:`gabriel.utils.openai_utils.get_all_responses`.
375
+
376
+ Returns
377
+ -------
378
+ pandas.DataFrame
379
+ DataFrame of seed entities with provenance metadata.
380
+ """
381
+
382
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
383
+ os.makedirs(save_dir, exist_ok=True)
384
+ cfg = SeedConfig(
385
+ instructions=instructions,
386
+ save_dir=save_dir,
387
+ file_name=file_name,
388
+ model=model,
389
+ n_parallels=n_parallels,
390
+ num_entities=num_entities,
391
+ entities_per_generation=entities_per_generation,
392
+ entity_batch_frac=entity_batch_frac,
393
+ existing_entities_cap=existing_entities_cap,
394
+ use_dummy=use_dummy,
395
+ deduplicate=deduplicate,
396
+ deduplicate_sample_seed=deduplicate_sample_seed,
397
+ reasoning_effort=reasoning_effort,
398
+ reasoning_summary=reasoning_summary,
399
+ max_timeout=max_timeout,
400
+ )
401
+ task = Seed(cfg, template_path=template_path)
402
+ return await task.run(
403
+ existing_entities=existing_entities,
404
+ reset_files=reset_files,
405
+ response_fn=response_fn,
406
+ get_all_responses_fn=get_all_responses_fn,
407
+ **response_kwargs,
408
+ )
409
+
410
+
411
+ async def classify(
412
+ df: pd.DataFrame,
413
+ column_name: Optional[str] = None,
414
+ *,
415
+ labels: Dict[str, str],
416
+ save_dir: str,
417
+ additional_instructions: Optional[str] = None,
418
+ model: str = "gpt-5-mini",
419
+ differentiate: bool = False,
420
+ circle_column_name: Optional[str] = None,
421
+ square_column_name: Optional[str] = None,
422
+ n_parallels: int = 650,
423
+ n_runs: int = 1,
424
+ n_attributes_per_run: int = 8,
425
+ min_frequency: float = 0.6,
426
+ reset_files: bool = False,
427
+ use_dummy: bool = False,
428
+ file_name: str = "classify_responses.csv",
429
+ modality: str = "text",
430
+ reasoning_effort: Optional[str] = None,
431
+ reasoning_summary: Optional[str] = None,
432
+ search_context_size: str = "medium",
433
+ template_path: Optional[str] = None,
434
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
435
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
436
+ **cfg_kwargs,
437
+ ) -> pd.DataFrame:
438
+ """Classifies texts / images / audio / pdfs / items on whether provided labels apply. Output = one or more classes per item.
439
+
440
+ Example Use
441
+ -----------
442
+ Tag news articles, product photos, or interview clips into topical categories.
443
+
444
+ Parameters
445
+ ----------
446
+ df:
447
+ DataFrame containing content to classify.
448
+ column_name:
449
+ Column with the main passage text. Can be ``None`` when using paired
450
+ circle/square inputs.
451
+ labels:
452
+ Mapping of label names to definitions the model should follow.
453
+ save_dir:
454
+ Directory where classification artifacts are written.
455
+ additional_instructions:
456
+ Free-form instructions appended to the classification prompt.
457
+ model:
458
+ Model name used for classification.
459
+ differentiate:
460
+ When ``True`` use differentiation mode to highlight contrasts.
461
+ circle_column_name, square_column_name:
462
+ Optional paired columns for contrastive classification.
463
+ n_parallels:
464
+ Maximum number of concurrent classification calls.
465
+ n_runs:
466
+ Number of repeated classification passes.
467
+ n_attributes_per_run:
468
+ Maximum number of labels to evaluate per prompt. Labels are split into
469
+ batches when this count is exceeded.
470
+ min_frequency:
471
+ Minimum label frequency required to keep a label during aggregation.
472
+ reset_files:
473
+ When ``True`` overwrite any existing outputs in ``save_dir``.
474
+ use_dummy:
475
+ If ``True`` return deterministic dummy outputs for offline testing.
476
+ file_name:
477
+ Basename for saved classification CSVs.
478
+ modality:
479
+ Indicates the content modality for prompt rendering.
480
+ reasoning_effort, reasoning_summary:
481
+ Optional OpenAI reasoning controls.
482
+ search_context_size:
483
+ Context size hint forwarded to the Responses API.
484
+ template_path:
485
+ Override the default classification prompt template.
486
+ response_fn:
487
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
488
+ that replaces the per-prompt model invocation. Ignored when
489
+ ``get_all_responses_fn`` is supplied.
490
+ get_all_responses_fn:
491
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
492
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
493
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
494
+ **cfg_kwargs:
495
+ Extra configuration passed to :class:`gabriel.tasks.classify.ClassifyConfig`.
496
+
497
+ Returns
498
+ -------
499
+ pandas.DataFrame
500
+ DataFrame including one column per label plus ``predicted_classes``; aggregates repeated runs using ``min_frequency``.
501
+ """
502
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
503
+ os.makedirs(save_dir, exist_ok=True)
504
+ cfg = ClassifyConfig(
505
+ labels=labels,
506
+ save_dir=save_dir,
507
+ file_name=file_name,
508
+ model=model,
509
+ differentiate=differentiate,
510
+ n_parallels=n_parallels,
511
+ n_runs=n_runs,
512
+ n_attributes_per_run=n_attributes_per_run,
513
+ min_frequency=min_frequency,
514
+ additional_instructions=additional_instructions or "",
515
+ use_dummy=use_dummy,
516
+ modality=modality,
517
+ reasoning_effort=reasoning_effort,
518
+ reasoning_summary=reasoning_summary,
519
+ search_context_size=search_context_size,
520
+ **cfg_kwargs,
521
+ )
522
+ return await Classify(cfg, template_path=template_path).run(
523
+ df,
524
+ column_name,
525
+ circle_column_name=circle_column_name,
526
+ square_column_name=square_column_name,
527
+ reset_files=reset_files,
528
+ response_fn=response_fn,
529
+ get_all_responses_fn=get_all_responses_fn,
530
+ )
531
+
532
+
533
+ async def ideate(
534
+ topic: str,
535
+ *,
536
+ save_dir: str,
537
+ file_name: str = "ideation.csv",
538
+ model: str = "gpt-5-mini",
539
+ ranking_model: Optional[str] = None,
540
+ n_ideas: int = 1000,
541
+ n_parallels: int = 650,
542
+ evaluation_mode: str = "recursive_rank",
543
+ attributes: Optional[Dict[str, str]] = None,
544
+ rank_attribute: Optional[str] = None,
545
+ recursive_fraction: float = 1.0 / 3.0,
546
+ recursive_min_remaining: int = 30,
547
+ recursive_final_round_multiplier: int = 3,
548
+ recursive_cut_side: str = "top",
549
+ recursive_rate_first_round: bool = True,
550
+ additional_instructions: Optional[str] = None,
551
+ web_search: bool = False,
552
+ use_dummy: bool = False,
553
+ reasoning_effort: Optional[str] = None,
554
+ reasoning_summary: Optional[str] = None,
555
+ reset_files: bool = False,
556
+ generation_kwargs: Optional[Dict[str, Any]] = None,
557
+ rank_config_updates: Optional[Dict[str, Any]] = None,
558
+ rank_run_kwargs: Optional[Dict[str, Any]] = None,
559
+ rate_config_updates: Optional[Dict[str, Any]] = None,
560
+ rate_run_kwargs: Optional[Dict[str, Any]] = None,
561
+ use_seed_entities: Optional[bool] = None,
562
+ seed_deduplicate: bool = True,
563
+ seed_config_updates: Optional[Dict[str, Any]] = None,
564
+ seed_run_kwargs: Optional[Dict[str, Any]] = None,
565
+ template_path: Optional[str] = None,
566
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
567
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
568
+ ) -> pd.DataFrame:
569
+ """Generates many novel scientific theories and filters the cream of the crop.
570
+
571
+ Example Use
572
+ -----------
573
+ Procure novel theories on inflation for potential research.
574
+
575
+ Parameters
576
+ ----------
577
+ topic:
578
+ Subject area or question to ideate on.
579
+ save_dir:
580
+ Directory where generated ideas and intermediate rankings are saved.
581
+ file_name:
582
+ CSV name for the consolidated ideation output.
583
+ model, ranking_model:
584
+ Models used for idea generation and ranking (if different).
585
+ n_ideas:
586
+ Target number of ideas to generate before pruning.
587
+ n_parallels:
588
+ Maximum concurrent calls for generation and ranking phases.
589
+ evaluation_mode:
590
+ Strategy used to evaluate ideas (for example ``"recursive_rank"``).
591
+ attributes:
592
+ Optional attributes to rate ideas on during evaluation.
593
+ rank_attribute:
594
+ Name of the attribute used for final ranking when multiple attributes
595
+ are present.
596
+ recursive_*:
597
+ Parameters controlling iterative ranking passes (fraction kept,
598
+ minimum remaining, cut side, etc.).
599
+ additional_instructions:
600
+ Extra guidance injected into prompts for both generation and ranking.
601
+ web_search:
602
+ Enable web search augmentation for generation.
603
+ use_dummy:
604
+ When ``True`` perform deterministic offline runs.
605
+ reasoning_effort, reasoning_summary:
606
+ Optional OpenAI reasoning controls.
607
+ reset_files:
608
+ Force regeneration of outputs in ``save_dir``.
609
+ *_config_updates, *_run_kwargs:
610
+ Fine-grained overrides for nested Rate/Rank/Seed tasks.
611
+ seed_deduplicate:
612
+ When ``True`` enable deduplication in the nested seed generation.
613
+ template_path:
614
+ Optional template override for the ideation prompts.
615
+ response_fn:
616
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
617
+ that replaces the per-prompt model invocation. Ignored when
618
+ ``get_all_responses_fn`` is supplied.
619
+ get_all_responses_fn:
620
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
621
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
622
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
623
+
624
+ Returns
625
+ -------
626
+ pandas.DataFrame
627
+ Ranked list of ideas with evaluation metadata.
628
+ """
629
+
630
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
631
+ os.makedirs(save_dir, exist_ok=True)
632
+
633
+ cfg_kwargs: Dict[str, Any] = dict(
634
+ save_dir=save_dir,
635
+ file_name=file_name,
636
+ model=model,
637
+ ranking_model=ranking_model,
638
+ n_parallels=n_parallels,
639
+ n_ideas=n_ideas,
640
+ evaluation_mode=evaluation_mode,
641
+ rank_attribute=rank_attribute,
642
+ recursive_fraction=recursive_fraction,
643
+ recursive_min_remaining=recursive_min_remaining,
644
+ recursive_final_round_multiplier=recursive_final_round_multiplier,
645
+ recursive_cut_side=recursive_cut_side,
646
+ recursive_rate_first_round=recursive_rate_first_round,
647
+ additional_instructions=additional_instructions,
648
+ web_search=web_search,
649
+ use_dummy=use_dummy,
650
+ reasoning_effort=reasoning_effort,
651
+ reasoning_summary=reasoning_summary,
652
+ seed_deduplicate=seed_deduplicate,
653
+ )
654
+ if attributes is not None:
655
+ cfg_kwargs["attributes"] = attributes
656
+ cfg = IdeateConfig(**cfg_kwargs)
657
+
658
+ def _with_callable_overrides(payload: Optional[Dict[str, Any]]) -> Dict[str, Any]:
659
+ updated = dict(payload or {})
660
+ if response_fn is not None:
661
+ updated.setdefault("response_fn", response_fn)
662
+ if get_all_responses_fn is not None:
663
+ updated.setdefault("get_all_responses_fn", get_all_responses_fn)
664
+ return updated
665
+
666
+ generation_kwargs = _with_callable_overrides(generation_kwargs)
667
+ rank_run_kwargs = _with_callable_overrides(rank_run_kwargs)
668
+ rate_run_kwargs = _with_callable_overrides(rate_run_kwargs)
669
+ seed_run_kwargs = _with_callable_overrides(seed_run_kwargs)
670
+
671
+ ideator = Ideate(cfg, template_path=template_path)
672
+ return await ideator.run(
673
+ topic,
674
+ additional_instructions=additional_instructions,
675
+ evaluation_mode=evaluation_mode,
676
+ attributes=attributes,
677
+ rank_attribute=rank_attribute,
678
+ reset_files=reset_files,
679
+ generation_kwargs=generation_kwargs,
680
+ rank_config_updates=rank_config_updates,
681
+ rank_run_kwargs=rank_run_kwargs,
682
+ rate_config_updates=rate_config_updates,
683
+ rate_run_kwargs=rate_run_kwargs,
684
+ use_seed_entities=use_seed_entities,
685
+ seed_config_updates=seed_config_updates,
686
+ seed_run_kwargs=seed_run_kwargs,
687
+ )
688
+
689
+
690
+ async def id8(*args, **kwargs) -> pd.DataFrame:
691
+ """Alias for :func:`ideate`."""
692
+
693
+ return await ideate(*args, **kwargs)
694
+
695
+
696
+ async def deidentify(
697
+ df: pd.DataFrame,
698
+ column_name: str,
699
+ *,
700
+ save_dir: str,
701
+ grouping_column: Optional[str] = None,
702
+ mapping_column: Optional[str] = None,
703
+ model: str = "gpt-5-mini",
704
+ n_parallels: int = 650,
705
+ use_dummy: bool = False,
706
+ file_name: str = "deidentified.csv",
707
+ max_words_per_call: int = 7500,
708
+ additional_instructions: Optional[str] = None,
709
+ reasoning_effort: Optional[str] = None,
710
+ reasoning_summary: Optional[str] = None,
711
+ n_passes: int = 1,
712
+ use_existing_mappings_only: bool = False,
713
+ template_path: Optional[str] = None,
714
+ reset_files: bool = False,
715
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
716
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
717
+ **cfg_kwargs,
718
+ ) -> pd.DataFrame:
719
+ """Replaces PII with realistic, consistent fake PII. Outputs anonymized text + mapping.
720
+
721
+ Example Use
722
+ -----------
723
+ Replace names, employers, addresses before sharing interview corpora.
724
+
725
+ Parameters
726
+ ----------
727
+ df:
728
+ DataFrame containing passages to deidentify.
729
+ column_name:
730
+ Column in ``df`` holding the text to scrub.
731
+ save_dir:
732
+ Directory where anonymised outputs and mappings are written.
733
+ grouping_column:
734
+ Optional column grouping records that should share replacements.
735
+ mapping_column:
736
+ Optional column providing deterministic replacement tokens.
737
+ model:
738
+ Model name used to perform the deidentification.
739
+ n_parallels:
740
+ Maximum concurrent requests.
741
+ use_dummy:
742
+ When ``True`` produce deterministic dummy replacements for testing.
743
+ file_name:
744
+ CSV filename used when persisting deidentified text.
745
+ max_words_per_call:
746
+ Chunk size control for long passages.
747
+ additional_instructions:
748
+ Extra guidance appended to the prompt.
749
+ reasoning_effort, reasoning_summary:
750
+ Optional OpenAI reasoning controls.
751
+ n_passes:
752
+ Number of deidentification passes to run over each passage.
753
+ use_existing_mappings_only:
754
+ If ``True`` only apply existing mappings and avoid new model calls.
755
+ template_path:
756
+ Custom prompt template path.
757
+ reset_files:
758
+ When ``True`` ignore cached outputs in ``save_dir``.
759
+ response_fn:
760
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
761
+ that replaces the per-prompt model invocation. Ignored when
762
+ ``get_all_responses_fn`` is supplied.
763
+ get_all_responses_fn:
764
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
765
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
766
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
767
+ **cfg_kwargs:
768
+ Additional overrides for :class:`gabriel.tasks.deidentify.DeidentifyConfig`.
769
+
770
+ Returns
771
+ -------
772
+ pandas.DataFrame
773
+ DataFrame containing deidentified text and replacement mappings.
774
+ """
775
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
776
+ os.makedirs(save_dir, exist_ok=True)
777
+ cfg = DeidentifyConfig(
778
+ save_dir=save_dir,
779
+ file_name=file_name,
780
+ model=model,
781
+ n_parallels=n_parallels,
782
+ use_dummy=use_dummy,
783
+ max_words_per_call=max_words_per_call,
784
+ additional_instructions=additional_instructions,
785
+ reasoning_effort=reasoning_effort,
786
+ reasoning_summary=reasoning_summary,
787
+ n_passes=n_passes,
788
+ use_existing_mappings_only=use_existing_mappings_only,
789
+ **cfg_kwargs,
790
+ )
791
+ return await Deidentifier(cfg, template_path=template_path).run(
792
+ df,
793
+ column_name,
794
+ grouping_column=grouping_column,
795
+ mapping_column=mapping_column,
796
+ reset_files=reset_files,
797
+ response_fn=response_fn,
798
+ get_all_responses_fn=get_all_responses_fn,
799
+ )
800
+
801
+ async def rank(
802
+ df: pd.DataFrame,
803
+ column_name: str,
804
+ *,
805
+ attributes: Union[Dict[str, str], List[str]],
806
+ save_dir: str,
807
+ additional_instructions: Optional[str] = None,
808
+ model: str = "gpt-5-mini",
809
+ n_rounds: int = 5,
810
+ matches_per_round: int = 3,
811
+ power_matching: bool = True,
812
+ return_raw_scores: bool = False,
813
+ learning_rate: float = 0.1,
814
+ n_parallels: int = 650,
815
+ n_attributes_per_run: int = 8,
816
+ use_dummy: bool = False,
817
+ file_name: str = "rankings",
818
+ reset_files: bool = False,
819
+ modality: str = "text",
820
+ reasoning_effort: Optional[str] = None,
821
+ reasoning_summary: Optional[str] = None,
822
+ template_path: Optional[str] = None,
823
+ recursive: bool = False,
824
+ recursive_fraction: float = 1.0 / 3.0,
825
+ recursive_min_remaining: int = 30,
826
+ recursive_final_round_multiplier: int = 3,
827
+ recursive_cut_attr: Optional[str] = None,
828
+ recursive_cut_side: str = "top",
829
+ recursive_rate_first_round: bool = True,
830
+ recursive_rewrite_func: Optional[Callable[[str, str, int], str]] = None,
831
+ recursive_rewrite_text_col: str = "text",
832
+ recursive_keep_stage_columns: bool = True,
833
+ recursive_add_stage_suffix: bool = True,
834
+ initial_rating_pass: bool = True,
835
+ rate_kwargs: Optional[Dict[str, Any]] = None,
836
+ primer_scores: Optional[Dict[str, Dict[str, float]]] = None,
837
+ primer_scale: float = 1.0,
838
+ primer_center: bool = True,
839
+ id_column: Optional[str] = None,
840
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
841
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
842
+ **cfg_kwargs,
843
+ ) -> pd.DataFrame:
844
+ """Pairwise comparisons between texts yields ELO-like attribute ratings. Output = grounded, relative z scores for each text.
845
+
846
+ Example Use
847
+ -----------
848
+ Rank technologies by "bulkiness" or artworks by "fine brushwork".
849
+
850
+ Parameters
851
+ ----------
852
+ df:
853
+ DataFrame containing passages to rank.
854
+ column_name:
855
+ Column holding the content to rank.
856
+ attributes:
857
+ Either a mapping of attribute names to descriptions or a list of
858
+ attribute names (descriptions inferred from templates).
859
+ save_dir:
860
+ Directory where ranking artifacts are saved.
861
+ additional_instructions:
862
+ Free-form prompt additions applied to each comparison.
863
+ model:
864
+ Model name used for ranking calls.
865
+ n_rounds, matches_per_round, power_matching, learning_rate:
866
+ Parameters controlling the Elo-style tournament mechanics.
867
+ n_parallels:
868
+ Maximum concurrent ranking calls.
869
+ n_attributes_per_run:
870
+ Maximum number of attributes to compare per prompt. Attributes are
871
+ batched across prompts when this cap is exceeded.
872
+ use_dummy:
873
+ When ``True`` run deterministic offline ranking.
874
+ file_name:
875
+ Base filename for saved rankings (without extension).
876
+ reset_files:
877
+ Force regeneration of any existing outputs in ``save_dir``.
878
+ modality:
879
+ Content modality forwarded to the prompt.
880
+ reasoning_effort, reasoning_summary:
881
+ Optional OpenAI reasoning controls.
882
+ template_path:
883
+ Path to a custom ranking prompt template.
884
+ recursive_*:
885
+ Settings for recursive pruning (fraction kept, minimum remaining, etc.).
886
+ initial_rating_pass:
887
+ Whether to run a preliminary rating stage before comparisons. Enabled by
888
+ default to give the tournament grounded starting scores; set to
889
+ ``False`` to skip the rating seed.
890
+ rate_kwargs:
891
+ Additional configuration forwarded to the preliminary rating stage.
892
+ primer_scores, primer_scale, primer_center:
893
+ Optional seed ratings to prime the Bradley–Terry loop. Scores are
894
+ centred per attribute when ``primer_center`` is ``True`` and scaled
895
+ by ``primer_scale``.
896
+ id_column:
897
+ Optional existing identifier column; otherwise hashes of ``column_name``
898
+ are generated.
899
+ response_fn:
900
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
901
+ that replaces the per-prompt model invocation. Ignored when
902
+ ``get_all_responses_fn`` is supplied.
903
+ get_all_responses_fn:
904
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
905
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
906
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
907
+ **cfg_kwargs:
908
+ Extra parameters passed to :class:`gabriel.tasks.rank.RankConfig`.
909
+
910
+ Returns
911
+ -------
912
+ pandas.DataFrame
913
+ Ranked outputs. The CSV written to ``save_dir`` always contains raw
914
+ scores and standard errors, but the returned DataFrame hides those
915
+ columns unless ``return_raw_scores`` is ``True``.
916
+ """
917
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
918
+ os.makedirs(save_dir, exist_ok=True)
919
+ cfg = RankConfig(
920
+ attributes=attributes,
921
+ n_rounds=n_rounds,
922
+ matches_per_round=matches_per_round,
923
+ power_matching=power_matching,
924
+ learning_rate=learning_rate,
925
+ model=model,
926
+ n_parallels=n_parallels,
927
+ n_attributes_per_run=n_attributes_per_run,
928
+ use_dummy=use_dummy,
929
+ save_dir=save_dir,
930
+ file_name=file_name,
931
+ additional_instructions=additional_instructions or "",
932
+ modality=modality,
933
+ reasoning_effort=reasoning_effort,
934
+ reasoning_summary=reasoning_summary,
935
+ recursive=recursive,
936
+ recursive_fraction=recursive_fraction,
937
+ recursive_min_remaining=recursive_min_remaining,
938
+ recursive_final_round_multiplier=recursive_final_round_multiplier,
939
+ recursive_cut_attr=recursive_cut_attr,
940
+ recursive_cut_side=recursive_cut_side,
941
+ recursive_rate_first_round=recursive_rate_first_round,
942
+ recursive_rewrite_func=recursive_rewrite_func,
943
+ recursive_rewrite_text_col=recursive_rewrite_text_col,
944
+ recursive_keep_stage_columns=recursive_keep_stage_columns,
945
+ recursive_add_stage_suffix=recursive_add_stage_suffix,
946
+ initial_rating_pass=initial_rating_pass,
947
+ rate_kwargs=rate_kwargs or {},
948
+ primer_scores=primer_scores,
949
+ primer_scale=primer_scale,
950
+ primer_center=primer_center,
951
+ **cfg_kwargs,
952
+ )
953
+ result_df = await Rank(cfg, template_path=template_path).run(
954
+ df,
955
+ column_name,
956
+ id_column=id_column,
957
+ reset_files=reset_files,
958
+ response_fn=response_fn,
959
+ get_all_responses_fn=get_all_responses_fn,
960
+ )
961
+
962
+ # By default only expose the z-score columns (attribute names without suffixes)
963
+ # to API callers while keeping the raw/SE columns persisted in the CSV output.
964
+ if return_raw_scores:
965
+ return result_df
966
+
967
+ if isinstance(attributes, dict):
968
+ attr_keys: List[str] = list(attributes.keys())
969
+ else:
970
+ attr_keys = list(attributes)
971
+ drop_cols: List[str] = []
972
+ for attr in attr_keys:
973
+ raw_col = f"{attr}_raw"
974
+ se_col = f"{attr}_se"
975
+ if raw_col in result_df.columns:
976
+ drop_cols.append(raw_col)
977
+ if se_col in result_df.columns:
978
+ drop_cols.append(se_col)
979
+ if drop_cols:
980
+ result_df = result_df.drop(columns=drop_cols)
981
+ return result_df
982
+
983
+
984
+ async def codify(
985
+ df: pd.DataFrame,
986
+ column_name: str,
987
+ *,
988
+ save_dir: str,
989
+ categories: Optional[Dict[str, str]] = None,
990
+ additional_instructions: str = "",
991
+ model: str = "gpt-5-mini",
992
+ n_parallels: int = 650,
993
+ max_words_per_call: int = 1000,
994
+ max_categories_per_call: int = 8,
995
+ file_name: str = "coding_results.csv",
996
+ reset_files: bool = False,
997
+ debug_print: bool = False,
998
+ use_dummy: bool = False,
999
+ reasoning_effort: Optional[str] = None,
1000
+ reasoning_summary: Optional[str] = None,
1001
+ modality: str = "text",
1002
+ json_mode: bool = True,
1003
+ max_timeout: Optional[float] = None,
1004
+ n_rounds: int = 2,
1005
+ completion_classifier_instructions: Optional[str] = None,
1006
+ template_path: Optional[str] = None,
1007
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
1008
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
1009
+ **cfg_kwargs,
1010
+ ) -> pd.DataFrame:
1011
+ """Passage coding: highlights snippets in text that match qualitative codes.
1012
+
1013
+ Example Use
1014
+ -----------
1015
+ Flag sentences about "economic insecurity" in speeches; "stressors" mentioned in interview.
1016
+
1017
+ Parameters
1018
+ ----------
1019
+ df:
1020
+ DataFrame containing the passages to code.
1021
+ column_name:
1022
+ Column with the text to be coded.
1023
+ save_dir:
1024
+ Directory where coding outputs are written.
1025
+ categories:
1026
+ Optional mapping of category names to descriptions. If omitted the model
1027
+ infers categories.
1028
+ additional_instructions:
1029
+ Extra guidance appended to the coding prompt.
1030
+ model:
1031
+ Model used for coding requests.
1032
+ n_parallels:
1033
+ Maximum number of concurrent coding calls.
1034
+ max_words_per_call:
1035
+ Chunk size control for each request.
1036
+ max_categories_per_call:
1037
+ Limit on the number of categories evaluated per call.
1038
+ file_name:
1039
+ Filename for saved coding responses.
1040
+ reset_files:
1041
+ When ``True`` regenerate outputs even if files exist.
1042
+ debug_print:
1043
+ Enable verbose logging of prompts and responses.
1044
+ use_dummy:
1045
+ Use deterministic dummy outputs for testing.
1046
+ reasoning_effort, reasoning_summary:
1047
+ Optional OpenAI reasoning controls.
1048
+ modality:
1049
+ Content modality hint (text, entity, etc.).
1050
+ json_mode:
1051
+ Request JSON-mode responses where supported.
1052
+ max_timeout:
1053
+ Optional per-call timeout.
1054
+ n_rounds:
1055
+ Number of completion passes to refine codes.
1056
+ completion_classifier_instructions:
1057
+ Optional classifier guidance for completion steps.
1058
+ template_path:
1059
+ Custom Jinja2 template for coding prompts.
1060
+ response_fn:
1061
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
1062
+ that replaces the per-prompt model invocation. Ignored when
1063
+ ``get_all_responses_fn`` is supplied.
1064
+ get_all_responses_fn:
1065
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
1066
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
1067
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
1068
+ **cfg_kwargs:
1069
+ Additional overrides passed to :class:`gabriel.tasks.codify.CodifyConfig`.
1070
+
1071
+ Returns
1072
+ -------
1073
+ pandas.DataFrame
1074
+ DataFrame with coded categories and any iterative refinement metadata.
1075
+ """
1076
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
1077
+ os.makedirs(save_dir, exist_ok=True)
1078
+ cfg_kwargs = dict(cfg_kwargs)
1079
+
1080
+ cfg = CodifyConfig(
1081
+ save_dir=save_dir,
1082
+ file_name=file_name,
1083
+ model=model,
1084
+ n_parallels=n_parallels,
1085
+ max_words_per_call=max_words_per_call,
1086
+ max_categories_per_call=max_categories_per_call,
1087
+ debug_print=debug_print,
1088
+ use_dummy=use_dummy,
1089
+ reasoning_effort=reasoning_effort,
1090
+ reasoning_summary=reasoning_summary,
1091
+ modality=modality,
1092
+ json_mode=json_mode,
1093
+ max_timeout=max_timeout,
1094
+ n_rounds=n_rounds,
1095
+ completion_classifier_instructions=completion_classifier_instructions,
1096
+ **cfg_kwargs,
1097
+ )
1098
+ return await Codify(cfg, template_path=template_path).run(
1099
+ df,
1100
+ column_name,
1101
+ categories=categories,
1102
+ additional_instructions=additional_instructions,
1103
+ reset_files=reset_files,
1104
+ response_fn=response_fn,
1105
+ get_all_responses_fn=get_all_responses_fn,
1106
+ )
1107
+
1108
+
1109
+ async def paraphrase(
1110
+ df: pd.DataFrame,
1111
+ column_name: str,
1112
+ *,
1113
+ instructions: str,
1114
+ save_dir: str,
1115
+ revised_column_name: Optional[str] = None,
1116
+ n_revisions: int = 1,
1117
+ file_name: str = "paraphrase_responses.csv",
1118
+ model: str = "gpt-5-mini",
1119
+ json_mode: bool = False,
1120
+ web_search: Optional[bool] = None,
1121
+ n_parallels: int = 650,
1122
+ use_dummy: bool = False,
1123
+ reset_files: bool = False,
1124
+ reasoning_effort: Optional[str] = None,
1125
+ reasoning_summary: Optional[str] = None,
1126
+ n_rounds: int = 1,
1127
+ recursive_validation: Optional[bool] = None,
1128
+ n_initial_candidates: int = 1,
1129
+ n_validation_candidates: int = 5,
1130
+ use_modified_source: bool = False,
1131
+ template_path: Optional[str] = None,
1132
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
1133
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
1134
+ **cfg_kwargs,
1135
+ ) -> pd.DataFrame:
1136
+ """Rewrites texts consistently per instructions.
1137
+
1138
+ Example Use
1139
+ -----------
1140
+ Summarize earnings call transcripts to remove company specifics.
1141
+
1142
+ Parameters
1143
+ ----------
1144
+ df:
1145
+ DataFrame containing passages to paraphrase.
1146
+ column_name:
1147
+ Column with text to rewrite.
1148
+ instructions:
1149
+ Guidance describing how the paraphrase should differ from the source.
1150
+ save_dir:
1151
+ Directory where paraphrase outputs are written.
1152
+ revised_column_name:
1153
+ Optional name for the paraphrased column; defaults to a generated one.
1154
+ n_revisions:
1155
+ Number of paraphrases to produce per passage.
1156
+ file_name:
1157
+ CSV filename for saved paraphrases.
1158
+ model:
1159
+ Model name used for generation.
1160
+ json_mode:
1161
+ Whether to request JSON responses.
1162
+ web_search:
1163
+ Enable web search augmentation when supported by the model.
1164
+ n_parallels:
1165
+ Maximum concurrent paraphrase calls.
1166
+ use_dummy:
1167
+ Produce deterministic dummy paraphrases.
1168
+ reset_files:
1169
+ When ``True`` regenerate outputs even if files already exist.
1170
+ reasoning_effort, reasoning_summary:
1171
+ Optional OpenAI reasoning controls.
1172
+ n_rounds:
1173
+ Maximum number of paraphrase/validation cycles. ``1`` disables recursion.
1174
+ recursive_validation:
1175
+ Deprecated boolean flag retained for compatibility; prefer ``n_rounds``.
1176
+ n_initial_candidates, n_validation_candidates:
1177
+ Control the number of candidates in generation and validation phases.
1178
+ use_modified_source:
1179
+ If ``True`` allow modified source text to be used during validation.
1180
+ template_path:
1181
+ Custom template path to override the default paraphrase prompt.
1182
+ response_fn:
1183
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
1184
+ that replaces the per-prompt model invocation. Ignored when
1185
+ ``get_all_responses_fn`` is supplied.
1186
+ get_all_responses_fn:
1187
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
1188
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
1189
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
1190
+ **cfg_kwargs:
1191
+ Additional configuration passed to :class:`gabriel.tasks.paraphrase.ParaphraseConfig`.
1192
+
1193
+ Returns
1194
+ -------
1195
+ pandas.DataFrame
1196
+ DataFrame containing paraphrased text and any validation scores.
1197
+ """
1198
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
1199
+ os.makedirs(save_dir, exist_ok=True)
1200
+ cfg = ParaphraseConfig(
1201
+ instructions=instructions,
1202
+ revised_column_name=revised_column_name,
1203
+ n_revisions=n_revisions,
1204
+ save_dir=save_dir,
1205
+ file_name=file_name,
1206
+ model=model,
1207
+ json_mode=json_mode,
1208
+ web_search=web_search,
1209
+ n_parallels=n_parallels,
1210
+ use_dummy=use_dummy,
1211
+ reasoning_effort=reasoning_effort,
1212
+ reasoning_summary=reasoning_summary,
1213
+ n_rounds=n_rounds,
1214
+ recursive_validation=recursive_validation,
1215
+ n_initial_candidates=n_initial_candidates,
1216
+ n_validation_candidates=n_validation_candidates,
1217
+ use_modified_source=use_modified_source,
1218
+ **cfg_kwargs,
1219
+ )
1220
+ return await Paraphrase(cfg, template_path=template_path).run(
1221
+ df,
1222
+ column_name,
1223
+ reset_files=reset_files,
1224
+ response_fn=response_fn,
1225
+ get_all_responses_fn=get_all_responses_fn,
1226
+ )
1227
+
1228
+
1229
+ async def compare(
1230
+ df: pd.DataFrame,
1231
+ circle_column_name: str,
1232
+ square_column_name: str,
1233
+ *,
1234
+ save_dir: str,
1235
+ differentiate: bool = True,
1236
+ additional_instructions: Optional[str] = None,
1237
+ model: str = "gpt-5-mini",
1238
+ n_parallels: int = 650,
1239
+ n_runs: int = 1,
1240
+ reset_files: bool = False,
1241
+ use_dummy: bool = False,
1242
+ file_name: str = "comparison_responses.csv",
1243
+ modality: str = "text",
1244
+ reasoning_effort: Optional[str] = None,
1245
+ reasoning_summary: Optional[str] = None,
1246
+ template_path: Optional[str] = None,
1247
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
1248
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
1249
+ **cfg_kwargs,
1250
+ ) -> pd.DataFrame:
1251
+ """Identifies similarities / differences between paired items. Output = list of differences.
1252
+
1253
+ Example Use
1254
+ -----------
1255
+ Contrast op-eds from different districts; compare two ad campaigns.
1256
+
1257
+ Parameters
1258
+ ----------
1259
+ df:
1260
+ DataFrame containing the paired passages to compare.
1261
+ circle_column_name, square_column_name:
1262
+ Columns representing the two sides of each comparison.
1263
+ save_dir:
1264
+ Directory where comparison outputs are written.
1265
+ differentiate:
1266
+ Whether to prompt the model to emphasise key differences.
1267
+ additional_instructions:
1268
+ Extra prompt guidance applied to each comparison.
1269
+ model:
1270
+ Model name for comparison calls.
1271
+ n_parallels:
1272
+ Maximum number of concurrent comparison requests.
1273
+ n_runs:
1274
+ Number of repeated comparisons to gather per pair.
1275
+ reset_files:
1276
+ When ``True`` regenerate results regardless of existing files.
1277
+ use_dummy:
1278
+ If ``True`` return deterministic dummy comparison outputs.
1279
+ file_name:
1280
+ CSV filename for saved comparison responses.
1281
+ modality:
1282
+ Content modality hint for prompt rendering.
1283
+ reasoning_effort, reasoning_summary:
1284
+ Optional OpenAI reasoning controls.
1285
+ template_path:
1286
+ Custom template override for comparison prompts.
1287
+ response_fn:
1288
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
1289
+ that replaces the per-prompt model invocation. Ignored when
1290
+ ``get_all_responses_fn`` is supplied.
1291
+ get_all_responses_fn:
1292
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
1293
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
1294
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
1295
+ **cfg_kwargs:
1296
+ Additional configuration passed to :class:`gabriel.tasks.compare.CompareConfig`.
1297
+
1298
+ Returns
1299
+ -------
1300
+ pandas.DataFrame
1301
+ DataFrame indexed by both input columns with one row per attribute and
1302
+ an ``explanation`` field describing the preference rationale.
1303
+ """
1304
+
1305
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
1306
+ os.makedirs(save_dir, exist_ok=True)
1307
+ cfg = CompareConfig(
1308
+ save_dir=save_dir,
1309
+ file_name=file_name,
1310
+ model=model,
1311
+ n_parallels=n_parallels,
1312
+ n_runs=n_runs,
1313
+ use_dummy=use_dummy,
1314
+ differentiate=differentiate,
1315
+ additional_instructions=additional_instructions or "",
1316
+ modality=modality,
1317
+ reasoning_effort=reasoning_effort,
1318
+ reasoning_summary=reasoning_summary,
1319
+ **cfg_kwargs,
1320
+ )
1321
+ return await Compare(cfg, template_path=template_path).run(
1322
+ df,
1323
+ circle_column_name,
1324
+ square_column_name,
1325
+ reset_files=reset_files,
1326
+ response_fn=response_fn,
1327
+ get_all_responses_fn=get_all_responses_fn,
1328
+ )
1329
+
1330
+
1331
+ async def bucket(
1332
+ df: pd.DataFrame,
1333
+ column_name: str,
1334
+ *,
1335
+ save_dir: str,
1336
+ additional_instructions: Optional[str] = None,
1337
+ model: str = "gpt-5-mini",
1338
+ n_parallels: int = 650,
1339
+ reset_files: bool = False,
1340
+ use_dummy: bool = False,
1341
+ file_name: str = "bucket_definitions.csv",
1342
+ bucket_count: int = 10,
1343
+ differentiate: bool = False,
1344
+ reasoning_effort: Optional[str] = None,
1345
+ reasoning_summary: Optional[str] = None,
1346
+ template_path: Optional[str] = None,
1347
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
1348
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
1349
+ **cfg_kwargs,
1350
+ ) -> pd.DataFrame:
1351
+ """Builds taxonomies from many terms. Output = bucket/cluster labels.
1352
+
1353
+ Example Use
1354
+ -----------
1355
+ Group technologies, artworks, or HR complaints into emergent categories.
1356
+
1357
+ Parameters
1358
+ ----------
1359
+ df:
1360
+ DataFrame containing passages to bucket.
1361
+ column_name:
1362
+ Column holding the text to cluster.
1363
+ save_dir:
1364
+ Directory where bucket definitions and intermediate state are saved.
1365
+ additional_instructions:
1366
+ Extra prompt guidance for bucket creation.
1367
+ model:
1368
+ Model used to propose bucket definitions.
1369
+ n_parallels:
1370
+ Maximum number of concurrent bucket definition calls.
1371
+ reset_files:
1372
+ When ``True`` regenerate outputs despite existing files.
1373
+ use_dummy:
1374
+ Return deterministic dummy buckets for offline testing.
1375
+ file_name:
1376
+ Filename for saved bucket definitions.
1377
+ bucket_count:
1378
+ Target number of buckets to generate.
1379
+ differentiate:
1380
+ Whether to encourage distinctive bucket descriptions.
1381
+ reasoning_effort, reasoning_summary:
1382
+ Optional OpenAI reasoning controls.
1383
+ template_path:
1384
+ Custom template path for bucket prompts.
1385
+ response_fn:
1386
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
1387
+ that replaces the per-prompt model invocation. Ignored when
1388
+ ``get_all_responses_fn`` is supplied.
1389
+ get_all_responses_fn:
1390
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
1391
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
1392
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
1393
+ **cfg_kwargs:
1394
+ Additional overrides forwarded to :class:`gabriel.tasks.bucket.BucketConfig`.
1395
+
1396
+ Returns
1397
+ -------
1398
+ pandas.DataFrame
1399
+ DataFrame containing the finalized bucket names and definitions (one
1400
+ row per bucket).
1401
+ """
1402
+
1403
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
1404
+ os.makedirs(save_dir, exist_ok=True)
1405
+ cfg = BucketConfig(
1406
+ bucket_count=bucket_count,
1407
+ save_dir=save_dir,
1408
+ file_name=file_name,
1409
+ model=model,
1410
+ n_parallels=n_parallels,
1411
+ use_dummy=use_dummy,
1412
+ additional_instructions=additional_instructions,
1413
+ differentiate=differentiate,
1414
+ reasoning_effort=reasoning_effort,
1415
+ reasoning_summary=reasoning_summary,
1416
+ **cfg_kwargs,
1417
+ )
1418
+ return await Bucket(cfg, template_path=template_path).run(
1419
+ df,
1420
+ column_name,
1421
+ reset_files=reset_files,
1422
+ response_fn=response_fn,
1423
+ get_all_responses_fn=get_all_responses_fn,
1424
+ )
1425
+
1426
+
1427
+ async def discover(
1428
+ df: pd.DataFrame,
1429
+ *,
1430
+ column_name: Optional[str] = None,
1431
+ circle_column_name: Optional[str] = None,
1432
+ square_column_name: Optional[str] = None,
1433
+ save_dir: str,
1434
+ additional_instructions: Optional[str] = None,
1435
+ model: str = "gpt-5-mini",
1436
+ n_parallels: int = 650,
1437
+ n_runs: int = 1,
1438
+ min_frequency: float = 0.6,
1439
+ bucket_count: int = 10,
1440
+ differentiate: bool = True,
1441
+ max_words_per_call: int = 1000,
1442
+ max_categories_per_call: int = 8,
1443
+ n_terms_per_prompt: int = 250,
1444
+ repeat_bucketing: int = 5,
1445
+ repeat_voting: int = 25,
1446
+ next_round_frac: float = 0.25,
1447
+ top_k_per_round: int = 1,
1448
+ raw_term_definitions: bool = True,
1449
+ use_dummy: bool = False,
1450
+ modality: str = "text",
1451
+ reasoning_effort: Optional[str] = None,
1452
+ reasoning_summary: Optional[str] = None,
1453
+ reset_files: bool = False,
1454
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
1455
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
1456
+ **cfg_kwargs,
1457
+ ) -> Dict[str, pd.DataFrame]:
1458
+ """Discovers natural language features which discriminate two classes of data.
1459
+
1460
+ Example Use
1461
+ -----------
1462
+ Identify what distinguishes 5 star vs. 1 star reviews or successful vs. failed startups.
1463
+
1464
+ Parameters
1465
+ ----------
1466
+ df:
1467
+ DataFrame containing the corpus to mine for labels.
1468
+ column_name:
1469
+ Column with free-form text to analyse. Optional when providing paired
1470
+ circle/square columns for contrastive discovery.
1471
+ circle_column_name, square_column_name:
1472
+ Optional paired columns enabling bidirectional discovery.
1473
+ save_dir:
1474
+ Directory where intermediate and final discovery outputs are saved.
1475
+ additional_instructions:
1476
+ Extra guidance applied throughout the discovery pipeline.
1477
+ model:
1478
+ Model used for bucket definitions and classification.
1479
+ n_parallels:
1480
+ Maximum concurrent calls per stage.
1481
+ n_runs:
1482
+ Number of classification repetitions to stabilise label prevalence.
1483
+ min_frequency:
1484
+ Minimum frequency threshold for labels to persist.
1485
+ bucket_count:
1486
+ Target number of buckets to propose in the initial step.
1487
+ differentiate:
1488
+ Encourage distinctive bucket descriptions when ``True``.
1489
+ max_words_per_call, max_categories_per_call:
1490
+ Chunking controls for classification prompts.
1491
+ n_terms_per_prompt, repeat_bucketing, repeat_voting:
1492
+ Parameters that regulate how many discovered terms are evaluated and how
1493
+ often bucketing/voting rounds repeat.
1494
+ next_round_frac, top_k_per_round:
1495
+ Controls for carrying top-performing terms into subsequent rounds.
1496
+ raw_term_definitions:
1497
+ Whether to keep raw label definitions in the outputs.
1498
+ use_dummy:
1499
+ If ``True`` perform deterministic offline discovery.
1500
+ modality:
1501
+ Content modality hint forwarded to downstream tasks.
1502
+ reasoning_effort, reasoning_summary:
1503
+ Optional OpenAI reasoning controls.
1504
+ reset_files:
1505
+ When ``True`` regenerate all discovery artifacts.
1506
+ response_fn:
1507
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
1508
+ that replaces the per-prompt model invocation. Ignored when
1509
+ ``get_all_responses_fn`` is supplied.
1510
+ get_all_responses_fn:
1511
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
1512
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
1513
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
1514
+ **cfg_kwargs:
1515
+ Additional overrides passed to :class:`gabriel.tasks.discover.DiscoverConfig`.
1516
+
1517
+ Returns
1518
+ -------
1519
+ Dict[str, pandas.DataFrame]
1520
+ Intermediate DataFrames from each step of the discovery pipeline. When
1521
+ ``circle_column_name`` and ``square_column_name`` are provided,
1522
+ classification is performed twice (circle and square directions). A
1523
+ ``summary`` key describes label prevalence differences with
1524
+ ``difference_pct`` expressed as circle minus square percentage points.
1525
+ """
1526
+
1527
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
1528
+ os.makedirs(save_dir, exist_ok=True)
1529
+ cfg = DiscoverConfig(
1530
+ save_dir=save_dir,
1531
+ model=model,
1532
+ n_parallels=n_parallels,
1533
+ n_runs=n_runs,
1534
+ min_frequency=min_frequency,
1535
+ bucket_count=bucket_count,
1536
+ additional_instructions=additional_instructions,
1537
+ differentiate=differentiate,
1538
+ max_words_per_call=max_words_per_call,
1539
+ max_categories_per_call=max_categories_per_call,
1540
+ n_terms_per_prompt=n_terms_per_prompt,
1541
+ repeat_bucketing=repeat_bucketing,
1542
+ repeat_voting=repeat_voting,
1543
+ next_round_frac=next_round_frac,
1544
+ top_k_per_round=top_k_per_round,
1545
+ raw_term_definitions=raw_term_definitions,
1546
+ use_dummy=use_dummy,
1547
+ modality=modality,
1548
+ reasoning_effort=reasoning_effort,
1549
+ reasoning_summary=reasoning_summary,
1550
+ **cfg_kwargs,
1551
+ )
1552
+ return await Discover(cfg).run(
1553
+ df,
1554
+ column_name=column_name,
1555
+ circle_column_name=circle_column_name,
1556
+ square_column_name=square_column_name,
1557
+ reset_files=reset_files,
1558
+ response_fn=response_fn,
1559
+ get_all_responses_fn=get_all_responses_fn,
1560
+ )
1561
+
1562
+
1563
+ async def deduplicate(
1564
+ df: pd.DataFrame,
1565
+ column_name: str,
1566
+ *,
1567
+ save_dir: str,
1568
+ additional_instructions: Optional[str] = None,
1569
+ modality: str = "entity",
1570
+ max_words_per_text: int = 500,
1571
+ model: str = "gpt-5-mini",
1572
+ n_parallels: int = 650,
1573
+ n_runs: int = 3,
1574
+ reset_files: bool = False,
1575
+ use_dummy: bool = False,
1576
+ file_name: str = "deduplicate_responses.csv",
1577
+ use_embeddings: bool = True,
1578
+ group_size: int = 500,
1579
+ max_timeout: Optional[float] = None,
1580
+ template_path: Optional[str] = None,
1581
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
1582
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
1583
+ **cfg_kwargs,
1584
+ ) -> pd.DataFrame:
1585
+ """Detects conceptual duplicates. Maps all duplicates to one representative term.
1586
+
1587
+ Example Use
1588
+ -----------
1589
+ Collapse "F-18", "Super Hornet Fighter Jet", "f-18 hornet" into "F-18".
1590
+
1591
+ Parameters
1592
+ ----------
1593
+ df:
1594
+ DataFrame containing the passages to deduplicate.
1595
+ column_name:
1596
+ Column holding the text to deduplicate.
1597
+ save_dir:
1598
+ Directory where deduplication artifacts are written.
1599
+ additional_instructions:
1600
+ Extra guidance appended to the deduplication prompt.
1601
+ modality:
1602
+ Use ``"entity"`` for short entity strings or ``"text"`` for long-form text snippets.
1603
+ max_words_per_text:
1604
+ Maximum word count for each text snippet when ``modality="text"``.
1605
+ model:
1606
+ Model name used for overlap detection.
1607
+ n_parallels:
1608
+ Maximum number of concurrent calls.
1609
+ n_runs:
1610
+ Number of passes to run; helps stabilise duplicate detection.
1611
+ reset_files:
1612
+ When ``True`` regenerate outputs regardless of existing files.
1613
+ use_dummy:
1614
+ Return deterministic dummy outputs for offline testing.
1615
+ file_name:
1616
+ CSV filename for saved deduplication responses.
1617
+ use_embeddings:
1618
+ Whether to use embedding-based prefiltering prior to model calls.
1619
+ group_size:
1620
+ Number of passages to evaluate per batch during deduplication.
1621
+ max_timeout:
1622
+ Optional timeout per API call.
1623
+ template_path:
1624
+ Custom template override for deduplication prompts.
1625
+ response_fn:
1626
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
1627
+ that replaces the per-prompt model invocation. Ignored when
1628
+ ``get_all_responses_fn`` is supplied.
1629
+ get_all_responses_fn:
1630
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
1631
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
1632
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
1633
+ **cfg_kwargs:
1634
+ Additional configuration passed to
1635
+ :class:`gabriel.tasks.deduplicate.DeduplicateConfig`.
1636
+
1637
+ Returns
1638
+ -------
1639
+ pandas.DataFrame
1640
+ DataFrame including the original content plus ``mapped_<column_name>`` columns
1641
+ (per run and final) indicating the canonical representative for each
1642
+ detected duplicate cluster.
1643
+ """
1644
+
1645
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
1646
+ os.makedirs(save_dir, exist_ok=True)
1647
+ cfg = DeduplicateConfig(
1648
+ save_dir=save_dir,
1649
+ file_name=file_name,
1650
+ model=model,
1651
+ n_parallels=n_parallels,
1652
+ n_runs=n_runs,
1653
+ use_dummy=use_dummy,
1654
+ max_timeout=max_timeout,
1655
+ additional_instructions=additional_instructions,
1656
+ use_embeddings=use_embeddings,
1657
+ group_size=group_size,
1658
+ modality=modality,
1659
+ max_words_per_text=max_words_per_text,
1660
+ **cfg_kwargs,
1661
+ )
1662
+ return await Deduplicate(cfg, template_path=template_path).run(
1663
+ df,
1664
+ column_name=column_name,
1665
+ reset_files=reset_files,
1666
+ response_fn=response_fn,
1667
+ get_all_responses_fn=get_all_responses_fn,
1668
+ )
1669
+
1670
+
1671
+ async def merge(
1672
+ df_left: pd.DataFrame,
1673
+ df_right: pd.DataFrame,
1674
+ *,
1675
+ save_dir: str,
1676
+ on: Optional[str] = None,
1677
+ left_on: Optional[str] = None,
1678
+ right_on: Optional[str] = None,
1679
+ how: str = "left",
1680
+ additional_instructions: Optional[str] = None,
1681
+ model: str = "gpt-5-nano",
1682
+ n_parallels: int = 650,
1683
+ n_runs: int = 1,
1684
+ reset_files: bool = False,
1685
+ use_dummy: bool = False,
1686
+ file_name: str = "merge_responses.csv",
1687
+ use_embeddings: bool = True,
1688
+ short_list_len: int = 16,
1689
+ long_list_len: int = 256,
1690
+ max_attempts: int = 4,
1691
+ short_list_multiplier: float = 0.5,
1692
+ auto_match_threshold: float = 0.75,
1693
+ use_best_auto_match: bool = False,
1694
+ candidate_scan_chunks: int = 5,
1695
+ template_path: Optional[str] = None,
1696
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
1697
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
1698
+ **cfg_kwargs,
1699
+ ) -> pd.DataFrame:
1700
+ """Creates crosswalks. Output = merged table with GPT-matched identifiers.
1701
+
1702
+ Example Use
1703
+ -----------
1704
+ Match two distinct job title directories; link patent titles to product names.
1705
+
1706
+ Parameters
1707
+ ----------
1708
+ df_left, df_right:
1709
+ DataFrames to merge.
1710
+ save_dir:
1711
+ Directory where merge results and diagnostics are saved.
1712
+ on, left_on, right_on:
1713
+ Column(s) to match on. ``on`` applies to both sides; ``left_on`` and
1714
+ ``right_on`` override per side.
1715
+ how:
1716
+ Merge strategy (``"left"`` or ``"right"``) determining which side is treated as
1717
+ the short/base table.
1718
+ additional_instructions:
1719
+ Extra prompt context for the model.
1720
+ model:
1721
+ Model used to compare candidate records.
1722
+ n_parallels:
1723
+ Maximum number of concurrent merge comparisons.
1724
+ n_runs:
1725
+ Number of repeated comparisons per candidate.
1726
+ reset_files:
1727
+ When ``True`` regenerate outputs even if files exist.
1728
+ use_dummy:
1729
+ If ``True`` return deterministic dummy matches.
1730
+ file_name:
1731
+ CSV filename for saved merge responses.
1732
+ use_embeddings:
1733
+ Whether to use embeddings to shortlist candidates before calling the
1734
+ model.
1735
+ short_list_len, long_list_len, short_list_multiplier:
1736
+ Controls for candidate pool sizes.
1737
+ max_attempts:
1738
+ Maximum retry attempts per match before giving up.
1739
+ auto_match_threshold:
1740
+ Confidence threshold for automatically accepting matches.
1741
+ use_best_auto_match:
1742
+ When ``True`` pick the highest confidence candidate when multiple exceed
1743
+ ``auto_match_threshold``.
1744
+ candidate_scan_chunks:
1745
+ Number of candidate batches to scan when building the shortlist.
1746
+ template_path:
1747
+ Custom template override for merge prompts.
1748
+ response_fn:
1749
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
1750
+ that replaces the per-prompt model invocation. Ignored when
1751
+ ``get_all_responses_fn`` is supplied.
1752
+ get_all_responses_fn:
1753
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
1754
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
1755
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
1756
+ **cfg_kwargs:
1757
+ Additional overrides forwarded to :class:`gabriel.tasks.merge.MergeConfig`.
1758
+
1759
+ Returns
1760
+ -------
1761
+ pandas.DataFrame
1762
+ Merged result keyed to the ``how``-selected short side, enriched with
1763
+ model-evaluated matches from the long side and deduplicated on the
1764
+ short key.
1765
+ """
1766
+
1767
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
1768
+ os.makedirs(save_dir, exist_ok=True)
1769
+ cfg = MergeConfig(
1770
+ save_dir=save_dir,
1771
+ file_name=file_name,
1772
+ model=model,
1773
+ n_parallels=n_parallels,
1774
+ n_runs=n_runs,
1775
+ use_dummy=use_dummy,
1776
+ additional_instructions=additional_instructions,
1777
+ use_embeddings=use_embeddings,
1778
+ short_list_len=short_list_len,
1779
+ long_list_len=long_list_len,
1780
+ max_attempts=max_attempts,
1781
+ short_list_multiplier=short_list_multiplier,
1782
+ auto_match_threshold=auto_match_threshold,
1783
+ use_best_auto_match=use_best_auto_match,
1784
+ candidate_scan_chunks=candidate_scan_chunks,
1785
+ **cfg_kwargs,
1786
+ )
1787
+ return await Merge(cfg, template_path=template_path).run(
1788
+ df_left,
1789
+ df_right,
1790
+ on=on,
1791
+ left_on=left_on,
1792
+ right_on=right_on,
1793
+ how=how,
1794
+ reset_files=reset_files,
1795
+ response_fn=response_fn,
1796
+ get_all_responses_fn=get_all_responses_fn,
1797
+ )
1798
+
1799
+
1800
+ async def filter(
1801
+ df: pd.DataFrame,
1802
+ column_name: str,
1803
+ *,
1804
+ condition: str,
1805
+ save_dir: str,
1806
+ entities_per_call: int = 150,
1807
+ shuffle: bool = True,
1808
+ random_seed: int = 42,
1809
+ n_runs: int = 1,
1810
+ threshold: float = 0.5,
1811
+ additional_instructions: Optional[str] = None,
1812
+ model: str = "gpt-5-nano",
1813
+ n_parallels: int = 650,
1814
+ reset_files: bool = False,
1815
+ use_dummy: bool = False,
1816
+ file_name: str = "filter_responses.csv",
1817
+ max_timeout: Optional[float] = None,
1818
+ template_path: Optional[str] = None,
1819
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
1820
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
1821
+ **cfg_kwargs,
1822
+ ) -> pd.DataFrame:
1823
+ """High-throughput boolean screening. Outputs items which meet natural language condition.
1824
+
1825
+ Example Use
1826
+ -----------
1827
+ Subset 18M Wikipedia titles to only technologies.
1828
+
1829
+ Parameters
1830
+ ----------
1831
+ df:
1832
+ DataFrame containing passages to filter.
1833
+ column_name:
1834
+ Column with the text to evaluate.
1835
+ condition:
1836
+ Natural-language condition that determines whether a passage is kept.
1837
+ save_dir:
1838
+ Directory where filter responses are saved.
1839
+ entities_per_call:
1840
+ Number of passages to send in each API call.
1841
+ shuffle:
1842
+ Whether to randomise order before batching.
1843
+ random_seed:
1844
+ Seed used when ``shuffle`` is ``True``.
1845
+ n_runs:
1846
+ Number of repeated evaluations per passage.
1847
+ threshold:
1848
+ Probability threshold above which a passage is retained.
1849
+ additional_instructions:
1850
+ Extra guidance appended to the filter prompt.
1851
+ model:
1852
+ Model used for filtering.
1853
+ n_parallels:
1854
+ Maximum number of concurrent filtering calls.
1855
+ reset_files:
1856
+ When ``True`` regenerate outputs even if files exist.
1857
+ use_dummy:
1858
+ Return deterministic dummy outputs instead of real API responses.
1859
+ file_name:
1860
+ CSV filename for saved filter responses.
1861
+ max_timeout:
1862
+ Optional per-call timeout.
1863
+ template_path:
1864
+ Custom prompt template path.
1865
+ response_fn:
1866
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
1867
+ that replaces the per-prompt model invocation. Ignored when
1868
+ ``get_all_responses_fn`` is supplied.
1869
+ get_all_responses_fn:
1870
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
1871
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
1872
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
1873
+ **cfg_kwargs:
1874
+ Additional configuration passed to :class:`gabriel.tasks.filter.FilterConfig`.
1875
+
1876
+ Returns
1877
+ -------
1878
+ pandas.DataFrame
1879
+ Filtered DataFrame with keep/score columns reflecting model decisions.
1880
+ """
1881
+
1882
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
1883
+ os.makedirs(save_dir, exist_ok=True)
1884
+ cfg = FilterConfig(
1885
+ condition=condition,
1886
+ save_dir=save_dir,
1887
+ file_name=file_name,
1888
+ model=model,
1889
+ n_parallels=n_parallels,
1890
+ entities_per_call=entities_per_call,
1891
+ shuffle=shuffle,
1892
+ random_seed=random_seed,
1893
+ n_runs=n_runs,
1894
+ threshold=threshold,
1895
+ additional_instructions=additional_instructions or "",
1896
+ use_dummy=use_dummy,
1897
+ max_timeout=max_timeout,
1898
+ **cfg_kwargs,
1899
+ )
1900
+ return await Filter(cfg, template_path=template_path).run(
1901
+ df,
1902
+ column_name,
1903
+ reset_files=reset_files,
1904
+ response_fn=response_fn,
1905
+ get_all_responses_fn=get_all_responses_fn,
1906
+ )
1907
+
1908
+
1909
+ async def debias(
1910
+ df: pd.DataFrame,
1911
+ column_name: str,
1912
+ *,
1913
+ mode: MeasurementMode = "rate",
1914
+ measurement_attribute: Optional[str] = None,
1915
+ removal_attribute: Optional[str] = None,
1916
+ signal_dictionary: Dict[str, str],
1917
+ attributes: Optional[Dict[str, str]] = None,
1918
+ removal_method: RemovalMethod = "codify",
1919
+ save_dir: str = os.path.expanduser("~/Documents/runs"),
1920
+ run_name: Optional[str] = None,
1921
+ strip_percentages: Optional[List[int]] = None,
1922
+ categories_to_strip: Optional[List[str]] = None,
1923
+ template_path: Optional[str] = None,
1924
+ model: str = "gpt-5-mini",
1925
+ n_parallels: int = 650,
1926
+ measurement_kwargs: Optional[Dict[str, Any]] = None,
1927
+ removal_kwargs: Optional[Dict[str, Any]] = None,
1928
+ remaining_signal: bool = True,
1929
+ max_words_per_call: Optional[int] = 1000,
1930
+ n_rounds: Optional[int] = 3,
1931
+ use_dummy: bool = False,
1932
+ robust_regression: bool = True,
1933
+ random_seed: int = 12345,
1934
+ verbose: bool = True,
1935
+ reset_files: bool = False,
1936
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
1937
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
1938
+ ) -> pd.DataFrame:
1939
+ """Post-process measurements to remove inference bias.
1940
+
1941
+ Example Use
1942
+ -----------
1943
+ Ensure GPT isn't guessing climate opinions in speeches based on general political lean.
1944
+
1945
+ Parameters
1946
+ ----------
1947
+ df:
1948
+ DataFrame containing passages to measure and debias.
1949
+ column_name:
1950
+ Column with the text to process.
1951
+ mode:
1952
+ Measurement mode (e.g., ``"rate"``) determining how bias is estimated.
1953
+ measurement_attribute, removal_attribute:
1954
+ Specify the attribute used for regression and the key from
1955
+ ``signal_dictionary`` that should be removed. When
1956
+ ``measurement_attribute`` is omitted the first key from ``attributes``
1957
+ is used. ``removal_attribute`` defaults to the measurement attribute
1958
+ when present in ``signal_dictionary`` or otherwise the first key from
1959
+ ``signal_dictionary``. Notices are printed when inferred and
1960
+ ``verbose`` is ``True``.
1961
+ signal_dictionary:
1962
+ Mapping of bias signals to their definitions.
1963
+ attributes:
1964
+ Optional rating attributes used during measurement.
1965
+ removal_method:
1966
+ Strategy for removing bias (for example ``"codify"``).
1967
+ save_dir:
1968
+ Base directory for all debiasing artifacts.
1969
+ run_name:
1970
+ Optional run identifier; defaults to a timestamped folder.
1971
+ strip_percentages, categories_to_strip:
1972
+ Optional controls for category pruning during removal.
1973
+ template_path:
1974
+ Optional template override used during removal steps.
1975
+ model:
1976
+ Model used across the measurement and removal stages.
1977
+ n_parallels:
1978
+ Maximum concurrent API calls.
1979
+ measurement_kwargs, removal_kwargs:
1980
+ Fine-grained overrides for the measurement and removal tasks.
1981
+ remaining_signal:
1982
+ When ``True`` (default) measure a remaining-signal prevalence attribute on
1983
+ the stripped text and use it in the two-step debiasing regression.
1984
+ max_words_per_call, n_rounds:
1985
+ Convenience passthroughs for the removal stage. ``max_words_per_call``
1986
+ configures the codify task's chunk size, while ``n_rounds`` controls the
1987
+ number of completion passes run by codify and any downstream
1988
+ paraphrasing steps. Defaults to 3 when not explicitly provided.
1989
+ use_dummy:
1990
+ If ``True`` run deterministic offline debiasing.
1991
+ robust_regression:
1992
+ Whether to use robust regression when estimating bias coefficients.
1993
+ random_seed:
1994
+ Seed for deterministic behaviour in sampling-heavy steps.
1995
+ verbose:
1996
+ When ``True`` print notices about inferred defaults and progress.
1997
+ reset_files:
1998
+ When ``True`` propagate reset behaviour to all measurement and removal stages.
1999
+ response_fn:
2000
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
2001
+ that replaces the per-prompt model invocation. Ignored when
2002
+ ``get_all_responses_fn`` is supplied.
2003
+ get_all_responses_fn:
2004
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
2005
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
2006
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
2007
+
2008
+ Returns
2009
+ -------
2010
+ pandas.DataFrame
2011
+ Debiased results with raw, stripped, and debiased columns appended.
2012
+ """
2013
+
2014
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
2015
+ measurement_kwargs = dict(measurement_kwargs or {})
2016
+ removal_kwargs = dict(removal_kwargs or {})
2017
+ if response_fn is not None:
2018
+ measurement_kwargs.setdefault("response_fn", response_fn)
2019
+ removal_kwargs.setdefault("response_fn", response_fn)
2020
+ if get_all_responses_fn is not None:
2021
+ measurement_kwargs.setdefault("get_all_responses_fn", get_all_responses_fn)
2022
+ removal_kwargs.setdefault("get_all_responses_fn", get_all_responses_fn)
2023
+
2024
+ if reset_files:
2025
+ measurement_kwargs.setdefault("reset_files", True)
2026
+ removal_kwargs.setdefault("reset_files", True)
2027
+
2028
+ if removal_method == "codify" and max_words_per_call is not None:
2029
+ removal_kwargs.setdefault("max_words_per_call", max_words_per_call)
2030
+ if "completion_max_rounds" in removal_kwargs and "n_rounds" not in removal_kwargs:
2031
+ replacement = removal_kwargs.pop("completion_max_rounds")
2032
+ warnings.warn(
2033
+ "completion_max_rounds in removal_kwargs is deprecated; use n_rounds instead.",
2034
+ DeprecationWarning,
2035
+ stacklevel=2,
2036
+ )
2037
+ if replacement is not None:
2038
+ removal_kwargs.setdefault("n_rounds", replacement)
2039
+ if n_rounds is not None:
2040
+ removal_kwargs.setdefault("n_rounds", n_rounds)
2041
+
2042
+ cfg = DebiasConfig(
2043
+ mode=mode,
2044
+ measurement_attribute=measurement_attribute,
2045
+ removal_attribute=removal_attribute,
2046
+ signal_dictionary=signal_dictionary,
2047
+ attributes=attributes or {},
2048
+ removal_method=removal_method,
2049
+ save_dir=save_dir,
2050
+ run_name=run_name,
2051
+ strip_percentages=strip_percentages,
2052
+ categories_to_strip=categories_to_strip,
2053
+ template_path=template_path,
2054
+ model=model,
2055
+ n_parallels=n_parallels,
2056
+ measurement_kwargs=measurement_kwargs,
2057
+ removal_kwargs=removal_kwargs,
2058
+ remaining_signal=remaining_signal,
2059
+ use_dummy=use_dummy,
2060
+ robust_regression=robust_regression,
2061
+ random_seed=random_seed,
2062
+ verbose=verbose,
2063
+ )
2064
+ pipeline = DebiasPipeline(cfg)
2065
+ result = await pipeline.run(df, column_name, reset_files=reset_files)
2066
+ return result.results
2067
+
2068
+
2069
+ async def whatever(
2070
+ prompts: Optional[Union[str, List[str], pd.DataFrame]] = None,
2071
+ identifiers: Optional[List[str]] = None,
2072
+ *,
2073
+ save_dir: str,
2074
+ df: Optional[pd.DataFrame] = None,
2075
+ column_name: Optional[str] = None,
2076
+ identifier_column: Optional[str] = None,
2077
+ image_column: Optional[str] = None,
2078
+ audio_column: Optional[str] = None,
2079
+ prompt_images: Optional[Dict[str, List[str]]] = None,
2080
+ prompt_audio: Optional[Dict[str, List[Dict[str, str]]]] = None,
2081
+ file_name: str = "custom_prompt_responses.csv",
2082
+ model: str = "gpt-5-mini",
2083
+ json_mode: bool = False,
2084
+ web_search: Optional[bool] = None,
2085
+ web_search_filters: Optional[Dict[str, Any]] = None,
2086
+ search_context_size: str = "medium",
2087
+ n_parallels: int = 650,
2088
+ use_dummy: bool = False,
2089
+ reset_files: bool = False,
2090
+ return_original_columns: bool = True,
2091
+ drop_prompts: bool = True,
2092
+ reasoning_effort: Optional[str] = None,
2093
+ reasoning_summary: Optional[str] = None,
2094
+ response_fn: Optional[Callable[..., Awaitable[Any]]] = None,
2095
+ get_all_responses_fn: Optional[Callable[..., Awaitable[pd.DataFrame]]] = None,
2096
+ **kwargs,
2097
+ ) -> pd.DataFrame:
2098
+ """Run any GPT prompts, but leverage GABRIEL's parallelization / checkpointing.
2099
+
2100
+ Example Use
2101
+ -----------
2102
+ Any set of prompts; slots into any pipeline.
2103
+
2104
+ Parameters
2105
+ ----------
2106
+ prompts:
2107
+ Single prompt string, list of prompts, or DataFrame of prompts.
2108
+ identifiers:
2109
+ Optional identifiers to align responses with custom keys.
2110
+ save_dir:
2111
+ Directory where raw responses are written.
2112
+ df:
2113
+ Source DataFrame to pull prompts from when ``prompts`` is not provided.
2114
+ column_name:
2115
+ Column in ``df`` containing prompts to send.
2116
+ identifier_column:
2117
+ Column providing identifiers for each prompt row.
2118
+ image_column, audio_column:
2119
+ Optional columns containing image or audio references to include.
2120
+ prompt_images, prompt_audio:
2121
+ Pre-constructed multimodal payloads keyed by identifier.
2122
+ file_name:
2123
+ CSV filename for persisted responses.
2124
+ model:
2125
+ Model name passed to :func:`gabriel.utils.openai_utils.get_all_responses`.
2126
+ json_mode:
2127
+ Whether to request JSON-mode responses where supported.
2128
+ web_search:
2129
+ Enable web search augmentation.
2130
+ web_search_filters:
2131
+ Filters dict forwarded to the Responses API (allowed domains and optional
2132
+ location hints such as ``city`` or ``timezone``).
2133
+ search_context_size:
2134
+ Context size hint for web-search capable models.
2135
+ n_parallels:
2136
+ Maximum concurrent response requests.
2137
+ use_dummy:
2138
+ If ``True`` return deterministic dummy responses.
2139
+ reset_files:
2140
+ When ``True`` regenerate outputs even if files already exist.
2141
+ return_original_columns:
2142
+ When ``True`` and ``df`` is provided, merge response columns back onto
2143
+ the input DataFrame using the prompt identifiers.
2144
+ drop_prompts:
2145
+ When ``True`` and merging back onto ``df``, drop the prompt column
2146
+ before saving/returning the result.
2147
+ reasoning_effort, reasoning_summary:
2148
+ Optional OpenAI reasoning controls.
2149
+ response_fn:
2150
+ Optional callable forwarded to :func:`gabriel.utils.openai_utils.get_all_responses`
2151
+ that replaces the per-prompt model invocation. Ignored when
2152
+ ``get_all_responses_fn`` is supplied.
2153
+ get_all_responses_fn:
2154
+ Optional callable that fully replaces :func:`gabriel.utils.openai_utils.get_all_responses`.
2155
+ It must accept ``prompts`` and ``identifiers`` (and ideally ``model`` and
2156
+ ``json_mode``) and return a DataFrame containing a ``"Response"`` column.
2157
+ **kwargs:
2158
+ Additional parameters forwarded directly to
2159
+ :func:`gabriel.utils.openai_utils.get_all_responses`.
2160
+
2161
+ Returns
2162
+ -------
2163
+ pandas.DataFrame
2164
+ DataFrame of prompts, identifiers, and model responses saved to
2165
+ ``save_dir/file_name``.
2166
+ """
2167
+ save_dir = os.path.expandvars(os.path.expanduser(save_dir))
2168
+ os.makedirs(save_dir, exist_ok=True)
2169
+
2170
+ if df is None and prompts is None:
2171
+ raise ValueError("Either prompts or df must be provided to `whatever`.")
2172
+
2173
+ kwargs = dict(kwargs)
2174
+ if response_fn is not None:
2175
+ kwargs.setdefault("response_fn", response_fn)
2176
+ if get_all_responses_fn is not None:
2177
+ kwargs.setdefault("get_all_responses_fn", get_all_responses_fn)
2178
+
2179
+ if web_search is None and "web_search" in kwargs:
2180
+ web_search = kwargs.pop("web_search")
2181
+ else:
2182
+ kwargs.pop("web_search", None)
2183
+
2184
+ if web_search_filters is None and "web_search_filters" in kwargs:
2185
+ web_search_filters = kwargs.pop("web_search_filters")
2186
+ else:
2187
+ kwargs.pop("web_search_filters", None)
2188
+
2189
+ if "search_context_size" in kwargs:
2190
+ if search_context_size == "medium":
2191
+ search_context_size = kwargs.pop("search_context_size")
2192
+ else:
2193
+ kwargs.pop("search_context_size")
2194
+
2195
+ cfg = WhateverConfig(
2196
+ save_dir=save_dir,
2197
+ file_name=file_name,
2198
+ model=model,
2199
+ json_mode=json_mode,
2200
+ web_search=web_search,
2201
+ web_search_filters=web_search_filters,
2202
+ search_context_size=search_context_size,
2203
+ n_parallels=n_parallels,
2204
+ use_dummy=use_dummy,
2205
+ reasoning_effort=reasoning_effort,
2206
+ reasoning_summary=reasoning_summary,
2207
+ )
2208
+
2209
+ runner = Whatever(cfg)
2210
+ return await runner.run(
2211
+ prompts,
2212
+ df=df,
2213
+ identifiers=identifiers,
2214
+ column_name=column_name,
2215
+ identifier_column=identifier_column,
2216
+ image_column=image_column,
2217
+ audio_column=audio_column,
2218
+ prompt_images=prompt_images,
2219
+ prompt_audio=prompt_audio,
2220
+ web_search_filters=web_search_filters,
2221
+ reset_files=reset_files,
2222
+ return_original_columns=return_original_columns,
2223
+ drop_prompts=drop_prompts,
2224
+ response_fn=response_fn,
2225
+ get_all_responses_fn=get_all_responses_fn,
2226
+ **kwargs,
2227
+ )
2228
+
2229
+
2230
+ def view(
2231
+ df: pd.DataFrame,
2232
+ column_name: str,
2233
+ attributes: Optional[Union[Mapping[str, Any], Sequence[Any], Any]] = None,
2234
+ *,
2235
+ header_columns: Optional[Any] = None,
2236
+ max_passages: Optional[int] = None,
2237
+ font_scale: float = 1.0,
2238
+ font_family: Optional[str] = None,
2239
+ color_mode: str = "auto",
2240
+ ):
2241
+ """UI to view sample texts with ratings / passage coding.
2242
+
2243
+ Example Use
2244
+ -----------
2245
+ Spot-check classify / rating outputs; view coded passages.
2246
+
2247
+ Parameters
2248
+ ----------
2249
+ df:
2250
+ DataFrame containing passages to display.
2251
+ column_name:
2252
+ Column with the primary text to render.
2253
+ attributes:
2254
+ Optional iterable or mapping of attribute columns to include alongside
2255
+ the passage text.
2256
+ header_columns:
2257
+ Optional columns whose values should appear in the viewer header.
2258
+ max_passages:
2259
+ Optional cap on the number of passages displayed.
2260
+ font_scale:
2261
+ Scaling factor applied to viewer typography.
2262
+ font_family:
2263
+ Optional font family override.
2264
+ color_mode:
2265
+ Either ``"auto"``, ``"light"``, or ``"dark"`` to control the viewer
2266
+ theme.
2267
+
2268
+ Returns
2269
+ -------
2270
+ Any
2271
+ The rendered viewer object produced by
2272
+ :func:`gabriel.utils.passage_viewer.view`.
2273
+ """
2274
+
2275
+ return _view_passages(
2276
+ df,
2277
+ column_name,
2278
+ attributes=attributes,
2279
+ header_columns=header_columns,
2280
+ max_passages=max_passages,
2281
+ font_scale=font_scale,
2282
+ font_family=font_family,
2283
+ color_mode=color_mode,
2284
+ )