openai-gabriel 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. gabriel/__init__.py +61 -0
  2. gabriel/_version.py +1 -0
  3. gabriel/api.py +2284 -0
  4. gabriel/cli/__main__.py +60 -0
  5. gabriel/core/__init__.py +7 -0
  6. gabriel/core/llm_client.py +34 -0
  7. gabriel/core/pipeline.py +18 -0
  8. gabriel/core/prompt_template.py +152 -0
  9. gabriel/prompts/__init__.py +1 -0
  10. gabriel/prompts/bucket_prompt.jinja2 +113 -0
  11. gabriel/prompts/classification_prompt.jinja2 +50 -0
  12. gabriel/prompts/codify_prompt.jinja2 +95 -0
  13. gabriel/prompts/comparison_prompt.jinja2 +60 -0
  14. gabriel/prompts/deduplicate_prompt.jinja2 +41 -0
  15. gabriel/prompts/deidentification_prompt.jinja2 +112 -0
  16. gabriel/prompts/extraction_prompt.jinja2 +61 -0
  17. gabriel/prompts/filter_prompt.jinja2 +31 -0
  18. gabriel/prompts/ideation_prompt.jinja2 +80 -0
  19. gabriel/prompts/merge_prompt.jinja2 +47 -0
  20. gabriel/prompts/paraphrase_prompt.jinja2 +17 -0
  21. gabriel/prompts/rankings_prompt.jinja2 +49 -0
  22. gabriel/prompts/ratings_prompt.jinja2 +50 -0
  23. gabriel/prompts/regional_analysis_prompt.jinja2 +40 -0
  24. gabriel/prompts/seed.jinja2 +43 -0
  25. gabriel/prompts/snippets.jinja2 +117 -0
  26. gabriel/tasks/__init__.py +63 -0
  27. gabriel/tasks/_attribute_utils.py +69 -0
  28. gabriel/tasks/bucket.py +432 -0
  29. gabriel/tasks/classify.py +562 -0
  30. gabriel/tasks/codify.py +1033 -0
  31. gabriel/tasks/compare.py +235 -0
  32. gabriel/tasks/debias.py +1460 -0
  33. gabriel/tasks/deduplicate.py +341 -0
  34. gabriel/tasks/deidentify.py +316 -0
  35. gabriel/tasks/discover.py +524 -0
  36. gabriel/tasks/extract.py +455 -0
  37. gabriel/tasks/filter.py +169 -0
  38. gabriel/tasks/ideate.py +782 -0
  39. gabriel/tasks/merge.py +464 -0
  40. gabriel/tasks/paraphrase.py +531 -0
  41. gabriel/tasks/rank.py +2041 -0
  42. gabriel/tasks/rate.py +347 -0
  43. gabriel/tasks/seed.py +465 -0
  44. gabriel/tasks/whatever.py +344 -0
  45. gabriel/utils/__init__.py +64 -0
  46. gabriel/utils/audio_utils.py +42 -0
  47. gabriel/utils/file_utils.py +464 -0
  48. gabriel/utils/image_utils.py +22 -0
  49. gabriel/utils/jinja.py +31 -0
  50. gabriel/utils/logging.py +86 -0
  51. gabriel/utils/mapmaker.py +304 -0
  52. gabriel/utils/media_utils.py +78 -0
  53. gabriel/utils/modality_utils.py +148 -0
  54. gabriel/utils/openai_utils.py +5470 -0
  55. gabriel/utils/parsing.py +282 -0
  56. gabriel/utils/passage_viewer.py +2557 -0
  57. gabriel/utils/pdf_utils.py +20 -0
  58. gabriel/utils/plot_utils.py +2881 -0
  59. gabriel/utils/prompt_utils.py +42 -0
  60. gabriel/utils/word_matching.py +158 -0
  61. openai_gabriel-1.0.1.dist-info/METADATA +443 -0
  62. openai_gabriel-1.0.1.dist-info/RECORD +67 -0
  63. openai_gabriel-1.0.1.dist-info/WHEEL +5 -0
  64. openai_gabriel-1.0.1.dist-info/entry_points.txt +2 -0
  65. openai_gabriel-1.0.1.dist-info/licenses/LICENSE +201 -0
  66. openai_gabriel-1.0.1.dist-info/licenses/NOTICE +13 -0
  67. openai_gabriel-1.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,782 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+
9
+ import pandas as pd
10
+
11
+ from gabriel.core.prompt_template import PromptTemplate, resolve_template
12
+ from gabriel.utils.openai_utils import get_all_responses, response_to_text
13
+ from gabriel.utils.logging import announce_prompt_rendering
14
+ from gabriel.tasks.rank import Rank, RankConfig
15
+ from gabriel.tasks.rate import Rate, RateConfig
16
+ from gabriel.tasks.deduplicate import Deduplicate, DeduplicateConfig
17
+ from gabriel.tasks.seed import Seed, SeedConfig
18
+
19
+
20
+ _DEF_ATTR_LABEL = "major new contribution to literature"
21
+ _DEF_ATTR_DESCRIPTION = (
22
+ "Measures how original, well-reasoned, and consequential the proposed theory is. "
23
+ "High scores correspond to ideas that introduce novel and creative thought, "
24
+ "but above all are just genuinely superior scientific theory pursuant to the topic. "
25
+ "Use your best professor hat to judge 'major new contribution to literature' just as a high quality journal would, seeing past basic tricks and rehashes to identify true brilliance and novel cleverness. "
26
+ "Theories that contribute to the literature say something usefully new and interesting, capturing something in the real world better "
27
+ "than existing thought. Novel yet specific, testable, non-trivial, and brilliant/inspired such that top professors would admire it deeply; "
28
+ "a high standard requiring deep thought and consideration, worthy of evaluating frontier research theories. "
29
+ "Give low ratings to anything but a truly exceptional new theory that goes beyond existing work; penalize uncreative and unambitious theories that parrot existing work, "
30
+ "while rewarding clearly new ideas that are clever, logically sensible, and explain important (NOT trivial and vague) things about the world that existing theories don't. "
31
+ "Don't reward focus on niches and fads that don't really matter. Winning theories can be old school or new school, as long as they speak to something genuinely important in the topic and the world. "
32
+ "Reward interesting and important and clever, not just slapping old work onto something new like quantum or smartphones if it is just for the sake of it. "
33
+ "Penalize lack of clarity, where jargon or complex writing obfuscates the underlying ideas. Penalize proposals that just try to sound smart by being complicated / are unreadable. Penalize if core ideas aren't truly clear, parsimonious, well written, or presented with the intention to convey understanding. "
34
+ "Parsimony and clarity are key. "
35
+ "A major contribution to the literature MUST explain something big and significant, NOT tiny effects that don't really matter. "
36
+ "Default to low ratings unless you are fully convinced this is truly brilliant work deserving of research and publication."
37
+ )
38
+
39
+
40
+ def _default_attributes() -> Dict[str, str]:
41
+ return {_DEF_ATTR_LABEL: _DEF_ATTR_DESCRIPTION}
42
+
43
+
44
+ @dataclass
45
+ class IdeateConfig:
46
+ """Configuration for :class:`Ideate`."""
47
+
48
+ save_dir: str = os.path.expanduser("~/Documents/runs")
49
+ file_name: str = "ideation.csv"
50
+ model: str = "gpt-5-mini"
51
+ ranking_model: Optional[str] = None
52
+ n_parallels: int = 650
53
+ n_ideas: int = 1000
54
+ evaluation_mode: str = "recursive_rank"
55
+ attributes: Dict[str, str] = field(default_factory=_default_attributes)
56
+ rank_attribute: Optional[str] = None
57
+ recursive_fraction: float = 1.0 / 3.0
58
+ recursive_min_remaining: int = 30
59
+ recursive_final_round_multiplier: int = 3
60
+ recursive_cut_side: str = "top"
61
+ recursive_rate_first_round: bool = True
62
+ additional_instructions: Optional[str] = None
63
+ use_dummy: bool = False
64
+ web_search: bool = False
65
+ reasoning_effort: Optional[str] = None
66
+ reasoning_summary: Optional[str] = None
67
+ use_seed_entities: bool = True
68
+ seed_num_entities: Optional[int] = None
69
+ seed_entities_per_generation: Optional[int] = None
70
+ seed_entity_batch_frac: Optional[float] = None
71
+ seed_existing_entities_cap: Optional[int] = None
72
+ seed_additional_instructions: Optional[str] = None
73
+ seed_template_path: Optional[str] = None
74
+ seed_deduplicate: bool = True
75
+ deduplicate_ideas: bool = True
76
+
77
+ def __post_init__(self) -> None:
78
+ if self.additional_instructions is not None:
79
+ cleaned = str(self.additional_instructions).strip()
80
+ self.additional_instructions = cleaned or None
81
+ if self.seed_additional_instructions is not None:
82
+ cleaned_seed = str(self.seed_additional_instructions).strip()
83
+ self.seed_additional_instructions = cleaned_seed or None
84
+
85
+
86
+ class Ideate:
87
+ """Generate and optionally score frontier scientific theories."""
88
+
89
+ def __init__(
90
+ self,
91
+ cfg: IdeateConfig,
92
+ template: Optional[PromptTemplate] = None,
93
+ template_path: Optional[str] = None,
94
+ ) -> None:
95
+ expanded = Path(os.path.expandvars(os.path.expanduser(cfg.save_dir)))
96
+ expanded.mkdir(parents=True, exist_ok=True)
97
+ cfg.save_dir = str(expanded)
98
+ self.cfg = cfg
99
+ self.template = resolve_template(
100
+ template=template,
101
+ template_path=template_path,
102
+ reference_filename="ideation_prompt.jinja2",
103
+ )
104
+
105
+ async def run(
106
+ self,
107
+ topic: str,
108
+ *,
109
+ additional_instructions: Optional[str] = None,
110
+ evaluation_mode: Optional[str] = None,
111
+ attributes: Optional[Dict[str, str]] = None,
112
+ rank_attribute: Optional[str] = None,
113
+ reset_files: bool = False,
114
+ generation_kwargs: Optional[Dict[str, Any]] = None,
115
+ rank_config_updates: Optional[Dict[str, Any]] = None,
116
+ rank_run_kwargs: Optional[Dict[str, Any]] = None,
117
+ rate_config_updates: Optional[Dict[str, Any]] = None,
118
+ rate_run_kwargs: Optional[Dict[str, Any]] = None,
119
+ use_seed_entities: Optional[bool] = None,
120
+ seed_config_updates: Optional[Dict[str, Any]] = None,
121
+ seed_run_kwargs: Optional[Dict[str, Any]] = None,
122
+ deduplicate_ideas: Optional[bool] = None,
123
+ deduplicate_config_updates: Optional[Dict[str, Any]] = None,
124
+ deduplicate_run_kwargs: Optional[Dict[str, Any]] = None,
125
+ ) -> pd.DataFrame:
126
+ """Generate a large batch of theories and optionally score them."""
127
+
128
+ base_name = os.path.splitext(self.cfg.file_name)[0]
129
+ final_path = os.path.join(self.cfg.save_dir, f"{base_name}_final.csv")
130
+
131
+ if not reset_files and os.path.exists(final_path):
132
+ try:
133
+ print(f"[Ideate] Loading cached results from {final_path}")
134
+ cached = pd.read_csv(final_path)
135
+ return cached
136
+ except Exception:
137
+ print("[Ideate] Failed to load cached results; recomputing.")
138
+
139
+ attrs = attributes or self.cfg.attributes
140
+ if not attrs:
141
+ raise ValueError("At least one attribute must be provided for scoring")
142
+ attr_key = str(
143
+ rank_attribute or self.cfg.rank_attribute or next(iter(attrs))
144
+ ).strip()
145
+
146
+ mode = (evaluation_mode or self.cfg.evaluation_mode or "none").lower()
147
+ if mode not in {"recursive_rank", "rank", "rate", "none"}:
148
+ raise ValueError(
149
+ "evaluation_mode must be one of 'recursive_rank', 'rank', 'rate', or 'none'"
150
+ )
151
+
152
+ gen_kwargs = dict(generation_kwargs or {})
153
+ rank_cfg_updates = dict(rank_config_updates or {})
154
+ rank_run_kwargs = dict(rank_run_kwargs or {})
155
+ rate_cfg_updates = dict(rate_config_updates or {})
156
+ rate_run_kwargs = dict(rate_run_kwargs or {})
157
+
158
+ use_seed = (
159
+ self.cfg.use_seed_entities if use_seed_entities is None else use_seed_entities
160
+ )
161
+ use_dedup = (
162
+ self.cfg.deduplicate_ideas if deduplicate_ideas is None else deduplicate_ideas
163
+ )
164
+ dedup_cfg_updates = dict(deduplicate_config_updates or {})
165
+ dedup_run_kwargs = dict(deduplicate_run_kwargs or {})
166
+
167
+ raw_df, _ = await self._generate_reports(
168
+ topic,
169
+ additional_instructions or self.cfg.additional_instructions,
170
+ reset_files=reset_files,
171
+ use_seed_entities=use_seed,
172
+ **gen_kwargs,
173
+ seed_config_updates=seed_config_updates or {},
174
+ seed_run_kwargs=seed_run_kwargs or {},
175
+ )
176
+ parsed_df = self._parse_reports(raw_df, topic)
177
+ self._print_random_previews(parsed_df)
178
+
179
+ if use_dedup:
180
+ parsed_df = await self._deduplicate_ideas(
181
+ parsed_df,
182
+ reset_files=reset_files,
183
+ config_updates=dedup_cfg_updates,
184
+ run_kwargs=dedup_run_kwargs,
185
+ )
186
+
187
+ topic_instruction = (
188
+ "Research field/topic the theories are situated in, and should be judged in the context of: "
189
+ f"{topic}"
190
+ )
191
+
192
+ if mode == "none":
193
+ parsed_df.to_csv(final_path, index=False)
194
+ return parsed_df
195
+
196
+ if mode == "rate":
197
+ scored_df = await self._apply_rate(
198
+ parsed_df,
199
+ attrs,
200
+ attr_key,
201
+ topic_instruction,
202
+ reset_files=reset_files,
203
+ config_updates=rate_cfg_updates,
204
+ run_kwargs=rate_run_kwargs,
205
+ )
206
+ else:
207
+ recursive = mode == "recursive_rank"
208
+ scored_df = await self._apply_rank(
209
+ parsed_df,
210
+ attrs,
211
+ attr_key,
212
+ topic_instruction,
213
+ recursive=recursive,
214
+ reset_files=reset_files,
215
+ config_updates=rank_cfg_updates,
216
+ run_kwargs=rank_run_kwargs,
217
+ )
218
+
219
+ self._print_rank_summaries(scored_df, attr_key)
220
+ scored_df.to_csv(final_path, index=False)
221
+ return scored_df
222
+
223
+ async def _generate_reports(
224
+ self,
225
+ topic: str,
226
+ additional_instructions: Optional[str],
227
+ *,
228
+ reset_files: bool,
229
+ use_seed_entities: bool,
230
+ seed_config_updates: Dict[str, Any],
231
+ seed_run_kwargs: Dict[str, Any],
232
+ **generation_kwargs: Any,
233
+ ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
234
+ base_name = os.path.splitext(self.cfg.file_name)[0]
235
+ raw_path = os.path.join(self.cfg.save_dir, f"{base_name}_raw_responses.csv")
236
+ print(
237
+ f"[Ideate] Generating {self.cfg.n_ideas} theories with model {self.cfg.model}."
238
+ )
239
+
240
+ seed_assignments: List[Optional[str]] = []
241
+ seed_df: Optional[pd.DataFrame] = None
242
+ seeds_enabled = use_seed_entities
243
+ if seeds_enabled:
244
+ seed_df = await self._generate_seed_entities(
245
+ topic,
246
+ additional_instructions,
247
+ reset_files=reset_files,
248
+ config_updates=seed_config_updates,
249
+ run_kwargs=seed_run_kwargs,
250
+ )
251
+ seed_assignments = (
252
+ seed_df["entity"].astype(str).str.strip().tolist() if seed_df is not None else []
253
+ )
254
+ seed_assignments = [s for s in seed_assignments if s]
255
+ if len(seed_assignments) < self.cfg.n_ideas:
256
+ print(
257
+ "[Ideate] Warning: insufficient unique seeds; recycling to cover all prompts."
258
+ )
259
+ if not seed_assignments:
260
+ seeds_enabled = False
261
+ if seeds_enabled:
262
+ while len(seed_assignments) < self.cfg.n_ideas:
263
+ deficit = self.cfg.n_ideas - len(seed_assignments)
264
+ seed_assignments.extend(seed_assignments[:deficit])
265
+ seed_assignments = seed_assignments[: self.cfg.n_ideas]
266
+
267
+ prompts: List[str] = []
268
+ identifiers: List[str] = []
269
+ announce_prompt_rendering("Ideate", self.cfg.n_ideas)
270
+ for idx in range(self.cfg.n_ideas):
271
+ seed_text = seed_assignments[idx] if seeds_enabled and idx < len(seed_assignments) else None
272
+ prompts.append(
273
+ self.template.render(
274
+ topic=topic,
275
+ additional_instructions=additional_instructions or "",
276
+ seed=seed_text,
277
+ )
278
+ )
279
+ identifiers.append(f"idea-{idx:05d}")
280
+
281
+ kwargs = dict(
282
+ model=self.cfg.model,
283
+ n_parallels=self.cfg.n_parallels,
284
+ save_path=raw_path,
285
+ reset_files=reset_files,
286
+ use_dummy=self.cfg.use_dummy,
287
+ reasoning_effort=self.cfg.reasoning_effort,
288
+ reasoning_summary=self.cfg.reasoning_summary,
289
+ print_example_prompt=True,
290
+ )
291
+ kwargs.update(generation_kwargs)
292
+ if "web_search" not in kwargs:
293
+ kwargs["web_search"] = self.cfg.web_search
294
+
295
+ df_resp = await get_all_responses(
296
+ prompts=prompts,
297
+ identifiers=identifiers,
298
+ **kwargs,
299
+ )
300
+ if not isinstance(df_resp, pd.DataFrame):
301
+ raise RuntimeError("get_all_responses returned no DataFrame")
302
+ df_resp = df_resp.copy()
303
+ df_resp["idea_id"] = df_resp["Identifier"].astype(str)
304
+ df_resp["topic"] = topic
305
+ df_resp["report_text"] = df_resp["Response"].apply(response_to_text)
306
+ df_resp["report_text"] = df_resp["report_text"].astype(str).str.strip()
307
+ if seeds_enabled:
308
+ df_resp["seed_text"] = seed_assignments[: len(df_resp)]
309
+ else:
310
+ df_resp["seed_text"] = None
311
+ return df_resp, seed_df
312
+
313
+ async def _generate_seed_entities(
314
+ self,
315
+ topic: str,
316
+ additional_instructions: Optional[str],
317
+ *,
318
+ reset_files: bool,
319
+ config_updates: Dict[str, Any],
320
+ run_kwargs: Dict[str, Any],
321
+ ) -> pd.DataFrame:
322
+ config_updates = dict(config_updates)
323
+ instructions = self._build_seed_instruction(topic, additional_instructions)
324
+ base_name = os.path.splitext(self.cfg.file_name)[0]
325
+ seed_save = os.path.join(self.cfg.save_dir, "seed")
326
+ template_override = config_updates.pop("template_path", None)
327
+ cfg_kwargs: Dict[str, Any] = dict(
328
+ instructions=instructions,
329
+ save_dir=seed_save,
330
+ file_name=f"{base_name}_seed_entities.csv",
331
+ model=self.cfg.model,
332
+ n_parallels=self.cfg.n_parallels,
333
+ num_entities=self.cfg.seed_num_entities or self.cfg.n_ideas,
334
+ entities_per_generation=self.cfg.seed_entities_per_generation or 20,
335
+ entity_batch_frac=self.cfg.seed_entity_batch_frac or 0.25,
336
+ existing_entities_cap=self.cfg.seed_existing_entities_cap or 100,
337
+ use_dummy=self.cfg.use_dummy,
338
+ deduplicate=self.cfg.seed_deduplicate,
339
+ reasoning_effort=self.cfg.reasoning_effort,
340
+ reasoning_summary=self.cfg.reasoning_summary,
341
+ )
342
+ if self.cfg.seed_additional_instructions:
343
+ cfg_kwargs["instructions"] = (
344
+ f"{cfg_kwargs['instructions'].rstrip()}\n\nAdditional guidance:\n{self.cfg.seed_additional_instructions}"
345
+ )
346
+ cfg_kwargs.update(config_updates)
347
+ seed_cfg = SeedConfig(**cfg_kwargs)
348
+ template_path = template_override or self.cfg.seed_template_path
349
+ seed_task = Seed(seed_cfg, template_path=template_path)
350
+ run_opts = dict(run_kwargs)
351
+ seed_df = await seed_task.run(reset_files=reset_files, **run_opts)
352
+ if not isinstance(seed_df, pd.DataFrame):
353
+ raise RuntimeError("Seed generation did not return a DataFrame")
354
+ return seed_df
355
+
356
+ def _build_seed_instruction(
357
+ self, topic: str, additional_instructions: Optional[str]
358
+ ) -> str:
359
+ base_lines = [
360
+ "Generate concise, specific seed concepts that can anchor frontier scientific theories. ",
361
+ "Each seed should describe a sharply defined angle, mechanism, dataset, real world phenomena or scenario, expressed in 1-2 specific sentences. ",
362
+ "Seeds must be mutually unique and grounded in the topic. ",
363
+ "Do not draft the full theory—provide only the inspirational seed or scenario to explore. ",
364
+ "Be genuinely novel and creative; think deeply about the topic and provide interesting seeds for frontier work that are clearly distinct from one another ",
365
+ "and would lead to completely different theories and ideas if fully explored. ",
366
+ "Again: don't describe a theory, just some details/a domain that would be interesting to pursue a novel theory. ",
367
+ "For each seed, just give some light nudges towards a research focus, NOT the full theory. ",
368
+ "Each seed should touch on important, non-trivial specific subdomains for research; avoid niches, fads, etc that don't have real significance in the research field or the real world. ",
369
+ "Don't obsess with recent events like quantum or DeFi; can be old school too, not necessarily anything to do with current events. ",
370
+ "Can be anything, old events or more recent, wacky or traditional, as long as interesting research focus related to the topic. Present a broad range of seeds across very different interesting angles."
371
+ ]
372
+ base_lines.append("Primary topic focus:")
373
+ base_lines.append(topic.strip())
374
+ if additional_instructions:
375
+ base_lines.append("Contextual guidance from the user:")
376
+ base_lines.append(additional_instructions.strip())
377
+ return "\n".join(line for line in base_lines if line)
378
+
379
+ def _parse_reports(self, df: pd.DataFrame, topic: str) -> pd.DataFrame:
380
+ print("[Ideate] Parsing structured sections from each report.")
381
+ df_proc = df.copy()
382
+ df_proc["report_text"] = df_proc["report_text"].apply(response_to_text)
383
+ df_proc["report_text"] = df_proc["report_text"].astype(str).str.strip()
384
+
385
+ sections: Dict[str, List[Optional[str]]] = {
386
+ "title": [],
387
+ "in_a_nutshell": [],
388
+ "in_one_paragraph": [],
389
+ "illustrative_examples": [],
390
+ "testable_predictions": [],
391
+ "full_thinking": [],
392
+ "summary_preview": [],
393
+ "report_preview": [],
394
+ }
395
+
396
+ for text in df_proc["report_text"].astype(str):
397
+ parsed = self._extract_sections(text)
398
+ sections["title"].append(parsed.get("title"))
399
+ sections["in_a_nutshell"].append(parsed.get("in_a_nutshell"))
400
+ sections["in_one_paragraph"].append(parsed.get("in_one_paragraph"))
401
+ sections["illustrative_examples"].append(parsed.get("illustrative_examples"))
402
+ sections["testable_predictions"].append(parsed.get("testable_predictions"))
403
+ sections["full_thinking"].append(parsed.get("full_thinking"))
404
+ preview_parts: List[str] = []
405
+ for key, label in [
406
+ ("title", "Title"),
407
+ ("in_a_nutshell", "In a nutshell"),
408
+ ("in_one_paragraph", "In one paragraph"),
409
+ ("illustrative_examples", "Illustrative examples"),
410
+ ("testable_predictions", "Testable predictions"),
411
+ ]:
412
+ value = parsed.get(key)
413
+ if value:
414
+ preview_parts.append(f"{label}: {value}")
415
+ preview_text = "\n\n".join(preview_parts) if preview_parts else None
416
+ sections["summary_preview"].append(preview_text)
417
+ sections["report_preview"].append(preview_text)
418
+
419
+ for key, values in sections.items():
420
+ df_proc[key] = values
421
+
422
+ df_proc["topic"] = topic
423
+ return self._clean_columns(df_proc)
424
+
425
+ async def _deduplicate_ideas(
426
+ self,
427
+ df: pd.DataFrame,
428
+ *,
429
+ reset_files: bool,
430
+ config_updates: Dict[str, Any],
431
+ run_kwargs: Dict[str, Any],
432
+ ) -> pd.DataFrame:
433
+ print("[Ideate] Deduplicating ideas before scoring.")
434
+ dedup_save = os.path.join(self.cfg.save_dir, "ideate_deduplicate")
435
+ base_name = os.path.splitext(self.cfg.file_name)[0]
436
+ dedup_instruction = (
437
+ "You do not need exact matches. Deduplicate ideas that are highly similar, "
438
+ "operate in the same conceptual space, or describe the same underlying theory. "
439
+ "Pick the representative text as the clearest, best-stated, and most complete version."
440
+ )
441
+ extra_instruction = config_updates.get("additional_instructions")
442
+ cfg_kwargs: Dict[str, Any] = dict(
443
+ save_dir=dedup_save,
444
+ file_name=f"{base_name}_deduplicate.csv",
445
+ model=self.cfg.model,
446
+ n_parallels=self.cfg.n_parallels,
447
+ n_runs=1,
448
+ use_dummy=self.cfg.use_dummy,
449
+ modality="text",
450
+ max_words_per_text=500,
451
+ group_size=25,
452
+ additional_instructions=dedup_instruction,
453
+ )
454
+ cfg_kwargs.update(config_updates)
455
+ if extra_instruction:
456
+ cfg_kwargs["additional_instructions"] = (
457
+ f"{dedup_instruction}\n\n{extra_instruction}"
458
+ )
459
+ df_proc = df.copy()
460
+ if "report_text_original" not in df_proc.columns:
461
+ df_proc["report_text_original"] = df_proc["report_text"]
462
+ dedup_cfg = DeduplicateConfig(**cfg_kwargs)
463
+ dedup_task = Deduplicate(dedup_cfg)
464
+ dedup_run_opts = dict(run_kwargs)
465
+ dedup_df = await dedup_task.run(
466
+ df_proc,
467
+ column_name="report_text",
468
+ reset_files=reset_files,
469
+ **dedup_run_opts,
470
+ )
471
+ if "mapped_report_text" in dedup_df.columns:
472
+ dedup_df["report_text"] = dedup_df["mapped_report_text"]
473
+ return dedup_df
474
+
475
+ def _clean_columns(self, df: pd.DataFrame) -> pd.DataFrame:
476
+ """Drop raw response metadata and present a consistent column order."""
477
+
478
+ raw_columns = {
479
+ "Identifier",
480
+ "Response",
481
+ "Time Taken",
482
+ "Input Tokens",
483
+ "Reasoning Tokens",
484
+ "Output Tokens",
485
+ "Reasoning Effort",
486
+ "Reasoning Summary",
487
+ "Successful",
488
+ "Error Log",
489
+ "Response IDs",
490
+ "Response ID",
491
+ }
492
+ cleaned = df.drop(columns=[col for col in raw_columns if col in df.columns])
493
+
494
+ preferred_order = [
495
+ "idea_id",
496
+ "topic",
497
+ "seed_text",
498
+ "report_text",
499
+ "report_text_original",
500
+ "title",
501
+ "in_a_nutshell",
502
+ "in_one_paragraph",
503
+ "illustrative_examples",
504
+ "testable_predictions",
505
+ "full_thinking",
506
+ "summary_preview",
507
+ "report_preview",
508
+ ]
509
+
510
+ ordered = [col for col in preferred_order if col in cleaned.columns]
511
+ remaining = [col for col in cleaned.columns if col not in ordered]
512
+ return cleaned.loc[:, ordered + remaining]
513
+
514
+ def _extract_sections(self, text: str) -> Dict[str, Optional[str]]:
515
+ headers = {
516
+ "title": "title",
517
+ "in a nutshell": "in_a_nutshell",
518
+ "in one paragraph": "in_one_paragraph",
519
+ "illustrative examples": "illustrative_examples",
520
+ "testable predictions": "testable_predictions",
521
+ "the full thinking": "full_thinking",
522
+ }
523
+ result: Dict[str, Optional[str]] = {v: None for v in headers.values()}
524
+ current_key: Optional[str] = None
525
+ buffer: List[str] = []
526
+ for raw_line in text.splitlines():
527
+ line = raw_line.strip()
528
+ if not line and current_key is None:
529
+ continue
530
+ lowered = line.lower()
531
+ matched = None
532
+ for header_text, key in headers.items():
533
+ if lowered.startswith(f"{header_text}:"):
534
+ matched = key
535
+ content = line[len(header_text) + 1 :].strip()
536
+ if current_key is not None:
537
+ result[current_key] = "\n".join(buffer).strip() or None
538
+ buffer = [content] if content else []
539
+ current_key = key
540
+ break
541
+ if matched is None:
542
+ if current_key is not None:
543
+ buffer.append(raw_line.rstrip())
544
+ if current_key is not None:
545
+ result[current_key] = "\n".join(buffer).strip() or None
546
+ return result
547
+
548
+ async def _apply_rate(
549
+ self,
550
+ df: pd.DataFrame,
551
+ attributes: Dict[str, str],
552
+ attr_key: str,
553
+ topic_instruction: str,
554
+ *,
555
+ reset_files: bool,
556
+ config_updates: Dict[str, Any],
557
+ run_kwargs: Dict[str, Any],
558
+ ) -> pd.DataFrame:
559
+ print("[Ideate] Scoring reports with Rate task.")
560
+ rate_save = os.path.join(self.cfg.save_dir, "rate")
561
+ base_name = os.path.splitext(self.cfg.file_name)[0]
562
+ cfg_kwargs: Dict[str, Any] = dict(
563
+ attributes=attributes,
564
+ save_dir=rate_save,
565
+ file_name=f"{base_name}_ratings.csv",
566
+ model=self.cfg.ranking_model or self.cfg.model,
567
+ n_parallels=self.cfg.n_parallels,
568
+ use_dummy=self.cfg.use_dummy,
569
+ reasoning_effort=self.cfg.reasoning_effort,
570
+ reasoning_summary=self.cfg.reasoning_summary,
571
+ )
572
+ cfg_kwargs.update(config_updates)
573
+ existing_instruction = cfg_kwargs.get("additional_instructions")
574
+ if existing_instruction:
575
+ cfg_kwargs["additional_instructions"] = (
576
+ f"{existing_instruction.rstrip()}\n\n{topic_instruction}"
577
+ )
578
+ else:
579
+ cfg_kwargs["additional_instructions"] = topic_instruction
580
+ rate_cfg = RateConfig(**cfg_kwargs)
581
+ rate_task = Rate(rate_cfg)
582
+ rate_run_opts = dict(run_kwargs)
583
+ rate_run_opts.setdefault("web_search", False)
584
+ df_scored = await rate_task.run(
585
+ df,
586
+ "report_text",
587
+ reset_files=reset_files,
588
+ **rate_run_opts,
589
+ )
590
+ return self._sort_results(df_scored, attr_key)
591
+
592
+ async def _apply_rank(
593
+ self,
594
+ df: pd.DataFrame,
595
+ attributes: Dict[str, str],
596
+ attr_key: str,
597
+ topic_instruction: str,
598
+ *,
599
+ recursive: bool,
600
+ reset_files: bool,
601
+ config_updates: Dict[str, Any],
602
+ run_kwargs: Dict[str, Any],
603
+ ) -> pd.DataFrame:
604
+ print("[Ideate] Ranking reports with Rank task.")
605
+ rank_save = os.path.join(self.cfg.save_dir, "rank")
606
+ base_name = os.path.splitext(self.cfg.file_name)[0]
607
+ cfg_kwargs: Dict[str, Any] = dict(
608
+ attributes=attributes,
609
+ save_dir=rank_save,
610
+ file_name=f"{base_name}_rankings",
611
+ model=self.cfg.ranking_model or self.cfg.model,
612
+ n_parallels=self.cfg.n_parallels,
613
+ use_dummy=self.cfg.use_dummy,
614
+ reasoning_effort=self.cfg.reasoning_effort,
615
+ reasoning_summary=self.cfg.reasoning_summary,
616
+ recursive=recursive,
617
+ recursive_fraction=self.cfg.recursive_fraction,
618
+ recursive_min_remaining=self.cfg.recursive_min_remaining,
619
+ recursive_final_round_multiplier=self.cfg.recursive_final_round_multiplier,
620
+ recursive_cut_side=self.cfg.recursive_cut_side,
621
+ recursive_rate_first_round=self.cfg.recursive_rate_first_round,
622
+ )
623
+ if attr_key and cfg_kwargs.get("recursive"):
624
+ cfg_kwargs.setdefault("recursive_cut_attr", attr_key)
625
+ cfg_kwargs.update(config_updates)
626
+ existing_instruction = cfg_kwargs.get("additional_instructions")
627
+ if existing_instruction:
628
+ cfg_kwargs["additional_instructions"] = (
629
+ f"{existing_instruction.rstrip()}\n\n{topic_instruction}"
630
+ )
631
+ else:
632
+ cfg_kwargs["additional_instructions"] = topic_instruction
633
+ rank_cfg = RankConfig(**cfg_kwargs)
634
+ rank_task = Rank(rank_cfg)
635
+ rank_run_opts = dict(run_kwargs)
636
+ rank_run_opts.setdefault("web_search", False)
637
+ df_ranked = await rank_task.run(
638
+ df,
639
+ "report_text",
640
+ id_column="idea_id",
641
+ reset_files=reset_files,
642
+ **rank_run_opts,
643
+ )
644
+ return self._sort_results(df_ranked, attr_key)
645
+
646
+ def _sort_results(self, df: pd.DataFrame, attr_key: str) -> pd.DataFrame:
647
+ resolved = self._resolve_attr_column(df, attr_key)
648
+ if resolved is None:
649
+ return df.reset_index(drop=True)
650
+ df_sorted = df.copy()
651
+ if resolved != attr_key:
652
+ df_sorted = df_sorted.rename(columns={resolved: attr_key})
653
+ resolved = attr_key
654
+ if not pd.api.types.is_numeric_dtype(df_sorted[resolved]):
655
+ df_sorted[resolved] = pd.to_numeric(df_sorted[resolved], errors="coerce")
656
+ df_sorted = df_sorted.sort_values(by=resolved, ascending=False, na_position="last").copy()
657
+ df_sorted.reset_index(drop=True, inplace=True)
658
+ rank_col = f"{attr_key}_rank"
659
+ positions: List[Optional[int]] = []
660
+ counter = 1
661
+ for value in df_sorted[resolved]:
662
+ if pd.isna(value):
663
+ positions.append(None)
664
+ else:
665
+ positions.append(counter)
666
+ counter += 1
667
+ df_sorted[rank_col] = pd.Series(positions, dtype="Int64")
668
+ return df_sorted
669
+
670
+ def _print_random_previews(self, df: pd.DataFrame, count: int = 5) -> None:
671
+ if df.empty:
672
+ return
673
+ preview_columns = [
674
+ "summary_preview",
675
+ "title",
676
+ "in_a_nutshell",
677
+ "in_one_paragraph",
678
+ "illustrative_examples",
679
+ "testable_predictions",
680
+ ]
681
+ missing_columns = [col for col in preview_columns if col not in df.columns]
682
+ if missing_columns:
683
+ return
684
+ mask = df[preview_columns].notna().any(axis=1)
685
+ available = df[mask]
686
+ if available.empty:
687
+ return
688
+ sample_count = min(count, len(available))
689
+ print(f"[Ideate] Showing {sample_count} random generated ideas:")
690
+ samples = available.sample(n=sample_count, replace=False)
691
+ for idx, (_, row) in enumerate(samples.iterrows(), start=1):
692
+ preview = self._build_preview(row)
693
+ print(f"\n--- Random Idea {idx} ({row.get('idea_id', 'N/A')}) ---")
694
+ print(preview)
695
+
696
+ def _print_rank_summaries(
697
+ self, df: pd.DataFrame, attr_key: str, count: int = 5
698
+ ) -> None:
699
+ if df.empty:
700
+ print("[Ideate] Skipping ranked summaries (missing score column or empty data).")
701
+ return
702
+ resolved = self._resolve_attr_column(df, attr_key)
703
+ if resolved is None:
704
+ print("[Ideate] Skipping ranked summaries (missing score column or empty data).")
705
+ return
706
+ df_local = df.copy()
707
+ if resolved != attr_key:
708
+ df_local = df_local.rename(columns={resolved: attr_key})
709
+ resolved = attr_key
710
+ if not pd.api.types.is_numeric_dtype(df_local[resolved]):
711
+ df_local[resolved] = pd.to_numeric(df_local[resolved], errors="coerce")
712
+ non_null = df_local[df_local[resolved].notna()].copy()
713
+ non_null = non_null.sort_values(by=resolved, ascending=False, na_position="last")
714
+ if non_null.empty:
715
+ print("[Ideate] Skipping ranked summaries (no scored entries available).")
716
+ return
717
+ top_count = min(count, len(non_null))
718
+ print(f"\n[Ideate] Top {top_count} ideas by '{attr_key}':")
719
+ for position, (_, row) in enumerate(non_null.head(top_count).iterrows(), start=1):
720
+ preview = self._build_preview(row)
721
+ score = row.get(attr_key, row.get(resolved, "N/A"))
722
+ print(f"\n#{position} (Score: {score}) - {row.get('idea_id', 'N/A')}")
723
+ print(preview)
724
+
725
+ bottom_count = min(count, len(non_null))
726
+ print(f"\n[Ideate] Bottom {bottom_count} ideas by '{attr_key}':")
727
+ tail_rows = non_null.tail(bottom_count).iloc[::-1]
728
+ start_position = len(non_null) - bottom_count + 1
729
+ for offset, (_, row) in enumerate(tail_rows.iterrows()):
730
+ position = start_position + offset
731
+ preview = self._build_preview(row)
732
+ score = row.get(attr_key, row.get(resolved, "N/A"))
733
+ print(f"\n#{position} (Score: {score}) - {row.get('idea_id', 'N/A')}")
734
+ print(preview)
735
+
736
+ @staticmethod
737
+ def _normalize_label(label: str) -> str:
738
+ return re.sub(r"[^a-z0-9]+", "", str(label).lower())
739
+
740
+ def _resolve_attr_column(self, df: pd.DataFrame, attr_key: str) -> Optional[str]:
741
+ target = self._normalize_label(attr_key)
742
+ for column in df.columns:
743
+ if self._normalize_label(column) == target:
744
+ return column
745
+ prefixes = ["cumulative_", "final_"]
746
+ pattern = re.compile(r"^(stage\d+_|round\d+_)")
747
+ for column in df.columns:
748
+ stripped = column
749
+ for prefix in prefixes:
750
+ if stripped.startswith(prefix):
751
+ stripped = stripped[len(prefix) :]
752
+ break
753
+ stripped = pattern.sub("", stripped)
754
+ if self._normalize_label(stripped) == target:
755
+ return column
756
+ return None
757
+
758
+ def _build_preview(self, row: pd.Series) -> str:
759
+ parts: List[str] = []
760
+ if "summary_preview" in row and isinstance(row["summary_preview"], str):
761
+ return row["summary_preview"].strip()
762
+ if "report_preview" in row and isinstance(row["report_preview"], str):
763
+ return row["report_preview"].strip()
764
+ title = row.get("title")
765
+ nutshell = row.get("in_a_nutshell")
766
+ paragraph = row.get("in_one_paragraph")
767
+ examples = row.get("illustrative_examples")
768
+ predictions = row.get("testable_predictions")
769
+ seed = row.get("seed_text")
770
+ if seed:
771
+ parts.append(f"Seed: {seed}")
772
+ if title:
773
+ parts.append(f"Title: {title}")
774
+ if nutshell:
775
+ parts.append(f"In a nutshell: {nutshell}")
776
+ if paragraph:
777
+ parts.append(f"In one paragraph: {paragraph}")
778
+ if examples:
779
+ parts.append(f"Illustrative examples: {examples}")
780
+ if predictions:
781
+ parts.append(f"Testable predictions: {predictions}")
782
+ return "\n\n".join(parts) if parts else "(No preview available)"