openai-gabriel 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. gabriel/__init__.py +61 -0
  2. gabriel/_version.py +1 -0
  3. gabriel/api.py +2284 -0
  4. gabriel/cli/__main__.py +60 -0
  5. gabriel/core/__init__.py +7 -0
  6. gabriel/core/llm_client.py +34 -0
  7. gabriel/core/pipeline.py +18 -0
  8. gabriel/core/prompt_template.py +152 -0
  9. gabriel/prompts/__init__.py +1 -0
  10. gabriel/prompts/bucket_prompt.jinja2 +113 -0
  11. gabriel/prompts/classification_prompt.jinja2 +50 -0
  12. gabriel/prompts/codify_prompt.jinja2 +95 -0
  13. gabriel/prompts/comparison_prompt.jinja2 +60 -0
  14. gabriel/prompts/deduplicate_prompt.jinja2 +41 -0
  15. gabriel/prompts/deidentification_prompt.jinja2 +112 -0
  16. gabriel/prompts/extraction_prompt.jinja2 +61 -0
  17. gabriel/prompts/filter_prompt.jinja2 +31 -0
  18. gabriel/prompts/ideation_prompt.jinja2 +80 -0
  19. gabriel/prompts/merge_prompt.jinja2 +47 -0
  20. gabriel/prompts/paraphrase_prompt.jinja2 +17 -0
  21. gabriel/prompts/rankings_prompt.jinja2 +49 -0
  22. gabriel/prompts/ratings_prompt.jinja2 +50 -0
  23. gabriel/prompts/regional_analysis_prompt.jinja2 +40 -0
  24. gabriel/prompts/seed.jinja2 +43 -0
  25. gabriel/prompts/snippets.jinja2 +117 -0
  26. gabriel/tasks/__init__.py +63 -0
  27. gabriel/tasks/_attribute_utils.py +69 -0
  28. gabriel/tasks/bucket.py +432 -0
  29. gabriel/tasks/classify.py +562 -0
  30. gabriel/tasks/codify.py +1033 -0
  31. gabriel/tasks/compare.py +235 -0
  32. gabriel/tasks/debias.py +1460 -0
  33. gabriel/tasks/deduplicate.py +341 -0
  34. gabriel/tasks/deidentify.py +316 -0
  35. gabriel/tasks/discover.py +524 -0
  36. gabriel/tasks/extract.py +455 -0
  37. gabriel/tasks/filter.py +169 -0
  38. gabriel/tasks/ideate.py +782 -0
  39. gabriel/tasks/merge.py +464 -0
  40. gabriel/tasks/paraphrase.py +531 -0
  41. gabriel/tasks/rank.py +2041 -0
  42. gabriel/tasks/rate.py +347 -0
  43. gabriel/tasks/seed.py +465 -0
  44. gabriel/tasks/whatever.py +344 -0
  45. gabriel/utils/__init__.py +64 -0
  46. gabriel/utils/audio_utils.py +42 -0
  47. gabriel/utils/file_utils.py +464 -0
  48. gabriel/utils/image_utils.py +22 -0
  49. gabriel/utils/jinja.py +31 -0
  50. gabriel/utils/logging.py +86 -0
  51. gabriel/utils/mapmaker.py +304 -0
  52. gabriel/utils/media_utils.py +78 -0
  53. gabriel/utils/modality_utils.py +148 -0
  54. gabriel/utils/openai_utils.py +5470 -0
  55. gabriel/utils/parsing.py +282 -0
  56. gabriel/utils/passage_viewer.py +2557 -0
  57. gabriel/utils/pdf_utils.py +20 -0
  58. gabriel/utils/plot_utils.py +2881 -0
  59. gabriel/utils/prompt_utils.py +42 -0
  60. gabriel/utils/word_matching.py +158 -0
  61. openai_gabriel-1.0.1.dist-info/METADATA +443 -0
  62. openai_gabriel-1.0.1.dist-info/RECORD +67 -0
  63. openai_gabriel-1.0.1.dist-info/WHEEL +5 -0
  64. openai_gabriel-1.0.1.dist-info/entry_points.txt +2 -0
  65. openai_gabriel-1.0.1.dist-info/licenses/LICENSE +201 -0
  66. openai_gabriel-1.0.1.dist-info/licenses/NOTICE +13 -0
  67. openai_gabriel-1.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,282 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ import json
5
+ import os
6
+ import re
7
+ from typing import Any, Dict, List, Optional, Tuple, Union
8
+
9
+ import pandas as pd
10
+
11
+ _JSON_FENCE_RE = re.compile(r"```(?:json)?\s*(.*?)\s*```", re.S)
12
+
13
+ # model used when an LLM is required to reformat malformed JSON
14
+ JSON_LLM_MODEL = os.getenv("JSON_LLM_MODEL", "gpt-5-mini")
15
+
16
+
17
+ def _parse_json(txt: Any) -> Union[dict, list]:
18
+ """Strict JSON parsing with common cleaning heuristics."""
19
+ if isinstance(txt, dict):
20
+ return txt
21
+
22
+ if isinstance(txt, list):
23
+ if len(txt) == 1:
24
+ return _parse_json(txt[0])
25
+ return txt
26
+
27
+ if isinstance(txt, (bytes, bytearray)):
28
+ txt = txt.decode(errors="ignore")
29
+
30
+ if txt is None:
31
+ raise ValueError("None provided")
32
+
33
+ cleaned = str(txt).strip()
34
+
35
+ if (cleaned.startswith('"') and cleaned.endswith('"')) or (
36
+ cleaned.startswith("'") and cleaned.endswith("'")
37
+ ):
38
+ cleaned = cleaned[1:-1].strip()
39
+
40
+ m = _JSON_FENCE_RE.search(cleaned)
41
+ if m:
42
+ cleaned = m.group(1).strip()
43
+
44
+ for parser in (json.loads, ast.literal_eval):
45
+ try:
46
+ out = parser(cleaned)
47
+ if isinstance(out, (dict, list)):
48
+ return out
49
+ except Exception:
50
+ pass
51
+
52
+ # attempt to strip `//` and `/* */` style comments before parsing
53
+ try:
54
+ no_line_comments = re.sub(r"(?<!:)//.*$", "", cleaned, flags=re.MULTILINE)
55
+ no_comments = re.sub(r"/\*.*?\*/", "", no_line_comments, flags=re.S)
56
+ out = json.loads(no_comments)
57
+ if isinstance(out, (dict, list)):
58
+ return out
59
+ except Exception:
60
+ pass
61
+
62
+ brace = re.search(r"\{[\s\S]*\}", cleaned)
63
+ if brace:
64
+ try:
65
+ out = json.loads(brace.group(0))
66
+ if isinstance(out, (dict, list)):
67
+ return out
68
+ except Exception:
69
+ pass
70
+
71
+ bracket = re.search(r"\[[\s\S]*\]", cleaned)
72
+ if bracket:
73
+ candidate = bracket.group(0).strip()
74
+ try:
75
+ out = json.loads(candidate)
76
+ if isinstance(out, (dict, list)):
77
+ return out
78
+ except Exception:
79
+ pass
80
+
81
+ m = re.fullmatch(r"\[\s*(['\"])(.*)\1\s*\]", candidate, re.S)
82
+ if m:
83
+ inner = m.group(2).strip()
84
+ try:
85
+ out = json.loads(inner)
86
+ if isinstance(out, (dict, list)):
87
+ return out
88
+ except Exception:
89
+ inner_bracket = re.search(r"\[[\s\S]*\]", inner)
90
+ if inner_bracket:
91
+ try:
92
+ out = json.loads(inner_bracket.group(0))
93
+ if isinstance(out, (dict, list)):
94
+ return out
95
+ except Exception:
96
+ pass
97
+
98
+ raise ValueError(f"Failed to parse JSON: {cleaned[:200]}")
99
+
100
+
101
+ def safe_json(txt: Any) -> Union[dict, list]:
102
+ """Best-effort JSON parser returning ``{}`` on failure.
103
+
104
+ This helper runs synchronously and never uses the LLM; it simply applies a
105
+ number of heuristics locally to coerce ``txt`` into a JSON object or list.
106
+ """
107
+
108
+ try:
109
+ return _parse_json(txt)
110
+ except Exception:
111
+ return {}
112
+
113
+
114
+ async def safest_json(
115
+ txt: Any,
116
+ *,
117
+ model: Optional[str] = None,
118
+ use_llm_fallback: bool = False,
119
+ llm_timeout: Optional[float] = 60.0,
120
+ ) -> Union[dict, list, Any]:
121
+ """Parse JSON with an optional LLM-based repair step.
122
+
123
+ The function first attempts to parse ``txt`` locally using ``_parse_json``.
124
+ When ``use_llm_fallback`` is ``False`` (the default) any parsing failure
125
+ results in ``None``. When ``use_llm_fallback`` is ``True`` an extra
126
+ call is made to :func:`gabriel.utils.openai_utils.get_response` to request
127
+ that the model reformat the text as valid JSON. A timeout can be provided
128
+ to prevent the repair step from hanging indefinitely.
129
+ """
130
+
131
+ try:
132
+ return _parse_json(txt)
133
+ except Exception:
134
+ if not use_llm_fallback:
135
+ return None
136
+
137
+ if model is None:
138
+ model = JSON_LLM_MODEL
139
+
140
+ from gabriel.utils.openai_utils import get_response
141
+
142
+ use_dummy = model == "dummy"
143
+ fixed, _ = await get_response(
144
+ prompt=(
145
+ "Please parse the following text **without changing any content** "
146
+ "into valid JSON. This is a pure formatting task.\n\n" + str(txt)
147
+ ),
148
+ model=model,
149
+ json_mode=True,
150
+ use_dummy=use_dummy,
151
+ timeout=llm_timeout,
152
+ )
153
+ if fixed:
154
+ try:
155
+ return _parse_json(fixed[0])
156
+ except Exception:
157
+ return None
158
+ return None
159
+
160
+
161
+ async def clean_json_df(
162
+ df: pd.DataFrame,
163
+ columns: List[str],
164
+ *,
165
+ id_col: str,
166
+ model: str = "gpt-5-mini",
167
+ exclude_valid_json: bool = False,
168
+ save_path: Optional[str] = None,
169
+ reasoning_effort: Optional[str] = None,
170
+ reasoning_summary: Optional[str] = None,
171
+ ) -> pd.DataFrame:
172
+ """Ensure specified DataFrame columns contain valid JSON.
173
+
174
+ Parameters
175
+ ----------
176
+ df:
177
+ Input DataFrame whose columns may contain malformed JSON strings.
178
+ columns:
179
+ Names of columns to inspect and clean.
180
+ id_col:
181
+ Name of a **unique** column in ``df`` used to merge cleaned JSON
182
+ responses back into the original DataFrame. A :class:`ValueError` is
183
+ raised if the column is missing or contains duplicate values.
184
+ model:
185
+ Model name passed to :func:`get_all_responses` when attempting to
186
+ repair invalid JSON. Defaults to ``"gpt-5-mini"``.
187
+ reasoning_effort:
188
+ Reasoning effort level forwarded to the model.
189
+ reasoning_summary:
190
+ Optional reasoning summary mode (e.g., ``"auto"``, ``"concise"``,
191
+ or ``"detailed"``) forwarded to the model. When ``None`` (default),
192
+ no reasoning summary is requested.
193
+ exclude_valid_json:
194
+ When ``False`` (default), only entries that fail to parse are sent to
195
+ the model. When ``True``, all entries are processed regardless of
196
+ validity.
197
+ save_path:
198
+ Optional path where the intermediate CSV from
199
+ :func:`get_all_responses` will be saved. When ``None`` (default), a
200
+ temporary file is created and deleted after processing.
201
+
202
+ Returns
203
+ -------
204
+ DataFrame with new ``<column>_cleaned`` columns containing the cleaned
205
+ JSON structures. Rows that were already valid retain their original value.
206
+ """
207
+
208
+ from gabriel.utils.openai_utils import get_all_responses
209
+ import tempfile
210
+ df = df.copy()
211
+
212
+ if id_col not in df.columns:
213
+ raise ValueError(f"Column '{id_col}' not found in DataFrame")
214
+ if not df[id_col].is_unique:
215
+ raise ValueError(f"Column '{id_col}' must contain unique values")
216
+
217
+ prompts: List[str] = []
218
+ identifiers: List[str] = []
219
+ # ``mapping`` maps each identifier to its originating column and ``id_col``
220
+ # value so responses can be merged back using a stable key.
221
+ mapping: Dict[str, Tuple[str, Any]] = {}
222
+
223
+ for col in columns:
224
+ cleaned_col = f"{col}_cleaned"
225
+ df[cleaned_col] = None
226
+ col_idx = df.columns.get_loc(cleaned_col)
227
+ for row_pos, (id_val, val) in enumerate(zip(df[id_col], df[col])):
228
+ valid = True
229
+ try:
230
+ _parse_json(val)
231
+ except Exception:
232
+ valid = False
233
+ if exclude_valid_json or not valid:
234
+ prompt = (
235
+ "Please parse the following text **without changing any content** "
236
+ "into valid JSON. This is a pure formatting task.\n\n" + str(val)
237
+ )
238
+ ident = f"{id_val}__{col}"
239
+ prompts.append(prompt)
240
+ identifiers.append(ident)
241
+ mapping[ident] = (col, id_val)
242
+ else:
243
+ df.iat[row_pos, col_idx] = val
244
+
245
+ if prompts:
246
+ use_dummy = model == "dummy"
247
+ cleanup = False
248
+ if save_path is None:
249
+ tmp_fd, tmp_path = tempfile.mkstemp(suffix=".csv")
250
+ os.close(tmp_fd)
251
+ os.remove(tmp_path)
252
+ cleanup = True
253
+ else:
254
+ tmp_path = save_path
255
+ try:
256
+ resp_df = await get_all_responses(
257
+ prompts=prompts,
258
+ identifiers=identifiers,
259
+ model=model,
260
+ json_mode=True,
261
+ use_dummy=use_dummy,
262
+ reasoning_effort=reasoning_effort,
263
+ reasoning_summary=reasoning_summary,
264
+ print_example_prompt=False,
265
+ save_path=tmp_path,
266
+ reset_files=True,
267
+ )
268
+ finally:
269
+ if cleanup:
270
+ try:
271
+ os.remove(tmp_path)
272
+ except Exception:
273
+ pass
274
+ for _, row in resp_df.iterrows():
275
+ ident = str(row.get("Identifier", "")).strip()
276
+ if ident not in mapping:
277
+ continue
278
+ col, id_val = mapping[ident]
279
+ col_idx = df.columns.get_loc(f"{col}_cleaned")
280
+ df.loc[df[id_col] == id_val, df.columns[col_idx]] = row["Response"]
281
+
282
+ return df