openai-gabriel 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gabriel/__init__.py +61 -0
- gabriel/_version.py +1 -0
- gabriel/api.py +2284 -0
- gabriel/cli/__main__.py +60 -0
- gabriel/core/__init__.py +7 -0
- gabriel/core/llm_client.py +34 -0
- gabriel/core/pipeline.py +18 -0
- gabriel/core/prompt_template.py +152 -0
- gabriel/prompts/__init__.py +1 -0
- gabriel/prompts/bucket_prompt.jinja2 +113 -0
- gabriel/prompts/classification_prompt.jinja2 +50 -0
- gabriel/prompts/codify_prompt.jinja2 +95 -0
- gabriel/prompts/comparison_prompt.jinja2 +60 -0
- gabriel/prompts/deduplicate_prompt.jinja2 +41 -0
- gabriel/prompts/deidentification_prompt.jinja2 +112 -0
- gabriel/prompts/extraction_prompt.jinja2 +61 -0
- gabriel/prompts/filter_prompt.jinja2 +31 -0
- gabriel/prompts/ideation_prompt.jinja2 +80 -0
- gabriel/prompts/merge_prompt.jinja2 +47 -0
- gabriel/prompts/paraphrase_prompt.jinja2 +17 -0
- gabriel/prompts/rankings_prompt.jinja2 +49 -0
- gabriel/prompts/ratings_prompt.jinja2 +50 -0
- gabriel/prompts/regional_analysis_prompt.jinja2 +40 -0
- gabriel/prompts/seed.jinja2 +43 -0
- gabriel/prompts/snippets.jinja2 +117 -0
- gabriel/tasks/__init__.py +63 -0
- gabriel/tasks/_attribute_utils.py +69 -0
- gabriel/tasks/bucket.py +432 -0
- gabriel/tasks/classify.py +562 -0
- gabriel/tasks/codify.py +1033 -0
- gabriel/tasks/compare.py +235 -0
- gabriel/tasks/debias.py +1460 -0
- gabriel/tasks/deduplicate.py +341 -0
- gabriel/tasks/deidentify.py +316 -0
- gabriel/tasks/discover.py +524 -0
- gabriel/tasks/extract.py +455 -0
- gabriel/tasks/filter.py +169 -0
- gabriel/tasks/ideate.py +782 -0
- gabriel/tasks/merge.py +464 -0
- gabriel/tasks/paraphrase.py +531 -0
- gabriel/tasks/rank.py +2041 -0
- gabriel/tasks/rate.py +347 -0
- gabriel/tasks/seed.py +465 -0
- gabriel/tasks/whatever.py +344 -0
- gabriel/utils/__init__.py +64 -0
- gabriel/utils/audio_utils.py +42 -0
- gabriel/utils/file_utils.py +464 -0
- gabriel/utils/image_utils.py +22 -0
- gabriel/utils/jinja.py +31 -0
- gabriel/utils/logging.py +86 -0
- gabriel/utils/mapmaker.py +304 -0
- gabriel/utils/media_utils.py +78 -0
- gabriel/utils/modality_utils.py +148 -0
- gabriel/utils/openai_utils.py +5470 -0
- gabriel/utils/parsing.py +282 -0
- gabriel/utils/passage_viewer.py +2557 -0
- gabriel/utils/pdf_utils.py +20 -0
- gabriel/utils/plot_utils.py +2881 -0
- gabriel/utils/prompt_utils.py +42 -0
- gabriel/utils/word_matching.py +158 -0
- openai_gabriel-1.0.1.dist-info/METADATA +443 -0
- openai_gabriel-1.0.1.dist-info/RECORD +67 -0
- openai_gabriel-1.0.1.dist-info/WHEEL +5 -0
- openai_gabriel-1.0.1.dist-info/entry_points.txt +2 -0
- openai_gabriel-1.0.1.dist-info/licenses/LICENSE +201 -0
- openai_gabriel-1.0.1.dist-info/licenses/NOTICE +13 -0
- openai_gabriel-1.0.1.dist-info/top_level.txt +1 -0
gabriel/utils/parsing.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
_JSON_FENCE_RE = re.compile(r"```(?:json)?\s*(.*?)\s*```", re.S)
|
|
12
|
+
|
|
13
|
+
# model used when an LLM is required to reformat malformed JSON
|
|
14
|
+
JSON_LLM_MODEL = os.getenv("JSON_LLM_MODEL", "gpt-5-mini")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _parse_json(txt: Any) -> Union[dict, list]:
|
|
18
|
+
"""Strict JSON parsing with common cleaning heuristics."""
|
|
19
|
+
if isinstance(txt, dict):
|
|
20
|
+
return txt
|
|
21
|
+
|
|
22
|
+
if isinstance(txt, list):
|
|
23
|
+
if len(txt) == 1:
|
|
24
|
+
return _parse_json(txt[0])
|
|
25
|
+
return txt
|
|
26
|
+
|
|
27
|
+
if isinstance(txt, (bytes, bytearray)):
|
|
28
|
+
txt = txt.decode(errors="ignore")
|
|
29
|
+
|
|
30
|
+
if txt is None:
|
|
31
|
+
raise ValueError("None provided")
|
|
32
|
+
|
|
33
|
+
cleaned = str(txt).strip()
|
|
34
|
+
|
|
35
|
+
if (cleaned.startswith('"') and cleaned.endswith('"')) or (
|
|
36
|
+
cleaned.startswith("'") and cleaned.endswith("'")
|
|
37
|
+
):
|
|
38
|
+
cleaned = cleaned[1:-1].strip()
|
|
39
|
+
|
|
40
|
+
m = _JSON_FENCE_RE.search(cleaned)
|
|
41
|
+
if m:
|
|
42
|
+
cleaned = m.group(1).strip()
|
|
43
|
+
|
|
44
|
+
for parser in (json.loads, ast.literal_eval):
|
|
45
|
+
try:
|
|
46
|
+
out = parser(cleaned)
|
|
47
|
+
if isinstance(out, (dict, list)):
|
|
48
|
+
return out
|
|
49
|
+
except Exception:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
# attempt to strip `//` and `/* */` style comments before parsing
|
|
53
|
+
try:
|
|
54
|
+
no_line_comments = re.sub(r"(?<!:)//.*$", "", cleaned, flags=re.MULTILINE)
|
|
55
|
+
no_comments = re.sub(r"/\*.*?\*/", "", no_line_comments, flags=re.S)
|
|
56
|
+
out = json.loads(no_comments)
|
|
57
|
+
if isinstance(out, (dict, list)):
|
|
58
|
+
return out
|
|
59
|
+
except Exception:
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
brace = re.search(r"\{[\s\S]*\}", cleaned)
|
|
63
|
+
if brace:
|
|
64
|
+
try:
|
|
65
|
+
out = json.loads(brace.group(0))
|
|
66
|
+
if isinstance(out, (dict, list)):
|
|
67
|
+
return out
|
|
68
|
+
except Exception:
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
bracket = re.search(r"\[[\s\S]*\]", cleaned)
|
|
72
|
+
if bracket:
|
|
73
|
+
candidate = bracket.group(0).strip()
|
|
74
|
+
try:
|
|
75
|
+
out = json.loads(candidate)
|
|
76
|
+
if isinstance(out, (dict, list)):
|
|
77
|
+
return out
|
|
78
|
+
except Exception:
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
m = re.fullmatch(r"\[\s*(['\"])(.*)\1\s*\]", candidate, re.S)
|
|
82
|
+
if m:
|
|
83
|
+
inner = m.group(2).strip()
|
|
84
|
+
try:
|
|
85
|
+
out = json.loads(inner)
|
|
86
|
+
if isinstance(out, (dict, list)):
|
|
87
|
+
return out
|
|
88
|
+
except Exception:
|
|
89
|
+
inner_bracket = re.search(r"\[[\s\S]*\]", inner)
|
|
90
|
+
if inner_bracket:
|
|
91
|
+
try:
|
|
92
|
+
out = json.loads(inner_bracket.group(0))
|
|
93
|
+
if isinstance(out, (dict, list)):
|
|
94
|
+
return out
|
|
95
|
+
except Exception:
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
raise ValueError(f"Failed to parse JSON: {cleaned[:200]}")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def safe_json(txt: Any) -> Union[dict, list]:
|
|
102
|
+
"""Best-effort JSON parser returning ``{}`` on failure.
|
|
103
|
+
|
|
104
|
+
This helper runs synchronously and never uses the LLM; it simply applies a
|
|
105
|
+
number of heuristics locally to coerce ``txt`` into a JSON object or list.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
return _parse_json(txt)
|
|
110
|
+
except Exception:
|
|
111
|
+
return {}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
async def safest_json(
|
|
115
|
+
txt: Any,
|
|
116
|
+
*,
|
|
117
|
+
model: Optional[str] = None,
|
|
118
|
+
use_llm_fallback: bool = False,
|
|
119
|
+
llm_timeout: Optional[float] = 60.0,
|
|
120
|
+
) -> Union[dict, list, Any]:
|
|
121
|
+
"""Parse JSON with an optional LLM-based repair step.
|
|
122
|
+
|
|
123
|
+
The function first attempts to parse ``txt`` locally using ``_parse_json``.
|
|
124
|
+
When ``use_llm_fallback`` is ``False`` (the default) any parsing failure
|
|
125
|
+
results in ``None``. When ``use_llm_fallback`` is ``True`` an extra
|
|
126
|
+
call is made to :func:`gabriel.utils.openai_utils.get_response` to request
|
|
127
|
+
that the model reformat the text as valid JSON. A timeout can be provided
|
|
128
|
+
to prevent the repair step from hanging indefinitely.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
return _parse_json(txt)
|
|
133
|
+
except Exception:
|
|
134
|
+
if not use_llm_fallback:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
if model is None:
|
|
138
|
+
model = JSON_LLM_MODEL
|
|
139
|
+
|
|
140
|
+
from gabriel.utils.openai_utils import get_response
|
|
141
|
+
|
|
142
|
+
use_dummy = model == "dummy"
|
|
143
|
+
fixed, _ = await get_response(
|
|
144
|
+
prompt=(
|
|
145
|
+
"Please parse the following text **without changing any content** "
|
|
146
|
+
"into valid JSON. This is a pure formatting task.\n\n" + str(txt)
|
|
147
|
+
),
|
|
148
|
+
model=model,
|
|
149
|
+
json_mode=True,
|
|
150
|
+
use_dummy=use_dummy,
|
|
151
|
+
timeout=llm_timeout,
|
|
152
|
+
)
|
|
153
|
+
if fixed:
|
|
154
|
+
try:
|
|
155
|
+
return _parse_json(fixed[0])
|
|
156
|
+
except Exception:
|
|
157
|
+
return None
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
async def clean_json_df(
|
|
162
|
+
df: pd.DataFrame,
|
|
163
|
+
columns: List[str],
|
|
164
|
+
*,
|
|
165
|
+
id_col: str,
|
|
166
|
+
model: str = "gpt-5-mini",
|
|
167
|
+
exclude_valid_json: bool = False,
|
|
168
|
+
save_path: Optional[str] = None,
|
|
169
|
+
reasoning_effort: Optional[str] = None,
|
|
170
|
+
reasoning_summary: Optional[str] = None,
|
|
171
|
+
) -> pd.DataFrame:
|
|
172
|
+
"""Ensure specified DataFrame columns contain valid JSON.
|
|
173
|
+
|
|
174
|
+
Parameters
|
|
175
|
+
----------
|
|
176
|
+
df:
|
|
177
|
+
Input DataFrame whose columns may contain malformed JSON strings.
|
|
178
|
+
columns:
|
|
179
|
+
Names of columns to inspect and clean.
|
|
180
|
+
id_col:
|
|
181
|
+
Name of a **unique** column in ``df`` used to merge cleaned JSON
|
|
182
|
+
responses back into the original DataFrame. A :class:`ValueError` is
|
|
183
|
+
raised if the column is missing or contains duplicate values.
|
|
184
|
+
model:
|
|
185
|
+
Model name passed to :func:`get_all_responses` when attempting to
|
|
186
|
+
repair invalid JSON. Defaults to ``"gpt-5-mini"``.
|
|
187
|
+
reasoning_effort:
|
|
188
|
+
Reasoning effort level forwarded to the model.
|
|
189
|
+
reasoning_summary:
|
|
190
|
+
Optional reasoning summary mode (e.g., ``"auto"``, ``"concise"``,
|
|
191
|
+
or ``"detailed"``) forwarded to the model. When ``None`` (default),
|
|
192
|
+
no reasoning summary is requested.
|
|
193
|
+
exclude_valid_json:
|
|
194
|
+
When ``False`` (default), only entries that fail to parse are sent to
|
|
195
|
+
the model. When ``True``, all entries are processed regardless of
|
|
196
|
+
validity.
|
|
197
|
+
save_path:
|
|
198
|
+
Optional path where the intermediate CSV from
|
|
199
|
+
:func:`get_all_responses` will be saved. When ``None`` (default), a
|
|
200
|
+
temporary file is created and deleted after processing.
|
|
201
|
+
|
|
202
|
+
Returns
|
|
203
|
+
-------
|
|
204
|
+
DataFrame with new ``<column>_cleaned`` columns containing the cleaned
|
|
205
|
+
JSON structures. Rows that were already valid retain their original value.
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
from gabriel.utils.openai_utils import get_all_responses
|
|
209
|
+
import tempfile
|
|
210
|
+
df = df.copy()
|
|
211
|
+
|
|
212
|
+
if id_col not in df.columns:
|
|
213
|
+
raise ValueError(f"Column '{id_col}' not found in DataFrame")
|
|
214
|
+
if not df[id_col].is_unique:
|
|
215
|
+
raise ValueError(f"Column '{id_col}' must contain unique values")
|
|
216
|
+
|
|
217
|
+
prompts: List[str] = []
|
|
218
|
+
identifiers: List[str] = []
|
|
219
|
+
# ``mapping`` maps each identifier to its originating column and ``id_col``
|
|
220
|
+
# value so responses can be merged back using a stable key.
|
|
221
|
+
mapping: Dict[str, Tuple[str, Any]] = {}
|
|
222
|
+
|
|
223
|
+
for col in columns:
|
|
224
|
+
cleaned_col = f"{col}_cleaned"
|
|
225
|
+
df[cleaned_col] = None
|
|
226
|
+
col_idx = df.columns.get_loc(cleaned_col)
|
|
227
|
+
for row_pos, (id_val, val) in enumerate(zip(df[id_col], df[col])):
|
|
228
|
+
valid = True
|
|
229
|
+
try:
|
|
230
|
+
_parse_json(val)
|
|
231
|
+
except Exception:
|
|
232
|
+
valid = False
|
|
233
|
+
if exclude_valid_json or not valid:
|
|
234
|
+
prompt = (
|
|
235
|
+
"Please parse the following text **without changing any content** "
|
|
236
|
+
"into valid JSON. This is a pure formatting task.\n\n" + str(val)
|
|
237
|
+
)
|
|
238
|
+
ident = f"{id_val}__{col}"
|
|
239
|
+
prompts.append(prompt)
|
|
240
|
+
identifiers.append(ident)
|
|
241
|
+
mapping[ident] = (col, id_val)
|
|
242
|
+
else:
|
|
243
|
+
df.iat[row_pos, col_idx] = val
|
|
244
|
+
|
|
245
|
+
if prompts:
|
|
246
|
+
use_dummy = model == "dummy"
|
|
247
|
+
cleanup = False
|
|
248
|
+
if save_path is None:
|
|
249
|
+
tmp_fd, tmp_path = tempfile.mkstemp(suffix=".csv")
|
|
250
|
+
os.close(tmp_fd)
|
|
251
|
+
os.remove(tmp_path)
|
|
252
|
+
cleanup = True
|
|
253
|
+
else:
|
|
254
|
+
tmp_path = save_path
|
|
255
|
+
try:
|
|
256
|
+
resp_df = await get_all_responses(
|
|
257
|
+
prompts=prompts,
|
|
258
|
+
identifiers=identifiers,
|
|
259
|
+
model=model,
|
|
260
|
+
json_mode=True,
|
|
261
|
+
use_dummy=use_dummy,
|
|
262
|
+
reasoning_effort=reasoning_effort,
|
|
263
|
+
reasoning_summary=reasoning_summary,
|
|
264
|
+
print_example_prompt=False,
|
|
265
|
+
save_path=tmp_path,
|
|
266
|
+
reset_files=True,
|
|
267
|
+
)
|
|
268
|
+
finally:
|
|
269
|
+
if cleanup:
|
|
270
|
+
try:
|
|
271
|
+
os.remove(tmp_path)
|
|
272
|
+
except Exception:
|
|
273
|
+
pass
|
|
274
|
+
for _, row in resp_df.iterrows():
|
|
275
|
+
ident = str(row.get("Identifier", "")).strip()
|
|
276
|
+
if ident not in mapping:
|
|
277
|
+
continue
|
|
278
|
+
col, id_val = mapping[ident]
|
|
279
|
+
col_idx = df.columns.get_loc(f"{col}_cleaned")
|
|
280
|
+
df.loc[df[id_col] == id_val, df.columns[col_idx]] = row["Response"]
|
|
281
|
+
|
|
282
|
+
return df
|