openai-gabriel 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gabriel/__init__.py +61 -0
- gabriel/_version.py +1 -0
- gabriel/api.py +2284 -0
- gabriel/cli/__main__.py +60 -0
- gabriel/core/__init__.py +7 -0
- gabriel/core/llm_client.py +34 -0
- gabriel/core/pipeline.py +18 -0
- gabriel/core/prompt_template.py +152 -0
- gabriel/prompts/__init__.py +1 -0
- gabriel/prompts/bucket_prompt.jinja2 +113 -0
- gabriel/prompts/classification_prompt.jinja2 +50 -0
- gabriel/prompts/codify_prompt.jinja2 +95 -0
- gabriel/prompts/comparison_prompt.jinja2 +60 -0
- gabriel/prompts/deduplicate_prompt.jinja2 +41 -0
- gabriel/prompts/deidentification_prompt.jinja2 +112 -0
- gabriel/prompts/extraction_prompt.jinja2 +61 -0
- gabriel/prompts/filter_prompt.jinja2 +31 -0
- gabriel/prompts/ideation_prompt.jinja2 +80 -0
- gabriel/prompts/merge_prompt.jinja2 +47 -0
- gabriel/prompts/paraphrase_prompt.jinja2 +17 -0
- gabriel/prompts/rankings_prompt.jinja2 +49 -0
- gabriel/prompts/ratings_prompt.jinja2 +50 -0
- gabriel/prompts/regional_analysis_prompt.jinja2 +40 -0
- gabriel/prompts/seed.jinja2 +43 -0
- gabriel/prompts/snippets.jinja2 +117 -0
- gabriel/tasks/__init__.py +63 -0
- gabriel/tasks/_attribute_utils.py +69 -0
- gabriel/tasks/bucket.py +432 -0
- gabriel/tasks/classify.py +562 -0
- gabriel/tasks/codify.py +1033 -0
- gabriel/tasks/compare.py +235 -0
- gabriel/tasks/debias.py +1460 -0
- gabriel/tasks/deduplicate.py +341 -0
- gabriel/tasks/deidentify.py +316 -0
- gabriel/tasks/discover.py +524 -0
- gabriel/tasks/extract.py +455 -0
- gabriel/tasks/filter.py +169 -0
- gabriel/tasks/ideate.py +782 -0
- gabriel/tasks/merge.py +464 -0
- gabriel/tasks/paraphrase.py +531 -0
- gabriel/tasks/rank.py +2041 -0
- gabriel/tasks/rate.py +347 -0
- gabriel/tasks/seed.py +465 -0
- gabriel/tasks/whatever.py +344 -0
- gabriel/utils/__init__.py +64 -0
- gabriel/utils/audio_utils.py +42 -0
- gabriel/utils/file_utils.py +464 -0
- gabriel/utils/image_utils.py +22 -0
- gabriel/utils/jinja.py +31 -0
- gabriel/utils/logging.py +86 -0
- gabriel/utils/mapmaker.py +304 -0
- gabriel/utils/media_utils.py +78 -0
- gabriel/utils/modality_utils.py +148 -0
- gabriel/utils/openai_utils.py +5470 -0
- gabriel/utils/parsing.py +282 -0
- gabriel/utils/passage_viewer.py +2557 -0
- gabriel/utils/pdf_utils.py +20 -0
- gabriel/utils/plot_utils.py +2881 -0
- gabriel/utils/prompt_utils.py +42 -0
- gabriel/utils/word_matching.py +158 -0
- openai_gabriel-1.0.1.dist-info/METADATA +443 -0
- openai_gabriel-1.0.1.dist-info/RECORD +67 -0
- openai_gabriel-1.0.1.dist-info/WHEEL +5 -0
- openai_gabriel-1.0.1.dist-info/entry_points.txt +2 -0
- openai_gabriel-1.0.1.dist-info/licenses/LICENSE +201 -0
- openai_gabriel-1.0.1.dist-info/licenses/NOTICE +13 -0
- openai_gabriel-1.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import os
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional, Set
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from .logging import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
TEXTUAL_MODALITIES = {"text", "entity", "web"}
|
|
14
|
+
PATH_MODALITIES = {"image", "audio", "pdf"}
|
|
15
|
+
ALL_MODALITIES = TEXTUAL_MODALITIES | PATH_MODALITIES
|
|
16
|
+
TABULAR_EXTENSIONS = {".csv", ".tsv", ".xlsx", ".xls", ".parquet", ".pq", ".feather"}
|
|
17
|
+
IMAGE_EXTENSIONS = {
|
|
18
|
+
".png",
|
|
19
|
+
".jpg",
|
|
20
|
+
".jpeg",
|
|
21
|
+
".gif",
|
|
22
|
+
".bmp",
|
|
23
|
+
".tiff",
|
|
24
|
+
".tif",
|
|
25
|
+
".webp",
|
|
26
|
+
".svg",
|
|
27
|
+
}
|
|
28
|
+
PDF_EXTENSIONS = {".pdf"}
|
|
29
|
+
TEXT_EXTENSIONS = {
|
|
30
|
+
".txt",
|
|
31
|
+
".md",
|
|
32
|
+
".rtf",
|
|
33
|
+
".html",
|
|
34
|
+
".htm",
|
|
35
|
+
".xml",
|
|
36
|
+
".json",
|
|
37
|
+
".csv",
|
|
38
|
+
".tsv",
|
|
39
|
+
}
|
|
40
|
+
AUDIO_EXTENSIONS = {
|
|
41
|
+
".mp3",
|
|
42
|
+
".wav",
|
|
43
|
+
".flac",
|
|
44
|
+
".m4a",
|
|
45
|
+
".aac",
|
|
46
|
+
".ogg",
|
|
47
|
+
".oga",
|
|
48
|
+
".opus",
|
|
49
|
+
".aiff",
|
|
50
|
+
".aif",
|
|
51
|
+
".aifc",
|
|
52
|
+
".wma",
|
|
53
|
+
".alac",
|
|
54
|
+
}
|
|
55
|
+
IMAGE_EXTENSION_SUFFIXES = {ext.lstrip(".") for ext in IMAGE_EXTENSIONS}
|
|
56
|
+
AUDIO_EXTENSION_SUFFIXES = {ext.lstrip(".") for ext in AUDIO_EXTENSIONS}
|
|
57
|
+
PDF_EXTENSION_SUFFIXES = {ext.lstrip(".") for ext in PDF_EXTENSIONS}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def load(
|
|
61
|
+
folder_path: str,
|
|
62
|
+
extensions: Optional[Iterable[str]] = None,
|
|
63
|
+
*,
|
|
64
|
+
tag_dict: Optional[Dict[str, Any]] = None,
|
|
65
|
+
save_name: str = "gabriel_aggregated_content.csv",
|
|
66
|
+
save_dir: Optional[str] = None,
|
|
67
|
+
reset_files: bool = False,
|
|
68
|
+
modality: Optional[str] = None,
|
|
69
|
+
) -> pd.DataFrame:
|
|
70
|
+
"""Aggregate files from a folder into a single CSV.
|
|
71
|
+
|
|
72
|
+
Parameters
|
|
73
|
+
----------
|
|
74
|
+
folder_path:
|
|
75
|
+
Path to a directory containing media files or to a single file. When a
|
|
76
|
+
CSV/Excel file is provided, it is loaded directly without creating a
|
|
77
|
+
copy.
|
|
78
|
+
extensions:
|
|
79
|
+
Optional iterable of file extensions (without leading dots) to include.
|
|
80
|
+
When ``None`` all files are processed.
|
|
81
|
+
tag_dict:
|
|
82
|
+
Optional mapping of substrings to tag values. The first matching
|
|
83
|
+
substring found in a file name determines the ``tag`` column value.
|
|
84
|
+
save_name:
|
|
85
|
+
Name of the output CSV written inside ``save_dir``. Defaults to
|
|
86
|
+
``"gabriel_aggregated_content.csv"``.
|
|
87
|
+
save_dir:
|
|
88
|
+
Optional directory for the aggregated CSV. When omitted, the data is
|
|
89
|
+
saved inside ``folder_path`` (or the parent directory if
|
|
90
|
+
``folder_path`` points to a file).
|
|
91
|
+
reset_files:
|
|
92
|
+
When ``False`` (default), an existing file at ``save_path`` is reused
|
|
93
|
+
instead of being regenerated. Set to ``True`` to overwrite the file.
|
|
94
|
+
modality:
|
|
95
|
+
Optional modality hint. ``"text"``, ``"entity"``, and ``"web"`` are
|
|
96
|
+
treated as text; ``"image"``, ``"audio"``, and ``"pdf"`` collect file paths. When
|
|
97
|
+
``None`` (default) the modality is inferred from the first matching file.
|
|
98
|
+
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
101
|
+
DataFrame
|
|
102
|
+
The aggregated contents or file paths of the processed files.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
folder_path = os.path.expanduser(os.path.expandvars(folder_path))
|
|
106
|
+
target_dir = _resolve_save_directory(folder_path, save_dir)
|
|
107
|
+
save_path = os.path.join(target_dir, save_name)
|
|
108
|
+
|
|
109
|
+
if os.path.exists(save_path) and not reset_files:
|
|
110
|
+
logger.info("Loading existing aggregated file from %s", save_path)
|
|
111
|
+
df = _read_tabular_file(save_path)
|
|
112
|
+
print(df.head())
|
|
113
|
+
print(f"Loaded existing aggregated file from {save_path}")
|
|
114
|
+
return df
|
|
115
|
+
|
|
116
|
+
extset = {e.lower().lstrip(".") for e in extensions} if extensions else None
|
|
117
|
+
modality = _resolve_modality(folder_path, extset, save_name, modality)
|
|
118
|
+
is_textual = _is_textual_modality(modality)
|
|
119
|
+
|
|
120
|
+
path_key = "path"
|
|
121
|
+
rows: List[Dict[str, Any]] = []
|
|
122
|
+
max_layers = 0
|
|
123
|
+
|
|
124
|
+
warned_pdf = False
|
|
125
|
+
warned_image = False
|
|
126
|
+
warned_audio = False
|
|
127
|
+
warned_doc = False
|
|
128
|
+
has_non_pdf = False
|
|
129
|
+
has_pdf = False
|
|
130
|
+
|
|
131
|
+
if os.path.isfile(folder_path):
|
|
132
|
+
ext = os.path.splitext(folder_path)[1].lower()
|
|
133
|
+
if ext == ".doc":
|
|
134
|
+
if not warned_doc:
|
|
135
|
+
print(
|
|
136
|
+
"[gabriel.load] Ignoring legacy .doc files. Please convert them "
|
|
137
|
+
"to .docx or PDF before loading."
|
|
138
|
+
)
|
|
139
|
+
warned_doc = True
|
|
140
|
+
if ext == ".pdf":
|
|
141
|
+
has_pdf = True
|
|
142
|
+
if is_textual and ext in TABULAR_EXTENSIONS:
|
|
143
|
+
logger.info(
|
|
144
|
+
"Input path %s is a tabular file; loading it without creating a copy.",
|
|
145
|
+
folder_path,
|
|
146
|
+
)
|
|
147
|
+
df = _read_tabular_file(folder_path)
|
|
148
|
+
print(df.head())
|
|
149
|
+
print(f"Loaded existing file from {folder_path}")
|
|
150
|
+
return df
|
|
151
|
+
name = os.path.basename(folder_path)
|
|
152
|
+
if ext != ".doc":
|
|
153
|
+
warned_pdf, warned_image, warned_audio = _warn_for_media_mismatch(
|
|
154
|
+
ext,
|
|
155
|
+
modality,
|
|
156
|
+
warned_pdf,
|
|
157
|
+
warned_image,
|
|
158
|
+
warned_audio,
|
|
159
|
+
folder_path,
|
|
160
|
+
)
|
|
161
|
+
rows.append(
|
|
162
|
+
_build_row(
|
|
163
|
+
file_path=folder_path,
|
|
164
|
+
name=name,
|
|
165
|
+
layers=(),
|
|
166
|
+
tag_dict=tag_dict,
|
|
167
|
+
is_textual=is_textual,
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
else:
|
|
171
|
+
for root, _, files in os.walk(folder_path):
|
|
172
|
+
for fname in files:
|
|
173
|
+
if fname == save_name:
|
|
174
|
+
continue
|
|
175
|
+
ext = os.path.splitext(fname)[1].lower()
|
|
176
|
+
if ext == ".doc":
|
|
177
|
+
if not warned_doc:
|
|
178
|
+
print(
|
|
179
|
+
"[gabriel.load] Ignoring legacy .doc files. Please convert "
|
|
180
|
+
"them to .docx or PDF before loading."
|
|
181
|
+
)
|
|
182
|
+
warned_doc = True
|
|
183
|
+
continue
|
|
184
|
+
short_ext = ext.lstrip(".")
|
|
185
|
+
if ext == ".pdf":
|
|
186
|
+
has_pdf = True
|
|
187
|
+
if modality == "pdf" and ext != ".pdf":
|
|
188
|
+
has_non_pdf = True
|
|
189
|
+
warned_pdf, warned_image, warned_audio = _warn_for_media_mismatch(
|
|
190
|
+
ext,
|
|
191
|
+
modality,
|
|
192
|
+
warned_pdf,
|
|
193
|
+
warned_image,
|
|
194
|
+
warned_audio,
|
|
195
|
+
folder_path,
|
|
196
|
+
)
|
|
197
|
+
if not _should_include_file(short_ext, modality, extset):
|
|
198
|
+
continue
|
|
199
|
+
file_path = os.path.join(root, fname)
|
|
200
|
+
rel = os.path.relpath(file_path, folder_path)
|
|
201
|
+
parts = rel.split(os.sep)
|
|
202
|
+
name = parts[-1]
|
|
203
|
+
layers = parts[:-1]
|
|
204
|
+
max_layers = max(max_layers, len(layers))
|
|
205
|
+
rows.append(
|
|
206
|
+
_build_row(
|
|
207
|
+
file_path=file_path,
|
|
208
|
+
name=name,
|
|
209
|
+
layers=layers,
|
|
210
|
+
tag_dict=tag_dict,
|
|
211
|
+
is_textual=is_textual,
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
if modality == "pdf" and has_non_pdf:
|
|
216
|
+
print(
|
|
217
|
+
"[gabriel.load] Detected non-PDF files in a PDF run. Only PDFs were "
|
|
218
|
+
"ingested. Set modality='text' (or 'entity'/'web') in gabriel.load if you "
|
|
219
|
+
"need to extract text from PDFs and include non-PDF files."
|
|
220
|
+
)
|
|
221
|
+
if modality == "pdf" and has_pdf:
|
|
222
|
+
print(
|
|
223
|
+
"[gabriel.load] PDF modality attaches PDFs directly (richer layout, figures, and "
|
|
224
|
+
"images). Set modality='text' (or 'entity'/'web') to extract text-only "
|
|
225
|
+
"versions of PDFs."
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
df = pd.DataFrame(rows)
|
|
229
|
+
for i in range(1, max_layers + 1):
|
|
230
|
+
col = f"layer_{i}"
|
|
231
|
+
if col not in df.columns:
|
|
232
|
+
df[col] = None
|
|
233
|
+
|
|
234
|
+
cols = ["name", path_key] + [f"layer_{i}" for i in range(1, max_layers + 1)]
|
|
235
|
+
if tag_dict:
|
|
236
|
+
cols.append("tag")
|
|
237
|
+
else:
|
|
238
|
+
df.drop(columns=["tag"], inplace=True, errors="ignore")
|
|
239
|
+
if is_textual:
|
|
240
|
+
cols.append("text")
|
|
241
|
+
else:
|
|
242
|
+
df.drop(columns=["text"], inplace=True, errors="ignore")
|
|
243
|
+
if not df.empty:
|
|
244
|
+
df = df[cols]
|
|
245
|
+
df.to_csv(save_path, index=False)
|
|
246
|
+
print(df.head())
|
|
247
|
+
print(f"Saved aggregated file to {save_path}")
|
|
248
|
+
return df
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _build_row(
|
|
252
|
+
*,
|
|
253
|
+
file_path: str,
|
|
254
|
+
name: str,
|
|
255
|
+
layers: Iterable[str],
|
|
256
|
+
tag_dict: Optional[Dict[str, Any]],
|
|
257
|
+
is_textual: bool,
|
|
258
|
+
) -> Dict[str, Any]:
|
|
259
|
+
tag = _match_tag(name, tag_dict)
|
|
260
|
+
row: Dict[str, Any] = {
|
|
261
|
+
"name": name,
|
|
262
|
+
"path": file_path,
|
|
263
|
+
"tag": tag,
|
|
264
|
+
}
|
|
265
|
+
if is_textual:
|
|
266
|
+
row["text"] = _extract_text(file_path)
|
|
267
|
+
for i, layer in enumerate(layers, start=1):
|
|
268
|
+
row[f"layer_{i}"] = layer
|
|
269
|
+
return row
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _extract_text(file_path: str) -> str:
|
|
273
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
274
|
+
if ext in TEXT_EXTENSIONS or not ext:
|
|
275
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
276
|
+
return fh.read()
|
|
277
|
+
if ext == ".pdf":
|
|
278
|
+
pypdf = _optional_import("pypdf", "pypdf")
|
|
279
|
+
reader = pypdf.PdfReader(file_path)
|
|
280
|
+
return "\n".join(page.extract_text() or "" for page in reader.pages).strip()
|
|
281
|
+
if ext == ".docx":
|
|
282
|
+
docx = _optional_import("docx", "python-docx")
|
|
283
|
+
document = docx.Document(file_path)
|
|
284
|
+
return "\n".join(p.text for p in document.paragraphs).strip()
|
|
285
|
+
if ext == ".doc":
|
|
286
|
+
return ""
|
|
287
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
288
|
+
return fh.read()
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _optional_import(module_name: str, package_name: str):
|
|
292
|
+
if importlib.util.find_spec(module_name) is None:
|
|
293
|
+
raise ImportError(
|
|
294
|
+
f"Missing optional dependency '{package_name}'. Install it to "
|
|
295
|
+
f"extract {module_name} documents."
|
|
296
|
+
)
|
|
297
|
+
return importlib.import_module(module_name)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def _match_tag(name: str, tag_dict: Optional[Dict[str, Any]]) -> Optional[Any]:
|
|
301
|
+
if not tag_dict:
|
|
302
|
+
return None
|
|
303
|
+
lower_name = name.lower()
|
|
304
|
+
for key, val in tag_dict.items():
|
|
305
|
+
if key.lower() in lower_name:
|
|
306
|
+
return val
|
|
307
|
+
return None
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _resolve_modality(
|
|
311
|
+
folder_path: str,
|
|
312
|
+
extset: Optional[Set[str]],
|
|
313
|
+
save_name: str,
|
|
314
|
+
requested_modality: Optional[str],
|
|
315
|
+
) -> str:
|
|
316
|
+
if requested_modality:
|
|
317
|
+
normalized = requested_modality.lower()
|
|
318
|
+
if normalized not in ALL_MODALITIES:
|
|
319
|
+
logger.info(
|
|
320
|
+
"Unknown modality '%s'; defaulting to text-style processing.",
|
|
321
|
+
normalized,
|
|
322
|
+
)
|
|
323
|
+
return normalized
|
|
324
|
+
detected = _detect_modality(folder_path, extset, save_name)
|
|
325
|
+
logger.info("Detected %s modality for %s", detected, folder_path)
|
|
326
|
+
return detected
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _detect_modality(
|
|
330
|
+
folder_path: str,
|
|
331
|
+
extset: Optional[Set[str]],
|
|
332
|
+
save_name: str,
|
|
333
|
+
) -> str:
|
|
334
|
+
detected: Set[str] = set()
|
|
335
|
+
for file_path in _iter_candidate_files(folder_path, extset, save_name):
|
|
336
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
337
|
+
if ext in PDF_EXTENSIONS:
|
|
338
|
+
detected.add("pdf")
|
|
339
|
+
elif ext in IMAGE_EXTENSIONS:
|
|
340
|
+
detected.add("image")
|
|
341
|
+
elif ext in AUDIO_EXTENSIONS:
|
|
342
|
+
detected.add("audio")
|
|
343
|
+
else:
|
|
344
|
+
detected.add("text")
|
|
345
|
+
if "text" in detected:
|
|
346
|
+
break
|
|
347
|
+
if not detected:
|
|
348
|
+
return "text"
|
|
349
|
+
if "text" in detected:
|
|
350
|
+
return "text"
|
|
351
|
+
if len(detected) == 1:
|
|
352
|
+
return detected.pop()
|
|
353
|
+
return "text"
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _iter_candidate_files(
|
|
357
|
+
folder_path: str,
|
|
358
|
+
extset: Optional[Set[str]],
|
|
359
|
+
save_name: str,
|
|
360
|
+
) -> Iterable[str]:
|
|
361
|
+
if os.path.isfile(folder_path):
|
|
362
|
+
yield folder_path
|
|
363
|
+
return
|
|
364
|
+
for root, _, files in os.walk(folder_path):
|
|
365
|
+
for fname in files:
|
|
366
|
+
if fname == save_name:
|
|
367
|
+
continue
|
|
368
|
+
short_ext = os.path.splitext(fname)[1].lower().lstrip(".")
|
|
369
|
+
if extset and short_ext not in extset:
|
|
370
|
+
continue
|
|
371
|
+
yield os.path.join(root, fname)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _is_textual_modality(modality: str) -> bool:
|
|
375
|
+
if modality in TEXTUAL_MODALITIES:
|
|
376
|
+
return True
|
|
377
|
+
if modality in PATH_MODALITIES:
|
|
378
|
+
return False
|
|
379
|
+
return True
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def _should_include_file(
|
|
383
|
+
short_ext: str,
|
|
384
|
+
modality: str,
|
|
385
|
+
extset: Optional[Set[str]],
|
|
386
|
+
) -> bool:
|
|
387
|
+
if extset and short_ext not in extset:
|
|
388
|
+
return False
|
|
389
|
+
if modality in TEXTUAL_MODALITIES:
|
|
390
|
+
if short_ext in IMAGE_EXTENSION_SUFFIXES or short_ext in AUDIO_EXTENSION_SUFFIXES:
|
|
391
|
+
return False
|
|
392
|
+
return True
|
|
393
|
+
if modality == "image":
|
|
394
|
+
return short_ext in IMAGE_EXTENSION_SUFFIXES
|
|
395
|
+
if modality == "audio":
|
|
396
|
+
return short_ext in AUDIO_EXTENSION_SUFFIXES
|
|
397
|
+
if modality == "pdf":
|
|
398
|
+
return short_ext in PDF_EXTENSION_SUFFIXES
|
|
399
|
+
return True
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _warn_for_media_mismatch(
|
|
403
|
+
ext: str,
|
|
404
|
+
modality: str,
|
|
405
|
+
warned_pdf: bool,
|
|
406
|
+
warned_image: bool,
|
|
407
|
+
warned_audio: bool,
|
|
408
|
+
folder_path: str,
|
|
409
|
+
) -> tuple[bool, bool, bool]:
|
|
410
|
+
if ext in PDF_EXTENSIONS and modality != "pdf" and not warned_pdf:
|
|
411
|
+
print(
|
|
412
|
+
f"[gabriel.load] Found PDF files in {folder_path} while modality='{modality}'. "
|
|
413
|
+
"PDFs will be extracted into plain text. For best PDF fidelity (layout, "
|
|
414
|
+
"figures, and images), set modality='pdf' here and in the downstream "
|
|
415
|
+
"gabriel call."
|
|
416
|
+
)
|
|
417
|
+
warned_pdf = True
|
|
418
|
+
if ext in IMAGE_EXTENSIONS and modality != "image" and not warned_image:
|
|
419
|
+
print(
|
|
420
|
+
f"[gabriel.load] Found image files in {folder_path}. "
|
|
421
|
+
"Set modality='image' to attach images directly to GPT calls."
|
|
422
|
+
)
|
|
423
|
+
warned_image = True
|
|
424
|
+
if ext in AUDIO_EXTENSIONS and modality != "audio" and not warned_audio:
|
|
425
|
+
print(
|
|
426
|
+
f"[gabriel.load] Found audio files in {folder_path}. "
|
|
427
|
+
"Set modality='audio' to attach audio directly to GPT calls."
|
|
428
|
+
)
|
|
429
|
+
warned_audio = True
|
|
430
|
+
return warned_pdf, warned_image, warned_audio
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def _resolve_save_directory(folder_path: str, save_dir: Optional[str]) -> str:
|
|
434
|
+
if save_dir:
|
|
435
|
+
resolved = os.path.expanduser(os.path.expandvars(save_dir))
|
|
436
|
+
else:
|
|
437
|
+
if os.path.isdir(folder_path):
|
|
438
|
+
resolved = folder_path
|
|
439
|
+
else:
|
|
440
|
+
parent = os.path.dirname(folder_path)
|
|
441
|
+
if not parent:
|
|
442
|
+
parent = os.path.dirname(os.path.abspath(folder_path))
|
|
443
|
+
resolved = parent
|
|
444
|
+
if not resolved:
|
|
445
|
+
resolved = os.getcwd()
|
|
446
|
+
if os.path.isfile(resolved):
|
|
447
|
+
raise ValueError(f"save_dir must be a directory path, got file {resolved}")
|
|
448
|
+
os.makedirs(resolved, exist_ok=True)
|
|
449
|
+
return resolved
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def _read_tabular_file(path: str) -> pd.DataFrame:
|
|
453
|
+
ext = os.path.splitext(path)[1].lower()
|
|
454
|
+
if ext == ".csv":
|
|
455
|
+
return pd.read_csv(path)
|
|
456
|
+
if ext == ".tsv":
|
|
457
|
+
return pd.read_csv(path, sep="\t")
|
|
458
|
+
if ext in {".xlsx", ".xls"}:
|
|
459
|
+
return pd.read_excel(path)
|
|
460
|
+
if ext in {".parquet", ".pq"}:
|
|
461
|
+
return pd.read_parquet(path)
|
|
462
|
+
if ext == ".feather":
|
|
463
|
+
return pd.read_feather(path)
|
|
464
|
+
return pd.read_csv(path)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def encode_image(image_path: str) -> Optional[str]:
|
|
6
|
+
"""Return the contents of ``image_path`` as a base64 string.
|
|
7
|
+
|
|
8
|
+
Parameters
|
|
9
|
+
----------
|
|
10
|
+
image_path: str
|
|
11
|
+
Path to the image file to encode.
|
|
12
|
+
|
|
13
|
+
Returns
|
|
14
|
+
-------
|
|
15
|
+
str or None
|
|
16
|
+
The base64-encoded contents of the file, or ``None`` if reading fails.
|
|
17
|
+
"""
|
|
18
|
+
try:
|
|
19
|
+
with open(image_path, "rb") as image_file:
|
|
20
|
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
|
21
|
+
except Exception:
|
|
22
|
+
return None
|
gabriel/utils/jinja.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import random
|
|
3
|
+
from collections import OrderedDict
|
|
4
|
+
import json
|
|
5
|
+
from jinja2 import Environment, FileSystemLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def shuffled(it, seed=None):
|
|
9
|
+
"""Return a new list with the same elements, shuffled."""
|
|
10
|
+
seq = list(it)
|
|
11
|
+
rnd = random.Random(seed) if seed is not None else random
|
|
12
|
+
rnd.shuffle(seq)
|
|
13
|
+
return seq
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def shuffled_dict(d, seed=None):
|
|
17
|
+
"""Return a JSON-formatted dict string with items shuffled."""
|
|
18
|
+
items = list(d.items())
|
|
19
|
+
rnd = random.Random(seed) if seed is not None else random
|
|
20
|
+
rnd.shuffle(items)
|
|
21
|
+
ordered = OrderedDict(items)
|
|
22
|
+
return json.dumps(ordered, ensure_ascii=False, indent=2)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_env():
|
|
26
|
+
"""Return a Jinja2 environment with shuffle filters preloaded."""
|
|
27
|
+
templates_dir = os.path.join(os.path.dirname(__file__), "..", "prompts")
|
|
28
|
+
env = Environment(loader=FileSystemLoader(os.path.abspath(templates_dir)))
|
|
29
|
+
env.filters["shuffled"] = shuffled
|
|
30
|
+
env.filters["shuffled_dict"] = shuffled_dict
|
|
31
|
+
return env
|
gabriel/utils/logging.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Simple logging helpers with configurable verbosity.
|
|
2
|
+
|
|
3
|
+
This module centralises logging configuration for the project. Users can
|
|
4
|
+
control verbosity either programmatically through :func:`set_log_level` or via
|
|
5
|
+
the ``GABRIEL_LOG_LEVEL`` environment variable. Levels mirror typical logging
|
|
6
|
+
conventions and add a "silent" option which suppresses all log output.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
from typing import Union
|
|
14
|
+
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
# Verbosity handling
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
LOG_LEVELS = {
|
|
20
|
+
"silent": logging.CRITICAL + 1,
|
|
21
|
+
"error": logging.ERROR,
|
|
22
|
+
"warning": logging.WARNING,
|
|
23
|
+
"info": logging.INFO,
|
|
24
|
+
"debug": logging.DEBUG,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _parse_level(level: Union[str, int, None]) -> int:
|
|
29
|
+
"""Translate a human friendly level to ``logging`` constants."""
|
|
30
|
+
|
|
31
|
+
if isinstance(level, str):
|
|
32
|
+
return LOG_LEVELS.get(level.lower(), logging.INFO)
|
|
33
|
+
if isinstance(level, int):
|
|
34
|
+
return level
|
|
35
|
+
return logging.INFO
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
CURRENT_LEVEL = _parse_level(os.getenv("GABRIEL_LOG_LEVEL", "warning"))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def set_log_level(level: Union[str, int]) -> None:
|
|
42
|
+
"""Set the global logging level for all GABRIEL loggers."""
|
|
43
|
+
|
|
44
|
+
global CURRENT_LEVEL
|
|
45
|
+
CURRENT_LEVEL = _parse_level(level)
|
|
46
|
+
root = logging.getLogger()
|
|
47
|
+
root.setLevel(CURRENT_LEVEL)
|
|
48
|
+
if not root.handlers:
|
|
49
|
+
handler = logging.StreamHandler()
|
|
50
|
+
formatter = logging.Formatter(
|
|
51
|
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
52
|
+
)
|
|
53
|
+
handler.setFormatter(formatter)
|
|
54
|
+
root.addHandler(handler)
|
|
55
|
+
for handler in root.handlers:
|
|
56
|
+
handler.setLevel(CURRENT_LEVEL)
|
|
57
|
+
# Update existing loggers to the new level
|
|
58
|
+
for logger in logging.getLogger().manager.loggerDict.values():
|
|
59
|
+
if isinstance(logger, logging.Logger):
|
|
60
|
+
logger.setLevel(CURRENT_LEVEL)
|
|
61
|
+
for h in logger.handlers:
|
|
62
|
+
h.setLevel(CURRENT_LEVEL)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_logger(name: str) -> logging.Logger:
|
|
66
|
+
"""Return a module logger configured with the global level."""
|
|
67
|
+
|
|
68
|
+
logger = logging.getLogger(name)
|
|
69
|
+
logger.setLevel(CURRENT_LEVEL)
|
|
70
|
+
return logger
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def announce_prompt_rendering(task: str, count: int) -> None:
|
|
74
|
+
"""Emit a lightweight notice when prompt rendering begins.
|
|
75
|
+
|
|
76
|
+
Direct ``print`` is intentional here so users see activity immediately in
|
|
77
|
+
notebooks/terminals without needing logging configuration.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
if count <= 0:
|
|
81
|
+
return
|
|
82
|
+
print(f"[{task}] Rendering {count} prompts…", flush=True)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# Configure root logger on import according to ``GABRIEL_LOG_LEVEL``.
|
|
86
|
+
set_log_level(CURRENT_LEVEL)
|