eval-ai-library 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- eval_ai_library-0.1.0.dist-info/METADATA +753 -0
- eval_ai_library-0.1.0.dist-info/RECORD +34 -0
- eval_ai_library-0.1.0.dist-info/WHEEL +5 -0
- eval_ai_library-0.1.0.dist-info/licenses/LICENSE +21 -0
- eval_ai_library-0.1.0.dist-info/top_level.txt +1 -0
- eval_lib/__init__.py +122 -0
- eval_lib/agent_metrics/__init__.py +12 -0
- eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +231 -0
- eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +251 -0
- eval_lib/agent_metrics/task_success_metric/task_success_rate.py +347 -0
- eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +106 -0
- eval_lib/datagenerator/datagenerator.py +230 -0
- eval_lib/datagenerator/document_loader.py +510 -0
- eval_lib/datagenerator/prompts.py +192 -0
- eval_lib/evaluate.py +335 -0
- eval_lib/evaluation_schema.py +63 -0
- eval_lib/llm_client.py +286 -0
- eval_lib/metric_pattern.py +229 -0
- eval_lib/metrics/__init__.py +25 -0
- eval_lib/metrics/answer_precision_metric/answer_precision.py +405 -0
- eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +195 -0
- eval_lib/metrics/bias_metric/bias.py +114 -0
- eval_lib/metrics/contextual_precision_metric/contextual_precision.py +102 -0
- eval_lib/metrics/contextual_recall_metric/contextual_recall.py +91 -0
- eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +169 -0
- eval_lib/metrics/custom_metric/custom_eval.py +303 -0
- eval_lib/metrics/faithfulness_metric/faithfulness.py +140 -0
- eval_lib/metrics/geval/geval.py +326 -0
- eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +102 -0
- eval_lib/metrics/toxicity_metric/toxicity.py +113 -0
- eval_lib/price.py +37 -0
- eval_lib/py.typed +1 -0
- eval_lib/testcases_schema.py +27 -0
- eval_lib/utils.py +99 -0
|
@@ -0,0 +1,510 @@
|
|
|
1
|
+
# document_loader.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from langchain_core.documents import Document
|
|
7
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
8
|
+
|
|
9
|
+
# LangChain loaders (оставляем существующие)
|
|
10
|
+
from langchain_community.document_loaders import PyPDFLoader
|
|
11
|
+
from langchain_community.document_loaders import Docx2txtLoader
|
|
12
|
+
from langchain_community.document_loaders import TextLoader
|
|
13
|
+
|
|
14
|
+
import html2text
|
|
15
|
+
import markdown
|
|
16
|
+
|
|
17
|
+
import io
|
|
18
|
+
import json
|
|
19
|
+
import zipfile
|
|
20
|
+
|
|
21
|
+
# ---------------------------
|
|
22
|
+
# Helper functions
|
|
23
|
+
# ---------------------------
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _read_text(p: Path) -> str:
|
|
27
|
+
return p.read_text(encoding="utf-8", errors="ignore")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _read_bytes(p: Path) -> bytes:
|
|
31
|
+
return p.read_bytes()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _csv_tsv_to_text(p: Path) -> str:
|
|
35
|
+
try:
|
|
36
|
+
import pandas as pd
|
|
37
|
+
sep = "," if p.suffix.lower() == ".csv" else "\t"
|
|
38
|
+
df = pd.read_csv(str(p), dtype=str, sep=sep,
|
|
39
|
+
encoding="utf-8", engine="python")
|
|
40
|
+
df = df.fillna("")
|
|
41
|
+
buf = io.StringIO()
|
|
42
|
+
df.to_csv(buf, index=False)
|
|
43
|
+
return buf.getvalue()
|
|
44
|
+
except Exception:
|
|
45
|
+
try:
|
|
46
|
+
return _read_text(p)
|
|
47
|
+
except Exception:
|
|
48
|
+
return ""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _xlsx_to_text(p: Path) -> str:
|
|
52
|
+
try:
|
|
53
|
+
import pandas as pd
|
|
54
|
+
df = pd.read_excel(str(p), dtype=str, engine="openpyxl")
|
|
55
|
+
df = df.fillna("")
|
|
56
|
+
buf = io.StringIO()
|
|
57
|
+
df.to_csv(buf, index=False)
|
|
58
|
+
return buf.getvalue()
|
|
59
|
+
except Exception:
|
|
60
|
+
return ""
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _pptx_to_text(p: Path) -> str:
|
|
64
|
+
try:
|
|
65
|
+
from pptx import Presentation
|
|
66
|
+
prs = Presentation(str(p))
|
|
67
|
+
texts = []
|
|
68
|
+
for slide in prs.slides:
|
|
69
|
+
for shape in slide.shapes:
|
|
70
|
+
if hasattr(shape, "text") and shape.text:
|
|
71
|
+
texts.append(shape.text)
|
|
72
|
+
return "\n".join(texts)
|
|
73
|
+
except Exception:
|
|
74
|
+
return ""
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _json_to_text(p: Path) -> str:
|
|
78
|
+
try:
|
|
79
|
+
data = json.loads(_read_text(p))
|
|
80
|
+
|
|
81
|
+
def flatten(obj, prefix=""):
|
|
82
|
+
lines = []
|
|
83
|
+
if isinstance(obj, dict):
|
|
84
|
+
for k, v in obj.items():
|
|
85
|
+
lines += flatten(v, f"{prefix}{k}.")
|
|
86
|
+
elif isinstance(obj, list):
|
|
87
|
+
for i, v in enumerate(obj):
|
|
88
|
+
lines += flatten(v, f"{prefix}{i}.")
|
|
89
|
+
else:
|
|
90
|
+
lines.append(f"{prefix[:-1]}: {obj}")
|
|
91
|
+
return lines
|
|
92
|
+
return "\n".join(flatten(data))
|
|
93
|
+
except Exception:
|
|
94
|
+
return _read_text(p)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _yaml_to_text(p: Path) -> str:
|
|
98
|
+
try:
|
|
99
|
+
import yaml
|
|
100
|
+
data = yaml.safe_load(_read_text(p))
|
|
101
|
+
return json.dumps(data, ensure_ascii=False, indent=2)
|
|
102
|
+
except Exception:
|
|
103
|
+
return _read_text(p)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _xml_to_text(p: Path) -> str:
|
|
107
|
+
try:
|
|
108
|
+
from xml.etree import ElementTree as ET
|
|
109
|
+
tree = ET.parse(str(p))
|
|
110
|
+
root = tree.getroot()
|
|
111
|
+
lines = []
|
|
112
|
+
|
|
113
|
+
def walk(node, prefix=""):
|
|
114
|
+
text = (node.text or "").strip()
|
|
115
|
+
tag = node.tag
|
|
116
|
+
if text:
|
|
117
|
+
lines.append(f"{prefix}{tag}: {text}")
|
|
118
|
+
for child in node:
|
|
119
|
+
walk(child, prefix + tag + ".")
|
|
120
|
+
walk(root)
|
|
121
|
+
return "\n".join(lines)
|
|
122
|
+
except Exception:
|
|
123
|
+
return _read_text(p)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _rtf_to_text(p: Path) -> str:
|
|
127
|
+
try:
|
|
128
|
+
from striprtf.striprtf import rtf_to_text
|
|
129
|
+
return rtf_to_text(_read_text(p))
|
|
130
|
+
except Exception:
|
|
131
|
+
return ""
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _odt_to_text(p: Path) -> str:
|
|
135
|
+
try:
|
|
136
|
+
with zipfile.ZipFile(str(p)) as z:
|
|
137
|
+
from xml.etree import ElementTree as ET
|
|
138
|
+
with z.open("content.xml") as f:
|
|
139
|
+
tree = ET.parse(f)
|
|
140
|
+
root = tree.getroot()
|
|
141
|
+
texts = []
|
|
142
|
+
for elem in root.iter():
|
|
143
|
+
if elem.text and elem.text.strip():
|
|
144
|
+
texts.append(elem.text.strip())
|
|
145
|
+
return "\n".join(texts)
|
|
146
|
+
except Exception:
|
|
147
|
+
return ""
|
|
148
|
+
|
|
149
|
+
# ---------------------------
|
|
150
|
+
# PDF: LangChain -> pypdf -> PyMuPDF -> OCR(PyMuPDF+pytesseract)
|
|
151
|
+
# ---------------------------
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _pdf_text_pypdf(p: Path) -> str:
|
|
155
|
+
try:
|
|
156
|
+
from pypdf import PdfReader # <- именно pypdf
|
|
157
|
+
reader = PdfReader(str(p))
|
|
158
|
+
texts = []
|
|
159
|
+
for page in reader.pages:
|
|
160
|
+
t = page.extract_text() or ""
|
|
161
|
+
if t.strip():
|
|
162
|
+
texts.append(t)
|
|
163
|
+
return "\n".join(texts)
|
|
164
|
+
except Exception:
|
|
165
|
+
return ""
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _pdf_text_pymupdf(p: Path) -> str:
|
|
169
|
+
try:
|
|
170
|
+
import fitz # PyMuPDF
|
|
171
|
+
text_parts = []
|
|
172
|
+
with fitz.open(str(p)) as doc:
|
|
173
|
+
for page in doc:
|
|
174
|
+
t = page.get_text("text") or ""
|
|
175
|
+
if t.strip():
|
|
176
|
+
text_parts.append(t)
|
|
177
|
+
return "\n".join(text_parts)
|
|
178
|
+
except Exception:
|
|
179
|
+
return ""
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _pdf_ocr_via_pymupdf(p: Path) -> str:
|
|
183
|
+
"""Render pages via PyMuPDF and OCR pytesseract. Will work if pytesseract + tesseract are installed."""
|
|
184
|
+
try:
|
|
185
|
+
import fitz # PyMuPDF
|
|
186
|
+
import pytesseract
|
|
187
|
+
from PIL import Image
|
|
188
|
+
import io as _io
|
|
189
|
+
|
|
190
|
+
texts = []
|
|
191
|
+
zoom = 2.0
|
|
192
|
+
mat = fitz.Matrix(zoom, zoom)
|
|
193
|
+
with fitz.open(str(p)) as doc:
|
|
194
|
+
for page in doc:
|
|
195
|
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
196
|
+
img = Image.open(_io.BytesIO(pix.tobytes("png")))
|
|
197
|
+
t = pytesseract.image_to_string(img) or ""
|
|
198
|
+
if t.strip():
|
|
199
|
+
texts.append(t)
|
|
200
|
+
return "\n".join(texts)
|
|
201
|
+
except Exception:
|
|
202
|
+
return ""
|
|
203
|
+
|
|
204
|
+
# ---------------------------
|
|
205
|
+
# Images (OCR)
|
|
206
|
+
# ---------------------------
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _ocr_image_bytes(img_bytes: bytes) -> str:
|
|
210
|
+
try:
|
|
211
|
+
import pytesseract
|
|
212
|
+
from PIL import Image
|
|
213
|
+
import io as _io
|
|
214
|
+
img = Image.open(_io.BytesIO(img_bytes))
|
|
215
|
+
return pytesseract.image_to_string(img) or ""
|
|
216
|
+
except Exception:
|
|
217
|
+
return ""
|
|
218
|
+
|
|
219
|
+
# ---------------------------
|
|
220
|
+
# Docx
|
|
221
|
+
# ---------------------------
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _docx_to_text_python_docx(p: Path) -> str:
|
|
225
|
+
try:
|
|
226
|
+
import docx # python-docx
|
|
227
|
+
d = docx.Document(str(p))
|
|
228
|
+
parts = []
|
|
229
|
+
for para in d.paragraphs:
|
|
230
|
+
if para.text:
|
|
231
|
+
parts.append(para.text)
|
|
232
|
+
# захватим текст из таблиц
|
|
233
|
+
for tbl in d.tables:
|
|
234
|
+
for row in tbl.rows:
|
|
235
|
+
cells = [cell.text for cell in row.cells]
|
|
236
|
+
if any(c.strip() for c in cells):
|
|
237
|
+
parts.append("\t".join(cells))
|
|
238
|
+
return "\n".join(parts)
|
|
239
|
+
except Exception:
|
|
240
|
+
return ""
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _docx_to_text_mammoth(p: Path) -> str:
|
|
244
|
+
try:
|
|
245
|
+
import mammoth
|
|
246
|
+
with open(str(p), "rb") as f:
|
|
247
|
+
result = mammoth.extract_raw_text(f)
|
|
248
|
+
return (result.value or "").strip()
|
|
249
|
+
except Exception:
|
|
250
|
+
return ""
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _docx_to_text_zipxml(p: Path) -> str:
|
|
254
|
+
"""Без зависимостей: читаем word/document.xml и вытаскиваем все w:t."""
|
|
255
|
+
try:
|
|
256
|
+
import zipfile
|
|
257
|
+
from xml.etree import ElementTree as ET
|
|
258
|
+
texts = []
|
|
259
|
+
with zipfile.ZipFile(str(p)) as z:
|
|
260
|
+
# основной документ
|
|
261
|
+
if "word/document.xml" in z.namelist():
|
|
262
|
+
with z.open("word/document.xml") as f:
|
|
263
|
+
root = ET.parse(f).getroot()
|
|
264
|
+
for el in root.iter():
|
|
265
|
+
tag = el.tag.rsplit("}", 1)[-1] # убрать namespace
|
|
266
|
+
if tag == "t" and el.text and el.text.strip():
|
|
267
|
+
texts.append(el.text.strip())
|
|
268
|
+
# заголовки/футеры тоже могут содержать текст
|
|
269
|
+
for name in z.namelist():
|
|
270
|
+
if name.startswith("word/header") and name.endswith(".xml"):
|
|
271
|
+
with z.open(name) as f:
|
|
272
|
+
root = ET.parse(f).getroot()
|
|
273
|
+
for el in root.iter():
|
|
274
|
+
tag = el.tag.rsplit("}", 1)[-1]
|
|
275
|
+
if tag == "t" and el.text and el.text.strip():
|
|
276
|
+
texts.append(el.text.strip())
|
|
277
|
+
if name.startswith("word/footer") and name.endswith(".xml"):
|
|
278
|
+
with z.open(name) as f:
|
|
279
|
+
root = ET.parse(f).getroot()
|
|
280
|
+
for el in root.iter():
|
|
281
|
+
tag = el.tag.rsplit("}", 1)[-1]
|
|
282
|
+
if tag == "t" and el.text and el.text.strip():
|
|
283
|
+
texts.append(el.text.strip())
|
|
284
|
+
return "\n".join(texts)
|
|
285
|
+
except Exception:
|
|
286
|
+
return ""
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _doc_to_text_textract(p: Path) -> str:
|
|
290
|
+
"""Для старого .doc. Работает, если установлен textract и системные бинарники (antiword/catdoc)."""
|
|
291
|
+
try:
|
|
292
|
+
import textract
|
|
293
|
+
return textract.process(str(p)).decode("utf-8", errors="ignore")
|
|
294
|
+
except Exception:
|
|
295
|
+
return ""
|
|
296
|
+
|
|
297
|
+
# ---------------------------
|
|
298
|
+
# General functions (extended)
|
|
299
|
+
# ---------------------------
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def load_documents(file_paths: List[str]) -> List[Document]:
|
|
303
|
+
documents: List[Document] = []
|
|
304
|
+
|
|
305
|
+
for path in map(Path, file_paths):
|
|
306
|
+
ext = path.suffix.lower()
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
# ---- PDF ----
|
|
310
|
+
if ext == ".pdf":
|
|
311
|
+
used_langchain = False
|
|
312
|
+
# 1) LangChain PyPDFLoader
|
|
313
|
+
try:
|
|
314
|
+
docs = PyPDFLoader(str(path)).load()
|
|
315
|
+
if docs and any((d.page_content or "").strip() for d in docs):
|
|
316
|
+
documents += docs
|
|
317
|
+
used_langchain = True
|
|
318
|
+
except Exception:
|
|
319
|
+
used_langchain = False
|
|
320
|
+
|
|
321
|
+
if not used_langchain:
|
|
322
|
+
# 2) pypdf
|
|
323
|
+
text = _pdf_text_pypdf(path)
|
|
324
|
+
if not text.strip():
|
|
325
|
+
# 3) PyMuPDF
|
|
326
|
+
text = _pdf_text_pymupdf(path)
|
|
327
|
+
if not text.strip():
|
|
328
|
+
# 4) OCR через PyMuPDF
|
|
329
|
+
text = _pdf_ocr_via_pymupdf(path)
|
|
330
|
+
|
|
331
|
+
if text.strip():
|
|
332
|
+
documents.append(Document(page_content=text, metadata={
|
|
333
|
+
"source": str(path), "filetype": "pdf"}))
|
|
334
|
+
else:
|
|
335
|
+
print(
|
|
336
|
+
f"⚠️ PDF has no extractable text (maybe scanned): {path.name}")
|
|
337
|
+
|
|
338
|
+
# ---- DOCX ----
|
|
339
|
+
elif ext == ".docx":
|
|
340
|
+
# 1) Пытаемся стандартным Docx2txtLoader
|
|
341
|
+
added = False
|
|
342
|
+
try:
|
|
343
|
+
docs = Docx2txtLoader(str(path)).load()
|
|
344
|
+
# Docx2txt иногда возвращает Document с пустым page_content
|
|
345
|
+
docs = [d for d in docs if (d.page_content or "").strip()]
|
|
346
|
+
if docs:
|
|
347
|
+
documents += docs
|
|
348
|
+
added = True
|
|
349
|
+
except Exception:
|
|
350
|
+
added = False
|
|
351
|
+
|
|
352
|
+
if not added:
|
|
353
|
+
# 2) python-docx
|
|
354
|
+
text = _docx_to_text_python_docx(path)
|
|
355
|
+
if not text.strip():
|
|
356
|
+
# 3) mammoth
|
|
357
|
+
text = _docx_to_text_mammoth(path)
|
|
358
|
+
if not text.strip():
|
|
359
|
+
# 4) zip+xml fallback
|
|
360
|
+
text = _docx_to_text_zipxml(path)
|
|
361
|
+
|
|
362
|
+
if text.strip():
|
|
363
|
+
documents.append(Document(
|
|
364
|
+
page_content=text,
|
|
365
|
+
metadata={"source": str(path), "filetype": "docx"}
|
|
366
|
+
))
|
|
367
|
+
else:
|
|
368
|
+
print(f"⚠️ DOCX produced no text: {path.name}")
|
|
369
|
+
|
|
370
|
+
elif ext == ".doc":
|
|
371
|
+
# старый формат
|
|
372
|
+
text = _doc_to_text_textract(path)
|
|
373
|
+
if text.strip():
|
|
374
|
+
documents.append(Document(
|
|
375
|
+
page_content=text,
|
|
376
|
+
metadata={"source": str(path), "filetype": "doc"}
|
|
377
|
+
))
|
|
378
|
+
else:
|
|
379
|
+
print(
|
|
380
|
+
f"⚠️ .DOC not extractable (install textract/antiword?): {path.name}")
|
|
381
|
+
|
|
382
|
+
# ---- TXT ----
|
|
383
|
+
elif ext == ".txt":
|
|
384
|
+
documents += TextLoader(str(path), encoding="utf-8").load()
|
|
385
|
+
|
|
386
|
+
# ---- HTML ----
|
|
387
|
+
elif ext in (".html", ".htm"):
|
|
388
|
+
html = _read_text(path)
|
|
389
|
+
text = html2text.html2text(html)
|
|
390
|
+
documents.append(Document(page_content=text, metadata={
|
|
391
|
+
"source": str(path), "filetype": "html"}))
|
|
392
|
+
|
|
393
|
+
# ---- Markdown ----
|
|
394
|
+
elif ext == ".md":
|
|
395
|
+
md = _read_text(path)
|
|
396
|
+
html = markdown.markdown(md)
|
|
397
|
+
text = html2text.html2text(html)
|
|
398
|
+
documents.append(Document(page_content=text, metadata={
|
|
399
|
+
"source": str(path), "filetype": "md"}))
|
|
400
|
+
|
|
401
|
+
# ---- CSV / TSV ----
|
|
402
|
+
elif ext in (".csv", ".tsv"):
|
|
403
|
+
text = _csv_tsv_to_text(path)
|
|
404
|
+
if text.strip():
|
|
405
|
+
documents.append(Document(page_content=text, metadata={
|
|
406
|
+
"source": str(path), "filetype": ext.lstrip(".")}))
|
|
407
|
+
else:
|
|
408
|
+
print(f"⚠️ Empty CSV/TSV: {path.name}")
|
|
409
|
+
|
|
410
|
+
# ---- XLSX ----
|
|
411
|
+
elif ext == ".xlsx":
|
|
412
|
+
text = _xlsx_to_text(path)
|
|
413
|
+
if text.strip():
|
|
414
|
+
documents.append(Document(page_content=text, metadata={
|
|
415
|
+
"source": str(path), "filetype": "xlsx"}))
|
|
416
|
+
else:
|
|
417
|
+
print(f"⚠️ Empty XLSX: {path.name}")
|
|
418
|
+
|
|
419
|
+
# ---- PPTX ----
|
|
420
|
+
elif ext == ".pptx":
|
|
421
|
+
text = _pptx_to_text(path)
|
|
422
|
+
if text.strip():
|
|
423
|
+
documents.append(Document(page_content=text, metadata={
|
|
424
|
+
"source": str(path), "filetype": "pptx"}))
|
|
425
|
+
else:
|
|
426
|
+
print(f"⚠️ Empty PPTX: {path.name}")
|
|
427
|
+
|
|
428
|
+
# ---- JSON ----
|
|
429
|
+
elif ext == ".json":
|
|
430
|
+
text = _json_to_text(path)
|
|
431
|
+
if text.strip():
|
|
432
|
+
documents.append(Document(page_content=text, metadata={
|
|
433
|
+
"source": str(path), "filetype": "json"}))
|
|
434
|
+
else:
|
|
435
|
+
print(f"⚠️ Empty JSON: {path.name}")
|
|
436
|
+
|
|
437
|
+
# ---- YAML / YML ----
|
|
438
|
+
elif ext in (".yaml", ".yml"):
|
|
439
|
+
text = _yaml_to_text(path)
|
|
440
|
+
if text.strip():
|
|
441
|
+
documents.append(Document(page_content=text, metadata={
|
|
442
|
+
"source": str(path), "filetype": "yaml"}))
|
|
443
|
+
else:
|
|
444
|
+
print(f"⚠️ Empty YAML: {path.name}")
|
|
445
|
+
|
|
446
|
+
# ---- XML ----
|
|
447
|
+
elif ext == ".xml":
|
|
448
|
+
text = _xml_to_text(path)
|
|
449
|
+
if text.strip():
|
|
450
|
+
documents.append(Document(page_content=text, metadata={
|
|
451
|
+
"source": str(path), "filetype": "xml"}))
|
|
452
|
+
else:
|
|
453
|
+
print(f"⚠️ Empty XML: {path.name}")
|
|
454
|
+
|
|
455
|
+
# ---- RTF ----
|
|
456
|
+
elif ext == ".rtf":
|
|
457
|
+
text = _rtf_to_text(path)
|
|
458
|
+
if text.strip():
|
|
459
|
+
documents.append(Document(page_content=text, metadata={
|
|
460
|
+
"source": str(path), "filetype": "rtf"}))
|
|
461
|
+
else:
|
|
462
|
+
print(f"⚠️ Empty RTF: {path.name}")
|
|
463
|
+
|
|
464
|
+
# ---- ODT ----
|
|
465
|
+
elif ext == ".odt":
|
|
466
|
+
text = _odt_to_text(path)
|
|
467
|
+
if text.strip():
|
|
468
|
+
documents.append(Document(page_content=text, metadata={
|
|
469
|
+
"source": str(path), "filetype": "odt"}))
|
|
470
|
+
else:
|
|
471
|
+
print(f"⚠️ Empty ODT: {path.name}")
|
|
472
|
+
|
|
473
|
+
# ---- Изображения (OCR) ----
|
|
474
|
+
elif ext in (".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"):
|
|
475
|
+
txt = _ocr_image_bytes(_read_bytes(path))
|
|
476
|
+
if txt.strip():
|
|
477
|
+
documents.append(Document(page_content=txt, metadata={
|
|
478
|
+
"source": str(path), "filetype": "image"}))
|
|
479
|
+
else:
|
|
480
|
+
print(f"⚠️ Image has no OCR text: {path.name}")
|
|
481
|
+
|
|
482
|
+
else:
|
|
483
|
+
print(f"⚠️ Unsupported format: {path.name} — skipped")
|
|
484
|
+
|
|
485
|
+
except Exception as exc:
|
|
486
|
+
print(f"❌ Error reading {path.name}: {exc}")
|
|
487
|
+
|
|
488
|
+
return documents
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def chunk_documents(
|
|
492
|
+
docs: List[Document],
|
|
493
|
+
chunk_size: int = 1024,
|
|
494
|
+
chunk_overlap: int = 100,
|
|
495
|
+
) -> List[Document]:
|
|
496
|
+
|
|
497
|
+
splitter = RecursiveCharacterTextSplitter(
|
|
498
|
+
chunk_size=chunk_size,
|
|
499
|
+
chunk_overlap=chunk_overlap,
|
|
500
|
+
separators=["\n\n", "\n", " ", ""],
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
chunks: List[Document] = []
|
|
504
|
+
for doc in docs:
|
|
505
|
+
for i, chunk_text in enumerate(splitter.split_text(doc.page_content)):
|
|
506
|
+
meta = dict(doc.metadata)
|
|
507
|
+
meta.update({"chunk_index": i}) # FIX
|
|
508
|
+
chunks.append(Document(page_content=chunk_text, metadata=meta))
|
|
509
|
+
|
|
510
|
+
return chunks
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
def get_question_style_guidance(question_openness: str, question_length: str) -> str:
|
|
2
|
+
"""Generate guidance based on question style parameters."""
|
|
3
|
+
guidance = []
|
|
4
|
+
|
|
5
|
+
if question_openness == "open":
|
|
6
|
+
guidance.append(
|
|
7
|
+
"- Favor open-ended questions that allow for detailed, explanatory responses")
|
|
8
|
+
guidance.append(
|
|
9
|
+
"- Include 'how', 'why', 'explain', 'describe' type questions")
|
|
10
|
+
elif question_openness == "closed":
|
|
11
|
+
guidance.append(
|
|
12
|
+
"- Focus on specific, factual questions with definitive answers")
|
|
13
|
+
guidance.append(
|
|
14
|
+
"- Include yes/no questions, specific data requests, and factual lookups")
|
|
15
|
+
else: # mixed
|
|
16
|
+
guidance.append("- Mix both open-ended and closed questions")
|
|
17
|
+
guidance.append(
|
|
18
|
+
"- Balance exploratory questions with specific factual queries")
|
|
19
|
+
|
|
20
|
+
if question_length == "short":
|
|
21
|
+
guidance.append("- Keep inputs concise and direct (1-2 sentences)")
|
|
22
|
+
elif question_length == "long":
|
|
23
|
+
guidance.append(
|
|
24
|
+
"- Include detailed context and background in inputs (3+ sentences)")
|
|
25
|
+
guidance.append(
|
|
26
|
+
"- Provide scenarios with multiple parts or complex requirements")
|
|
27
|
+
else: # mixed
|
|
28
|
+
guidance.append(
|
|
29
|
+
"- Vary input length from brief queries to detailed scenarios")
|
|
30
|
+
|
|
31
|
+
return "\n".join(guidance) if guidance else ""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_trap_guidance(trap_density: float) -> str:
|
|
35
|
+
"""Generate guidance for trap questions based on density."""
|
|
36
|
+
if trap_density == 0:
|
|
37
|
+
return "**No Trap Questions:** All questions should be answerable using the provided information."
|
|
38
|
+
|
|
39
|
+
trap_percentage = int(trap_density * 100)
|
|
40
|
+
|
|
41
|
+
return f"""**Trap Questions ({trap_percentage}% of total):**
|
|
42
|
+
Create realistic scenarios where:
|
|
43
|
+
- The user asks about information NOT present in the reference material
|
|
44
|
+
- Questions contain subtle factual errors or misconceptions
|
|
45
|
+
- Requests involve information that would require knowledge beyond the provided context
|
|
46
|
+
- Make traps subtle and realistic - they should feel like genuine user mistakes or knowledge gaps
|
|
47
|
+
- The agent should be able to politely indicate the limitation or correct the misconception"""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def dataset_generation_prompt(
|
|
51
|
+
chunk: str,
|
|
52
|
+
rows_per_chunk: int,
|
|
53
|
+
agent_description: str,
|
|
54
|
+
input_format: str,
|
|
55
|
+
expected_output_format: str,
|
|
56
|
+
test_types: list[str],
|
|
57
|
+
question_length: str,
|
|
58
|
+
question_openness: str,
|
|
59
|
+
trap_density: float,
|
|
60
|
+
language: str
|
|
61
|
+
) -> str:
|
|
62
|
+
"""
|
|
63
|
+
Create a more realistic and detailed prompt for dataset generation.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
question_style_guidance = get_question_style_guidance(
|
|
67
|
+
question_openness, question_length)
|
|
68
|
+
|
|
69
|
+
trap_guidance = get_trap_guidance(trap_density)
|
|
70
|
+
|
|
71
|
+
prompt = f"""You are an expert test case designer creating realistic evaluation scenarios for an AI agent.
|
|
72
|
+
|
|
73
|
+
**Agent Context:**
|
|
74
|
+
{agent_description}
|
|
75
|
+
|
|
76
|
+
**Input Format:** {input_format}
|
|
77
|
+
**Expected Output Format:** {expected_output_format}
|
|
78
|
+
|
|
79
|
+
**Reference Information:**
|
|
80
|
+
\"\"\"{chunk}\"\"\"
|
|
81
|
+
|
|
82
|
+
**Your Task:**
|
|
83
|
+
Create {rows_per_chunk} realistic test cases that would naturally occur when evaluating this agent. Each test case should:
|
|
84
|
+
|
|
85
|
+
1. **Reflect Real User Interactions:** Generate inputs that actual users would provide in real scenarios
|
|
86
|
+
2. **Be Contextually Relevant:** Use the reference information naturally, as if a user discovered it and wants to interact with the agent about it
|
|
87
|
+
3. **Vary in Complexity:** Include both straightforward and nuanced scenarios
|
|
88
|
+
4. **Test Different Capabilities:** Cover various aspects of the agent's expected functionality
|
|
89
|
+
|
|
90
|
+
**Test Types Available:** {', '.join(test_types)}
|
|
91
|
+
|
|
92
|
+
**Question Characteristics:**
|
|
93
|
+
- **Length:** {question_length}
|
|
94
|
+
- **Style:** {question_openness}
|
|
95
|
+
{question_style_guidance}
|
|
96
|
+
|
|
97
|
+
**Quality Requirements:**
|
|
98
|
+
- Use natural, conversational language that real users would employ
|
|
99
|
+
- Avoid academic or artificial phrasing
|
|
100
|
+
- Include context and background when users would naturally provide it
|
|
101
|
+
- Make questions specific enough to have clear, verifiable answers
|
|
102
|
+
- Ensure variety in question types and complexity levels
|
|
103
|
+
|
|
104
|
+
{trap_guidance}
|
|
105
|
+
|
|
106
|
+
**Output Format:**
|
|
107
|
+
Generate exactly {rows_per_chunk} JSON objects in an array. Each object must have:
|
|
108
|
+
- "input": The user's natural input/question
|
|
109
|
+
- "expected_output": The ideal agent response
|
|
110
|
+
- "test_type": One of {test_types}
|
|
111
|
+
- "is_trap": boolean indicating if this is a trap question
|
|
112
|
+
|
|
113
|
+
**Language:** {language}
|
|
114
|
+
|
|
115
|
+
Return ONLY the JSON array, no additional text or formatting."""
|
|
116
|
+
|
|
117
|
+
return prompt
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def dataset_generation_from_scratch_prompt(
|
|
121
|
+
max_rows: int,
|
|
122
|
+
agent_description: str,
|
|
123
|
+
input_format: str,
|
|
124
|
+
expected_output_format: str,
|
|
125
|
+
test_types: list[str],
|
|
126
|
+
question_length: str,
|
|
127
|
+
question_openness: str,
|
|
128
|
+
trap_density: float,
|
|
129
|
+
language: str
|
|
130
|
+
) -> str:
|
|
131
|
+
"""
|
|
132
|
+
Create a realistic prompt for generating dataset from scratch without reference material.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
question_style_guidance = get_question_style_guidance(
|
|
136
|
+
question_openness, question_length)
|
|
137
|
+
trap_guidance = get_trap_guidance(trap_density)
|
|
138
|
+
|
|
139
|
+
prompt = f"""You are an expert test case designer creating comprehensive evaluation scenarios for an AI agent.
|
|
140
|
+
|
|
141
|
+
**Agent Context:**
|
|
142
|
+
{agent_description}
|
|
143
|
+
|
|
144
|
+
**Input Format:** {input_format}
|
|
145
|
+
**Expected Output Format:** {expected_output_format}
|
|
146
|
+
|
|
147
|
+
**Your Task:**
|
|
148
|
+
Create {max_rows} diverse, realistic test cases that thoroughly evaluate this agent's capabilities. Design scenarios that would naturally occur in real-world usage.
|
|
149
|
+
|
|
150
|
+
**Test Design Principles:**
|
|
151
|
+
|
|
152
|
+
1. **Real-World Scenarios:** Create inputs that actual users would provide in genuine situations
|
|
153
|
+
2. **Comprehensive Coverage:** Test different aspects of the agent's functionality and knowledge domains
|
|
154
|
+
3. **Varied Complexity:** Include simple queries, moderate challenges, and complex multi-step scenarios
|
|
155
|
+
4. **Edge Cases:** Include boundary conditions and unusual but valid requests
|
|
156
|
+
5. **Common Use Cases:** Focus on frequent user interaction patterns
|
|
157
|
+
|
|
158
|
+
**Test Types Available:** {', '.join(test_types)}
|
|
159
|
+
|
|
160
|
+
**Question Characteristics:**
|
|
161
|
+
- **Length:** {question_length}
|
|
162
|
+
- **Style:** {question_openness}
|
|
163
|
+
{question_style_guidance}
|
|
164
|
+
|
|
165
|
+
**Quality Requirements:**
|
|
166
|
+
- Use natural, conversational language that real users would employ
|
|
167
|
+
- Avoid repetitive patterns or artificial academic phrasing
|
|
168
|
+
- Include appropriate context when users would naturally provide it
|
|
169
|
+
- Create specific, testable scenarios with clear success criteria
|
|
170
|
+
- Ensure broad coverage of the agent's expected capabilities
|
|
171
|
+
- Make each test case unique and valuable for evaluation
|
|
172
|
+
|
|
173
|
+
{trap_guidance}
|
|
174
|
+
|
|
175
|
+
**Diversity Guidelines:**
|
|
176
|
+
- Vary question topics and domains relevant to the agent
|
|
177
|
+
- Include different user personas and interaction styles
|
|
178
|
+
- Test both common workflows and edge cases
|
|
179
|
+
- Balance straightforward requests with more complex scenarios
|
|
180
|
+
|
|
181
|
+
**Output Format:**
|
|
182
|
+
Generate exactly {max_rows} JSON objects in an array. Each object must have:
|
|
183
|
+
- "input": The user's natural input/question
|
|
184
|
+
- "expected_output": The ideal agent response
|
|
185
|
+
- "test_type": One of {test_types}
|
|
186
|
+
- "is_trap": boolean indicating if this is a trap question
|
|
187
|
+
|
|
188
|
+
**Language:** {language}
|
|
189
|
+
|
|
190
|
+
Return ONLY the JSON array, no additional text or formatting."""
|
|
191
|
+
|
|
192
|
+
return prompt
|