dingo-python 2.2.2__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dingo/config/input_args.py +11 -1
- dingo/exec/local.py +2 -1
- dingo/io/output/__init__.py +1 -0
- dingo/io/output/result_info.py +16 -0
- dingo/model/llm/compare/llm_html_extract_compare.py +17 -2
- dingo/model/llm/compare/llm_html_extract_compare_v2.py +1 -1
- dingo/model/llm/compare/llm_html_extract_compare_v3.py +221 -0
- dingo/model/llm/hhh/llm_text_3h.py +1 -1
- dingo/model/llm/llm_classify_qr.py +4 -2
- dingo/model/llm/llm_custom_metric.py +211 -0
- dingo/model/llm/llm_document_parsing_ocr.py +6 -2
- dingo/model/llm/llm_factcheck_public.py +1 -1
- dingo/model/llm/llm_keyword_matcher.py +1 -1
- dingo/model/llm/llm_scout.py +1 -1
- dingo/model/llm/mineru/vlm_document_parsing.py +4 -8
- dingo/model/llm/mineru/vlm_document_parsing_ocr_train.py +4 -8
- dingo/model/llm/rag/llm_rag_answer_relevancy.py +1 -1
- dingo/model/llm/rag/llm_rag_chunk_quality.py +99 -0
- dingo/model/llm/rag/llm_rag_context_precision.py +1 -1
- dingo/model/llm/rag/llm_rag_context_recall.py +1 -1
- dingo/model/llm/rag/llm_rag_faithfulness.py +1 -1
- dingo/model/llm/vlm_image_relevant.py +9 -52
- dingo/model/llm/vlm_layout_quality.py +3 -54
- dingo/model/model.py +37 -24
- dingo/model/rule/rule_common.py +76 -0
- dingo/model/rule/rule_image.py +41 -32
- dingo/model/rule/scibase/__init__.py +1 -0
- dingo/model/rule/scibase/rule_quanliang.py +655 -0
- dingo/run/cli.py +22 -1
- dingo/utils/image_loader.py +141 -0
- {dingo_python-2.2.2.dist-info → dingo_python-2.3.0.dist-info}/METADATA +22 -1
- {dingo_python-2.2.2.dist-info → dingo_python-2.3.0.dist-info}/RECORD +36 -30
- {dingo_python-2.2.2.dist-info → dingo_python-2.3.0.dist-info}/WHEEL +0 -0
- {dingo_python-2.2.2.dist-info → dingo_python-2.3.0.dist-info}/entry_points.txt +0 -0
- {dingo_python-2.2.2.dist-info → dingo_python-2.3.0.dist-info}/licenses/LICENSE +0 -0
- {dingo_python-2.2.2.dist-info → dingo_python-2.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,655 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Dict, List
|
|
6
|
+
|
|
7
|
+
from dingo.config.input_args import EvaluatorRuleArgs
|
|
8
|
+
from dingo.io.input import Data, RequiredField
|
|
9
|
+
from dingo.io.output.eval_detail import EvalDetail, QualityLabel
|
|
10
|
+
from dingo.model.model import Model
|
|
11
|
+
from dingo.model.rule.base import BaseRule
|
|
12
|
+
|
|
13
|
+
URL_RE = re.compile(r"^[Hh][Tt][Tt][Pp][Ss]?://[^/$.?#][\s\S]*$")
|
|
14
|
+
DOI_RE = re.compile(r"^10\.\d{4,9}/([^A-Z\s\|]*)$")
|
|
15
|
+
INVISIBLE_RE = re.compile(r"[\u2000-\u200F\u202F\u205F\u3000\uFEFF\u00A0\u2060-\u206F\xa0]")
|
|
16
|
+
PAGE_RANGE_RE = re.compile(r"^\d+-\d+$")
|
|
17
|
+
ISSN_RE = re.compile(r"^\d{4}-\d{3}[\dX]$")
|
|
18
|
+
AUTHOR_SEP_RE = re.compile(r"[|;;]")
|
|
19
|
+
|
|
20
|
+
OA_BOOL_VALUES = {"true", "false", "unknown"}
|
|
21
|
+
METADATA_TYPE_VALUES = {"paper", "ebook"}
|
|
22
|
+
OA_STATUS_VALUES = {"diamond", "gold", "green", "hybrid", "bronze", "closed", ""}
|
|
23
|
+
LOC_TYPE_VALUES = {"download", "reader", "display", ""}
|
|
24
|
+
JSON_LIST_FIELDS = {
|
|
25
|
+
"isbns",
|
|
26
|
+
"author",
|
|
27
|
+
"contributors",
|
|
28
|
+
"locations",
|
|
29
|
+
"access_oa_url",
|
|
30
|
+
"publication_venue_issn",
|
|
31
|
+
"references",
|
|
32
|
+
"related_works",
|
|
33
|
+
}
|
|
34
|
+
LICENSE_VALUES = {
|
|
35
|
+
"cc-by",
|
|
36
|
+
"cc-by-nc",
|
|
37
|
+
"cc-by-sa",
|
|
38
|
+
"cc-by-nd",
|
|
39
|
+
"cc-by-nc-sa",
|
|
40
|
+
"cc-by-nc-nd",
|
|
41
|
+
"other-oa",
|
|
42
|
+
"cc0",
|
|
43
|
+
"",
|
|
44
|
+
"public-domain",
|
|
45
|
+
"publisher-specific-oa",
|
|
46
|
+
"publisher-specific",
|
|
47
|
+
"wiley-specific",
|
|
48
|
+
"elsevier-specific",
|
|
49
|
+
"oup-specific",
|
|
50
|
+
"acs-specific",
|
|
51
|
+
"rsc-specific",
|
|
52
|
+
"iop-specific",
|
|
53
|
+
"unspecified-oa",
|
|
54
|
+
"implied-oa",
|
|
55
|
+
"nonexclusive-distrib",
|
|
56
|
+
"gpl-v1",
|
|
57
|
+
"gpl-v2",
|
|
58
|
+
"gpl-v3",
|
|
59
|
+
"mit",
|
|
60
|
+
"ogl-c",
|
|
61
|
+
"pd",
|
|
62
|
+
}
|
|
63
|
+
ACCESS_LICENSE_VALUES = set(LICENSE_VALUES)
|
|
64
|
+
GRADE_CLASS_VALUES = {"k12", "higher-edu", "vocational-edu", "other", ""}
|
|
65
|
+
GRADE_VALUES = {"小学", "初中", "高中", ""}
|
|
66
|
+
|
|
67
|
+
_DEFAULT_LANGUAGE_VALUES = {"zh", "en", "ja", "de", "fr", "es", "ru", "ko", "ar"}
|
|
68
|
+
ASSETS_DIR = Path(__file__).resolve().parent / "assets"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _load_language_allowed_values() -> set[str]:
|
|
72
|
+
base = ASSETS_DIR / "to_iso-639.json"
|
|
73
|
+
if not base.exists():
|
|
74
|
+
return set(_DEFAULT_LANGUAGE_VALUES)
|
|
75
|
+
try:
|
|
76
|
+
with base.open("r", encoding="utf-8") as f:
|
|
77
|
+
values = json.load(f)
|
|
78
|
+
if isinstance(values, dict):
|
|
79
|
+
return set(str(v) for v in values.values() if isinstance(v, str))
|
|
80
|
+
except (TypeError, ValueError, json.JSONDecodeError):
|
|
81
|
+
return set(_DEFAULT_LANGUAGE_VALUES)
|
|
82
|
+
return set(_DEFAULT_LANGUAGE_VALUES)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _load_journal_mapping() -> Dict[str, str]:
|
|
86
|
+
csv_path = ASSETS_DIR / "journal_name_mapping_execute_20260512.csv"
|
|
87
|
+
if not csv_path.exists():
|
|
88
|
+
return {}
|
|
89
|
+
# Lazy import to avoid top-level optional dependency / heavier import.
|
|
90
|
+
import csv
|
|
91
|
+
|
|
92
|
+
mapping: Dict[str, str] = {}
|
|
93
|
+
with csv_path.open("r", encoding="utf-8", newline="") as f:
|
|
94
|
+
for row in csv.DictReader(f):
|
|
95
|
+
source_name = row.get("source_journal_name")
|
|
96
|
+
target_name = row.get("target_journal_name")
|
|
97
|
+
if source_name and target_name:
|
|
98
|
+
mapping[source_name] = target_name
|
|
99
|
+
return mapping
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
LANGUAGE_ALLOWED_VALUES = _load_language_allowed_values()
|
|
103
|
+
JOURNAL_NAME_MAPPING = _load_journal_mapping()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _valid_isbn10(code: str) -> bool:
|
|
107
|
+
if not re.fullmatch(r"\d{9}[\dXx]", code):
|
|
108
|
+
return False
|
|
109
|
+
total = sum((10 - idx) * int(ch) for idx, ch in enumerate(code[:9]))
|
|
110
|
+
check = code[9].upper()
|
|
111
|
+
check_value = 10 if check == "X" else int(check)
|
|
112
|
+
total += check_value
|
|
113
|
+
return total % 11 == 0
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _valid_isbn13(code: str) -> bool:
|
|
117
|
+
if not re.fullmatch(r"\d{13}", code):
|
|
118
|
+
return False
|
|
119
|
+
if not (code.startswith("978") or code.startswith("979")):
|
|
120
|
+
return False
|
|
121
|
+
total = sum(int(ch) * (1 if idx % 2 == 0 else 3) for idx, ch in enumerate(code))
|
|
122
|
+
return total % 10 == 0
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _valid_issn(code: str) -> bool:
|
|
126
|
+
if not ISSN_RE.fullmatch(code):
|
|
127
|
+
return False
|
|
128
|
+
digits = code.replace("-", "")
|
|
129
|
+
total = sum(int(ch) * (8 - idx) for idx, ch in enumerate(digits[:7]))
|
|
130
|
+
calculated = (11 - (total % 11)) % 11
|
|
131
|
+
expected = "X" if calculated == 10 else str(calculated)
|
|
132
|
+
return digits[7].upper() == expected
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def check_metadata_type(metadata_type: Any) -> bool:
|
|
136
|
+
if metadata_type is None:
|
|
137
|
+
return True
|
|
138
|
+
if not isinstance(metadata_type, str):
|
|
139
|
+
return True
|
|
140
|
+
if metadata_type.strip() == "":
|
|
141
|
+
return True
|
|
142
|
+
return metadata_type not in METADATA_TYPE_VALUES
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def check_doi(doi: Any, metadata_type: Any) -> bool:
|
|
146
|
+
if metadata_type not in METADATA_TYPE_VALUES:
|
|
147
|
+
return False
|
|
148
|
+
required = metadata_type == "paper"
|
|
149
|
+
if doi is None:
|
|
150
|
+
return required
|
|
151
|
+
if not isinstance(doi, str):
|
|
152
|
+
return True
|
|
153
|
+
if doi == "":
|
|
154
|
+
return required
|
|
155
|
+
if doi != doi.lower():
|
|
156
|
+
return True
|
|
157
|
+
if "https://doi.org/" in doi.lower():
|
|
158
|
+
return True
|
|
159
|
+
return not bool(DOI_RE.fullmatch(doi))
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def check_isbns(isbns: Any, metadata_type: Any) -> bool:
|
|
163
|
+
if metadata_type not in METADATA_TYPE_VALUES:
|
|
164
|
+
return False
|
|
165
|
+
required = metadata_type == "ebook"
|
|
166
|
+
if isbns is None:
|
|
167
|
+
return required
|
|
168
|
+
if not (isinstance(isbns, list) and all(isinstance(x, str) for x in isbns)):
|
|
169
|
+
return True
|
|
170
|
+
if len(isbns) == 0:
|
|
171
|
+
return required
|
|
172
|
+
for item in isbns:
|
|
173
|
+
if not (_valid_isbn10(item) or _valid_isbn13(item)):
|
|
174
|
+
return True
|
|
175
|
+
return False
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def check_isbn13(isbn13: Any, metadata_type: Any) -> bool:
|
|
179
|
+
if metadata_type not in METADATA_TYPE_VALUES:
|
|
180
|
+
return False
|
|
181
|
+
required = metadata_type == "ebook"
|
|
182
|
+
if isbn13 is None:
|
|
183
|
+
return required
|
|
184
|
+
if not isinstance(isbn13, str):
|
|
185
|
+
return True
|
|
186
|
+
if isbn13 == "":
|
|
187
|
+
return required
|
|
188
|
+
return not _valid_isbn13(isbn13)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def check_title(title: Any) -> bool:
|
|
192
|
+
if title is None:
|
|
193
|
+
return True
|
|
194
|
+
if not isinstance(title, str):
|
|
195
|
+
return True
|
|
196
|
+
if title == "":
|
|
197
|
+
return False
|
|
198
|
+
return bool(INVISIBLE_RE.search(title))
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def check_abstract(abstract: Any) -> bool:
|
|
202
|
+
if abstract is None:
|
|
203
|
+
return True
|
|
204
|
+
if not isinstance(abstract, str):
|
|
205
|
+
return True
|
|
206
|
+
if abstract == "":
|
|
207
|
+
return False
|
|
208
|
+
return bool(INVISIBLE_RE.search(abstract))
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def check_language(language: Any) -> bool:
|
|
212
|
+
if language is None:
|
|
213
|
+
return True
|
|
214
|
+
if not isinstance(language, str):
|
|
215
|
+
return True
|
|
216
|
+
if language == "":
|
|
217
|
+
return False
|
|
218
|
+
if not LANGUAGE_ALLOWED_VALUES:
|
|
219
|
+
return False
|
|
220
|
+
return language not in LANGUAGE_ALLOWED_VALUES
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def check_author(author: Any) -> bool:
|
|
224
|
+
if author is None:
|
|
225
|
+
return True
|
|
226
|
+
if not (isinstance(author, list) and all(isinstance(x, str) for x in author)):
|
|
227
|
+
return True
|
|
228
|
+
if len(author) == 0:
|
|
229
|
+
return False
|
|
230
|
+
for item in author:
|
|
231
|
+
if AUTHOR_SEP_RE.search(item):
|
|
232
|
+
return True
|
|
233
|
+
return False
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def check_contributors(contributors: Any) -> bool:
|
|
237
|
+
if contributors is None:
|
|
238
|
+
return True
|
|
239
|
+
if not (isinstance(contributors, list) and all(isinstance(x, str) for x in contributors)):
|
|
240
|
+
return True
|
|
241
|
+
if len(contributors) == 0:
|
|
242
|
+
return False
|
|
243
|
+
for item in contributors:
|
|
244
|
+
if AUTHOR_SEP_RE.search(item):
|
|
245
|
+
return True
|
|
246
|
+
return False
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def check_locations(locations: Any) -> bool:
|
|
250
|
+
if locations is None:
|
|
251
|
+
return True
|
|
252
|
+
if not isinstance(locations, list):
|
|
253
|
+
return True
|
|
254
|
+
if len(locations) == 0:
|
|
255
|
+
return False
|
|
256
|
+
for item in locations:
|
|
257
|
+
if not isinstance(item, dict):
|
|
258
|
+
return True
|
|
259
|
+
for key in ("type", "url", "license", "is_oa"):
|
|
260
|
+
if key not in item:
|
|
261
|
+
return True
|
|
262
|
+
if item["type"] not in LOC_TYPE_VALUES:
|
|
263
|
+
return True
|
|
264
|
+
if not (isinstance(item["url"], str) and URL_RE.fullmatch(item["url"])):
|
|
265
|
+
return True
|
|
266
|
+
if item["license"] not in LICENSE_VALUES:
|
|
267
|
+
return True
|
|
268
|
+
if item["is_oa"] not in OA_BOOL_VALUES:
|
|
269
|
+
return True
|
|
270
|
+
return False
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def check_access_is_oa(access_is_oa: Any, metadata_type: Any) -> bool:
|
|
274
|
+
if metadata_type not in METADATA_TYPE_VALUES:
|
|
275
|
+
return False
|
|
276
|
+
required = metadata_type == "paper"
|
|
277
|
+
if access_is_oa is None:
|
|
278
|
+
return required
|
|
279
|
+
if not isinstance(access_is_oa, str):
|
|
280
|
+
return True
|
|
281
|
+
if access_is_oa == "":
|
|
282
|
+
return required
|
|
283
|
+
return access_is_oa not in OA_BOOL_VALUES
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def check_access_oa_status(access_oa_status: Any) -> bool:
|
|
287
|
+
if access_oa_status is None:
|
|
288
|
+
return True
|
|
289
|
+
if not isinstance(access_oa_status, str):
|
|
290
|
+
return True
|
|
291
|
+
return access_oa_status not in OA_STATUS_VALUES
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def check_access_oa_url(access_oa_url: Any) -> bool:
|
|
295
|
+
if access_oa_url is None:
|
|
296
|
+
return True
|
|
297
|
+
if not (isinstance(access_oa_url, list) and all(isinstance(x, str) for x in access_oa_url)):
|
|
298
|
+
return True
|
|
299
|
+
if len(access_oa_url) == 0:
|
|
300
|
+
return False
|
|
301
|
+
return any(not bool(URL_RE.fullmatch(item)) for item in access_oa_url)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def check_access_license(access_license: Any) -> bool:
|
|
305
|
+
if access_license is None:
|
|
306
|
+
return True
|
|
307
|
+
if not isinstance(access_license, str):
|
|
308
|
+
return True
|
|
309
|
+
if access_license == "":
|
|
310
|
+
return False
|
|
311
|
+
return access_license not in ACCESS_LICENSE_VALUES
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def check_publication_published_date(publication_published_date: Any) -> bool:
|
|
315
|
+
if publication_published_date is None:
|
|
316
|
+
return True
|
|
317
|
+
if not isinstance(publication_published_date, str):
|
|
318
|
+
return True
|
|
319
|
+
if publication_published_date == "":
|
|
320
|
+
return False
|
|
321
|
+
if not bool(re.fullmatch(r"\d{4}-\d{2}-\d{2}", publication_published_date)):
|
|
322
|
+
return True
|
|
323
|
+
try:
|
|
324
|
+
datetime.strptime(publication_published_date, "%Y-%m-%d")
|
|
325
|
+
return False
|
|
326
|
+
except ValueError:
|
|
327
|
+
return True
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def check_publication_published_year(publication_published_year: Any) -> bool:
|
|
331
|
+
if publication_published_year is None:
|
|
332
|
+
return False
|
|
333
|
+
if not isinstance(publication_published_year, int) or isinstance(publication_published_year, bool):
|
|
334
|
+
return True
|
|
335
|
+
return not (0 < publication_published_year < 2100)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def check_publication_venue_issn(publication_venue_issn: Any) -> bool:
|
|
339
|
+
if publication_venue_issn is None:
|
|
340
|
+
return True
|
|
341
|
+
if not (isinstance(publication_venue_issn, list) and all(isinstance(x, str) for x in publication_venue_issn)):
|
|
342
|
+
return True
|
|
343
|
+
if len(publication_venue_issn) == 0:
|
|
344
|
+
return False
|
|
345
|
+
for item in publication_venue_issn:
|
|
346
|
+
if not _valid_issn(item):
|
|
347
|
+
return True
|
|
348
|
+
return False
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def check_publication_venue_biblio_volume(publication_venue_biblio_volume: Any) -> bool:
|
|
352
|
+
if publication_venue_biblio_volume is None:
|
|
353
|
+
return True
|
|
354
|
+
if not isinstance(publication_venue_biblio_volume, str):
|
|
355
|
+
return True
|
|
356
|
+
if publication_venue_biblio_volume == "":
|
|
357
|
+
return False
|
|
358
|
+
try:
|
|
359
|
+
int(publication_venue_biblio_volume)
|
|
360
|
+
return False
|
|
361
|
+
except (TypeError, ValueError):
|
|
362
|
+
return True
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def check_publication_venue_biblio_issue(publication_venue_biblio_issue: Any) -> bool:
|
|
366
|
+
if publication_venue_biblio_issue is None:
|
|
367
|
+
return True
|
|
368
|
+
if not isinstance(publication_venue_biblio_issue, str):
|
|
369
|
+
return True
|
|
370
|
+
if publication_venue_biblio_issue == "":
|
|
371
|
+
return False
|
|
372
|
+
try:
|
|
373
|
+
int(publication_venue_biblio_issue)
|
|
374
|
+
return False
|
|
375
|
+
except (TypeError, ValueError):
|
|
376
|
+
return True
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def check_publication_venue_biblio_pages(publication_venue_biblio_pages: Any) -> bool:
|
|
380
|
+
if publication_venue_biblio_pages is None:
|
|
381
|
+
return True
|
|
382
|
+
if not isinstance(publication_venue_biblio_pages, str):
|
|
383
|
+
return True
|
|
384
|
+
if publication_venue_biblio_pages == "":
|
|
385
|
+
return False
|
|
386
|
+
if not PAGE_RANGE_RE.fullmatch(publication_venue_biblio_pages):
|
|
387
|
+
return True
|
|
388
|
+
start, end = [int(x.strip()) for x in publication_venue_biblio_pages.split("-")]
|
|
389
|
+
return start <= 0 or end <= 0 or start > end
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def check_publication_pages(publication_pages: Any) -> bool:
|
|
393
|
+
if publication_pages is None:
|
|
394
|
+
return False
|
|
395
|
+
if not isinstance(publication_pages, int) or isinstance(publication_pages, bool):
|
|
396
|
+
return True
|
|
397
|
+
return publication_pages <= 0
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def check_publication_venue_name_unified(
|
|
401
|
+
publication_venue_name_unified: Any, publication_venue_name: Any
|
|
402
|
+
) -> bool:
|
|
403
|
+
if publication_venue_name_unified is None:
|
|
404
|
+
return True
|
|
405
|
+
if not isinstance(publication_venue_name_unified, str):
|
|
406
|
+
return True
|
|
407
|
+
if publication_venue_name is not None and not isinstance(publication_venue_name, str):
|
|
408
|
+
return True
|
|
409
|
+
expected_target = None
|
|
410
|
+
if isinstance(publication_venue_name, str) and publication_venue_name != "":
|
|
411
|
+
expected_target = JOURNAL_NAME_MAPPING.get(publication_venue_name, publication_venue_name)
|
|
412
|
+
if publication_venue_name_unified == "":
|
|
413
|
+
return False
|
|
414
|
+
if expected_target is None:
|
|
415
|
+
return True
|
|
416
|
+
return publication_venue_name_unified != expected_target
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def check_grade_class(grade_class: Any) -> bool:
|
|
420
|
+
if grade_class is None:
|
|
421
|
+
return True
|
|
422
|
+
if not isinstance(grade_class, str):
|
|
423
|
+
return True
|
|
424
|
+
if grade_class == "":
|
|
425
|
+
return False
|
|
426
|
+
return grade_class not in GRADE_CLASS_VALUES
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def check_grade(grade: Any, grade_class: Any) -> bool:
|
|
430
|
+
if grade is None:
|
|
431
|
+
return True
|
|
432
|
+
if not isinstance(grade, str):
|
|
433
|
+
return True
|
|
434
|
+
if grade_class is not None and not isinstance(grade_class, str):
|
|
435
|
+
return True
|
|
436
|
+
if grade == "":
|
|
437
|
+
return False
|
|
438
|
+
if grade not in GRADE_VALUES:
|
|
439
|
+
return True
|
|
440
|
+
if grade_class != "k12" and grade != "":
|
|
441
|
+
return True
|
|
442
|
+
return False
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def check_references(references: Any) -> bool:
|
|
446
|
+
if references is None:
|
|
447
|
+
return True
|
|
448
|
+
if not (isinstance(references, list) and all(isinstance(x, str) for x in references)):
|
|
449
|
+
return True
|
|
450
|
+
if len(references) == 0:
|
|
451
|
+
return False
|
|
452
|
+
return any(not URL_RE.fullmatch(item) for item in references)
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def check_related_works(related_works: Any) -> bool:
|
|
456
|
+
if related_works is None:
|
|
457
|
+
return True
|
|
458
|
+
if not (isinstance(related_works, list) and all(isinstance(x, str) for x in related_works)):
|
|
459
|
+
return True
|
|
460
|
+
if len(related_works) == 0:
|
|
461
|
+
return False
|
|
462
|
+
return any(not URL_RE.fullmatch(item) for item in related_works)
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def check_cited_by_api_url(cited_by_api_url: Any) -> bool:
|
|
466
|
+
if cited_by_api_url is None:
|
|
467
|
+
return True
|
|
468
|
+
if not isinstance(cited_by_api_url, str):
|
|
469
|
+
return True
|
|
470
|
+
if cited_by_api_url == "":
|
|
471
|
+
return False
|
|
472
|
+
return not bool(URL_RE.fullmatch(cited_by_api_url))
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def check_access_xinghe_repository_sha256(
|
|
476
|
+
access_xinghe_repository_sha256: Any, access_xinghe_repository_has_fulltext: Any
|
|
477
|
+
) -> bool:
|
|
478
|
+
if access_xinghe_repository_sha256 is None:
|
|
479
|
+
return True
|
|
480
|
+
if not isinstance(access_xinghe_repository_has_fulltext, bool):
|
|
481
|
+
return True
|
|
482
|
+
has_fulltext = access_xinghe_repository_has_fulltext
|
|
483
|
+
if isinstance(access_xinghe_repository_sha256, str):
|
|
484
|
+
if not has_fulltext:
|
|
485
|
+
return False
|
|
486
|
+
return access_xinghe_repository_sha256 == ""
|
|
487
|
+
if not (
|
|
488
|
+
isinstance(access_xinghe_repository_sha256, list)
|
|
489
|
+
and all(isinstance(x, str) for x in access_xinghe_repository_sha256)
|
|
490
|
+
):
|
|
491
|
+
return True
|
|
492
|
+
if not has_fulltext:
|
|
493
|
+
return False
|
|
494
|
+
return len(access_xinghe_repository_sha256) == 0
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def check_access_xinghe_repository_origin_path(
|
|
498
|
+
access_xinghe_repository_origin_path: Any, access_xinghe_repository_has_fulltext: Any
|
|
499
|
+
) -> bool:
|
|
500
|
+
if not isinstance(access_xinghe_repository_origin_path, str):
|
|
501
|
+
return True
|
|
502
|
+
if not isinstance(access_xinghe_repository_has_fulltext, bool):
|
|
503
|
+
return True
|
|
504
|
+
if not access_xinghe_repository_has_fulltext:
|
|
505
|
+
return False
|
|
506
|
+
return access_xinghe_repository_origin_path.strip() == ""
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
def _normalize_json_like_field(value: Any) -> Any:
|
|
510
|
+
if not isinstance(value, str):
|
|
511
|
+
return value
|
|
512
|
+
stripped = value.strip()
|
|
513
|
+
if not stripped:
|
|
514
|
+
return value
|
|
515
|
+
if stripped[0] not in ("[", "{"):
|
|
516
|
+
return value
|
|
517
|
+
try:
|
|
518
|
+
return json.loads(stripped)
|
|
519
|
+
except (TypeError, ValueError, json.JSONDecodeError):
|
|
520
|
+
cleaned = stripped.replace("\r", " ").replace("\n", " ").replace("\t", " ")
|
|
521
|
+
cleaned = "".join(ch if ord(ch) >= 32 else " " for ch in cleaned)
|
|
522
|
+
invalid_escape_re = re.compile(r'\\(?!["\\/bfnrtu])')
|
|
523
|
+
for _ in range(10):
|
|
524
|
+
next_cleaned = invalid_escape_re.sub(r"\\\\", cleaned)
|
|
525
|
+
if next_cleaned == cleaned:
|
|
526
|
+
break
|
|
527
|
+
cleaned = next_cleaned
|
|
528
|
+
try:
|
|
529
|
+
return json.loads(cleaned)
|
|
530
|
+
except (TypeError, ValueError, json.JSONDecodeError):
|
|
531
|
+
return value
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def _normalize_bool_field(value: Any) -> Any:
|
|
535
|
+
if isinstance(value, bool):
|
|
536
|
+
return value
|
|
537
|
+
if isinstance(value, int):
|
|
538
|
+
if value in (0, 1):
|
|
539
|
+
return bool(value)
|
|
540
|
+
return value
|
|
541
|
+
if isinstance(value, str):
|
|
542
|
+
lowered = value.strip().lower()
|
|
543
|
+
if lowered in ("1", "true"):
|
|
544
|
+
return True
|
|
545
|
+
if lowered in ("0", "false"):
|
|
546
|
+
return False
|
|
547
|
+
return value
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
def normalize_record(record: Dict[str, Any]) -> Dict[str, Any]:
|
|
551
|
+
normalized = dict(record)
|
|
552
|
+
for field in JSON_LIST_FIELDS:
|
|
553
|
+
if field in normalized:
|
|
554
|
+
normalized[field] = _normalize_json_like_field(normalized.get(field))
|
|
555
|
+
normalized["access_xinghe_repository_has_fulltext"] = _normalize_bool_field(
|
|
556
|
+
normalized.get("access_xinghe_repository_has_fulltext")
|
|
557
|
+
)
|
|
558
|
+
return normalized
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
FIELD_VALIDATORS = {
|
|
562
|
+
"metadata_type": lambda record: check_metadata_type(record.get("metadata_type")),
|
|
563
|
+
"doi": lambda record: check_doi(record.get("doi"), record.get("metadata_type")),
|
|
564
|
+
"isbns": lambda record: check_isbns(record.get("isbns"), record.get("metadata_type")),
|
|
565
|
+
"isbn13": lambda record: check_isbn13(record.get("isbn13"), record.get("metadata_type")),
|
|
566
|
+
"title": lambda record: check_title(record.get("title")),
|
|
567
|
+
"abstract": lambda record: check_abstract(record.get("abstract")),
|
|
568
|
+
"language": lambda record: check_language(record.get("language")),
|
|
569
|
+
"author": lambda record: check_author(record.get("author")),
|
|
570
|
+
"contributors": lambda record: check_contributors(record.get("contributors")),
|
|
571
|
+
"locations": lambda record: check_locations(record.get("locations")),
|
|
572
|
+
"access_is_oa": lambda record: check_access_is_oa(record.get("access_is_oa"), record.get("metadata_type")),
|
|
573
|
+
"access_oa_status": lambda record: check_access_oa_status(record.get("access_oa_status")),
|
|
574
|
+
"access_oa_url": lambda record: check_access_oa_url(record.get("access_oa_url")),
|
|
575
|
+
"access_license": lambda record: check_access_license(record.get("access_license")),
|
|
576
|
+
"publication_published_date": lambda record: check_publication_published_date(
|
|
577
|
+
record.get("publication_published_date")
|
|
578
|
+
),
|
|
579
|
+
"publication_published_year": lambda record: check_publication_published_year(
|
|
580
|
+
record.get("publication_published_year")
|
|
581
|
+
),
|
|
582
|
+
"publication_venue_issn": lambda record: check_publication_venue_issn(record.get("publication_venue_issn")),
|
|
583
|
+
"publication_venue_biblio_volume": lambda record: check_publication_venue_biblio_volume(
|
|
584
|
+
record.get("publication_venue_biblio_volume")
|
|
585
|
+
),
|
|
586
|
+
"publication_venue_biblio_issue": lambda record: check_publication_venue_biblio_issue(
|
|
587
|
+
record.get("publication_venue_biblio_issue")
|
|
588
|
+
),
|
|
589
|
+
"publication_venue_biblio_pages": lambda record: check_publication_venue_biblio_pages(
|
|
590
|
+
record.get("publication_venue_biblio_pages")
|
|
591
|
+
),
|
|
592
|
+
"publication_pages": lambda record: check_publication_pages(record.get("publication_pages")),
|
|
593
|
+
"publication_venue_name_unified": lambda record: check_publication_venue_name_unified(
|
|
594
|
+
record.get("publication_venue_name_unified"),
|
|
595
|
+
record.get("publication_venue_name"),
|
|
596
|
+
),
|
|
597
|
+
"grade_class": lambda record: check_grade_class(record.get("grade_class")),
|
|
598
|
+
"grade": lambda record: check_grade(record.get("grade"), record.get("grade_class")),
|
|
599
|
+
"references": lambda record: check_references(record.get("references")),
|
|
600
|
+
"related_works": lambda record: check_related_works(record.get("related_works")),
|
|
601
|
+
"cited_by_api_url": lambda record: check_cited_by_api_url(record.get("cited_by_api_url")),
|
|
602
|
+
"access_xinghe_repository_sha256": lambda record: check_access_xinghe_repository_sha256(
|
|
603
|
+
record.get("access_xinghe_repository_sha256"),
|
|
604
|
+
record.get("access_xinghe_repository_has_fulltext"),
|
|
605
|
+
),
|
|
606
|
+
"access_xinghe_repository_origin_path": lambda record: check_access_xinghe_repository_origin_path(
|
|
607
|
+
record.get("access_xinghe_repository_origin_path"),
|
|
608
|
+
record.get("access_xinghe_repository_has_fulltext"),
|
|
609
|
+
),
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
@Model.rule_register("QUALITY_BAD_EFFECTIVENESS", ["xinghe", "quanliang"])
|
|
614
|
+
class RuleQuanliangFieldValidation(BaseRule):
|
|
615
|
+
_metric_info = {
|
|
616
|
+
"category": "Rule-Based Metadata Quality Metrics",
|
|
617
|
+
"quality_dimension": "EFFECTIVENESS",
|
|
618
|
+
"metric_name": "RuleQuanliangFieldValidation",
|
|
619
|
+
"description": "Validate Quanliang metadata fields and report invalid fields",
|
|
620
|
+
"paper_title": "",
|
|
621
|
+
"paper_url": "",
|
|
622
|
+
"paper_authors": "",
|
|
623
|
+
"evaluation_results": "",
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
_required_fields = [RequiredField.METADATA]
|
|
627
|
+
dynamic_config = EvaluatorRuleArgs(key_list=list(FIELD_VALIDATORS.keys()))
|
|
628
|
+
|
|
629
|
+
@classmethod
|
|
630
|
+
def eval(cls, input_data: Data) -> EvalDetail:
|
|
631
|
+
res = EvalDetail(metric=cls.__name__)
|
|
632
|
+
normalized = normalize_record(input_data.to_dict())
|
|
633
|
+
selected_fields = cls.dynamic_config.key_list or []
|
|
634
|
+
bad_fields: List[str] = []
|
|
635
|
+
reasons: List[str] = []
|
|
636
|
+
for field in selected_fields:
|
|
637
|
+
if field not in FIELD_VALIDATORS:
|
|
638
|
+
bad_fields.append(field)
|
|
639
|
+
reasons.append("unsupported field")
|
|
640
|
+
continue
|
|
641
|
+
if field not in normalized:
|
|
642
|
+
bad_fields.append(field)
|
|
643
|
+
reasons.append("missing field")
|
|
644
|
+
continue
|
|
645
|
+
if FIELD_VALIDATORS[field](normalized):
|
|
646
|
+
bad_fields.append(field)
|
|
647
|
+
reasons.append(f"{field} invalid")
|
|
648
|
+
|
|
649
|
+
if bad_fields:
|
|
650
|
+
res.status = True
|
|
651
|
+
res.label = bad_fields
|
|
652
|
+
res.reason = reasons
|
|
653
|
+
else:
|
|
654
|
+
res.label = [QualityLabel.QUALITY_GOOD]
|
|
655
|
+
return res
|
dingo/run/cli.py
CHANGED
|
@@ -53,6 +53,13 @@ def parse_args():
|
|
|
53
53
|
default=False,
|
|
54
54
|
help="Output as JSON",
|
|
55
55
|
)
|
|
56
|
+
info_parser.add_argument(
|
|
57
|
+
"--count",
|
|
58
|
+
action="store_true",
|
|
59
|
+
default=False,
|
|
60
|
+
help="Print metric counts (rules, llm, groups, total_metrics=rules+llm). "
|
|
61
|
+
"Human mode: counts only. With --json: prepend a \"counts\" object to the payload.",
|
|
62
|
+
)
|
|
56
63
|
|
|
57
64
|
# --- dingo serve ---
|
|
58
65
|
serve_parser = subparsers.add_parser("serve", help="Start MCP server for AI agent integration")
|
|
@@ -177,9 +184,23 @@ def cmd_info(args):
|
|
|
177
184
|
groups[group_name] = [cls.__name__ for cls in rule_list]
|
|
178
185
|
info["groups"] = groups
|
|
179
186
|
|
|
187
|
+
counts = {
|
|
188
|
+
"rules": len(Model.rule_name_map),
|
|
189
|
+
"llm": len(Model.llm_name_map),
|
|
190
|
+
"groups": len(Model.rule_groups),
|
|
191
|
+
"total_metrics": len(Model.rule_name_map) + len(Model.llm_name_map),
|
|
192
|
+
}
|
|
193
|
+
|
|
180
194
|
if args.json:
|
|
181
|
-
|
|
195
|
+
if args.count:
|
|
196
|
+
payload = {"counts": counts, **info}
|
|
197
|
+
json.dump(payload, sys.stdout, indent=2, ensure_ascii=False)
|
|
198
|
+
else:
|
|
199
|
+
json.dump(info, sys.stdout, indent=2, ensure_ascii=False)
|
|
182
200
|
sys.stdout.write("\n")
|
|
201
|
+
elif args.count:
|
|
202
|
+
for key in ("rules", "llm", "groups", "total_metrics"):
|
|
203
|
+
print(f"{key}: {counts[key]}")
|
|
183
204
|
else:
|
|
184
205
|
_print_info_table(info)
|
|
185
206
|
|