dingo-python 2.2.2__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. dingo/config/input_args.py +11 -1
  2. dingo/exec/local.py +2 -1
  3. dingo/io/output/__init__.py +1 -0
  4. dingo/io/output/result_info.py +16 -0
  5. dingo/model/llm/compare/llm_html_extract_compare.py +17 -2
  6. dingo/model/llm/compare/llm_html_extract_compare_v2.py +1 -1
  7. dingo/model/llm/compare/llm_html_extract_compare_v3.py +221 -0
  8. dingo/model/llm/hhh/llm_text_3h.py +1 -1
  9. dingo/model/llm/llm_classify_qr.py +4 -2
  10. dingo/model/llm/llm_custom_metric.py +211 -0
  11. dingo/model/llm/llm_document_parsing_ocr.py +6 -2
  12. dingo/model/llm/llm_factcheck_public.py +1 -1
  13. dingo/model/llm/llm_keyword_matcher.py +1 -1
  14. dingo/model/llm/llm_scout.py +1 -1
  15. dingo/model/llm/mineru/vlm_document_parsing.py +4 -8
  16. dingo/model/llm/mineru/vlm_document_parsing_ocr_train.py +4 -8
  17. dingo/model/llm/rag/llm_rag_answer_relevancy.py +1 -1
  18. dingo/model/llm/rag/llm_rag_chunk_quality.py +99 -0
  19. dingo/model/llm/rag/llm_rag_context_precision.py +1 -1
  20. dingo/model/llm/rag/llm_rag_context_recall.py +1 -1
  21. dingo/model/llm/rag/llm_rag_faithfulness.py +1 -1
  22. dingo/model/llm/vlm_image_relevant.py +9 -52
  23. dingo/model/llm/vlm_layout_quality.py +3 -54
  24. dingo/model/model.py +37 -24
  25. dingo/model/rule/rule_common.py +76 -0
  26. dingo/model/rule/rule_image.py +41 -32
  27. dingo/model/rule/scibase/__init__.py +1 -0
  28. dingo/model/rule/scibase/rule_quanliang.py +655 -0
  29. dingo/run/cli.py +22 -1
  30. dingo/utils/image_loader.py +141 -0
  31. {dingo_python-2.2.2.dist-info → dingo_python-2.3.0.dist-info}/METADATA +22 -1
  32. {dingo_python-2.2.2.dist-info → dingo_python-2.3.0.dist-info}/RECORD +36 -30
  33. {dingo_python-2.2.2.dist-info → dingo_python-2.3.0.dist-info}/WHEEL +0 -0
  34. {dingo_python-2.2.2.dist-info → dingo_python-2.3.0.dist-info}/entry_points.txt +0 -0
  35. {dingo_python-2.2.2.dist-info → dingo_python-2.3.0.dist-info}/licenses/LICENSE +0 -0
  36. {dingo_python-2.2.2.dist-info → dingo_python-2.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,655 @@
1
+ import json
2
+ import re
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List
6
+
7
+ from dingo.config.input_args import EvaluatorRuleArgs
8
+ from dingo.io.input import Data, RequiredField
9
+ from dingo.io.output.eval_detail import EvalDetail, QualityLabel
10
+ from dingo.model.model import Model
11
+ from dingo.model.rule.base import BaseRule
12
+
13
+ URL_RE = re.compile(r"^[Hh][Tt][Tt][Pp][Ss]?://[^/$.?#][\s\S]*$")
14
+ DOI_RE = re.compile(r"^10\.\d{4,9}/([^A-Z\s\|]*)$")
15
+ INVISIBLE_RE = re.compile(r"[\u2000-\u200F\u202F\u205F\u3000\uFEFF\u00A0\u2060-\u206F\xa0]")
16
+ PAGE_RANGE_RE = re.compile(r"^\d+-\d+$")
17
+ ISSN_RE = re.compile(r"^\d{4}-\d{3}[\dX]$")
18
+ AUTHOR_SEP_RE = re.compile(r"[|;;]")
19
+
20
+ OA_BOOL_VALUES = {"true", "false", "unknown"}
21
+ METADATA_TYPE_VALUES = {"paper", "ebook"}
22
+ OA_STATUS_VALUES = {"diamond", "gold", "green", "hybrid", "bronze", "closed", ""}
23
+ LOC_TYPE_VALUES = {"download", "reader", "display", ""}
24
+ JSON_LIST_FIELDS = {
25
+ "isbns",
26
+ "author",
27
+ "contributors",
28
+ "locations",
29
+ "access_oa_url",
30
+ "publication_venue_issn",
31
+ "references",
32
+ "related_works",
33
+ }
34
+ LICENSE_VALUES = {
35
+ "cc-by",
36
+ "cc-by-nc",
37
+ "cc-by-sa",
38
+ "cc-by-nd",
39
+ "cc-by-nc-sa",
40
+ "cc-by-nc-nd",
41
+ "other-oa",
42
+ "cc0",
43
+ "",
44
+ "public-domain",
45
+ "publisher-specific-oa",
46
+ "publisher-specific",
47
+ "wiley-specific",
48
+ "elsevier-specific",
49
+ "oup-specific",
50
+ "acs-specific",
51
+ "rsc-specific",
52
+ "iop-specific",
53
+ "unspecified-oa",
54
+ "implied-oa",
55
+ "nonexclusive-distrib",
56
+ "gpl-v1",
57
+ "gpl-v2",
58
+ "gpl-v3",
59
+ "mit",
60
+ "ogl-c",
61
+ "pd",
62
+ }
63
+ ACCESS_LICENSE_VALUES = set(LICENSE_VALUES)
64
+ GRADE_CLASS_VALUES = {"k12", "higher-edu", "vocational-edu", "other", ""}
65
+ GRADE_VALUES = {"小学", "初中", "高中", ""}
66
+
67
+ _DEFAULT_LANGUAGE_VALUES = {"zh", "en", "ja", "de", "fr", "es", "ru", "ko", "ar"}
68
+ ASSETS_DIR = Path(__file__).resolve().parent / "assets"
69
+
70
+
71
+ def _load_language_allowed_values() -> set[str]:
72
+ base = ASSETS_DIR / "to_iso-639.json"
73
+ if not base.exists():
74
+ return set(_DEFAULT_LANGUAGE_VALUES)
75
+ try:
76
+ with base.open("r", encoding="utf-8") as f:
77
+ values = json.load(f)
78
+ if isinstance(values, dict):
79
+ return set(str(v) for v in values.values() if isinstance(v, str))
80
+ except (TypeError, ValueError, json.JSONDecodeError):
81
+ return set(_DEFAULT_LANGUAGE_VALUES)
82
+ return set(_DEFAULT_LANGUAGE_VALUES)
83
+
84
+
85
+ def _load_journal_mapping() -> Dict[str, str]:
86
+ csv_path = ASSETS_DIR / "journal_name_mapping_execute_20260512.csv"
87
+ if not csv_path.exists():
88
+ return {}
89
+ # Lazy import to avoid top-level optional dependency / heavier import.
90
+ import csv
91
+
92
+ mapping: Dict[str, str] = {}
93
+ with csv_path.open("r", encoding="utf-8", newline="") as f:
94
+ for row in csv.DictReader(f):
95
+ source_name = row.get("source_journal_name")
96
+ target_name = row.get("target_journal_name")
97
+ if source_name and target_name:
98
+ mapping[source_name] = target_name
99
+ return mapping
100
+
101
+
102
+ LANGUAGE_ALLOWED_VALUES = _load_language_allowed_values()
103
+ JOURNAL_NAME_MAPPING = _load_journal_mapping()
104
+
105
+
106
+ def _valid_isbn10(code: str) -> bool:
107
+ if not re.fullmatch(r"\d{9}[\dXx]", code):
108
+ return False
109
+ total = sum((10 - idx) * int(ch) for idx, ch in enumerate(code[:9]))
110
+ check = code[9].upper()
111
+ check_value = 10 if check == "X" else int(check)
112
+ total += check_value
113
+ return total % 11 == 0
114
+
115
+
116
+ def _valid_isbn13(code: str) -> bool:
117
+ if not re.fullmatch(r"\d{13}", code):
118
+ return False
119
+ if not (code.startswith("978") or code.startswith("979")):
120
+ return False
121
+ total = sum(int(ch) * (1 if idx % 2 == 0 else 3) for idx, ch in enumerate(code))
122
+ return total % 10 == 0
123
+
124
+
125
+ def _valid_issn(code: str) -> bool:
126
+ if not ISSN_RE.fullmatch(code):
127
+ return False
128
+ digits = code.replace("-", "")
129
+ total = sum(int(ch) * (8 - idx) for idx, ch in enumerate(digits[:7]))
130
+ calculated = (11 - (total % 11)) % 11
131
+ expected = "X" if calculated == 10 else str(calculated)
132
+ return digits[7].upper() == expected
133
+
134
+
135
+ def check_metadata_type(metadata_type: Any) -> bool:
136
+ if metadata_type is None:
137
+ return True
138
+ if not isinstance(metadata_type, str):
139
+ return True
140
+ if metadata_type.strip() == "":
141
+ return True
142
+ return metadata_type not in METADATA_TYPE_VALUES
143
+
144
+
145
+ def check_doi(doi: Any, metadata_type: Any) -> bool:
146
+ if metadata_type not in METADATA_TYPE_VALUES:
147
+ return False
148
+ required = metadata_type == "paper"
149
+ if doi is None:
150
+ return required
151
+ if not isinstance(doi, str):
152
+ return True
153
+ if doi == "":
154
+ return required
155
+ if doi != doi.lower():
156
+ return True
157
+ if "https://doi.org/" in doi.lower():
158
+ return True
159
+ return not bool(DOI_RE.fullmatch(doi))
160
+
161
+
162
+ def check_isbns(isbns: Any, metadata_type: Any) -> bool:
163
+ if metadata_type not in METADATA_TYPE_VALUES:
164
+ return False
165
+ required = metadata_type == "ebook"
166
+ if isbns is None:
167
+ return required
168
+ if not (isinstance(isbns, list) and all(isinstance(x, str) for x in isbns)):
169
+ return True
170
+ if len(isbns) == 0:
171
+ return required
172
+ for item in isbns:
173
+ if not (_valid_isbn10(item) or _valid_isbn13(item)):
174
+ return True
175
+ return False
176
+
177
+
178
+ def check_isbn13(isbn13: Any, metadata_type: Any) -> bool:
179
+ if metadata_type not in METADATA_TYPE_VALUES:
180
+ return False
181
+ required = metadata_type == "ebook"
182
+ if isbn13 is None:
183
+ return required
184
+ if not isinstance(isbn13, str):
185
+ return True
186
+ if isbn13 == "":
187
+ return required
188
+ return not _valid_isbn13(isbn13)
189
+
190
+
191
+ def check_title(title: Any) -> bool:
192
+ if title is None:
193
+ return True
194
+ if not isinstance(title, str):
195
+ return True
196
+ if title == "":
197
+ return False
198
+ return bool(INVISIBLE_RE.search(title))
199
+
200
+
201
+ def check_abstract(abstract: Any) -> bool:
202
+ if abstract is None:
203
+ return True
204
+ if not isinstance(abstract, str):
205
+ return True
206
+ if abstract == "":
207
+ return False
208
+ return bool(INVISIBLE_RE.search(abstract))
209
+
210
+
211
+ def check_language(language: Any) -> bool:
212
+ if language is None:
213
+ return True
214
+ if not isinstance(language, str):
215
+ return True
216
+ if language == "":
217
+ return False
218
+ if not LANGUAGE_ALLOWED_VALUES:
219
+ return False
220
+ return language not in LANGUAGE_ALLOWED_VALUES
221
+
222
+
223
+ def check_author(author: Any) -> bool:
224
+ if author is None:
225
+ return True
226
+ if not (isinstance(author, list) and all(isinstance(x, str) for x in author)):
227
+ return True
228
+ if len(author) == 0:
229
+ return False
230
+ for item in author:
231
+ if AUTHOR_SEP_RE.search(item):
232
+ return True
233
+ return False
234
+
235
+
236
+ def check_contributors(contributors: Any) -> bool:
237
+ if contributors is None:
238
+ return True
239
+ if not (isinstance(contributors, list) and all(isinstance(x, str) for x in contributors)):
240
+ return True
241
+ if len(contributors) == 0:
242
+ return False
243
+ for item in contributors:
244
+ if AUTHOR_SEP_RE.search(item):
245
+ return True
246
+ return False
247
+
248
+
249
+ def check_locations(locations: Any) -> bool:
250
+ if locations is None:
251
+ return True
252
+ if not isinstance(locations, list):
253
+ return True
254
+ if len(locations) == 0:
255
+ return False
256
+ for item in locations:
257
+ if not isinstance(item, dict):
258
+ return True
259
+ for key in ("type", "url", "license", "is_oa"):
260
+ if key not in item:
261
+ return True
262
+ if item["type"] not in LOC_TYPE_VALUES:
263
+ return True
264
+ if not (isinstance(item["url"], str) and URL_RE.fullmatch(item["url"])):
265
+ return True
266
+ if item["license"] not in LICENSE_VALUES:
267
+ return True
268
+ if item["is_oa"] not in OA_BOOL_VALUES:
269
+ return True
270
+ return False
271
+
272
+
273
+ def check_access_is_oa(access_is_oa: Any, metadata_type: Any) -> bool:
274
+ if metadata_type not in METADATA_TYPE_VALUES:
275
+ return False
276
+ required = metadata_type == "paper"
277
+ if access_is_oa is None:
278
+ return required
279
+ if not isinstance(access_is_oa, str):
280
+ return True
281
+ if access_is_oa == "":
282
+ return required
283
+ return access_is_oa not in OA_BOOL_VALUES
284
+
285
+
286
+ def check_access_oa_status(access_oa_status: Any) -> bool:
287
+ if access_oa_status is None:
288
+ return True
289
+ if not isinstance(access_oa_status, str):
290
+ return True
291
+ return access_oa_status not in OA_STATUS_VALUES
292
+
293
+
294
+ def check_access_oa_url(access_oa_url: Any) -> bool:
295
+ if access_oa_url is None:
296
+ return True
297
+ if not (isinstance(access_oa_url, list) and all(isinstance(x, str) for x in access_oa_url)):
298
+ return True
299
+ if len(access_oa_url) == 0:
300
+ return False
301
+ return any(not bool(URL_RE.fullmatch(item)) for item in access_oa_url)
302
+
303
+
304
+ def check_access_license(access_license: Any) -> bool:
305
+ if access_license is None:
306
+ return True
307
+ if not isinstance(access_license, str):
308
+ return True
309
+ if access_license == "":
310
+ return False
311
+ return access_license not in ACCESS_LICENSE_VALUES
312
+
313
+
314
+ def check_publication_published_date(publication_published_date: Any) -> bool:
315
+ if publication_published_date is None:
316
+ return True
317
+ if not isinstance(publication_published_date, str):
318
+ return True
319
+ if publication_published_date == "":
320
+ return False
321
+ if not bool(re.fullmatch(r"\d{4}-\d{2}-\d{2}", publication_published_date)):
322
+ return True
323
+ try:
324
+ datetime.strptime(publication_published_date, "%Y-%m-%d")
325
+ return False
326
+ except ValueError:
327
+ return True
328
+
329
+
330
+ def check_publication_published_year(publication_published_year: Any) -> bool:
331
+ if publication_published_year is None:
332
+ return False
333
+ if not isinstance(publication_published_year, int) or isinstance(publication_published_year, bool):
334
+ return True
335
+ return not (0 < publication_published_year < 2100)
336
+
337
+
338
+ def check_publication_venue_issn(publication_venue_issn: Any) -> bool:
339
+ if publication_venue_issn is None:
340
+ return True
341
+ if not (isinstance(publication_venue_issn, list) and all(isinstance(x, str) for x in publication_venue_issn)):
342
+ return True
343
+ if len(publication_venue_issn) == 0:
344
+ return False
345
+ for item in publication_venue_issn:
346
+ if not _valid_issn(item):
347
+ return True
348
+ return False
349
+
350
+
351
+ def check_publication_venue_biblio_volume(publication_venue_biblio_volume: Any) -> bool:
352
+ if publication_venue_biblio_volume is None:
353
+ return True
354
+ if not isinstance(publication_venue_biblio_volume, str):
355
+ return True
356
+ if publication_venue_biblio_volume == "":
357
+ return False
358
+ try:
359
+ int(publication_venue_biblio_volume)
360
+ return False
361
+ except (TypeError, ValueError):
362
+ return True
363
+
364
+
365
+ def check_publication_venue_biblio_issue(publication_venue_biblio_issue: Any) -> bool:
366
+ if publication_venue_biblio_issue is None:
367
+ return True
368
+ if not isinstance(publication_venue_biblio_issue, str):
369
+ return True
370
+ if publication_venue_biblio_issue == "":
371
+ return False
372
+ try:
373
+ int(publication_venue_biblio_issue)
374
+ return False
375
+ except (TypeError, ValueError):
376
+ return True
377
+
378
+
379
+ def check_publication_venue_biblio_pages(publication_venue_biblio_pages: Any) -> bool:
380
+ if publication_venue_biblio_pages is None:
381
+ return True
382
+ if not isinstance(publication_venue_biblio_pages, str):
383
+ return True
384
+ if publication_venue_biblio_pages == "":
385
+ return False
386
+ if not PAGE_RANGE_RE.fullmatch(publication_venue_biblio_pages):
387
+ return True
388
+ start, end = [int(x.strip()) for x in publication_venue_biblio_pages.split("-")]
389
+ return start <= 0 or end <= 0 or start > end
390
+
391
+
392
+ def check_publication_pages(publication_pages: Any) -> bool:
393
+ if publication_pages is None:
394
+ return False
395
+ if not isinstance(publication_pages, int) or isinstance(publication_pages, bool):
396
+ return True
397
+ return publication_pages <= 0
398
+
399
+
400
+ def check_publication_venue_name_unified(
401
+ publication_venue_name_unified: Any, publication_venue_name: Any
402
+ ) -> bool:
403
+ if publication_venue_name_unified is None:
404
+ return True
405
+ if not isinstance(publication_venue_name_unified, str):
406
+ return True
407
+ if publication_venue_name is not None and not isinstance(publication_venue_name, str):
408
+ return True
409
+ expected_target = None
410
+ if isinstance(publication_venue_name, str) and publication_venue_name != "":
411
+ expected_target = JOURNAL_NAME_MAPPING.get(publication_venue_name, publication_venue_name)
412
+ if publication_venue_name_unified == "":
413
+ return False
414
+ if expected_target is None:
415
+ return True
416
+ return publication_venue_name_unified != expected_target
417
+
418
+
419
+ def check_grade_class(grade_class: Any) -> bool:
420
+ if grade_class is None:
421
+ return True
422
+ if not isinstance(grade_class, str):
423
+ return True
424
+ if grade_class == "":
425
+ return False
426
+ return grade_class not in GRADE_CLASS_VALUES
427
+
428
+
429
+ def check_grade(grade: Any, grade_class: Any) -> bool:
430
+ if grade is None:
431
+ return True
432
+ if not isinstance(grade, str):
433
+ return True
434
+ if grade_class is not None and not isinstance(grade_class, str):
435
+ return True
436
+ if grade == "":
437
+ return False
438
+ if grade not in GRADE_VALUES:
439
+ return True
440
+ if grade_class != "k12" and grade != "":
441
+ return True
442
+ return False
443
+
444
+
445
+ def check_references(references: Any) -> bool:
446
+ if references is None:
447
+ return True
448
+ if not (isinstance(references, list) and all(isinstance(x, str) for x in references)):
449
+ return True
450
+ if len(references) == 0:
451
+ return False
452
+ return any(not URL_RE.fullmatch(item) for item in references)
453
+
454
+
455
+ def check_related_works(related_works: Any) -> bool:
456
+ if related_works is None:
457
+ return True
458
+ if not (isinstance(related_works, list) and all(isinstance(x, str) for x in related_works)):
459
+ return True
460
+ if len(related_works) == 0:
461
+ return False
462
+ return any(not URL_RE.fullmatch(item) for item in related_works)
463
+
464
+
465
+ def check_cited_by_api_url(cited_by_api_url: Any) -> bool:
466
+ if cited_by_api_url is None:
467
+ return True
468
+ if not isinstance(cited_by_api_url, str):
469
+ return True
470
+ if cited_by_api_url == "":
471
+ return False
472
+ return not bool(URL_RE.fullmatch(cited_by_api_url))
473
+
474
+
475
+ def check_access_xinghe_repository_sha256(
476
+ access_xinghe_repository_sha256: Any, access_xinghe_repository_has_fulltext: Any
477
+ ) -> bool:
478
+ if access_xinghe_repository_sha256 is None:
479
+ return True
480
+ if not isinstance(access_xinghe_repository_has_fulltext, bool):
481
+ return True
482
+ has_fulltext = access_xinghe_repository_has_fulltext
483
+ if isinstance(access_xinghe_repository_sha256, str):
484
+ if not has_fulltext:
485
+ return False
486
+ return access_xinghe_repository_sha256 == ""
487
+ if not (
488
+ isinstance(access_xinghe_repository_sha256, list)
489
+ and all(isinstance(x, str) for x in access_xinghe_repository_sha256)
490
+ ):
491
+ return True
492
+ if not has_fulltext:
493
+ return False
494
+ return len(access_xinghe_repository_sha256) == 0
495
+
496
+
497
+ def check_access_xinghe_repository_origin_path(
498
+ access_xinghe_repository_origin_path: Any, access_xinghe_repository_has_fulltext: Any
499
+ ) -> bool:
500
+ if not isinstance(access_xinghe_repository_origin_path, str):
501
+ return True
502
+ if not isinstance(access_xinghe_repository_has_fulltext, bool):
503
+ return True
504
+ if not access_xinghe_repository_has_fulltext:
505
+ return False
506
+ return access_xinghe_repository_origin_path.strip() == ""
507
+
508
+
509
+ def _normalize_json_like_field(value: Any) -> Any:
510
+ if not isinstance(value, str):
511
+ return value
512
+ stripped = value.strip()
513
+ if not stripped:
514
+ return value
515
+ if stripped[0] not in ("[", "{"):
516
+ return value
517
+ try:
518
+ return json.loads(stripped)
519
+ except (TypeError, ValueError, json.JSONDecodeError):
520
+ cleaned = stripped.replace("\r", " ").replace("\n", " ").replace("\t", " ")
521
+ cleaned = "".join(ch if ord(ch) >= 32 else " " for ch in cleaned)
522
+ invalid_escape_re = re.compile(r'\\(?!["\\/bfnrtu])')
523
+ for _ in range(10):
524
+ next_cleaned = invalid_escape_re.sub(r"\\\\", cleaned)
525
+ if next_cleaned == cleaned:
526
+ break
527
+ cleaned = next_cleaned
528
+ try:
529
+ return json.loads(cleaned)
530
+ except (TypeError, ValueError, json.JSONDecodeError):
531
+ return value
532
+
533
+
534
+ def _normalize_bool_field(value: Any) -> Any:
535
+ if isinstance(value, bool):
536
+ return value
537
+ if isinstance(value, int):
538
+ if value in (0, 1):
539
+ return bool(value)
540
+ return value
541
+ if isinstance(value, str):
542
+ lowered = value.strip().lower()
543
+ if lowered in ("1", "true"):
544
+ return True
545
+ if lowered in ("0", "false"):
546
+ return False
547
+ return value
548
+
549
+
550
+ def normalize_record(record: Dict[str, Any]) -> Dict[str, Any]:
551
+ normalized = dict(record)
552
+ for field in JSON_LIST_FIELDS:
553
+ if field in normalized:
554
+ normalized[field] = _normalize_json_like_field(normalized.get(field))
555
+ normalized["access_xinghe_repository_has_fulltext"] = _normalize_bool_field(
556
+ normalized.get("access_xinghe_repository_has_fulltext")
557
+ )
558
+ return normalized
559
+
560
+
561
+ FIELD_VALIDATORS = {
562
+ "metadata_type": lambda record: check_metadata_type(record.get("metadata_type")),
563
+ "doi": lambda record: check_doi(record.get("doi"), record.get("metadata_type")),
564
+ "isbns": lambda record: check_isbns(record.get("isbns"), record.get("metadata_type")),
565
+ "isbn13": lambda record: check_isbn13(record.get("isbn13"), record.get("metadata_type")),
566
+ "title": lambda record: check_title(record.get("title")),
567
+ "abstract": lambda record: check_abstract(record.get("abstract")),
568
+ "language": lambda record: check_language(record.get("language")),
569
+ "author": lambda record: check_author(record.get("author")),
570
+ "contributors": lambda record: check_contributors(record.get("contributors")),
571
+ "locations": lambda record: check_locations(record.get("locations")),
572
+ "access_is_oa": lambda record: check_access_is_oa(record.get("access_is_oa"), record.get("metadata_type")),
573
+ "access_oa_status": lambda record: check_access_oa_status(record.get("access_oa_status")),
574
+ "access_oa_url": lambda record: check_access_oa_url(record.get("access_oa_url")),
575
+ "access_license": lambda record: check_access_license(record.get("access_license")),
576
+ "publication_published_date": lambda record: check_publication_published_date(
577
+ record.get("publication_published_date")
578
+ ),
579
+ "publication_published_year": lambda record: check_publication_published_year(
580
+ record.get("publication_published_year")
581
+ ),
582
+ "publication_venue_issn": lambda record: check_publication_venue_issn(record.get("publication_venue_issn")),
583
+ "publication_venue_biblio_volume": lambda record: check_publication_venue_biblio_volume(
584
+ record.get("publication_venue_biblio_volume")
585
+ ),
586
+ "publication_venue_biblio_issue": lambda record: check_publication_venue_biblio_issue(
587
+ record.get("publication_venue_biblio_issue")
588
+ ),
589
+ "publication_venue_biblio_pages": lambda record: check_publication_venue_biblio_pages(
590
+ record.get("publication_venue_biblio_pages")
591
+ ),
592
+ "publication_pages": lambda record: check_publication_pages(record.get("publication_pages")),
593
+ "publication_venue_name_unified": lambda record: check_publication_venue_name_unified(
594
+ record.get("publication_venue_name_unified"),
595
+ record.get("publication_venue_name"),
596
+ ),
597
+ "grade_class": lambda record: check_grade_class(record.get("grade_class")),
598
+ "grade": lambda record: check_grade(record.get("grade"), record.get("grade_class")),
599
+ "references": lambda record: check_references(record.get("references")),
600
+ "related_works": lambda record: check_related_works(record.get("related_works")),
601
+ "cited_by_api_url": lambda record: check_cited_by_api_url(record.get("cited_by_api_url")),
602
+ "access_xinghe_repository_sha256": lambda record: check_access_xinghe_repository_sha256(
603
+ record.get("access_xinghe_repository_sha256"),
604
+ record.get("access_xinghe_repository_has_fulltext"),
605
+ ),
606
+ "access_xinghe_repository_origin_path": lambda record: check_access_xinghe_repository_origin_path(
607
+ record.get("access_xinghe_repository_origin_path"),
608
+ record.get("access_xinghe_repository_has_fulltext"),
609
+ ),
610
+ }
611
+
612
+
613
+ @Model.rule_register("QUALITY_BAD_EFFECTIVENESS", ["xinghe", "quanliang"])
614
+ class RuleQuanliangFieldValidation(BaseRule):
615
+ _metric_info = {
616
+ "category": "Rule-Based Metadata Quality Metrics",
617
+ "quality_dimension": "EFFECTIVENESS",
618
+ "metric_name": "RuleQuanliangFieldValidation",
619
+ "description": "Validate Quanliang metadata fields and report invalid fields",
620
+ "paper_title": "",
621
+ "paper_url": "",
622
+ "paper_authors": "",
623
+ "evaluation_results": "",
624
+ }
625
+
626
+ _required_fields = [RequiredField.METADATA]
627
+ dynamic_config = EvaluatorRuleArgs(key_list=list(FIELD_VALIDATORS.keys()))
628
+
629
+ @classmethod
630
+ def eval(cls, input_data: Data) -> EvalDetail:
631
+ res = EvalDetail(metric=cls.__name__)
632
+ normalized = normalize_record(input_data.to_dict())
633
+ selected_fields = cls.dynamic_config.key_list or []
634
+ bad_fields: List[str] = []
635
+ reasons: List[str] = []
636
+ for field in selected_fields:
637
+ if field not in FIELD_VALIDATORS:
638
+ bad_fields.append(field)
639
+ reasons.append("unsupported field")
640
+ continue
641
+ if field not in normalized:
642
+ bad_fields.append(field)
643
+ reasons.append("missing field")
644
+ continue
645
+ if FIELD_VALIDATORS[field](normalized):
646
+ bad_fields.append(field)
647
+ reasons.append(f"{field} invalid")
648
+
649
+ if bad_fields:
650
+ res.status = True
651
+ res.label = bad_fields
652
+ res.reason = reasons
653
+ else:
654
+ res.label = [QualityLabel.QUALITY_GOOD]
655
+ return res
dingo/run/cli.py CHANGED
@@ -53,6 +53,13 @@ def parse_args():
53
53
  default=False,
54
54
  help="Output as JSON",
55
55
  )
56
+ info_parser.add_argument(
57
+ "--count",
58
+ action="store_true",
59
+ default=False,
60
+ help="Print metric counts (rules, llm, groups, total_metrics=rules+llm). "
61
+ "Human mode: counts only. With --json: prepend a \"counts\" object to the payload.",
62
+ )
56
63
 
57
64
  # --- dingo serve ---
58
65
  serve_parser = subparsers.add_parser("serve", help="Start MCP server for AI agent integration")
@@ -177,9 +184,23 @@ def cmd_info(args):
177
184
  groups[group_name] = [cls.__name__ for cls in rule_list]
178
185
  info["groups"] = groups
179
186
 
187
+ counts = {
188
+ "rules": len(Model.rule_name_map),
189
+ "llm": len(Model.llm_name_map),
190
+ "groups": len(Model.rule_groups),
191
+ "total_metrics": len(Model.rule_name_map) + len(Model.llm_name_map),
192
+ }
193
+
180
194
  if args.json:
181
- json.dump(info, sys.stdout, indent=2, ensure_ascii=False)
195
+ if args.count:
196
+ payload = {"counts": counts, **info}
197
+ json.dump(payload, sys.stdout, indent=2, ensure_ascii=False)
198
+ else:
199
+ json.dump(info, sys.stdout, indent=2, ensure_ascii=False)
182
200
  sys.stdout.write("\n")
201
+ elif args.count:
202
+ for key in ("rules", "llm", "groups", "total_metrics"):
203
+ print(f"{key}: {counts[key]}")
183
204
  else:
184
205
  _print_info_table(info)
185
206