dingo-python 2.3.0__py3-none-any.whl → 2.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -53,6 +53,10 @@ class DatasetFieldArgs(BaseModel):
53
53
  image: str = ''
54
54
 
55
55
 
56
+ class DatasetMinerUArgs(BaseModel):
57
+ include_types: Optional[List[str]] = None # 只保留指定的 block 类型,None 表示全部保留
58
+
59
+
56
60
  class DatasetArgs(BaseModel):
57
61
  source: str = 'hugging_face'
58
62
  format: str = 'json'
@@ -64,6 +68,7 @@ class DatasetArgs(BaseModel):
64
68
  excel_config: DatasetExcelArgs = DatasetExcelArgs()
65
69
  csv_config: DatasetCsvArgs = DatasetCsvArgs()
66
70
  parquet_config: DatasetParquetArgs = DatasetParquetArgs()
71
+ mineru_config: DatasetMinerUArgs = DatasetMinerUArgs()
67
72
 
68
73
 
69
74
  class ExecutorResultSaveArgs(BaseModel):
@@ -1,3 +1,4 @@
1
+ import dingo.data.converter.mineru # noqa: F401 — registers mineru / mineru_v2
1
2
  from dingo.data.converter.base import BaseConverter
2
3
 
3
4
  converters = BaseConverter.converters
@@ -0,0 +1,245 @@
1
+ """MinerU output format converters.
2
+
3
+ Supports two MinerU structured output files:
4
+ - ``content_list.json`` (format: ``"mineru"``) — flat array of blocks
5
+ - ``content_list_v2.json`` (format: ``"mineru_v2"``) — pages × blocks
6
+ """
7
+
8
+ import json
9
+ from typing import Callable, Dict, List, Union
10
+
11
+ from dingo.config import InputArgs
12
+ from dingo.data.converter.base import BaseConverter
13
+ from dingo.io import Data
14
+
15
+
16
+ def _flatten_spans(spans: List[Dict]) -> str:
17
+ """Concatenate a V2 span list into plain text.
18
+
19
+ Spans look like ``[{"type": "text", "content": "..."}, ...]``.
20
+ Hyperlink spans may carry ``children`` with nested text spans;
21
+ when present we use the top-level ``content`` which is already the
22
+ concatenated text.
23
+ """
24
+ parts = []
25
+ for span in spans:
26
+ parts.append(span.get("content", ""))
27
+ return "".join(parts)
28
+
29
+
30
+ def _wrap_image(img_path) -> List[str]:
31
+ """Ensure img_path is a list of strings (Dingo convention)."""
32
+ if not img_path:
33
+ return []
34
+ if isinstance(img_path, list):
35
+ return img_path
36
+ return [img_path]
37
+
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # V1: content_list.json
41
+ # ---------------------------------------------------------------------------
42
+
43
+ _V1_TEXT_TYPES = frozenset({
44
+ "text", "equation", "header", "footer",
45
+ "page_number", "aside_text", "page_footnote",
46
+ })
47
+
48
+
49
+ def _map_block_v1(block: dict, block_idx: int) -> dict:
50
+ """Map a single content_list.json block to a Data-compatible dict."""
51
+ btype = block.get("type", "")
52
+ page_idx = block.get("page_idx", 0)
53
+
54
+ data_dict = dict(block)
55
+ data_dict["data_id"] = f"p{page_idx}-b{block_idx}"
56
+
57
+ if btype in _V1_TEXT_TYPES:
58
+ data_dict["content"] = block.get("text", "")
59
+
60
+ elif btype == "image":
61
+ data_dict["content"] = ""
62
+ data_dict["image"] = _wrap_image(block.get("img_path"))
63
+
64
+ elif btype == "table":
65
+ data_dict["content"] = block.get("table_body", "")
66
+ data_dict["image"] = _wrap_image(block.get("img_path"))
67
+
68
+ elif btype == "chart":
69
+ data_dict["content"] = block.get("content", "")
70
+ data_dict["image"] = _wrap_image(block.get("img_path"))
71
+
72
+ elif btype == "code":
73
+ data_dict["content"] = block.get("code_body", "")
74
+
75
+ elif btype == "list":
76
+ items = block.get("list_items", [])
77
+ data_dict["content"] = "\n".join(items) if items else ""
78
+
79
+ else:
80
+ data_dict.setdefault("content", block.get("text", ""))
81
+
82
+ if "img_path" in data_dict and "image" not in data_dict:
83
+ img = _wrap_image(data_dict["img_path"])
84
+ if img:
85
+ data_dict["image"] = img
86
+
87
+ return data_dict
88
+
89
+
90
+ @BaseConverter.register("mineru")
91
+ class MinerUConverter(BaseConverter):
92
+ """Converter for MinerU ``content_list.json`` (flat block array)."""
93
+
94
+ def __init__(self):
95
+ super().__init__()
96
+
97
+ @classmethod
98
+ def convertor(cls, input_args: InputArgs) -> Callable:
99
+ include = None
100
+ if hasattr(input_args.dataset, "mineru_config"):
101
+ cfg = input_args.dataset.mineru_config
102
+ if cfg.include_types:
103
+ include = frozenset(cfg.include_types)
104
+
105
+ def _convert(raw: Union[str, list]):
106
+ blocks = raw
107
+ if isinstance(raw, str):
108
+ blocks = json.loads(raw)
109
+ for block_idx, block in enumerate(blocks):
110
+ if include and block.get("type", "") not in include:
111
+ continue
112
+ data_dict = _map_block_v1(block, block_idx)
113
+ yield Data(**data_dict)
114
+
115
+ return _convert
116
+
117
+
118
+ # ---------------------------------------------------------------------------
119
+ # V2: content_list_v2.json
120
+ # ---------------------------------------------------------------------------
121
+
122
+ def _map_block_v2(block: dict, page_idx: int, block_idx: int) -> dict:
123
+ """Map a single content_list_v2.json block to a Data-compatible dict."""
124
+ btype = block.get("type", "")
125
+ inner = block.get("content")
126
+
127
+ if not isinstance(inner, dict):
128
+ data_dict = {
129
+ "data_id": f"p{page_idx}-b{block_idx}",
130
+ "type": btype,
131
+ "page_idx": page_idx,
132
+ "content": inner if isinstance(inner, str) else "",
133
+ "raw_content": inner,
134
+ }
135
+ if "bbox" in block:
136
+ data_dict["bbox"] = block["bbox"]
137
+ return data_dict
138
+
139
+ data_dict = {
140
+ "data_id": f"p{page_idx}-b{block_idx}",
141
+ "type": btype,
142
+ "page_idx": page_idx,
143
+ }
144
+
145
+ if "bbox" in block:
146
+ data_dict["bbox"] = block["bbox"]
147
+ if "anchor" in block:
148
+ data_dict["anchor"] = block["anchor"]
149
+ if "sub_type" in block:
150
+ data_dict["sub_type"] = block["sub_type"]
151
+
152
+ if btype == "title":
153
+ spans = inner.get("title_content", [])
154
+ data_dict["content"] = _flatten_spans(spans)
155
+ data_dict["text_level"] = inner.get("level", 0)
156
+
157
+ elif btype == "paragraph":
158
+ spans = inner.get("paragraph_content", [])
159
+ data_dict["content"] = _flatten_spans(spans)
160
+
161
+ elif btype == "equation_interline":
162
+ data_dict["content"] = inner.get("math_content", "")
163
+ if "math_type" in inner:
164
+ data_dict["math_type"] = inner["math_type"]
165
+
166
+ elif btype == "image":
167
+ data_dict["content"] = ""
168
+ data_dict["image"] = _wrap_image(inner.get("img_path"))
169
+ for key in ("image_caption", "image_footnote"):
170
+ if key in inner:
171
+ data_dict[key] = inner[key]
172
+
173
+ elif btype == "table":
174
+ data_dict["content"] = inner.get("table_body", "")
175
+ data_dict["image"] = _wrap_image(inner.get("img_path"))
176
+ for key in ("table_caption", "table_footnote"):
177
+ if key in inner:
178
+ data_dict[key] = inner[key]
179
+
180
+ elif btype == "chart":
181
+ data_dict["content"] = inner.get("content", "")
182
+ data_dict["image"] = _wrap_image(inner.get("img_path"))
183
+ for key in ("chart_caption", "chart_footnote"):
184
+ if key in inner:
185
+ data_dict[key] = inner[key]
186
+
187
+ elif btype == "code":
188
+ data_dict["content"] = inner.get("code_content", "")
189
+ for key in ("code_caption", "code_footnote", "code_language"):
190
+ if key in inner:
191
+ data_dict[key] = inner[key]
192
+
193
+ elif btype == "algorithm":
194
+ data_dict["content"] = inner.get("algorithm_content", "")
195
+ for key in ("algorithm_caption", "algorithm_footnote"):
196
+ if key in inner:
197
+ data_dict[key] = inner[key]
198
+
199
+ elif btype in ("list", "index"):
200
+ items = inner.get("list_items", [])
201
+ data_dict["content"] = "\n".join(items) if items else ""
202
+ data_dict["list_items"] = items
203
+
204
+ else:
205
+ content_key = f"{btype}_content"
206
+ spans = inner.get(content_key, [])
207
+ if isinstance(spans, list) and spans and isinstance(spans[0], dict):
208
+ data_dict["content"] = _flatten_spans(spans)
209
+ elif isinstance(inner, str):
210
+ data_dict["content"] = inner
211
+ else:
212
+ data_dict["content"] = ""
213
+
214
+ data_dict["raw_content"] = inner
215
+
216
+ return data_dict
217
+
218
+
219
+ @BaseConverter.register("mineru_v2")
220
+ class MinerUV2Converter(BaseConverter):
221
+ """Converter for MinerU ``content_list_v2.json`` (pages x blocks)."""
222
+
223
+ def __init__(self):
224
+ super().__init__()
225
+
226
+ @classmethod
227
+ def convertor(cls, input_args: InputArgs) -> Callable:
228
+ include = None
229
+ if hasattr(input_args.dataset, "mineru_config"):
230
+ cfg = input_args.dataset.mineru_config
231
+ if cfg.include_types:
232
+ include = frozenset(cfg.include_types)
233
+
234
+ def _convert(raw: Union[str, list]):
235
+ pages = raw
236
+ if isinstance(raw, str):
237
+ pages = json.loads(raw)
238
+ for page_idx, page_blocks in enumerate(pages):
239
+ for block_idx, block in enumerate(page_blocks):
240
+ if include and block.get("type", "") not in include:
241
+ continue
242
+ data_dict = _map_block_v2(block, page_idx, block_idx)
243
+ yield Data(**data_dict)
244
+
245
+ return _convert
@@ -395,7 +395,7 @@ class LocalDataSource(DataSource):
395
395
  elif os.path.exists(self.path) and os.path.isdir(self.path):
396
396
  self._find_all_files(self.path, f_list)
397
397
 
398
- by_line = self.input_args.dataset.format not in ["json", "listjson"]
398
+ by_line = self.input_args.dataset.format not in ["json", "listjson", "mineru", "mineru_v2"]
399
399
 
400
400
  for f in f_list:
401
401
  # Check if file is CSV
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dingo-python
3
- Version: 2.3.0
3
+ Version: 2.4.0
4
4
  Summary: A Comprehensive AI Data Quality Evaluation Tool for Large Models
5
5
  Home-page: https://github.com/MigoXLab/dingo
6
6
  Author: Dingo
@@ -1,10 +1,11 @@
1
1
  dingo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  dingo/config/__init__.py,sha256=SaeOmGEUG0Hp5lqHxnHUTE_9ysN5KzA_Icilb9xY2mQ,349
3
- dingo/config/input_args.py,sha256=ngOJcAyq_LdLqNszMTljfOUp_utXN6FX-nBsD_ADyeg,4388
3
+ dingo/config/input_args.py,sha256=lu4ZcG9me_9PgjXScnB9C17mBosVj7ArTU-JNuE4qgc,4592
4
4
  dingo/data/__init__.py,sha256=reCw4XQoInUTtvRW6c1wY_LH1EWJ7XpZDQcBCW61Lf8,214
5
- dingo/data/converter/__init__.py,sha256=1MiG4H8Sg2sYHQmYdg0F9_1okP_YoMNHyQorPEAf6zw,91
5
+ dingo/data/converter/__init__.py,sha256=nZOTVKQ5B4_VmMhxNM2289KisdWXeom-IcciuAv9iqI,174
6
6
  dingo/data/converter/base.py,sha256=_WXa_plKj83iFgQyHABchGbX-dv3d17QuODua-bd83w,12820
7
7
  dingo/data/converter/img_utils.py,sha256=ulySpvbmdn-LoeufJw7pVS1_k5XRJMbptzP2qLOIBjU,3527
8
+ dingo/data/converter/mineru.py,sha256=bVB8dKGaiAEIXgms4fM_ykiPVPb5tKMfJf8_hPmh2j8,8136
8
9
  dingo/data/dataset/__init__.py,sha256=IWXEBNDBcffxOnteKLJDOuWTyx2rgk0MN1kHv5eirFY,574
9
10
  dingo/data/dataset/base.py,sha256=orRyPrgYqRgxO6WiFtCE2hX0yZXloMKfydkosFJzdGU,5437
10
11
  dingo/data/dataset/huggingface.py,sha256=DNor2VudY8ckvNZxNHk3XY0PqKvyyvKg6sYZzUE1GZ0,7155
@@ -15,7 +16,7 @@ dingo/data/dataset/sql.py,sha256=hERVYF3Ij7UzQWJmucurBj0OUiKjU5cGtfGF-J1_8AM,259
15
16
  dingo/data/datasource/__init__.py,sha256=_cZDMi4xAHGfals0UpXEcNxRDuYOSjJHCV29tQB6dso,386
16
17
  dingo/data/datasource/base.py,sha256=pK5h8oxSGnEJtP_vbA4rkzRhaM5GTssQ6mQuF-HVlEY,2936
17
18
  dingo/data/datasource/huggingface.py,sha256=wWruojFogbToghatcXnqGCg1inQ1NYyN74yfyDFNZBw,3768
18
- dingo/data/datasource/local.py,sha256=8IQ9FvW_T_vFSr4k6Pu0l3Am8UVLt94RrBRLsJsuGUs,18787
19
+ dingo/data/datasource/local.py,sha256=O4R6qpS5RGa-M57EdY2nm1jK56Pe-aFogjcaI4LdSyw,18810
19
20
  dingo/data/datasource/s3.py,sha256=FKDfrtZfiwby2x1WLUngbBO2ILfb4r-Dz-ho6-ThRSA,2963
20
21
  dingo/data/datasource/sql.py,sha256=LQBaPyP7Pigs-Rdm6ODfrbo45TILYY_CFUIPWp8qUk0,3910
21
22
  dingo/data/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -154,9 +155,9 @@ dingo/utils/exception.py,sha256=fh58dSLSmYSnwW4MQXg-Jfai2QcZfDruTaYGbaWk7Wc,446
154
155
  dingo/utils/image_loader.py,sha256=RqnziJhqDB6zWf1s7S4zKgFqLA9e9CHkf5aU1R4fqnM,4479
155
156
  dingo/utils/log_util/__init__.py,sha256=VfzAAHUV8RuN-QaySahfAPfhM__-myigUlKx7ywVerA,717
156
157
  dingo/utils/log_util/logger.py,sha256=spGK0w22UgXsCcArd1rpt2teLPy7QPlIuvBaKYioHdY,1414
157
- dingo_python-2.3.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
158
- dingo_python-2.3.0.dist-info/METADATA,sha256=F6glwcTym-X-iH-k5LDOT2bu5lmZdzdDWgwH4_n2Eg4,27555
159
- dingo_python-2.3.0.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
160
- dingo_python-2.3.0.dist-info/entry_points.txt,sha256=Vo_p8qSVnOENdy1uubqxJRppZIpiQ753JG3WPAUeYps,45
161
- dingo_python-2.3.0.dist-info/top_level.txt,sha256=gSXQSLowu_WOQRi75wK3qyjbHxeN5PqsaA4ChGmJdek,6
162
- dingo_python-2.3.0.dist-info/RECORD,,
158
+ dingo_python-2.4.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
159
+ dingo_python-2.4.0.dist-info/METADATA,sha256=0RovhGosWHE5sqmznPjief9qCa2BN13cKQ1yfNwsmRw,27555
160
+ dingo_python-2.4.0.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
161
+ dingo_python-2.4.0.dist-info/entry_points.txt,sha256=Vo_p8qSVnOENdy1uubqxJRppZIpiQ753JG3WPAUeYps,45
162
+ dingo_python-2.4.0.dist-info/top_level.txt,sha256=gSXQSLowu_WOQRi75wK3qyjbHxeN5PqsaA4ChGmJdek,6
163
+ dingo_python-2.4.0.dist-info/RECORD,,