dingo-python 2.3.0__py3-none-any.whl → 2.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dingo/config/input_args.py +5 -0
- dingo/data/converter/__init__.py +1 -0
- dingo/data/converter/mineru.py +245 -0
- dingo/data/datasource/local.py +1 -1
- {dingo_python-2.3.0.dist-info → dingo_python-2.4.0.dist-info}/METADATA +1 -1
- {dingo_python-2.3.0.dist-info → dingo_python-2.4.0.dist-info}/RECORD +10 -9
- {dingo_python-2.3.0.dist-info → dingo_python-2.4.0.dist-info}/WHEEL +0 -0
- {dingo_python-2.3.0.dist-info → dingo_python-2.4.0.dist-info}/entry_points.txt +0 -0
- {dingo_python-2.3.0.dist-info → dingo_python-2.4.0.dist-info}/licenses/LICENSE +0 -0
- {dingo_python-2.3.0.dist-info → dingo_python-2.4.0.dist-info}/top_level.txt +0 -0
dingo/config/input_args.py
CHANGED
|
@@ -53,6 +53,10 @@ class DatasetFieldArgs(BaseModel):
|
|
|
53
53
|
image: str = ''
|
|
54
54
|
|
|
55
55
|
|
|
56
|
+
class DatasetMinerUArgs(BaseModel):
|
|
57
|
+
include_types: Optional[List[str]] = None # 只保留指定的 block 类型,None 表示全部保留
|
|
58
|
+
|
|
59
|
+
|
|
56
60
|
class DatasetArgs(BaseModel):
|
|
57
61
|
source: str = 'hugging_face'
|
|
58
62
|
format: str = 'json'
|
|
@@ -64,6 +68,7 @@ class DatasetArgs(BaseModel):
|
|
|
64
68
|
excel_config: DatasetExcelArgs = DatasetExcelArgs()
|
|
65
69
|
csv_config: DatasetCsvArgs = DatasetCsvArgs()
|
|
66
70
|
parquet_config: DatasetParquetArgs = DatasetParquetArgs()
|
|
71
|
+
mineru_config: DatasetMinerUArgs = DatasetMinerUArgs()
|
|
67
72
|
|
|
68
73
|
|
|
69
74
|
class ExecutorResultSaveArgs(BaseModel):
|
dingo/data/converter/__init__.py
CHANGED
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""MinerU output format converters.
|
|
2
|
+
|
|
3
|
+
Supports two MinerU structured output files:
|
|
4
|
+
- ``content_list.json`` (format: ``"mineru"``) — flat array of blocks
|
|
5
|
+
- ``content_list_v2.json`` (format: ``"mineru_v2"``) — pages × blocks
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from typing import Callable, Dict, List, Union
|
|
10
|
+
|
|
11
|
+
from dingo.config import InputArgs
|
|
12
|
+
from dingo.data.converter.base import BaseConverter
|
|
13
|
+
from dingo.io import Data
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _flatten_spans(spans: List[Dict]) -> str:
|
|
17
|
+
"""Concatenate a V2 span list into plain text.
|
|
18
|
+
|
|
19
|
+
Spans look like ``[{"type": "text", "content": "..."}, ...]``.
|
|
20
|
+
Hyperlink spans may carry ``children`` with nested text spans;
|
|
21
|
+
when present we use the top-level ``content`` which is already the
|
|
22
|
+
concatenated text.
|
|
23
|
+
"""
|
|
24
|
+
parts = []
|
|
25
|
+
for span in spans:
|
|
26
|
+
parts.append(span.get("content", ""))
|
|
27
|
+
return "".join(parts)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _wrap_image(img_path) -> List[str]:
|
|
31
|
+
"""Ensure img_path is a list of strings (Dingo convention)."""
|
|
32
|
+
if not img_path:
|
|
33
|
+
return []
|
|
34
|
+
if isinstance(img_path, list):
|
|
35
|
+
return img_path
|
|
36
|
+
return [img_path]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# V1: content_list.json
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
_V1_TEXT_TYPES = frozenset({
|
|
44
|
+
"text", "equation", "header", "footer",
|
|
45
|
+
"page_number", "aside_text", "page_footnote",
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _map_block_v1(block: dict, block_idx: int) -> dict:
|
|
50
|
+
"""Map a single content_list.json block to a Data-compatible dict."""
|
|
51
|
+
btype = block.get("type", "")
|
|
52
|
+
page_idx = block.get("page_idx", 0)
|
|
53
|
+
|
|
54
|
+
data_dict = dict(block)
|
|
55
|
+
data_dict["data_id"] = f"p{page_idx}-b{block_idx}"
|
|
56
|
+
|
|
57
|
+
if btype in _V1_TEXT_TYPES:
|
|
58
|
+
data_dict["content"] = block.get("text", "")
|
|
59
|
+
|
|
60
|
+
elif btype == "image":
|
|
61
|
+
data_dict["content"] = ""
|
|
62
|
+
data_dict["image"] = _wrap_image(block.get("img_path"))
|
|
63
|
+
|
|
64
|
+
elif btype == "table":
|
|
65
|
+
data_dict["content"] = block.get("table_body", "")
|
|
66
|
+
data_dict["image"] = _wrap_image(block.get("img_path"))
|
|
67
|
+
|
|
68
|
+
elif btype == "chart":
|
|
69
|
+
data_dict["content"] = block.get("content", "")
|
|
70
|
+
data_dict["image"] = _wrap_image(block.get("img_path"))
|
|
71
|
+
|
|
72
|
+
elif btype == "code":
|
|
73
|
+
data_dict["content"] = block.get("code_body", "")
|
|
74
|
+
|
|
75
|
+
elif btype == "list":
|
|
76
|
+
items = block.get("list_items", [])
|
|
77
|
+
data_dict["content"] = "\n".join(items) if items else ""
|
|
78
|
+
|
|
79
|
+
else:
|
|
80
|
+
data_dict.setdefault("content", block.get("text", ""))
|
|
81
|
+
|
|
82
|
+
if "img_path" in data_dict and "image" not in data_dict:
|
|
83
|
+
img = _wrap_image(data_dict["img_path"])
|
|
84
|
+
if img:
|
|
85
|
+
data_dict["image"] = img
|
|
86
|
+
|
|
87
|
+
return data_dict
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@BaseConverter.register("mineru")
|
|
91
|
+
class MinerUConverter(BaseConverter):
|
|
92
|
+
"""Converter for MinerU ``content_list.json`` (flat block array)."""
|
|
93
|
+
|
|
94
|
+
def __init__(self):
|
|
95
|
+
super().__init__()
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def convertor(cls, input_args: InputArgs) -> Callable:
|
|
99
|
+
include = None
|
|
100
|
+
if hasattr(input_args.dataset, "mineru_config"):
|
|
101
|
+
cfg = input_args.dataset.mineru_config
|
|
102
|
+
if cfg.include_types:
|
|
103
|
+
include = frozenset(cfg.include_types)
|
|
104
|
+
|
|
105
|
+
def _convert(raw: Union[str, list]):
|
|
106
|
+
blocks = raw
|
|
107
|
+
if isinstance(raw, str):
|
|
108
|
+
blocks = json.loads(raw)
|
|
109
|
+
for block_idx, block in enumerate(blocks):
|
|
110
|
+
if include and block.get("type", "") not in include:
|
|
111
|
+
continue
|
|
112
|
+
data_dict = _map_block_v1(block, block_idx)
|
|
113
|
+
yield Data(**data_dict)
|
|
114
|
+
|
|
115
|
+
return _convert
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ---------------------------------------------------------------------------
|
|
119
|
+
# V2: content_list_v2.json
|
|
120
|
+
# ---------------------------------------------------------------------------
|
|
121
|
+
|
|
122
|
+
def _map_block_v2(block: dict, page_idx: int, block_idx: int) -> dict:
|
|
123
|
+
"""Map a single content_list_v2.json block to a Data-compatible dict."""
|
|
124
|
+
btype = block.get("type", "")
|
|
125
|
+
inner = block.get("content")
|
|
126
|
+
|
|
127
|
+
if not isinstance(inner, dict):
|
|
128
|
+
data_dict = {
|
|
129
|
+
"data_id": f"p{page_idx}-b{block_idx}",
|
|
130
|
+
"type": btype,
|
|
131
|
+
"page_idx": page_idx,
|
|
132
|
+
"content": inner if isinstance(inner, str) else "",
|
|
133
|
+
"raw_content": inner,
|
|
134
|
+
}
|
|
135
|
+
if "bbox" in block:
|
|
136
|
+
data_dict["bbox"] = block["bbox"]
|
|
137
|
+
return data_dict
|
|
138
|
+
|
|
139
|
+
data_dict = {
|
|
140
|
+
"data_id": f"p{page_idx}-b{block_idx}",
|
|
141
|
+
"type": btype,
|
|
142
|
+
"page_idx": page_idx,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
if "bbox" in block:
|
|
146
|
+
data_dict["bbox"] = block["bbox"]
|
|
147
|
+
if "anchor" in block:
|
|
148
|
+
data_dict["anchor"] = block["anchor"]
|
|
149
|
+
if "sub_type" in block:
|
|
150
|
+
data_dict["sub_type"] = block["sub_type"]
|
|
151
|
+
|
|
152
|
+
if btype == "title":
|
|
153
|
+
spans = inner.get("title_content", [])
|
|
154
|
+
data_dict["content"] = _flatten_spans(spans)
|
|
155
|
+
data_dict["text_level"] = inner.get("level", 0)
|
|
156
|
+
|
|
157
|
+
elif btype == "paragraph":
|
|
158
|
+
spans = inner.get("paragraph_content", [])
|
|
159
|
+
data_dict["content"] = _flatten_spans(spans)
|
|
160
|
+
|
|
161
|
+
elif btype == "equation_interline":
|
|
162
|
+
data_dict["content"] = inner.get("math_content", "")
|
|
163
|
+
if "math_type" in inner:
|
|
164
|
+
data_dict["math_type"] = inner["math_type"]
|
|
165
|
+
|
|
166
|
+
elif btype == "image":
|
|
167
|
+
data_dict["content"] = ""
|
|
168
|
+
data_dict["image"] = _wrap_image(inner.get("img_path"))
|
|
169
|
+
for key in ("image_caption", "image_footnote"):
|
|
170
|
+
if key in inner:
|
|
171
|
+
data_dict[key] = inner[key]
|
|
172
|
+
|
|
173
|
+
elif btype == "table":
|
|
174
|
+
data_dict["content"] = inner.get("table_body", "")
|
|
175
|
+
data_dict["image"] = _wrap_image(inner.get("img_path"))
|
|
176
|
+
for key in ("table_caption", "table_footnote"):
|
|
177
|
+
if key in inner:
|
|
178
|
+
data_dict[key] = inner[key]
|
|
179
|
+
|
|
180
|
+
elif btype == "chart":
|
|
181
|
+
data_dict["content"] = inner.get("content", "")
|
|
182
|
+
data_dict["image"] = _wrap_image(inner.get("img_path"))
|
|
183
|
+
for key in ("chart_caption", "chart_footnote"):
|
|
184
|
+
if key in inner:
|
|
185
|
+
data_dict[key] = inner[key]
|
|
186
|
+
|
|
187
|
+
elif btype == "code":
|
|
188
|
+
data_dict["content"] = inner.get("code_content", "")
|
|
189
|
+
for key in ("code_caption", "code_footnote", "code_language"):
|
|
190
|
+
if key in inner:
|
|
191
|
+
data_dict[key] = inner[key]
|
|
192
|
+
|
|
193
|
+
elif btype == "algorithm":
|
|
194
|
+
data_dict["content"] = inner.get("algorithm_content", "")
|
|
195
|
+
for key in ("algorithm_caption", "algorithm_footnote"):
|
|
196
|
+
if key in inner:
|
|
197
|
+
data_dict[key] = inner[key]
|
|
198
|
+
|
|
199
|
+
elif btype in ("list", "index"):
|
|
200
|
+
items = inner.get("list_items", [])
|
|
201
|
+
data_dict["content"] = "\n".join(items) if items else ""
|
|
202
|
+
data_dict["list_items"] = items
|
|
203
|
+
|
|
204
|
+
else:
|
|
205
|
+
content_key = f"{btype}_content"
|
|
206
|
+
spans = inner.get(content_key, [])
|
|
207
|
+
if isinstance(spans, list) and spans and isinstance(spans[0], dict):
|
|
208
|
+
data_dict["content"] = _flatten_spans(spans)
|
|
209
|
+
elif isinstance(inner, str):
|
|
210
|
+
data_dict["content"] = inner
|
|
211
|
+
else:
|
|
212
|
+
data_dict["content"] = ""
|
|
213
|
+
|
|
214
|
+
data_dict["raw_content"] = inner
|
|
215
|
+
|
|
216
|
+
return data_dict
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
@BaseConverter.register("mineru_v2")
|
|
220
|
+
class MinerUV2Converter(BaseConverter):
|
|
221
|
+
"""Converter for MinerU ``content_list_v2.json`` (pages x blocks)."""
|
|
222
|
+
|
|
223
|
+
def __init__(self):
|
|
224
|
+
super().__init__()
|
|
225
|
+
|
|
226
|
+
@classmethod
|
|
227
|
+
def convertor(cls, input_args: InputArgs) -> Callable:
|
|
228
|
+
include = None
|
|
229
|
+
if hasattr(input_args.dataset, "mineru_config"):
|
|
230
|
+
cfg = input_args.dataset.mineru_config
|
|
231
|
+
if cfg.include_types:
|
|
232
|
+
include = frozenset(cfg.include_types)
|
|
233
|
+
|
|
234
|
+
def _convert(raw: Union[str, list]):
|
|
235
|
+
pages = raw
|
|
236
|
+
if isinstance(raw, str):
|
|
237
|
+
pages = json.loads(raw)
|
|
238
|
+
for page_idx, page_blocks in enumerate(pages):
|
|
239
|
+
for block_idx, block in enumerate(page_blocks):
|
|
240
|
+
if include and block.get("type", "") not in include:
|
|
241
|
+
continue
|
|
242
|
+
data_dict = _map_block_v2(block, page_idx, block_idx)
|
|
243
|
+
yield Data(**data_dict)
|
|
244
|
+
|
|
245
|
+
return _convert
|
dingo/data/datasource/local.py
CHANGED
|
@@ -395,7 +395,7 @@ class LocalDataSource(DataSource):
|
|
|
395
395
|
elif os.path.exists(self.path) and os.path.isdir(self.path):
|
|
396
396
|
self._find_all_files(self.path, f_list)
|
|
397
397
|
|
|
398
|
-
by_line = self.input_args.dataset.format not in ["json", "listjson"]
|
|
398
|
+
by_line = self.input_args.dataset.format not in ["json", "listjson", "mineru", "mineru_v2"]
|
|
399
399
|
|
|
400
400
|
for f in f_list:
|
|
401
401
|
# Check if file is CSV
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
dingo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
dingo/config/__init__.py,sha256=SaeOmGEUG0Hp5lqHxnHUTE_9ysN5KzA_Icilb9xY2mQ,349
|
|
3
|
-
dingo/config/input_args.py,sha256=
|
|
3
|
+
dingo/config/input_args.py,sha256=lu4ZcG9me_9PgjXScnB9C17mBosVj7ArTU-JNuE4qgc,4592
|
|
4
4
|
dingo/data/__init__.py,sha256=reCw4XQoInUTtvRW6c1wY_LH1EWJ7XpZDQcBCW61Lf8,214
|
|
5
|
-
dingo/data/converter/__init__.py,sha256=
|
|
5
|
+
dingo/data/converter/__init__.py,sha256=nZOTVKQ5B4_VmMhxNM2289KisdWXeom-IcciuAv9iqI,174
|
|
6
6
|
dingo/data/converter/base.py,sha256=_WXa_plKj83iFgQyHABchGbX-dv3d17QuODua-bd83w,12820
|
|
7
7
|
dingo/data/converter/img_utils.py,sha256=ulySpvbmdn-LoeufJw7pVS1_k5XRJMbptzP2qLOIBjU,3527
|
|
8
|
+
dingo/data/converter/mineru.py,sha256=bVB8dKGaiAEIXgms4fM_ykiPVPb5tKMfJf8_hPmh2j8,8136
|
|
8
9
|
dingo/data/dataset/__init__.py,sha256=IWXEBNDBcffxOnteKLJDOuWTyx2rgk0MN1kHv5eirFY,574
|
|
9
10
|
dingo/data/dataset/base.py,sha256=orRyPrgYqRgxO6WiFtCE2hX0yZXloMKfydkosFJzdGU,5437
|
|
10
11
|
dingo/data/dataset/huggingface.py,sha256=DNor2VudY8ckvNZxNHk3XY0PqKvyyvKg6sYZzUE1GZ0,7155
|
|
@@ -15,7 +16,7 @@ dingo/data/dataset/sql.py,sha256=hERVYF3Ij7UzQWJmucurBj0OUiKjU5cGtfGF-J1_8AM,259
|
|
|
15
16
|
dingo/data/datasource/__init__.py,sha256=_cZDMi4xAHGfals0UpXEcNxRDuYOSjJHCV29tQB6dso,386
|
|
16
17
|
dingo/data/datasource/base.py,sha256=pK5h8oxSGnEJtP_vbA4rkzRhaM5GTssQ6mQuF-HVlEY,2936
|
|
17
18
|
dingo/data/datasource/huggingface.py,sha256=wWruojFogbToghatcXnqGCg1inQ1NYyN74yfyDFNZBw,3768
|
|
18
|
-
dingo/data/datasource/local.py,sha256=
|
|
19
|
+
dingo/data/datasource/local.py,sha256=O4R6qpS5RGa-M57EdY2nm1jK56Pe-aFogjcaI4LdSyw,18810
|
|
19
20
|
dingo/data/datasource/s3.py,sha256=FKDfrtZfiwby2x1WLUngbBO2ILfb4r-Dz-ho6-ThRSA,2963
|
|
20
21
|
dingo/data/datasource/sql.py,sha256=LQBaPyP7Pigs-Rdm6ODfrbo45TILYY_CFUIPWp8qUk0,3910
|
|
21
22
|
dingo/data/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -154,9 +155,9 @@ dingo/utils/exception.py,sha256=fh58dSLSmYSnwW4MQXg-Jfai2QcZfDruTaYGbaWk7Wc,446
|
|
|
154
155
|
dingo/utils/image_loader.py,sha256=RqnziJhqDB6zWf1s7S4zKgFqLA9e9CHkf5aU1R4fqnM,4479
|
|
155
156
|
dingo/utils/log_util/__init__.py,sha256=VfzAAHUV8RuN-QaySahfAPfhM__-myigUlKx7ywVerA,717
|
|
156
157
|
dingo/utils/log_util/logger.py,sha256=spGK0w22UgXsCcArd1rpt2teLPy7QPlIuvBaKYioHdY,1414
|
|
157
|
-
dingo_python-2.
|
|
158
|
-
dingo_python-2.
|
|
159
|
-
dingo_python-2.
|
|
160
|
-
dingo_python-2.
|
|
161
|
-
dingo_python-2.
|
|
162
|
-
dingo_python-2.
|
|
158
|
+
dingo_python-2.4.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
159
|
+
dingo_python-2.4.0.dist-info/METADATA,sha256=0RovhGosWHE5sqmznPjief9qCa2BN13cKQ1yfNwsmRw,27555
|
|
160
|
+
dingo_python-2.4.0.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
161
|
+
dingo_python-2.4.0.dist-info/entry_points.txt,sha256=Vo_p8qSVnOENdy1uubqxJRppZIpiQ753JG3WPAUeYps,45
|
|
162
|
+
dingo_python-2.4.0.dist-info/top_level.txt,sha256=gSXQSLowu_WOQRi75wK3qyjbHxeN5PqsaA4ChGmJdek,6
|
|
163
|
+
dingo_python-2.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|