PorosData-Designer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- porosdata_designer-0.1.0/LICENSE +21 -0
- porosdata_designer-0.1.0/MANIFEST.in +60 -0
- porosdata_designer-0.1.0/PKG-INFO +177 -0
- porosdata_designer-0.1.0/README.md +143 -0
- porosdata_designer-0.1.0/README_CN.md +143 -0
- porosdata_designer-0.1.0/pyproject.toml +64 -0
- porosdata_designer-0.1.0/setup.cfg +4 -0
- porosdata_designer-0.1.0/src/PorosData_Designer.egg-info/PKG-INFO +177 -0
- porosdata_designer-0.1.0/src/PorosData_Designer.egg-info/SOURCES.txt +37 -0
- porosdata_designer-0.1.0/src/PorosData_Designer.egg-info/dependency_links.txt +1 -0
- porosdata_designer-0.1.0/src/PorosData_Designer.egg-info/entry_points.txt +2 -0
- porosdata_designer-0.1.0/src/PorosData_Designer.egg-info/requires.txt +22 -0
- porosdata_designer-0.1.0/src/PorosData_Designer.egg-info/top_level.txt +1 -0
- porosdata_designer-0.1.0/src/porosdata_designer/__init__.py +89 -0
- porosdata_designer-0.1.0/src/porosdata_designer/__main__.py +9 -0
- porosdata_designer-0.1.0/src/porosdata_designer/adapters/__init__.py +10 -0
- porosdata_designer-0.1.0/src/porosdata_designer/adapters/content_list_adapter.py +121 -0
- porosdata_designer-0.1.0/src/porosdata_designer/cli.py +95 -0
- porosdata_designer-0.1.0/src/porosdata_designer/config.py +1 -0
- porosdata_designer-0.1.0/src/porosdata_designer/mappers/__init__.py +3 -0
- porosdata_designer-0.1.0/src/porosdata_designer/mappers/asset_anchoring.py +143 -0
- porosdata_designer-0.1.0/src/porosdata_designer/mappers/data_mining_mapper.py +295 -0
- porosdata_designer-0.1.0/src/porosdata_designer/plugin_system.py +1 -0
- porosdata_designer-0.1.0/src/porosdata_designer/py.typed +2 -0
- porosdata_designer-0.1.0/src/porosdata_designer/reorganizers/__init__.py +17 -0
- porosdata_designer-0.1.0/src/porosdata_designer/reorganizers/content_filter.py +150 -0
- porosdata_designer-0.1.0/src/porosdata_designer/reorganizers/multimodal_interleaver.py +1006 -0
- porosdata_designer-0.1.0/src/porosdata_designer/reorganizers/paragraph_classifier.py +162 -0
- porosdata_designer-0.1.0/src/porosdata_designer/reorganizers/text_aggregator.py +829 -0
- porosdata_designer-0.1.0/src/porosdata_designer/reorganizers/token_marker.py +69 -0
- porosdata_designer-0.1.0/src/porosdata_designer/runtime/__init__.py +3 -0
- porosdata_designer-0.1.0/src/porosdata_designer/runtime/commands.py +584 -0
- porosdata_designer-0.1.0/src/porosdata_designer/runtime/config.py +112 -0
- porosdata_designer-0.1.0/src/porosdata_designer/runtime/pipelines.py +391 -0
- porosdata_designer-0.1.0/src/porosdata_designer/runtime/plugin_system.py +273 -0
- porosdata_designer-0.1.0/src/porosdata_designer/utils/__init__.py +1 -0
- porosdata_designer-0.1.0/src/porosdata_designer/validators/__init__.py +3 -0
- porosdata_designer-0.1.0/src/porosdata_designer/validators/latex_validator.py +114 -0
- porosdata_designer-0.1.0/src/porosdata_designer/validators/schema_validator.py +140 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Kivent
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Minimal release metadata
|
|
2
|
+
include README.md
|
|
3
|
+
include README_CN.md
|
|
4
|
+
include LICENSE
|
|
5
|
+
include pyproject.toml
|
|
6
|
+
|
|
7
|
+
# Package typing marker
|
|
8
|
+
include src/porosdata_designer/py.typed
|
|
9
|
+
|
|
10
|
+
# Keep the source distribution clean
|
|
11
|
+
prune tests
|
|
12
|
+
prune data
|
|
13
|
+
prune logs
|
|
14
|
+
prune scripts
|
|
15
|
+
prune examples
|
|
16
|
+
prune docs
|
|
17
|
+
prune docs/audit
|
|
18
|
+
prune .cursor
|
|
19
|
+
prune .github
|
|
20
|
+
prune .idea
|
|
21
|
+
prune .pytest_cache
|
|
22
|
+
prune .vscode
|
|
23
|
+
|
|
24
|
+
exclude .gitignore
|
|
25
|
+
exclude CHANGELOG.md
|
|
26
|
+
exclude requirements-dev.txt
|
|
27
|
+
exclude Pipfile
|
|
28
|
+
exclude Pipfile.lock
|
|
29
|
+
exclude .env
|
|
30
|
+
exclude .env.*
|
|
31
|
+
|
|
32
|
+
global-exclude *.py[cod]
|
|
33
|
+
global-exclude __pycache__
|
|
34
|
+
global-exclude .pytest_cache
|
|
35
|
+
global-exclude *.log
|
|
36
|
+
global-exclude *.ipynb
|
|
37
|
+
global-exclude .ipynb_checkpoints
|
|
38
|
+
global-exclude *.pem
|
|
39
|
+
global-exclude *.key
|
|
40
|
+
global-exclude *.png
|
|
41
|
+
global-exclude *.jpg
|
|
42
|
+
global-exclude *.jpeg
|
|
43
|
+
global-exclude *.gif
|
|
44
|
+
global-exclude *.svg
|
|
45
|
+
global-exclude *.webp
|
|
46
|
+
global-exclude *.pdf
|
|
47
|
+
global-exclude *.csv
|
|
48
|
+
global-exclude *.tsv
|
|
49
|
+
global-exclude *.xls
|
|
50
|
+
global-exclude *.xlsx
|
|
51
|
+
global-exclude *.db
|
|
52
|
+
global-exclude *.sqlite
|
|
53
|
+
global-exclude *.sqlite3
|
|
54
|
+
global-exclude *.parquet
|
|
55
|
+
global-exclude *.feather
|
|
56
|
+
global-exclude *.pkl
|
|
57
|
+
global-exclude *.joblib
|
|
58
|
+
global-exclude *.npy
|
|
59
|
+
global-exclude *.npz
|
|
60
|
+
global-exclude .DS_Store
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: PorosData-Designer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Structured delivery toolkit for converting MinerU parses into full_text, datamining, and multimodal outputs.
|
|
5
|
+
Author-email: Kivent <72405514@cityu-dg.edu.cn>
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: mineru,document-processing,scientific-data,multimodal,structured-output
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: typing-extensions>=4.0; python_version < "3.10"
|
|
20
|
+
Requires-Dist: dataclasses>=0.6; python_version < "3.7"
|
|
21
|
+
Requires-Dist: pydantic>=2.0
|
|
22
|
+
Requires-Dist: loguru>=0.7.0
|
|
23
|
+
Requires-Dist: pathlib2>=2.3.0; python_version < "3.4"
|
|
24
|
+
Requires-Dist: dataclasses-json>=0.5.0
|
|
25
|
+
Requires-Dist: tqdm>=4.64.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
29
|
+
Requires-Dist: black; extra == "dev"
|
|
30
|
+
Requires-Dist: mypy; extra == "dev"
|
|
31
|
+
Provides-Extra: optional
|
|
32
|
+
Requires-Dist: cleanlit>=0.2.0; extra == "optional"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# PorosData-Designer
|
|
36
|
+
|
|
37
|
+
`PorosData-Designer` converts MinerU-generated document parses into three stable deliverables: `full_text`, `datamining`, and `multimodal`.
|
|
38
|
+
|
|
39
|
+
It is designed for scientific data processing, structure-aware training preparation, and atomic document design centered on paragraphs, formulas, chemical expressions, and figure assets.
|
|
40
|
+
|
|
41
|
+
## What It Does
|
|
42
|
+
|
|
43
|
+
- Builds a structure-aware full-text view from `*_content_list.json`.
|
|
44
|
+
- Maps document sections, formulas, chemical expressions, and asset references into a datamining view.
|
|
45
|
+
- Extracts image-caption-mention relationships into a multimodal view with copied assets and Markdown outputs.
|
|
46
|
+
|
|
47
|
+
## Install
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install porosdata-designer
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Quick Start
|
|
54
|
+
|
|
55
|
+
`--input_dir` should point to a directory tree that contains MinerU outputs. In practice, Designer expects:
|
|
56
|
+
|
|
57
|
+
- one or more `*_content_list.json` files
|
|
58
|
+
- image assets that remain resolvable relative to those input files
|
|
59
|
+
|
|
60
|
+
For package usage, it is recommended to pass explicit output and log directories.
|
|
61
|
+
|
|
62
|
+
Run the full pipeline:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
porosdata-designer run all --input_dir "path/to/input_dir" --output_dir "path/to/output_dir" --log_dir "path/to/log_dir"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Equivalent module mode:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
python -m porosdata_designer run all --input_dir "path/to/input_dir" --output_dir "path/to/output_dir" --log_dir "path/to/log_dir"
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Stage Commands
|
|
75
|
+
|
|
76
|
+
Run text structuring only:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
porosdata-designer run text --input_dir "path/to/input_dir" --output_dir "path/to/output_dir"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Run multimodal extraction only:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
porosdata-designer run multimodal --input_dir "path/to/input_dir" --output_dir "path/to/output_dir"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Outputs
|
|
89
|
+
|
|
90
|
+
Designer produces three delivery views by default:
|
|
91
|
+
|
|
92
|
+
- `full_text`: for structure-aware training and human review.
|
|
93
|
+
- `datamining`: for retrieval, extraction, indexing, and downstream storage.
|
|
94
|
+
- `multimodal`: for image-text linking, multimodal indexing, and asset delivery.
|
|
95
|
+
|
|
96
|
+
Typical output layout:
|
|
97
|
+
|
|
98
|
+
```text
|
|
99
|
+
path/to/output_dir/
|
|
100
|
+
├── full_text/{doc_id}/
|
|
101
|
+
│ ├── {doc_id}_structured.json
|
|
102
|
+
│ └── {doc_id}_structured.txt
|
|
103
|
+
├── datamining/{doc_id}/
|
|
104
|
+
│ └── {doc_id}_datamining.json
|
|
105
|
+
└── multimodal/{doc_id}/
|
|
106
|
+
├── {doc_id}_index.json
|
|
107
|
+
├── fig_n.md
|
|
108
|
+
└── assets/
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Validation
|
|
112
|
+
|
|
113
|
+
Audit structured outputs:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
porosdata-designer audit structured --root_dir "path/to/output_dir"
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Validate text outputs:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
porosdata-designer validate structured --output_dir "path/to/output_dir/full_text"
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Validate multimodal outputs:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
porosdata-designer validate multimodal --output_dir "path/to/output_dir/multimodal"
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Run final acceptance validation:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
porosdata-designer validate acceptance --output_dir "path/to/output_dir/multimodal"
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Python Usage
|
|
138
|
+
|
|
139
|
+
You can also use the package directly in Python:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from porosdata_designer import DataMiningMapper, MultimodalInterleaver, TextAggregator
|
|
143
|
+
|
|
144
|
+
aggregator = TextAggregator()
|
|
145
|
+
mapper = DataMiningMapper()
|
|
146
|
+
interleaver = MultimodalInterleaver()
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
A more complete text-side example:
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
from porosdata_designer import DataMiningMapper, TextAggregator
|
|
153
|
+
|
|
154
|
+
content_list = [
|
|
155
|
+
{"type": "text", "text_level": 1, "text": "Abstract", "page_idx": 0},
|
|
156
|
+
{"type": "text", "text": "This work studies a Cu-Zr metallic glass system.", "page_idx": 0},
|
|
157
|
+
{"type": "text", "text_level": 1, "text": "Results and Discussion", "page_idx": 1},
|
|
158
|
+
{"type": "text", "text": "Figure 1 shows the microstructure evolution at 700 K.", "page_idx": 1},
|
|
159
|
+
]
|
|
160
|
+
|
|
161
|
+
aggregator = TextAggregator()
|
|
162
|
+
structured_text = aggregator.aggregate(content_list)
|
|
163
|
+
|
|
164
|
+
mapper = DataMiningMapper()
|
|
165
|
+
datamining_view = mapper.map(structured_text, {"doc_id": "demo-0001"})
|
|
166
|
+
|
|
167
|
+
print(structured_text)
|
|
168
|
+
print(datamining_view.pure_text_stream)
|
|
169
|
+
print(datamining_view.structured_json["sections"])
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Expected outcome:
|
|
173
|
+
|
|
174
|
+
- `structured_text` contains Poros tags such as `<poros_doc>`, `<poros_section_*>`, and `<poros_paragraph>`.
|
|
175
|
+
- `pure_text_stream` removes the structure tags while keeping readable text.
|
|
176
|
+
- `structured_json` exposes mined fields such as `sections`, `formulas`, `chemical_formulas`, and `asset_refs`.
|
|
177
|
+
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# PorosData-Designer
|
|
2
|
+
|
|
3
|
+
`PorosData-Designer` converts MinerU-generated document parses into three stable deliverables: `full_text`, `datamining`, and `multimodal`.
|
|
4
|
+
|
|
5
|
+
It is designed for scientific data processing, structure-aware training preparation, and atomic document design centered on paragraphs, formulas, chemical expressions, and figure assets.
|
|
6
|
+
|
|
7
|
+
## What It Does
|
|
8
|
+
|
|
9
|
+
- Builds a structure-aware full-text view from `*_content_list.json`.
|
|
10
|
+
- Maps document sections, formulas, chemical expressions, and asset references into a datamining view.
|
|
11
|
+
- Extracts image-caption-mention relationships into a multimodal view with copied assets and Markdown outputs.
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install porosdata-designer
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Quick Start
|
|
20
|
+
|
|
21
|
+
`--input_dir` should point to a directory tree that contains MinerU outputs. In practice, Designer expects:
|
|
22
|
+
|
|
23
|
+
- one or more `*_content_list.json` files
|
|
24
|
+
- image assets that remain resolvable relative to those input files
|
|
25
|
+
|
|
26
|
+
For package usage, it is recommended to pass explicit output and log directories.
|
|
27
|
+
|
|
28
|
+
Run the full pipeline:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
porosdata-designer run all --input_dir "path/to/input_dir" --output_dir "path/to/output_dir" --log_dir "path/to/log_dir"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Equivalent module mode:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
python -m porosdata_designer run all --input_dir "path/to/input_dir" --output_dir "path/to/output_dir" --log_dir "path/to/log_dir"
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Stage Commands
|
|
41
|
+
|
|
42
|
+
Run text structuring only:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
porosdata-designer run text --input_dir "path/to/input_dir" --output_dir "path/to/output_dir"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Run multimodal extraction only:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
porosdata-designer run multimodal --input_dir "path/to/input_dir" --output_dir "path/to/output_dir"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Outputs
|
|
55
|
+
|
|
56
|
+
Designer produces three delivery views by default:
|
|
57
|
+
|
|
58
|
+
- `full_text`: for structure-aware training and human review.
|
|
59
|
+
- `datamining`: for retrieval, extraction, indexing, and downstream storage.
|
|
60
|
+
- `multimodal`: for image-text linking, multimodal indexing, and asset delivery.
|
|
61
|
+
|
|
62
|
+
Typical output layout:
|
|
63
|
+
|
|
64
|
+
```text
|
|
65
|
+
path/to/output_dir/
|
|
66
|
+
├── full_text/{doc_id}/
|
|
67
|
+
│ ├── {doc_id}_structured.json
|
|
68
|
+
│ └── {doc_id}_structured.txt
|
|
69
|
+
├── datamining/{doc_id}/
|
|
70
|
+
│ └── {doc_id}_datamining.json
|
|
71
|
+
└── multimodal/{doc_id}/
|
|
72
|
+
├── {doc_id}_index.json
|
|
73
|
+
├── fig_n.md
|
|
74
|
+
└── assets/
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Validation
|
|
78
|
+
|
|
79
|
+
Audit structured outputs:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
porosdata-designer audit structured --root_dir "path/to/output_dir"
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Validate text outputs:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
porosdata-designer validate structured --output_dir "path/to/output_dir/full_text"
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Validate multimodal outputs:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
porosdata-designer validate multimodal --output_dir "path/to/output_dir/multimodal"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Run final acceptance validation:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
porosdata-designer validate acceptance --output_dir "path/to/output_dir/multimodal"
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Python Usage
|
|
104
|
+
|
|
105
|
+
You can also use the package directly in Python:
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from porosdata_designer import DataMiningMapper, MultimodalInterleaver, TextAggregator
|
|
109
|
+
|
|
110
|
+
aggregator = TextAggregator()
|
|
111
|
+
mapper = DataMiningMapper()
|
|
112
|
+
interleaver = MultimodalInterleaver()
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
A more complete text-side example:
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from porosdata_designer import DataMiningMapper, TextAggregator
|
|
119
|
+
|
|
120
|
+
content_list = [
|
|
121
|
+
{"type": "text", "text_level": 1, "text": "Abstract", "page_idx": 0},
|
|
122
|
+
{"type": "text", "text": "This work studies a Cu-Zr metallic glass system.", "page_idx": 0},
|
|
123
|
+
{"type": "text", "text_level": 1, "text": "Results and Discussion", "page_idx": 1},
|
|
124
|
+
{"type": "text", "text": "Figure 1 shows the microstructure evolution at 700 K.", "page_idx": 1},
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
aggregator = TextAggregator()
|
|
128
|
+
structured_text = aggregator.aggregate(content_list)
|
|
129
|
+
|
|
130
|
+
mapper = DataMiningMapper()
|
|
131
|
+
datamining_view = mapper.map(structured_text, {"doc_id": "demo-0001"})
|
|
132
|
+
|
|
133
|
+
print(structured_text)
|
|
134
|
+
print(datamining_view.pure_text_stream)
|
|
135
|
+
print(datamining_view.structured_json["sections"])
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Expected outcome:
|
|
139
|
+
|
|
140
|
+
- `structured_text` contains Poros tags such as `<poros_doc>`, `<poros_section_*>`, and `<poros_paragraph>`.
|
|
141
|
+
- `pure_text_stream` removes the structure tags while keeping readable text.
|
|
142
|
+
- `structured_json` exposes mined fields such as `sections`, `formulas`, `chemical_formulas`, and `asset_refs`.
|
|
143
|
+
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# PorosData-Designer
|
|
2
|
+
|
|
3
|
+
`PorosData-Designer` 用来把 MinerU 生成的文档解析结果转换成三类稳定交付物:`full_text`、`datamining` 和 `multimodal`。
|
|
4
|
+
|
|
5
|
+
它面向科学数据处理、结构感知训练准备,以及围绕段落、公式、化学式和图表资产展开的原子化文档设计。
|
|
6
|
+
|
|
7
|
+
## 它解决什么问题
|
|
8
|
+
|
|
9
|
+
- 把 `*_content_list.json` 构造成适合训练和审阅的结构化全文视图。
|
|
10
|
+
- 把章节、公式、化学式和资产引用整理成稳定的数据挖掘视图。
|
|
11
|
+
- 把图像、图注、正文提及和复制后的图片资产组织成多模态输出。
|
|
12
|
+
|
|
13
|
+
## 安装
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install porosdata-designer
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## 快速开始
|
|
20
|
+
|
|
21
|
+
`--input_dir` 应指向包含 MinerU 输出结果的目录树。通常要求:
|
|
22
|
+
|
|
23
|
+
- 存在一个或多个 `*_content_list.json`
|
|
24
|
+
- 对应图片资源能通过与输入文件的相对路径被正确定位
|
|
25
|
+
|
|
26
|
+
作为 PyPI 安装用户,建议显式传入输出目录和日志目录。
|
|
27
|
+
|
|
28
|
+
安装后可直接运行:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
porosdata-designer run all --input_dir "path/to/input_dir" --output_dir "path/to/output_dir" --log_dir "path/to/log_dir"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
等价的模块调用方式:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
python -m porosdata_designer run all --input_dir "path/to/input_dir" --output_dir "path/to/output_dir" --log_dir "path/to/log_dir"
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## 分阶段运行
|
|
41
|
+
|
|
42
|
+
只运行文本结构化:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
porosdata-designer run text --input_dir "path/to/input_dir" --output_dir "path/to/output_dir"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
只运行多模态抽取:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
porosdata-designer run multimodal --input_dir "path/to/input_dir" --output_dir "path/to/output_dir"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## 输出结果
|
|
55
|
+
|
|
56
|
+
Designer 默认会生成三类交付结果:
|
|
57
|
+
|
|
58
|
+
- `full_text`:适合结构感知训练和人工审阅。
|
|
59
|
+
- `datamining`:适合检索、抽取、入库和后续数据分析。
|
|
60
|
+
- `multimodal`:适合图文关联、多模态索引和图片资产交付。
|
|
61
|
+
|
|
62
|
+
典型输出目录如下:
|
|
63
|
+
|
|
64
|
+
```text
|
|
65
|
+
path/to/output_dir/
|
|
66
|
+
├── full_text/{doc_id}/
|
|
67
|
+
│ ├── {doc_id}_structured.json
|
|
68
|
+
│ └── {doc_id}_structured.txt
|
|
69
|
+
├── datamining/{doc_id}/
|
|
70
|
+
│ └── {doc_id}_datamining.json
|
|
71
|
+
└── multimodal/{doc_id}/
|
|
72
|
+
├── {doc_id}_index.json
|
|
73
|
+
├── fig_n.md
|
|
74
|
+
└── assets/
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## 校验与验收
|
|
78
|
+
|
|
79
|
+
审计结构化输出:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
porosdata-designer audit structured --root_dir "path/to/output_dir"
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
校验文本输出:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
porosdata-designer validate structured --output_dir "path/to/output_dir/full_text"
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
校验多模态输出:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
porosdata-designer validate multimodal --output_dir "path/to/output_dir/multimodal"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
执行最终验收校验:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
porosdata-designer validate acceptance --output_dir "path/to/output_dir/multimodal"
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Python 用法
|
|
104
|
+
|
|
105
|
+
你也可以直接在 Python 中使用这些核心组件:
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from porosdata_designer import DataMiningMapper, MultimodalInterleaver, TextAggregator
|
|
109
|
+
|
|
110
|
+
aggregator = TextAggregator()
|
|
111
|
+
mapper = DataMiningMapper()
|
|
112
|
+
interleaver = MultimodalInterleaver()
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
下面是一个更完整的文本侧示例:
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from porosdata_designer import DataMiningMapper, TextAggregator
|
|
119
|
+
|
|
120
|
+
content_list = [
|
|
121
|
+
{"type": "text", "text_level": 1, "text": "Abstract", "page_idx": 0},
|
|
122
|
+
{"type": "text", "text": "This work studies a Cu-Zr metallic glass system.", "page_idx": 0},
|
|
123
|
+
{"type": "text", "text_level": 1, "text": "Results and Discussion", "page_idx": 1},
|
|
124
|
+
{"type": "text", "text": "Figure 1 shows the microstructure evolution at 700 K.", "page_idx": 1},
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
aggregator = TextAggregator()
|
|
128
|
+
structured_text = aggregator.aggregate(content_list)
|
|
129
|
+
|
|
130
|
+
mapper = DataMiningMapper()
|
|
131
|
+
datamining_view = mapper.map(structured_text, {"doc_id": "demo-0001"})
|
|
132
|
+
|
|
133
|
+
print(structured_text)
|
|
134
|
+
print(datamining_view.pure_text_stream)
|
|
135
|
+
print(datamining_view.structured_json["sections"])
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
这个例子执行后,你可以看到:
|
|
139
|
+
|
|
140
|
+
- `structured_text` 会生成带 Poros 标签的结构化文本,例如 `<poros_doc>`、`<poros_section_*>`、`<poros_paragraph>`。
|
|
141
|
+
- `pure_text_stream` 会去掉结构标签,保留适合继续训练或检索的纯文本内容。
|
|
142
|
+
- `structured_json` 会暴露 `sections`、`formulas`、`chemical_formulas`、`asset_refs` 等数据挖掘字段。
|
|
143
|
+
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "PorosData-Designer"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Structured delivery toolkit for converting MinerU parses into full_text, datamining, and multimodal outputs."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.8"
|
|
7
|
+
license = {text = "MIT"}
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "Kivent", email = "72405514@cityu-dg.edu.cn"},
|
|
10
|
+
]
|
|
11
|
+
keywords = ["mineru", "document-processing", "scientific-data", "multimodal", "structured-output"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 3 - Alpha",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3.8",
|
|
17
|
+
"Programming Language :: Python :: 3.9",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
dependencies = [
|
|
24
|
+
# 原有依赖
|
|
25
|
+
"typing-extensions>=4.0; python_version<'3.10'",
|
|
26
|
+
"dataclasses>=0.6; python_version<'3.7'",
|
|
27
|
+
"pydantic>=2.0",
|
|
28
|
+
"loguru>=0.7.0",
|
|
29
|
+
|
|
30
|
+
# 新增必需依赖
|
|
31
|
+
"pathlib2>=2.3.0; python_version<'3.4'", # Path处理兼容性
|
|
32
|
+
"dataclasses-json>=0.5.0", # JSON序列化支持
|
|
33
|
+
"tqdm>=4.64.0", # 进度条显示
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.optional-dependencies]
|
|
37
|
+
dev = [
|
|
38
|
+
"pytest>=7.0",
|
|
39
|
+
"pytest-cov",
|
|
40
|
+
"black",
|
|
41
|
+
"mypy",
|
|
42
|
+
]
|
|
43
|
+
optional = [
|
|
44
|
+
"cleanlit>=0.2.0",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
[build-system]
|
|
48
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
49
|
+
build-backend = "setuptools.build_meta"
|
|
50
|
+
|
|
51
|
+
[tool.setuptools.packages.find]
|
|
52
|
+
where = ["src"]
|
|
53
|
+
include = ["porosdata_designer*"]
|
|
54
|
+
|
|
55
|
+
[tool.setuptools.package-data]
|
|
56
|
+
porosdata_designer = ["py.typed"]
|
|
57
|
+
|
|
58
|
+
[project.scripts]
|
|
59
|
+
porosdata-designer = "porosdata_designer.cli:main"
|
|
60
|
+
|
|
61
|
+
[tool.pytest.ini_options]
|
|
62
|
+
testpaths = ["tests"]
|
|
63
|
+
pythonpath = ["src"]
|
|
64
|
+
|