PorosData-Designer 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. porosdata_designer-0.1.0/LICENSE +21 -0
  2. porosdata_designer-0.1.0/MANIFEST.in +60 -0
  3. porosdata_designer-0.1.0/PKG-INFO +177 -0
  4. porosdata_designer-0.1.0/README.md +143 -0
  5. porosdata_designer-0.1.0/README_CN.md +143 -0
  6. porosdata_designer-0.1.0/pyproject.toml +64 -0
  7. porosdata_designer-0.1.0/setup.cfg +4 -0
  8. porosdata_designer-0.1.0/src/PorosData_Designer.egg-info/PKG-INFO +177 -0
  9. porosdata_designer-0.1.0/src/PorosData_Designer.egg-info/SOURCES.txt +37 -0
  10. porosdata_designer-0.1.0/src/PorosData_Designer.egg-info/dependency_links.txt +1 -0
  11. porosdata_designer-0.1.0/src/PorosData_Designer.egg-info/entry_points.txt +2 -0
  12. porosdata_designer-0.1.0/src/PorosData_Designer.egg-info/requires.txt +22 -0
  13. porosdata_designer-0.1.0/src/PorosData_Designer.egg-info/top_level.txt +1 -0
  14. porosdata_designer-0.1.0/src/porosdata_designer/__init__.py +89 -0
  15. porosdata_designer-0.1.0/src/porosdata_designer/__main__.py +9 -0
  16. porosdata_designer-0.1.0/src/porosdata_designer/adapters/__init__.py +10 -0
  17. porosdata_designer-0.1.0/src/porosdata_designer/adapters/content_list_adapter.py +121 -0
  18. porosdata_designer-0.1.0/src/porosdata_designer/cli.py +95 -0
  19. porosdata_designer-0.1.0/src/porosdata_designer/config.py +1 -0
  20. porosdata_designer-0.1.0/src/porosdata_designer/mappers/__init__.py +3 -0
  21. porosdata_designer-0.1.0/src/porosdata_designer/mappers/asset_anchoring.py +143 -0
  22. porosdata_designer-0.1.0/src/porosdata_designer/mappers/data_mining_mapper.py +295 -0
  23. porosdata_designer-0.1.0/src/porosdata_designer/plugin_system.py +1 -0
  24. porosdata_designer-0.1.0/src/porosdata_designer/py.typed +2 -0
  25. porosdata_designer-0.1.0/src/porosdata_designer/reorganizers/__init__.py +17 -0
  26. porosdata_designer-0.1.0/src/porosdata_designer/reorganizers/content_filter.py +150 -0
  27. porosdata_designer-0.1.0/src/porosdata_designer/reorganizers/multimodal_interleaver.py +1006 -0
  28. porosdata_designer-0.1.0/src/porosdata_designer/reorganizers/paragraph_classifier.py +162 -0
  29. porosdata_designer-0.1.0/src/porosdata_designer/reorganizers/text_aggregator.py +829 -0
  30. porosdata_designer-0.1.0/src/porosdata_designer/reorganizers/token_marker.py +69 -0
  31. porosdata_designer-0.1.0/src/porosdata_designer/runtime/__init__.py +3 -0
  32. porosdata_designer-0.1.0/src/porosdata_designer/runtime/commands.py +584 -0
  33. porosdata_designer-0.1.0/src/porosdata_designer/runtime/config.py +112 -0
  34. porosdata_designer-0.1.0/src/porosdata_designer/runtime/pipelines.py +391 -0
  35. porosdata_designer-0.1.0/src/porosdata_designer/runtime/plugin_system.py +273 -0
  36. porosdata_designer-0.1.0/src/porosdata_designer/utils/__init__.py +1 -0
  37. porosdata_designer-0.1.0/src/porosdata_designer/validators/__init__.py +3 -0
  38. porosdata_designer-0.1.0/src/porosdata_designer/validators/latex_validator.py +114 -0
  39. porosdata_designer-0.1.0/src/porosdata_designer/validators/schema_validator.py +140 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Kivent
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,60 @@
1
+ # Minimal release metadata
2
+ include README.md
3
+ include README_CN.md
4
+ include LICENSE
5
+ include pyproject.toml
6
+
7
+ # Package typing marker
8
+ include src/porosdata_designer/py.typed
9
+
10
+ # Keep the source distribution clean
11
+ prune tests
12
+ prune data
13
+ prune logs
14
+ prune scripts
15
+ prune examples
16
+ prune docs
17
+ prune docs/audit
18
+ prune .cursor
19
+ prune .github
20
+ prune .idea
21
+ prune .pytest_cache
22
+ prune .vscode
23
+
24
+ exclude .gitignore
25
+ exclude CHANGELOG.md
26
+ exclude requirements-dev.txt
27
+ exclude Pipfile
28
+ exclude Pipfile.lock
29
+ exclude .env
30
+ exclude .env.*
31
+
32
+ global-exclude *.py[cod]
33
+ global-exclude __pycache__
34
+ global-exclude .pytest_cache
35
+ global-exclude *.log
36
+ global-exclude *.ipynb
37
+ global-exclude .ipynb_checkpoints
38
+ global-exclude *.pem
39
+ global-exclude *.key
40
+ global-exclude *.png
41
+ global-exclude *.jpg
42
+ global-exclude *.jpeg
43
+ global-exclude *.gif
44
+ global-exclude *.svg
45
+ global-exclude *.webp
46
+ global-exclude *.pdf
47
+ global-exclude *.csv
48
+ global-exclude *.tsv
49
+ global-exclude *.xls
50
+ global-exclude *.xlsx
51
+ global-exclude *.db
52
+ global-exclude *.sqlite
53
+ global-exclude *.sqlite3
54
+ global-exclude *.parquet
55
+ global-exclude *.feather
56
+ global-exclude *.pkl
57
+ global-exclude *.joblib
58
+ global-exclude *.npy
59
+ global-exclude *.npz
60
+ global-exclude .DS_Store
@@ -0,0 +1,177 @@
1
+ Metadata-Version: 2.4
2
+ Name: PorosData-Designer
3
+ Version: 0.1.0
4
+ Summary: Structured delivery toolkit for converting MinerU parses into full_text, datamining, and multimodal outputs.
5
+ Author-email: Kivent <72405514@cityu-dg.edu.cn>
6
+ License: MIT
7
+ Keywords: mineru,document-processing,scientific-data,multimodal,structured-output
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: typing-extensions>=4.0; python_version < "3.10"
20
+ Requires-Dist: dataclasses>=0.6; python_version < "3.7"
21
+ Requires-Dist: pydantic>=2.0
22
+ Requires-Dist: loguru>=0.7.0
23
+ Requires-Dist: pathlib2>=2.3.0; python_version < "3.4"
24
+ Requires-Dist: dataclasses-json>=0.5.0
25
+ Requires-Dist: tqdm>=4.64.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=7.0; extra == "dev"
28
+ Requires-Dist: pytest-cov; extra == "dev"
29
+ Requires-Dist: black; extra == "dev"
30
+ Requires-Dist: mypy; extra == "dev"
31
+ Provides-Extra: optional
32
+ Requires-Dist: cleanlit>=0.2.0; extra == "optional"
33
+ Dynamic: license-file
34
+
35
+ # PorosData-Designer
36
+
37
+ `PorosData-Designer` converts MinerU-generated document parses into three stable deliverables: `full_text`, `datamining`, and `multimodal`.
38
+
39
+ It is designed for scientific data processing, structure-aware training preparation, and atomic document design centered on paragraphs, formulas, chemical expressions, and figure assets.
40
+
41
+ ## What It Does
42
+
43
+ - Builds a structure-aware full-text view from `*_content_list.json`.
44
+ - Maps document sections, formulas, chemical expressions, and asset references into a datamining view.
45
+ - Extracts image-caption-mention relationships into a multimodal view with copied assets and Markdown outputs.
46
+
47
+ ## Install
48
+
49
+ ```bash
50
+ pip install porosdata-designer
51
+ ```
52
+
53
+ ## Quick Start
54
+
55
+ `--input_dir` should point to a directory tree that contains MinerU outputs. In practice, Designer expects:
56
+
57
+ - one or more `*_content_list.json` files
58
+ - image assets that remain resolvable relative to those input files
59
+
60
+ For package usage, it is recommended to pass explicit output and log directories.
61
+
62
+ Run the full pipeline:
63
+
64
+ ```bash
65
+ porosdata-designer run all --input_dir "path/to/input_dir" --output_dir "path/to/output_dir" --log_dir "path/to/log_dir"
66
+ ```
67
+
68
+ Equivalent module mode:
69
+
70
+ ```bash
71
+ python -m porosdata_designer run all --input_dir "path/to/input_dir" --output_dir "path/to/output_dir" --log_dir "path/to/log_dir"
72
+ ```
73
+
74
+ ## Stage Commands
75
+
76
+ Run text structuring only:
77
+
78
+ ```bash
79
+ porosdata-designer run text --input_dir "path/to/input_dir" --output_dir "path/to/output_dir"
80
+ ```
81
+
82
+ Run multimodal extraction only:
83
+
84
+ ```bash
85
+ porosdata-designer run multimodal --input_dir "path/to/input_dir" --output_dir "path/to/output_dir"
86
+ ```
87
+
88
+ ## Outputs
89
+
90
+ Designer produces three delivery views by default:
91
+
92
+ - `full_text`: for structure-aware training and human review.
93
+ - `datamining`: for retrieval, extraction, indexing, and downstream storage.
94
+ - `multimodal`: for image-text linking, multimodal indexing, and asset delivery.
95
+
96
+ Typical output layout:
97
+
98
+ ```text
99
+ path/to/output_dir/
100
+ ├── full_text/{doc_id}/
101
+ │ ├── {doc_id}_structured.json
102
+ │ └── {doc_id}_structured.txt
103
+ ├── datamining/{doc_id}/
104
+ │ └── {doc_id}_datamining.json
105
+ └── multimodal/{doc_id}/
106
+ ├── {doc_id}_index.json
107
+ ├── fig_n.md
108
+ └── assets/
109
+ ```
110
+
111
+ ## Validation
112
+
113
+ Audit structured outputs:
114
+
115
+ ```bash
116
+ porosdata-designer audit structured --root_dir "path/to/output_dir"
117
+ ```
118
+
119
+ Validate text outputs:
120
+
121
+ ```bash
122
+ porosdata-designer validate structured --output_dir "path/to/output_dir/full_text"
123
+ ```
124
+
125
+ Validate multimodal outputs:
126
+
127
+ ```bash
128
+ porosdata-designer validate multimodal --output_dir "path/to/output_dir/multimodal"
129
+ ```
130
+
131
+ Run final acceptance validation:
132
+
133
+ ```bash
134
+ porosdata-designer validate acceptance --output_dir "path/to/output_dir/multimodal"
135
+ ```
136
+
137
+ ## Python Usage
138
+
139
+ You can also use the package directly in Python:
140
+
141
+ ```python
142
+ from porosdata_designer import DataMiningMapper, MultimodalInterleaver, TextAggregator
143
+
144
+ aggregator = TextAggregator()
145
+ mapper = DataMiningMapper()
146
+ interleaver = MultimodalInterleaver()
147
+ ```
148
+
149
+ A more complete text-side example:
150
+
151
+ ```python
152
+ from porosdata_designer import DataMiningMapper, TextAggregator
153
+
154
+ content_list = [
155
+ {"type": "text", "text_level": 1, "text": "Abstract", "page_idx": 0},
156
+ {"type": "text", "text": "This work studies a Cu-Zr metallic glass system.", "page_idx": 0},
157
+ {"type": "text", "text_level": 1, "text": "Results and Discussion", "page_idx": 1},
158
+ {"type": "text", "text": "Figure 1 shows the microstructure evolution at 700 K.", "page_idx": 1},
159
+ ]
160
+
161
+ aggregator = TextAggregator()
162
+ structured_text = aggregator.aggregate(content_list)
163
+
164
+ mapper = DataMiningMapper()
165
+ datamining_view = mapper.map(structured_text, {"doc_id": "demo-0001"})
166
+
167
+ print(structured_text)
168
+ print(datamining_view.pure_text_stream)
169
+ print(datamining_view.structured_json["sections"])
170
+ ```
171
+
172
+ Expected outcome:
173
+
174
+ - `structured_text` contains Poros tags such as `<poros_doc>`, `<poros_section_*>`, and `<poros_paragraph>`.
175
+ - `pure_text_stream` removes the structure tags while keeping readable text.
176
+ - `structured_json` exposes mined fields such as `sections`, `formulas`, `chemical_formulas`, and `asset_refs`.
177
+
@@ -0,0 +1,143 @@
1
+ # PorosData-Designer
2
+
3
+ `PorosData-Designer` converts MinerU-generated document parses into three stable deliverables: `full_text`, `datamining`, and `multimodal`.
4
+
5
+ It is designed for scientific data processing, structure-aware training preparation, and atomic document design centered on paragraphs, formulas, chemical expressions, and figure assets.
6
+
7
+ ## What It Does
8
+
9
+ - Builds a structure-aware full-text view from `*_content_list.json`.
10
+ - Maps document sections, formulas, chemical expressions, and asset references into a datamining view.
11
+ - Extracts image-caption-mention relationships into a multimodal view with copied assets and Markdown outputs.
12
+
13
+ ## Install
14
+
15
+ ```bash
16
+ pip install porosdata-designer
17
+ ```
18
+
19
+ ## Quick Start
20
+
21
+ `--input_dir` should point to a directory tree that contains MinerU outputs. In practice, Designer expects:
22
+
23
+ - one or more `*_content_list.json` files
24
+ - image assets that remain resolvable relative to those input files
25
+
26
+ For package usage, it is recommended to pass explicit output and log directories.
27
+
28
+ Run the full pipeline:
29
+
30
+ ```bash
31
+ porosdata-designer run all --input_dir "path/to/input_dir" --output_dir "path/to/output_dir" --log_dir "path/to/log_dir"
32
+ ```
33
+
34
+ Equivalent module mode:
35
+
36
+ ```bash
37
+ python -m porosdata_designer run all --input_dir "path/to/input_dir" --output_dir "path/to/output_dir" --log_dir "path/to/log_dir"
38
+ ```
39
+
40
+ ## Stage Commands
41
+
42
+ Run text structuring only:
43
+
44
+ ```bash
45
+ porosdata-designer run text --input_dir "path/to/input_dir" --output_dir "path/to/output_dir"
46
+ ```
47
+
48
+ Run multimodal extraction only:
49
+
50
+ ```bash
51
+ porosdata-designer run multimodal --input_dir "path/to/input_dir" --output_dir "path/to/output_dir"
52
+ ```
53
+
54
+ ## Outputs
55
+
56
+ Designer produces three delivery views by default:
57
+
58
+ - `full_text`: for structure-aware training and human review.
59
+ - `datamining`: for retrieval, extraction, indexing, and downstream storage.
60
+ - `multimodal`: for image-text linking, multimodal indexing, and asset delivery.
61
+
62
+ Typical output layout:
63
+
64
+ ```text
65
+ path/to/output_dir/
66
+ ├── full_text/{doc_id}/
67
+ │ ├── {doc_id}_structured.json
68
+ │ └── {doc_id}_structured.txt
69
+ ├── datamining/{doc_id}/
70
+ │ └── {doc_id}_datamining.json
71
+ └── multimodal/{doc_id}/
72
+ ├── {doc_id}_index.json
73
+ ├── fig_n.md
74
+ └── assets/
75
+ ```
76
+
77
+ ## Validation
78
+
79
+ Audit structured outputs:
80
+
81
+ ```bash
82
+ porosdata-designer audit structured --root_dir "path/to/output_dir"
83
+ ```
84
+
85
+ Validate text outputs:
86
+
87
+ ```bash
88
+ porosdata-designer validate structured --output_dir "path/to/output_dir/full_text"
89
+ ```
90
+
91
+ Validate multimodal outputs:
92
+
93
+ ```bash
94
+ porosdata-designer validate multimodal --output_dir "path/to/output_dir/multimodal"
95
+ ```
96
+
97
+ Run final acceptance validation:
98
+
99
+ ```bash
100
+ porosdata-designer validate acceptance --output_dir "path/to/output_dir/multimodal"
101
+ ```
102
+
103
+ ## Python Usage
104
+
105
+ You can also use the package directly in Python:
106
+
107
+ ```python
108
+ from porosdata_designer import DataMiningMapper, MultimodalInterleaver, TextAggregator
109
+
110
+ aggregator = TextAggregator()
111
+ mapper = DataMiningMapper()
112
+ interleaver = MultimodalInterleaver()
113
+ ```
114
+
115
+ A more complete text-side example:
116
+
117
+ ```python
118
+ from porosdata_designer import DataMiningMapper, TextAggregator
119
+
120
+ content_list = [
121
+ {"type": "text", "text_level": 1, "text": "Abstract", "page_idx": 0},
122
+ {"type": "text", "text": "This work studies a Cu-Zr metallic glass system.", "page_idx": 0},
123
+ {"type": "text", "text_level": 1, "text": "Results and Discussion", "page_idx": 1},
124
+ {"type": "text", "text": "Figure 1 shows the microstructure evolution at 700 K.", "page_idx": 1},
125
+ ]
126
+
127
+ aggregator = TextAggregator()
128
+ structured_text = aggregator.aggregate(content_list)
129
+
130
+ mapper = DataMiningMapper()
131
+ datamining_view = mapper.map(structured_text, {"doc_id": "demo-0001"})
132
+
133
+ print(structured_text)
134
+ print(datamining_view.pure_text_stream)
135
+ print(datamining_view.structured_json["sections"])
136
+ ```
137
+
138
+ Expected outcome:
139
+
140
+ - `structured_text` contains Poros tags such as `<poros_doc>`, `<poros_section_*>`, and `<poros_paragraph>`.
141
+ - `pure_text_stream` removes the structure tags while keeping readable text.
142
+ - `structured_json` exposes mined fields such as `sections`, `formulas`, `chemical_formulas`, and `asset_refs`.
143
+
@@ -0,0 +1,143 @@
1
+ # PorosData-Designer
2
+
3
+ `PorosData-Designer` 用来把 MinerU 生成的文档解析结果转换成三类稳定交付物:`full_text`、`datamining` 和 `multimodal`。
4
+
5
+ 它面向科学数据处理、结构感知训练准备,以及围绕段落、公式、化学式和图表资产展开的原子化文档设计。
6
+
7
+ ## 它解决什么问题
8
+
9
+ - 把 `*_content_list.json` 构造成适合训练和审阅的结构化全文视图。
10
+ - 把章节、公式、化学式和资产引用整理成稳定的数据挖掘视图。
11
+ - 把图像、图注、正文提及和复制后的图片资产组织成多模态输出。
12
+
13
+ ## 安装
14
+
15
+ ```bash
16
+ pip install porosdata-designer
17
+ ```
18
+
19
+ ## 快速开始
20
+
21
+ `--input_dir` 应指向包含 MinerU 输出结果的目录树。通常要求:
22
+
23
+ - 存在一个或多个 `*_content_list.json`
24
+ - 对应图片资源能通过与输入文件的相对路径被正确定位
25
+
26
+ 作为 PyPI 安装用户,建议显式传入输出目录和日志目录。
27
+
28
+ 安装后可直接运行:
29
+
30
+ ```bash
31
+ porosdata-designer run all --input_dir "path/to/input_dir" --output_dir "path/to/output_dir" --log_dir "path/to/log_dir"
32
+ ```
33
+
34
+ 等价的模块调用方式:
35
+
36
+ ```bash
37
+ python -m porosdata_designer run all --input_dir "path/to/input_dir" --output_dir "path/to/output_dir" --log_dir "path/to/log_dir"
38
+ ```
39
+
40
+ ## 分阶段运行
41
+
42
+ 只运行文本结构化:
43
+
44
+ ```bash
45
+ porosdata-designer run text --input_dir "path/to/input_dir" --output_dir "path/to/output_dir"
46
+ ```
47
+
48
+ 只运行多模态抽取:
49
+
50
+ ```bash
51
+ porosdata-designer run multimodal --input_dir "path/to/input_dir" --output_dir "path/to/output_dir"
52
+ ```
53
+
54
+ ## 输出结果
55
+
56
+ Designer 默认会生成三类交付结果:
57
+
58
+ - `full_text`:适合结构感知训练和人工审阅。
59
+ - `datamining`:适合检索、抽取、入库和后续数据分析。
60
+ - `multimodal`:适合图文关联、多模态索引和图片资产交付。
61
+
62
+ 典型输出目录如下:
63
+
64
+ ```text
65
+ path/to/output_dir/
66
+ ├── full_text/{doc_id}/
67
+ │ ├── {doc_id}_structured.json
68
+ │ └── {doc_id}_structured.txt
69
+ ├── datamining/{doc_id}/
70
+ │ └── {doc_id}_datamining.json
71
+ └── multimodal/{doc_id}/
72
+ ├── {doc_id}_index.json
73
+ ├── fig_n.md
74
+ └── assets/
75
+ ```
76
+
77
+ ## 校验与验收
78
+
79
+ 审计结构化输出:
80
+
81
+ ```bash
82
+ porosdata-designer audit structured --root_dir "path/to/output_dir"
83
+ ```
84
+
85
+ 校验文本输出:
86
+
87
+ ```bash
88
+ porosdata-designer validate structured --output_dir "path/to/output_dir/full_text"
89
+ ```
90
+
91
+ 校验多模态输出:
92
+
93
+ ```bash
94
+ porosdata-designer validate multimodal --output_dir "path/to/output_dir/multimodal"
95
+ ```
96
+
97
+ 执行最终验收校验:
98
+
99
+ ```bash
100
+ porosdata-designer validate acceptance --output_dir "path/to/output_dir/multimodal"
101
+ ```
102
+
103
+ ## Python 用法
104
+
105
+ 你也可以直接在 Python 中使用这些核心组件:
106
+
107
+ ```python
108
+ from porosdata_designer import DataMiningMapper, MultimodalInterleaver, TextAggregator
109
+
110
+ aggregator = TextAggregator()
111
+ mapper = DataMiningMapper()
112
+ interleaver = MultimodalInterleaver()
113
+ ```
114
+
115
+ 下面是一个更完整的文本侧示例:
116
+
117
+ ```python
118
+ from porosdata_designer import DataMiningMapper, TextAggregator
119
+
120
+ content_list = [
121
+ {"type": "text", "text_level": 1, "text": "Abstract", "page_idx": 0},
122
+ {"type": "text", "text": "This work studies a Cu-Zr metallic glass system.", "page_idx": 0},
123
+ {"type": "text", "text_level": 1, "text": "Results and Discussion", "page_idx": 1},
124
+ {"type": "text", "text": "Figure 1 shows the microstructure evolution at 700 K.", "page_idx": 1},
125
+ ]
126
+
127
+ aggregator = TextAggregator()
128
+ structured_text = aggregator.aggregate(content_list)
129
+
130
+ mapper = DataMiningMapper()
131
+ datamining_view = mapper.map(structured_text, {"doc_id": "demo-0001"})
132
+
133
+ print(structured_text)
134
+ print(datamining_view.pure_text_stream)
135
+ print(datamining_view.structured_json["sections"])
136
+ ```
137
+
138
+ 这个例子执行后,你可以看到:
139
+
140
+ - `structured_text` 会生成带 Poros 标签的结构化文本,例如 `<poros_doc>`、`<poros_section_*>`、`<poros_paragraph>`。
141
+ - `pure_text_stream` 会去掉结构标签,保留适合继续训练或检索的纯文本内容。
142
+ - `structured_json` 会暴露 `sections`、`formulas`、`chemical_formulas`、`asset_refs` 等数据挖掘字段。
143
+
@@ -0,0 +1,64 @@
1
+ [project]
2
+ name = "PorosData-Designer"
3
+ version = "0.1.0"
4
+ description = "Structured delivery toolkit for converting MinerU parses into full_text, datamining, and multimodal outputs."
5
+ readme = "README.md"
6
+ requires-python = ">=3.8"
7
+ license = {text = "MIT"}
8
+ authors = [
9
+ {name = "Kivent", email = "72405514@cityu-dg.edu.cn"},
10
+ ]
11
+ keywords = ["mineru", "document-processing", "scientific-data", "multimodal", "structured-output"]
12
+ classifiers = [
13
+ "Development Status :: 3 - Alpha",
14
+ "Intended Audience :: Developers",
15
+ "Programming Language :: Python :: 3",
16
+ "Programming Language :: Python :: 3.8",
17
+ "Programming Language :: Python :: 3.9",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ ]
22
+
23
+ dependencies = [
24
+ # 原有依赖
25
+ "typing-extensions>=4.0; python_version<'3.10'",
26
+ "dataclasses>=0.6; python_version<'3.7'",
27
+ "pydantic>=2.0",
28
+ "loguru>=0.7.0",
29
+
30
+ # 新增必需依赖
31
+ "pathlib2>=2.3.0; python_version<'3.4'", # Path处理兼容性
32
+ "dataclasses-json>=0.5.0", # JSON序列化支持
33
+ "tqdm>=4.64.0", # 进度条显示
34
+ ]
35
+
36
+ [project.optional-dependencies]
37
+ dev = [
38
+ "pytest>=7.0",
39
+ "pytest-cov",
40
+ "black",
41
+ "mypy",
42
+ ]
43
+ optional = [
44
+ "cleanlit>=0.2.0",
45
+ ]
46
+
47
+ [build-system]
48
+ requires = ["setuptools>=61.0", "wheel"]
49
+ build-backend = "setuptools.build_meta"
50
+
51
+ [tool.setuptools.packages.find]
52
+ where = ["src"]
53
+ include = ["porosdata_designer*"]
54
+
55
+ [tool.setuptools.package-data]
56
+ porosdata_designer = ["py.typed"]
57
+
58
+ [project.scripts]
59
+ porosdata-designer = "porosdata_designer.cli:main"
60
+
61
+ [tool.pytest.ini_options]
62
+ testpaths = ["tests"]
63
+ pythonpath = ["src"]
64
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+