PorosData-Designer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. porosdata_designer/__init__.py +89 -0
  2. porosdata_designer/__main__.py +9 -0
  3. porosdata_designer/adapters/__init__.py +10 -0
  4. porosdata_designer/adapters/content_list_adapter.py +121 -0
  5. porosdata_designer/cli.py +95 -0
  6. porosdata_designer/config.py +1 -0
  7. porosdata_designer/mappers/__init__.py +3 -0
  8. porosdata_designer/mappers/asset_anchoring.py +143 -0
  9. porosdata_designer/mappers/data_mining_mapper.py +295 -0
  10. porosdata_designer/plugin_system.py +1 -0
  11. porosdata_designer/py.typed +2 -0
  12. porosdata_designer/reorganizers/__init__.py +17 -0
  13. porosdata_designer/reorganizers/content_filter.py +150 -0
  14. porosdata_designer/reorganizers/multimodal_interleaver.py +1006 -0
  15. porosdata_designer/reorganizers/paragraph_classifier.py +162 -0
  16. porosdata_designer/reorganizers/text_aggregator.py +829 -0
  17. porosdata_designer/reorganizers/token_marker.py +69 -0
  18. porosdata_designer/runtime/__init__.py +3 -0
  19. porosdata_designer/runtime/commands.py +584 -0
  20. porosdata_designer/runtime/config.py +112 -0
  21. porosdata_designer/runtime/pipelines.py +391 -0
  22. porosdata_designer/runtime/plugin_system.py +273 -0
  23. porosdata_designer/utils/__init__.py +1 -0
  24. porosdata_designer/validators/__init__.py +3 -0
  25. porosdata_designer/validators/latex_validator.py +114 -0
  26. porosdata_designer/validators/schema_validator.py +140 -0
  27. porosdata_designer-0.1.0.dist-info/METADATA +177 -0
  28. porosdata_designer-0.1.0.dist-info/RECORD +32 -0
  29. porosdata_designer-0.1.0.dist-info/WHEEL +5 -0
  30. porosdata_designer-0.1.0.dist-info/entry_points.txt +2 -0
  31. porosdata_designer-0.1.0.dist-info/licenses/LICENSE +21 -0
  32. porosdata_designer-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,89 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ PorosData-Designer 包初始化模块
4
+
5
+ 核心能力:
6
+ - 段落类型识别: ParagraphClassifier 智能分类文档结构
7
+ - 文本结构化: TextAggregator 全文本聚合引擎
8
+ - 多模态关联: MultimodalInterleaver 图文交织引擎
9
+ - 标记与格式化: TokenMarker 添加特殊标记
10
+ - Schema 校验: SchemaValidator / LaTeXValidator
11
+ - 资产穿透: AssetAnchoringEngine (Fig/Table → UUID)
12
+ - 数据挖掘: DataMiningMapper 双视图映射
13
+ """
14
+
15
+ import sys
16
+ import os
17
+ from typing import Dict, List
18
+
19
+ if sys.platform == "win32":
20
+ os.environ["PYTHONIOENCODING"] = "utf-8"
21
+ try:
22
+ import subprocess
23
+ subprocess.run(["chcp", "65001"], shell=True, capture_output=True)
24
+ except:
25
+ pass
26
+ if hasattr(sys.stdout, "reconfigure"):
27
+ sys.stdout.reconfigure(encoding="utf-8")
28
+ if hasattr(sys.stderr, "reconfigure"):
29
+ sys.stderr.reconfigure(encoding="utf-8")
30
+
31
+ from .reorganizers.content_filter import ContentFilter, ContentType
32
+ from .reorganizers.paragraph_classifier import ParagraphClassifier, ParagraphType
33
+ from .reorganizers.token_marker import TokenMarker
34
+ from .reorganizers.text_aggregator import TextAggregator
35
+ from .reorganizers.multimodal_interleaver import MultimodalInterleaver
36
+ from .adapters.content_list_adapter import ContentListAdapter
37
+ from .runtime.plugin_system import PluginRegistry
38
+ from .validators.schema_validator import SchemaValidator, SchemaValidationResult, SchemaIssue
39
+ from .validators.latex_validator import LaTeXValidator, LaTeXValidationResult, FormulaQuality
40
+ from .mappers.asset_anchoring import AssetAnchoringEngine
41
+ from .mappers.data_mining_mapper import DataMiningMapper, DataMiningView
42
+
43
+ from . import adapters
44
+ from . import mappers
45
+ from . import reorganizers
46
+ from . import runtime
47
+ from . import validators
48
+
49
+
50
+ def aggregate_text(content_list: List[Dict]) -> str:
51
+ """便捷的文本聚合函数
52
+
53
+ Args:
54
+ content_list: MinerU解析后的内容列表
55
+
56
+ Returns:
57
+ 结构化的XML文本
58
+
59
+ Example:
60
+ >>> from porosdata_designer import aggregate_text
61
+ >>> xml_text = aggregate_text(content_list)
62
+ """
63
+ aggregator = TextAggregator()
64
+ return aggregator.aggregate(content_list)
65
+
66
+
67
+ __version__ = "0.1.0"
68
+ __all__ = [
69
+ "aggregate_text",
70
+
71
+ "ContentFilter",
72
+ "ContentType",
73
+ "ParagraphClassifier",
74
+ "ParagraphType",
75
+ "TokenMarker",
76
+ "TextAggregator",
77
+ "MultimodalInterleaver",
78
+ "ContentListAdapter",
79
+ "PluginRegistry",
80
+ "SchemaValidator",
81
+ "SchemaValidationResult",
82
+ "SchemaIssue",
83
+ "LaTeXValidator",
84
+ "LaTeXValidationResult",
85
+ "FormulaQuality",
86
+ "AssetAnchoringEngine",
87
+ "DataMiningMapper",
88
+ "DataMiningView",
89
+ ]
@@ -0,0 +1,9 @@
1
+ # -*- coding: utf-8 -*-
2
+ """Module entry point forwarding to the unified CLI."""
3
+
4
+ from .cli import main
5
+
6
+
7
+ if __name__ == "__main__":
8
+ raise SystemExit(main())
9
+
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ """适配器模块 - 支持多种输入格式"""
3
+
4
+ from .content_list_adapter import ContentListAdapter
5
+
6
+ from . import content_list_adapter
7
+
8
+ __all__ = [
9
+ "ContentListAdapter",
10
+ ]
@@ -0,0 +1,121 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 内容列表适配器 (ContentListAdapter) - 数据重组流水线的输入适配组件
4
+
5
+ 详细描述:
6
+ 专门处理MinerU输出的content_list.json格式数据,提供标准化的
7
+ 数据访问接口。负责文件读取、格式验证和数据预处理。
8
+
9
+ 核心功能:
10
+ - 文件读取: 从JSON文件加载MinerU content_list数据
11
+ - 格式验证: 检查数据结构的完整性和正确性
12
+ - 数据标准化: 统一不同版本MinerU输出的数据格式(含 processed 的 image 优化格式)
13
+ - 错误处理: 完善的异常捕获和错误报告机制
14
+
15
+ 处理格式:
16
+ - 输入: MinerU content_list.json文件路径
17
+ - 输出: 标准化的List[Dict]数据结构
18
+ - 支持类型: text、image、table等MinerU元素类型
19
+ - image: image_caption / image_footnote 支持旧版 [str] 与新版 [{"text": "...", "original_text": "..."}]
20
+ """
21
+
22
+ import json
23
+ from pathlib import Path
24
+ from typing import List, Dict
25
+ from ..runtime.plugin_system import PluginRegistry
26
+
27
+
28
+ def _normalize_caption_footnote_list(raw_list: List) -> List[str]:
29
+ """将 image_caption 或 image_footnote 的原始列表规范为清理后的文本字符串列表。
30
+
31
+ 兼容两种格式:
32
+ - 旧版/原始 MinerU:列表元素为 str
33
+ - 新版 processed:列表元素为 dict,含 "text"(清理后)与 "original_text"
34
+ 统一返回清理后的文本列表,供下游解析与展示使用。
35
+ """
36
+ if not raw_list:
37
+ return []
38
+ result = []
39
+ for elem in raw_list:
40
+ if isinstance(elem, dict):
41
+ result.append(elem.get("text", "").strip() or "")
42
+ elif isinstance(elem, str):
43
+ result.append(elem.strip())
44
+ else:
45
+ result.append(str(elem).strip())
46
+ return [s for s in result if s]
47
+
48
+
49
+ class ContentListAdapter:
50
+ """content_list.json 适配器"""
51
+
52
+ @staticmethod
53
+ def load(file_path: str) -> List[Dict]:
54
+ """加载 content_list.json 文件
55
+
56
+ Args:
57
+ file_path: JSON 文件路径
58
+
59
+ Returns:
60
+ 内容块列表
61
+ """
62
+ with open(file_path, 'r', encoding='utf-8') as f:
63
+ content_list = json.load(f)
64
+
65
+ # 验证格式(应该是数组)
66
+ if not isinstance(content_list, list):
67
+ raise ValueError(
68
+ f"content_list.json should be a list, got {type(content_list)}"
69
+ )
70
+
71
+ return content_list
72
+
73
+ @staticmethod
74
+ def get_image_caption_texts(image_item: Dict) -> List[str]:
75
+ """从图片项中获取清理后的图注文本列表。
76
+
77
+ 兼容 processed 优化格式:image_caption 为 [{"text": "...", "original_text": "..."}] 时,
78
+ 仅返回 "text" 字段(清理后文本);旧版 [str] 则原样返回并 strip。
79
+
80
+ Args:
81
+ image_item: content_list 中 type=="image" 的项
82
+
83
+ Returns:
84
+ 清理后的图注字符串列表
85
+ """
86
+ raw = image_item.get("image_caption", [])
87
+ return _normalize_caption_footnote_list(raw)
88
+
89
+ @staticmethod
90
+ def get_image_footnote_texts(image_item: Dict) -> List[str]:
91
+ """从图片项中获取清理后的图脚注文本列表。
92
+
93
+ 格式约定同 get_image_caption_texts。
94
+
95
+ Args:
96
+ image_item: content_list 中 type=="image" 的项
97
+
98
+ Returns:
99
+ 清理后的图脚注字符串列表
100
+ """
101
+ raw = image_item.get("image_footnote", [])
102
+ return _normalize_caption_footnote_list(raw)
103
+
104
+ @staticmethod
105
+ def save(data: List[Dict], file_path: str) -> None:
106
+ """保存为 content_list.json 格式
107
+
108
+ Args:
109
+ data: 内容块列表
110
+ file_path: 输出文件路径
111
+ """
112
+ Path(file_path).parent.mkdir(parents=True, exist_ok=True)
113
+ with open(file_path, 'w', encoding='utf-8') as f:
114
+ json.dump(data, f, ensure_ascii=False, indent=2)
115
+
116
+
117
+ @PluginRegistry.register("content_list_adapter")
118
+ def load_content_list(file_path: str) -> List[Dict]:
119
+ """content_list.json 适配器"""
120
+ return ContentListAdapter.load(file_path)
121
+
@@ -0,0 +1,95 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+ from typing import Optional, Sequence
6
+
7
+ from .runtime.commands import (
8
+ run_final_acceptance_validation,
9
+ run_structured_audit,
10
+ validate_delivery_outputs,
11
+ validate_multimodal_outputs,
12
+ validate_structured_outputs,
13
+ )
14
+ from .runtime.pipelines import ensure_console_encoding, run_full_pipeline, run_multimodal_pipeline, run_text_pipeline
15
+
16
+
17
+ def build_parser() -> argparse.ArgumentParser:
18
+ parser = argparse.ArgumentParser(prog="porosdata-designer", description="PorosData Designer unified command line interface")
19
+ subparsers = parser.add_subparsers(dest="command")
20
+
21
+ run_parser = subparsers.add_parser("run", help="run pipeline stages")
22
+ run_subparsers = run_parser.add_subparsers(dest="run_command")
23
+ for name in ("all", "text", "multimodal"):
24
+ cmd = run_subparsers.add_parser(name, help=f"run {name} pipeline")
25
+ cmd.add_argument("--input_dir", type=Path, required=True, help="Input data directory, for example data/processed")
26
+ cmd.add_argument("--output_dir", type=Path, default=None, help="Structured output base directory")
27
+ cmd.add_argument("--log_dir", type=Path, default=None, help="Log directory")
28
+
29
+ audit_parser = subparsers.add_parser("audit", help="run audits")
30
+ audit_subparsers = audit_parser.add_subparsers(dest="audit_command")
31
+ audit_structured = audit_subparsers.add_parser("structured", help="audit data/structured outputs")
32
+ audit_structured.add_argument("--root_dir", type=Path, default=None, help="Structured root directory")
33
+
34
+ validate_parser = subparsers.add_parser("validate", help="run validations")
35
+ validate_subparsers = validate_parser.add_subparsers(dest="validate_command")
36
+
37
+ validate_structured = validate_subparsers.add_parser("structured", help="validate full_text structured outputs")
38
+ validate_structured.add_argument("--output_dir", type=Path, default=None, help="full_text output directory")
39
+ validate_structured.add_argument("--log_dir", type=Path, default=None, help="Log directory")
40
+
41
+ validate_multimodal = validate_subparsers.add_parser("multimodal", help="validate multimodal outputs")
42
+ validate_multimodal.add_argument("--output_dir", type=Path, default=None, help="multimodal output directory")
43
+ validate_multimodal.add_argument("--log_dir", type=Path, default=None, help="Log directory")
44
+
45
+ validate_acceptance = validate_subparsers.add_parser("acceptance", help="run final acceptance validation")
46
+ validate_acceptance.add_argument("--output_dir", type=Path, default=None, help="multimodal output directory")
47
+ validate_acceptance.add_argument("--log_dir", type=Path, default=None, help="Log directory")
48
+
49
+ validate_delivery = validate_subparsers.add_parser("delivery", help="validate outputs against delivery standard")
50
+ validate_delivery.add_argument("--root_dir", type=Path, default=None, help="structured root directory")
51
+ validate_delivery.add_argument("--log_dir", type=Path, default=None, help="Log directory")
52
+ return parser
53
+
54
+
55
+ def dispatch(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
56
+ if args.command == "run":
57
+ if args.run_command == "all":
58
+ return run_full_pipeline(args.input_dir, output_dir=args.output_dir, log_dir=args.log_dir)
59
+ if args.run_command == "text":
60
+ return run_text_pipeline(args.input_dir, output_dir=args.output_dir, log_dir=args.log_dir)
61
+ if args.run_command == "multimodal":
62
+ return run_multimodal_pipeline(args.input_dir, output_dir=args.output_dir, log_dir=args.log_dir)
63
+ run_parser = next(action for action in parser._actions if isinstance(action, argparse._SubParsersAction)).choices["run"]
64
+ run_parser.print_help()
65
+ return 0
66
+
67
+ if args.command == "audit":
68
+ if args.audit_command == "structured":
69
+ return run_structured_audit(structured_root=args.root_dir)
70
+ audit_parser = next(action for action in parser._actions if isinstance(action, argparse._SubParsersAction)).choices["audit"]
71
+ audit_parser.print_help()
72
+ return 0
73
+
74
+ if args.command == "validate":
75
+ if args.validate_command == "structured":
76
+ return validate_structured_outputs(output_dir=args.output_dir, log_dir=args.log_dir)
77
+ if args.validate_command == "multimodal":
78
+ return validate_multimodal_outputs(output_dir=args.output_dir, log_dir=args.log_dir)
79
+ if args.validate_command == "acceptance":
80
+ return run_final_acceptance_validation(multimodal_dir=args.output_dir, log_dir=args.log_dir)
81
+ if args.validate_command == "delivery":
82
+ return validate_delivery_outputs(structured_root=args.root_dir, log_dir=args.log_dir)
83
+ validate_parser = next(action for action in parser._actions if isinstance(action, argparse._SubParsersAction)).choices["validate"]
84
+ validate_parser.print_help()
85
+ return 0
86
+
87
+ parser.print_help()
88
+ return 0
89
+
90
+
91
+ def main(argv: Optional[Sequence[str]] = None) -> int:
92
+ ensure_console_encoding()
93
+ parser = build_parser()
94
+ args = parser.parse_args(argv)
95
+ return dispatch(args, parser)
@@ -0,0 +1 @@
1
+ from .runtime.config import * # noqa: F401,F403
@@ -0,0 +1,3 @@
1
+ """Output mapping and asset anchoring helpers."""
2
+ from .asset_anchoring import * # noqa: F401,F403
3
+ from .data_mining_mapper import * # noqa: F401,F403
@@ -0,0 +1,143 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 资产穿透引擎 (Asset Anchoring Engine) - Fig/Table 与物理资产硬链接
4
+
5
+ 核心职责:
6
+ - 自动扫描文本中的 Fig. n 或 Table n 关键字
7
+ - 将其替换为包含 UUID 的资产标签 <poros_asset uuid="xxx">
8
+ - 直接关联 fig_n.md 中的图像元数据
9
+ """
10
+
11
+ import re
12
+ import uuid
13
+ from typing import Dict, List, Optional, Tuple
14
+ from pathlib import Path
15
+
16
+ from ..runtime.config import TAG_PREFIX
17
+ from ..adapters.content_list_adapter import ContentListAdapter
18
+
19
+ # 使用 uuid5 确保同一 doc+fig 始终得到相同 UUID,便于跨任务关联
20
+ def _make_asset_uuid(doc_id: str, asset_key: str) -> str:
21
+ namespace = uuid.NAMESPACE_DNS
22
+ name = f"{doc_id}:{asset_key}"
23
+ return str(uuid.uuid5(namespace, name))
24
+
25
+
26
+ class AssetAnchoringEngine:
27
+ """资产穿透引擎
28
+
29
+ 将正文中的 Fig. n / Table n 引用替换为带 UUID 的资产标签,
30
+ 建立与物理文件(fig_n.md、图像)的硬链接。
31
+ """
32
+
33
+ def __init__(self):
34
+ # Fig 引用模式:Fig. 1, Fig 1, Figure 1, Fig 1a, Fig 2.1
35
+ self._fig_pattern = re.compile(
36
+ r'\b(?:Fig\.?|Figure|FIG\.?)\s+(\d+(?:\.\d+|[a-zA-Z])?)\b',
37
+ re.IGNORECASE
38
+ )
39
+ # Table 引用模式
40
+ self._table_pattern = re.compile(
41
+ r'\b(?:Table|TABLE)\s+(\d+(?:\.\d+|[a-zA-Z])?)\b',
42
+ re.IGNORECASE
43
+ )
44
+
45
+ def build_asset_registry(
46
+ self,
47
+ content_list: List[Dict],
48
+ doc_id: str,
49
+ output_dir: Optional[Path] = None
50
+ ) -> Dict[str, str]:
51
+ """建立 Fig/Table ID -> UUID 的注册表
52
+
53
+ 使用 uuid5 确保同一 doc+fig 始终得到相同 UUID,便于 full_text 与 multimodal 跨任务关联。
54
+
55
+ Args:
56
+ content_list: MinerU 内容列表
57
+ doc_id: 文档 ID
58
+ output_dir: 多模态输出目录(可选,用于后续生成 fig_n.md)
59
+
60
+ Returns:
61
+ {"fig_1": "uuid-xxx", "table_1": "uuid-yyy", ...}
62
+ """
63
+ registry: Dict[str, str] = {}
64
+
65
+ for item in content_list:
66
+ if item.get("type") == "image":
67
+ fig_id = self._parse_fig_id_from_image(item)
68
+ if fig_id:
69
+ key = f"fig_{fig_id}"
70
+ if key not in registry:
71
+ registry[key] = _make_asset_uuid(doc_id, key)
72
+
73
+ if item.get("type") == "table" or "table_body" in item:
74
+ table_id = self._parse_table_id(item)
75
+ if table_id:
76
+ key = f"table_{table_id}"
77
+ if key not in registry:
78
+ registry[key] = _make_asset_uuid(doc_id, key)
79
+
80
+ return registry
81
+
82
+ def _parse_fig_id_from_image(self, image_item: Dict) -> Optional[str]:
83
+ """从图片项解析 Fig ID(使用清理后的图注文本,兼容 processed image 格式)"""
84
+ captions = ContentListAdapter.get_image_caption_texts(image_item)
85
+ for cap in captions:
86
+ m = re.search(r'\b(?:Fig\.?|Figure|FIG\.?)\s+(\d+(?:\.\d+|[a-zA-Z])?)', cap, re.IGNORECASE)
87
+ if m:
88
+ return m.group(1)
89
+ return None
90
+
91
+ def _parse_table_id(self, item: Dict) -> Optional[str]:
92
+ """从表格项解析 Table ID"""
93
+ text = item.get("text", "") or ""
94
+ m = re.search(r'\b(?:Table|TABLE)\s+(\d+(?:\.\d+|[a-zA-Z])?)', text, re.IGNORECASE)
95
+ if m:
96
+ return m.group(1)
97
+ return None
98
+
99
+ def anchor_text(
100
+ self,
101
+ text: str,
102
+ registry: Dict[str, str],
103
+ replace_fig: bool = True,
104
+ replace_table: bool = True
105
+ ) -> Tuple[str, List[Dict]]:
106
+ """将文本中的 Fig/Table 引用替换为资产标签
107
+
108
+ Args:
109
+ text: 原始文本
110
+ registry: {"fig_1": "uuid-xxx", "table_1": "uuid-yyy"}
111
+ replace_fig: 是否替换 Fig 引用
112
+ replace_table: 是否替换 Table 引用
113
+
114
+ Returns:
115
+ (替换后的文本, 替换记录列表)
116
+ """
117
+ result = text
118
+ replacements: List[Dict] = []
119
+
120
+ def replace_fig_match(m):
121
+ fig_id = m.group(1)
122
+ key = f"fig_{fig_id}"
123
+ if key in registry:
124
+ uid = registry[key]
125
+ replacements.append({"type": "fig", "id": fig_id, "uuid": uid})
126
+ return f'<{TAG_PREFIX}asset uuid="{uid}" type="fig" ref="{fig_id}">Fig. {fig_id}</{TAG_PREFIX}asset>'
127
+ return m.group(0)
128
+
129
+ def replace_table_match(m):
130
+ table_id = m.group(1)
131
+ key = f"table_{table_id}"
132
+ if key in registry:
133
+ uid = registry[key]
134
+ replacements.append({"type": "table", "id": table_id, "uuid": uid})
135
+ return f'<{TAG_PREFIX}asset uuid="{uid}" type="table" ref="{table_id}">Table {table_id}</{TAG_PREFIX}asset>'
136
+ return m.group(0)
137
+
138
+ if replace_fig:
139
+ result = self._fig_pattern.sub(replace_fig_match, result)
140
+ if replace_table:
141
+ result = self._table_pattern.sub(replace_table_match, result)
142
+
143
+ return result, replacements