triclick-doc-toolset 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. triclick_doc_toolset-1.0.0/.github/workflows/publish.yml +49 -0
  2. triclick_doc_toolset-1.0.0/.gitignore +22 -0
  3. triclick_doc_toolset-1.0.0/.python-version +1 -0
  4. triclick_doc_toolset-1.0.0/PKG-INFO +10 -0
  5. triclick_doc_toolset-1.0.0/README.md +0 -0
  6. triclick_doc_toolset-1.0.0/pipelines/generation.yaml +36 -0
  7. triclick_doc_toolset-1.0.0/pipelines/review.yaml +35 -0
  8. triclick_doc_toolset-1.0.0/pipelines/title_table_footnote_patterns.yaml +19 -0
  9. triclick_doc_toolset-1.0.0/pyproject.toml +50 -0
  10. triclick_doc_toolset-1.0.0/src/__init__.py +13 -0
  11. triclick_doc_toolset-1.0.0/src/commands/__init__.py +5 -0
  12. triclick_doc_toolset-1.0.0/src/commands/docx_file_parse_cmd.py +54 -0
  13. triclick_doc_toolset-1.0.0/src/commands/docx_file_partition_cmd.py +101 -0
  14. triclick_doc_toolset-1.0.0/src/commands/file_type_identification_cmd.py +72 -0
  15. triclick_doc_toolset-1.0.0/src/commands/rtf_file_parse_cmd.py +126 -0
  16. triclick_doc_toolset-1.0.0/src/common/__init__.py +0 -0
  17. triclick_doc_toolset-1.0.0/src/common/models/__init__.py +5 -0
  18. triclick_doc_toolset-1.0.0/src/common/models/table_item.py +41 -0
  19. triclick_doc_toolset-1.0.0/src/common/rules/rule_same_as_util.py +171 -0
  20. triclick_doc_toolset-1.0.0/src/common/utils/title_table_footnote_patterns_loader.py +57 -0
  21. triclick_doc_toolset-1.0.0/src/common/word/docx_file_parse_util.py +173 -0
  22. triclick_doc_toolset-1.0.0/src/common/word/docx_file_partition_util.py +127 -0
  23. triclick_doc_toolset-1.0.0/src/framework/__init__.py +16 -0
  24. triclick_doc_toolset-1.0.0/src/framework/command.py +33 -0
  25. triclick_doc_toolset-1.0.0/src/framework/command_registry.py +40 -0
  26. triclick_doc_toolset-1.0.0/src/framework/context.py +135 -0
  27. triclick_doc_toolset-1.0.0/src/framework/pipeline.py +132 -0
  28. triclick_doc_toolset-1.0.0/src/framework/strategy.py +62 -0
  29. triclick_doc_toolset-1.0.0/src/service.py +134 -0
  30. triclick_doc_toolset-1.0.0/tests/data.input/generation_shell.docx +0 -0
  31. triclick_doc_toolset-1.0.0/tests/data.input/generation_shell0.docx +0 -0
  32. triclick_doc_toolset-1.0.0/tests/data.input/generation_shell1.docx +0 -0
  33. triclick_doc_toolset-1.0.0/tests/data.input/generation_shell2.docx +0 -0
  34. triclick_doc_toolset-1.0.0/tests/data.input/generation_shell3.docx +0 -0
  35. triclick_doc_toolset-1.0.0/tests/data.input/generation_shell5.docx +0 -0
  36. triclick_doc_toolset-1.0.0/tests/data.input/review/review_Table_1_Summary_of_Patient_Disposition_Enrolled_Analysis_Set.docx +0 -0
  37. triclick_doc_toolset-1.0.0/tests/data.input/review/review_Table_2_Summary_of_Demographic_and_Baseline_Characteristics.docx +0 -0
  38. triclick_doc_toolset-1.0.0/tests/data.input/review/review_Table_3_Study_Treatment_Exposure_FAS.docx +0 -0
  39. triclick_doc_toolset-1.0.0/tests/data.input/review/review_Table_4_Summary_of_Serum_Pharmacokinetic_Concentrations.docx +0 -0
  40. triclick_doc_toolset-1.0.0/tests/data.input/review/review_Table_5_Summary_of_ROC_Analysis_for_PK_Parameters_PKAS.docx +0 -0
  41. triclick_doc_toolset-1.0.0/tests/data.input/review/review_Table_6_Analysis_of_PK_Parameter_Estimates_PartA_PKAS.docx +0 -0
  42. triclick_doc_toolset-1.0.0/tests/data.input/test/generation_shell.docx +0 -0
  43. triclick_doc_toolset-1.0.0/tests/data.input/test/generation_shell0.docx +0 -0
  44. triclick_doc_toolset-1.0.0/tests/test_run_review.py +12 -0
  45. triclick_doc_toolset-1.0.0/tests/test_service.py +92 -0
  46. triclick_doc_toolset-1.0.0/uv.lock +796 -0
@@ -0,0 +1,49 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ pull_request:
5
+ branches:
6
+ - main
7
+ types: [closed]
8
+
9
+ jobs:
10
+ publish:
11
+ # 只在PR合并到main分支时执行
12
+ if: github.event_name == 'push' || (github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main')
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - name: Checkout code
17
+ uses: actions/checkout@v4
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v4
21
+ with:
22
+ python-version: '3.12'
23
+
24
+ - name: Install uv
25
+ uses: astral-sh/setup-uv@v3
26
+
27
+ - name: Install dependencies
28
+ run: |
29
+ uv sync
30
+ uv add --group dev build twine
31
+
32
+ - name: Clean previous builds
33
+ run: |
34
+ rm -rf dist/ build/ *.egg-info/
35
+
36
+ - name: Build package
37
+ run: |
38
+ uv run python -m build
39
+
40
+ - name: Check package
41
+ run: |
42
+ uv run twine check dist/*
43
+
44
+ - name: Publish to PyPI
45
+ env:
46
+ TWINE_USERNAME: __token__
47
+ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
48
+ run: |
49
+ uv run twine upload dist/*
@@ -0,0 +1,22 @@
1
+ # os
2
+ .DS_Store
3
+
4
+ # IDE
5
+ .idea
6
+ .vscode
7
+
8
+ # Virtual environments
9
+ .venv
10
+
11
+ # Python-generated files
12
+ __pycache__/
13
+ *.py[oc]
14
+ build/
15
+ dist/
16
+ wheels/
17
+ output
18
+ data.output
19
+ *.egg-info
20
+
21
+ # project-specific files
22
+ data.output
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.4
2
+ Name: triclick-doc-toolset
3
+ Version: 1.0.0
4
+ Summary: Advanced document partitioning and processing toolset
5
+ Requires-Python: >=3.12
6
+ Requires-Dist: pydantic>=2.12.3
7
+ Requires-Dist: python-docx>=1.2.0
8
+ Requires-Dist: pyyaml>=6.0
9
+ Provides-Extra: dev
10
+ Requires-Dist: pytest<9.0,>=8.0; extra == 'dev'
File without changes
@@ -0,0 +1,36 @@
1
+ # Generation流水线:文档类型识别 -> 解析 -> 拆分
2
+
3
+ pipeline:
4
+ strategies:
5
+ # 文件类型识别
6
+ - name: detect_file_type
7
+ exec_mode: sequential
8
+ priority: 1
9
+ commands:
10
+ - type: FileTypeIdentificationCommand
11
+ name: detect_file_type
12
+ priority: 1
13
+
14
+ # 按类型解析
15
+ - name: parse_by_type
16
+ exec_mode: sequential
17
+ priority: 2
18
+ commands:
19
+ - type: DocxFileParseCommand
20
+ name: parse_docx_title_table_footnote
21
+ priority: 1
22
+ condition: "doc_type == 'docx'"
23
+ - type: RTFFileParseCommand
24
+ name: parse_rtf_title_table_footnote
25
+ priority: 2
26
+ condition: "doc_type == 'rtf'"
27
+
28
+ # 文档拆分
29
+ - name: split_docx_tables
30
+ exec_mode: sequential
31
+ priority: 3
32
+ commands:
33
+ - type: DocxFilePartitionCommand
34
+ name: split_docx_to_single_tables
35
+ priority: 3
36
+ condition: "doc_type == 'docx'"
@@ -0,0 +1,35 @@
1
+ # Review流水线:文档类型识别 -> 解析 -> 拆分(小优先级先执行)
2
+ pipeline:
3
+ strategies:
4
+ # 文件类型识别,单独策略
5
+ - name: detect_file_type
6
+ exec_mode: sequential
7
+ priority: 1
8
+ commands:
9
+ - type: FileTypeIdentificationCommand
10
+ name: detect_file_type
11
+ priority: 1
12
+
13
+ # 按类型解析,单独策略
14
+ - name: parse_by_type
15
+ exec_mode: sequential
16
+ priority: 2
17
+ commands:
18
+ - type: DocxFileParseCommand
19
+ name: parse_docx_title_table_footnote
20
+ priority: 1
21
+ condition: "doc_type == 'docx'"
22
+ - type: RTFFileParseCommand
23
+ name: parse_rtf_title_table_footnote
24
+ priority: 2
25
+ condition: "doc_type == 'rtf'"
26
+
27
+ # DOCX拆分,单独策略
28
+ - name: split_docx_tables
29
+ exec_mode: sequential
30
+ priority: 3
31
+ commands:
32
+ - type: DocxFilePartitionCommand
33
+ name: split_docx_tables_with_copy
34
+ priority: 3
35
+ condition: "doc_type == 'docx'"
@@ -0,0 +1,19 @@
1
+ # 用于从标题文本中提取用于命名的最小标签(前缀+编号),保持最小可表达
2
+ # 例如:"Table 2.3"、"Listing 1"。不包含后续描述。
3
+ # 参考上面的列表样式进行拆分
4
+ label_patterns:
5
+ - '^\s*(?:Table)\s+[A-Za-z0-9_-]+(?:\.[A-Za-z0-9_-]+)*(?=\s|$|[::])'
6
+ - '^\s*(?:Listing)\s+[A-Za-z0-9_-]+(?:\.[A-Za-z0-9_-]+)*(?=\s|$|[::])'
7
+
8
+ footnote_patterns:
9
+ - '^same\s+as\s+table\s+[0-9]+(\.[0-9]+)*'
10
+
11
+ # 可配置的标题最小正则列表
12
+ # 你可以根据文档风格增删这些规则;按序匹配,忽略大小写。
13
+ # 示例:匹配以 Table 或 Listing 开头,并跟随编号或编号+字母等形式。
14
+ title_patterns:
15
+ - '^(?:Table)\s+\S+'
16
+ - '^(?:Listing)\s+\S+'
17
+ - '^\s*(?:Table|Listing)\s+[A-Za-z0-9_-]+(?:\.[A-Za-z0-9_-]+)*(?=\s|$|[::])'
18
+
19
+
@@ -0,0 +1,50 @@
1
+ [project]
2
+ name = "triclick-doc-toolset"
3
+ version = "1.0.0"
4
+ description = "Advanced document partitioning and processing toolset"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "pyyaml>=6.0",
9
+ "pydantic>=2.12.3",
10
+ "python-docx>=1.2.0",
11
+ ]
12
+
13
+ [project.optional-dependencies]
14
+ dev = [
15
+ "pytest>=8.0,<9.0"
16
+ ]
17
+
18
+ [tool.setuptools]
19
+ package-dir = {"" = "src"}
20
+
21
+ [tool.setuptools.packages.find]
22
+ where = ["src"]
23
+
24
+
25
+ [build-system]
26
+ requires = ["hatchling>=1.26"]
27
+ build-backend = "hatchling.build"
28
+
29
+ [tool.hatch.build.targets.wheel]
30
+ packages = ["src"]
31
+
32
+ [[tool.uv.index]]
33
+ url = "https://pypi.org/simple/"
34
+ default = true
35
+
36
+ [tool.uv]
37
+ index-url = "https://pypi.org/simple/"
38
+
39
+ [tool.ruff]
40
+ line-length = 100
41
+
42
+ [tool.pyright]
43
+ pythonVersion = "3.12"
44
+ typeCheckingMode = "standard"
45
+
46
+ [dependency-groups]
47
+ dev = [
48
+ "build>=1.3.0",
49
+ "twine>=6.2.0",
50
+ ]
@@ -0,0 +1,13 @@
1
+ from .framework import Pipeline, Strategy, Command, Context
2
+ from .service import run_pipeline, run_generation, run_review
3
+
4
+
5
+ __all__ = [
6
+ "Pipeline",
7
+ "Strategy",
8
+ "Command",
9
+ "Context",
10
+ "run_pipeline",
11
+ "run_generation",
12
+ "run_review",
13
+ ]
@@ -0,0 +1,5 @@
1
+ # 确保命令模块被导入,从而完成注册到 CommandRegistry
2
+ from .docx_file_parse_cmd import DocxFileParseCommand # noqa: F401
3
+ from .docx_file_partition_cmd import DocxFilePartitionCommand # noqa: F401
4
+ from .file_type_identification_cmd import FileTypeIdentificationCommand # noqa: F401
5
+ from .rtf_file_parse_cmd import RTFFileParseCommand # noqa: F401
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Dict
4
+ from dataclasses import asdict
5
+
6
+ # 框架依赖
7
+ from ..framework.command import Command
8
+ from ..framework.context import Context
9
+ from ..framework.command_registry import CommandRegistry
10
+
11
+ # 解析工具方法(仅保留元数据解析)
12
+ from ..common.word.docx_file_parse_util import (
13
+ extract_docx_content_with_metadata,
14
+ )
15
+
16
+
17
+ class DocxFileParseCommand(Command):
18
+ """
19
+ 框架命令子类:按“标题-表格-脚注/段落”分块(仅元数据)。
20
+ 仅使用 `extract_docx_content_with_metadata` 进行解析。
21
+ """
22
+
23
+ def is_satisfied(self, context: Context) -> bool:
24
+ return context.has_document()
25
+
26
+ def execute(self, context: Context) -> Context:
27
+ # 解析输入路径(支持文件或文件夹),仅处理 .docx
28
+ paths = context.resolve_document_paths(patterns=["*.docx"])
29
+ if not paths:
30
+ context.add_error("No DOCX files resolved from context")
31
+ return context
32
+
33
+ # 将解析得到的 TableItem 列表转为结构化字典,并写入 Context.sections
34
+ parsed_sections: List[Dict] = []
35
+ for p in paths:
36
+ sections = extract_docx_content_with_metadata(str(p))
37
+ for sec in sections:
38
+ d = asdict(sec)
39
+ d["source_file"] = str(p)
40
+ parsed_sections.append(d)
41
+
42
+ # 更新上下文
43
+ context.doc_type = "docx"
44
+ context.sections = parsed_sections
45
+ context.processing_summary["title_table_footnote_partition"] = {
46
+ "files_processed": len(paths),
47
+ "sections_extracted": len(parsed_sections),
48
+ "mode": "metadata_only",
49
+ }
50
+ return context
51
+
52
+
53
+ # 注册到命令注册表,便于 Pipeline 通过 YAML 创建
54
+ CommandRegistry.register("DocxFileParseCommand", DocxFileParseCommand)
@@ -0,0 +1,101 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import List, Dict
5
+ from dataclasses import asdict
6
+
7
+ # 框架依赖
8
+ from ..framework.command import Command
9
+ from ..framework.context import Context
10
+ from ..framework.command_registry import CommandRegistry
11
+
12
+ # 仅对接拆分工具方法
13
+ from ..common.word.docx_file_partition_util import (
14
+ split_docx_into_tables_with_copy,
15
+ )
16
+ from ..common.models import TableItem
17
+
18
+
19
+ class DocxFilePartitionCommand(Command):
20
+ """
21
+ 基于“标题-表格-脚注”元数据拆分 DOCX,保留原样式。
22
+ 只对接 `split_docx_into_tables_with_copy`,从 Context.sections 读取解析结果,
23
+ 处理完后覆盖写回 Context.sections。
24
+ 输出目录仅从 Context.metadata['output_dir'] 读取。
25
+ """
26
+
27
+ def is_satisfied(self, context: Context) -> bool:
28
+ return context.has_document()
29
+
30
+ def _to_table_item(self, d: Dict) -> TableItem:
31
+ ti = TableItem(
32
+ level=int(d.get("level", 0) or 0),
33
+ table_index=d.get("table_index"),
34
+ label=d.get("label"),
35
+ title=d.get("title"),
36
+ )
37
+ # 安全过滤 None
38
+ ti.title_indices = [int(i) for i in (d.get("title_indices") or []) if isinstance(i, int)]
39
+ ti.footnote = d.get("footnote")
40
+ ti.footnote_indices = [int(i) for i in (d.get("footnote_indices") or []) if isinstance(i, int)]
41
+ ti.local_path = d.get("local_path")
42
+ return ti
43
+
44
+ def execute(self, context: Context) -> Context:
45
+ # 从 Context.sections 读取解析结果
46
+ sections_in = context.sections or []
47
+ if not sections_in:
48
+ context.add_error("No parsed sections found in context; run partition first")
49
+ return context
50
+
51
+ # 按 source_file 分组
52
+ grouped: Dict[str, List[TableItem]] = {}
53
+ for d in sections_in:
54
+ src = d.get("source_file")
55
+ if not src:
56
+ # 无来源则跳过
57
+ continue
58
+ grouped.setdefault(src, []).append(self._to_table_item(d))
59
+
60
+ out_dirs: List[str] = []
61
+ generated_files: List[str] = []
62
+ sections_out: List[dict] = []
63
+
64
+ for src_str, items in grouped.items():
65
+ src = Path(src_str)
66
+ base_out = context.metadata.get("output_dir")
67
+ if not base_out:
68
+ context.add_error("DocxFilePartitionCommand requires context.metadata['output_dir']")
69
+ return context
70
+ out_dir_str = str(base_out)
71
+
72
+ updated_items = split_docx_into_tables_with_copy(str(src), items, output_dir=out_dir_str)
73
+ out_dirs.append(out_dir_str)
74
+
75
+ # 写入 Context.sections(结构化 dict),附带 source_file
76
+ for sec in updated_items:
77
+ d = asdict(sec)
78
+ d["source_file"] = src_str
79
+ sections_out.append(d)
80
+ if d.get("local_path"):
81
+ generated_files.append(d["local_path"])
82
+
83
+ context.doc_type = "docx"
84
+ # 覆盖写回拆分后的分段到 Context
85
+ context.sections = sections_out
86
+ # 记录输出摘要
87
+ tables_processed = sum(1 for d in sections_out if d.get("local_path"))
88
+ context.processing_summary["title_table_footnote_split"] = {
89
+ "files_processed": len(grouped),
90
+ "tables_processed": tables_processed,
91
+ "generated_files_count": len(generated_files),
92
+ "output_dirs": out_dirs,
93
+ "mode": "copy_elements",
94
+ }
95
+ # 附加生成文件(可用于后续输出或验证)
96
+ context.generated_files.extend(generated_files)
97
+ return context
98
+
99
+
100
+ # 注册到命令注册表,便于 Pipeline 通过 YAML 创建
101
+ CommandRegistry.register("DocxFilePartitionCommand", DocxFilePartitionCommand)
@@ -0,0 +1,72 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, List, Optional
5
+
6
+ from ..framework.command import Command
7
+ from ..framework.context import Context
8
+ from ..framework.command_registry import CommandRegistry
9
+
10
+
11
+ class FileTypeIdentificationCommand(Command):
12
+ """
13
+ 文档类型识别命令:根据输入的文件或文件夹,识别主要文档类型。
14
+
15
+ 规则:
16
+ - 单文件:按扩展名识别(docx/doc/rtf)。
17
+ - 文件夹:优先选择包含文件数量最多的已支持类型(docx > rtf > doc)。
18
+ - 识别结果写入 `context.doc_type`,并在 processing_summary 中记录统计。
19
+ """
20
+
21
+ SUPPORTED_EXTS = {"docx": "docx", "doc": "doc", "rtf": "rtf"}
22
+
23
+ def is_satisfied(self, context: Context) -> bool:
24
+ return context.document_uri is not None
25
+
26
+ def _count_types(self, paths: List[Path]) -> Dict[str, int]:
27
+ counts: Dict[str, int] = {"docx": 0, "doc": 0, "rtf": 0}
28
+ for p in paths:
29
+ ext = p.suffix.lower().lstrip(".")
30
+ if ext in counts:
31
+ counts[ext] += 1
32
+ return counts
33
+
34
+ def execute(self, context: Context) -> Context:
35
+ uri = context.document_uri
36
+ if uri is None:
37
+ context.add_error("No document_uri set in context")
38
+ return context
39
+
40
+ # 单文件识别
41
+ if isinstance(uri, (str, Path)) and Path(uri).is_file():
42
+ ext = Path(uri).suffix.lower().lstrip(".")
43
+ doc_type = self.SUPPORTED_EXTS.get(ext, None)
44
+ context.doc_type = doc_type
45
+ context.processing_summary["file_type_identification"] = {
46
+ "source": str(Path(uri)),
47
+ "doc_type": doc_type or "unknown",
48
+ }
49
+ return context
50
+
51
+ # 目录或列表识别
52
+ paths = context.resolve_document_paths(patterns=["*.docx", "*.doc", "*.rtf"])
53
+ counts = self._count_types(paths)
54
+ # 选择规则:docx 优先,其次 rtf,再次 doc
55
+ doc_type: Optional[str] = None
56
+ if counts["docx"] > 0:
57
+ doc_type = "docx"
58
+ elif counts["rtf"] > 0:
59
+ doc_type = "rtf"
60
+ elif counts["doc"] > 0:
61
+ doc_type = "doc"
62
+ else:
63
+ doc_type = None
64
+ context.doc_type = doc_type
65
+ context.processing_summary["file_type_identification"] = {
66
+ "counts": counts,
67
+ "doc_type": doc_type or "unknown",
68
+ }
69
+ return context
70
+
71
+ # 注册到命令注册表
72
+ CommandRegistry.register("FileTypeIdentificationCommand", FileTypeIdentificationCommand)
@@ -0,0 +1,126 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import asdict
5
+ from pathlib import Path
6
+ from typing import List, Dict, Optional
7
+
8
+ from ..framework.command import Command
9
+ from ..framework.context import Context
10
+ from ..framework.command_registry import CommandRegistry
11
+
12
+ from ..common.models import TableItem
13
+ from ..common.utils.title_table_footnote_patterns_loader import (
14
+ load_title_patterns, load_label_patterns, load_footnote_patterns
15
+ )
16
+
17
+
18
+ TITLE_PATTERNS = load_title_patterns()
19
+ LABEL_PATTERNS = load_label_patterns()
20
+ FOOTNOTE_PATTERNS = load_footnote_patterns()
21
+
22
+
23
+ def _extract_label(text: str) -> Optional[str]:
24
+ text = (text or "").strip()
25
+ for pat in LABEL_PATTERNS:
26
+ m = pat.match(text)
27
+ if m:
28
+ label = m.group(0).strip()
29
+ label = re.sub(r"\s+", " ", label)
30
+ label = re.sub(r"^(table|listing)", lambda m: m.group(1).capitalize(), label)
31
+ return label
32
+ return None
33
+
34
+
35
+ def _compute_level(label: Optional[str]) -> int:
36
+ if not label:
37
+ return 0
38
+ parts = label.split(None, 1)
39
+ number_part = parts[1] if len(parts) == 2 else parts[0]
40
+ return number_part.count('.')
41
+
42
+
43
+ def _rtf_to_text(rtf_path: Path) -> str:
44
+ # 朴素的 RTF 文本提取:去除控制词与分组,保留 \par 作为换行
45
+ data = rtf_path.read_bytes()
46
+ try:
47
+ s = data.decode('utf-8', errors='ignore')
48
+ except Exception:
49
+ s = data.decode('latin-1', errors='ignore')
50
+ # 将段落分隔转换为换行
51
+ s = s.replace("\\par", "\n").replace("\\line", "\n")
52
+ s = s.replace("\\tab", "\t")
53
+ # 移除转义的 16 进制字符(简化处理)
54
+ s = re.sub(r"\\'[0-9a-fA-F]{2}", " ", s)
55
+ # 移除控制词(如 \\b、\\fs24 等)
56
+ s = re.sub(r"\\[a-zA-Z]+(?:-?\d+)?\s?", "", s)
57
+ # 移除分组的大括号
58
+ s = s.replace('{', '').replace('}', '')
59
+ # 归一化换行
60
+ s = re.sub(r"\r\n|\r", "\n", s)
61
+ # 压缩多行空白
62
+ s = re.sub(r"\n{3,}", "\n\n", s)
63
+ return s
64
+
65
+
66
+ class RTFFileParseCommand(Command):
67
+ """
68
+ 简易 RTF 解析命令:
69
+ - 逐行匹配标题模式,生成 TableItem;
70
+ - 尝试收集紧随标题后的脚注行(基于 footnote_patterns),直到遇到下一个标题;
71
+ - 不支持表格索引(table_index 始终为 None)。
72
+ """
73
+
74
+ def is_satisfied(self, context: Context) -> bool:
75
+ return context.has_document()
76
+
77
+ def execute(self, context: Context) -> Context:
78
+ paths = context.resolve_document_paths(patterns=["*.rtf"])
79
+ if not paths:
80
+ # 若无 RTF 文件,直接返回,不记为错误(由条件控制是否运行)
81
+ return context
82
+
83
+ sections_out: List[Dict] = []
84
+ for p in paths:
85
+ txt = _rtf_to_text(Path(p))
86
+ lines = [ln.strip() for ln in txt.splitlines()]
87
+ current: Optional[TableItem] = None
88
+ for idx, line in enumerate(lines):
89
+ if not line:
90
+ continue
91
+ # 新标题开始
92
+ if any(pat.match(line) for pat in TITLE_PATTERNS):
93
+ if current:
94
+ d = asdict(current)
95
+ d["source_file"] = str(p)
96
+ sections_out.append(d)
97
+ label = _extract_label(line)
98
+ current = TableItem(
99
+ label=label,
100
+ title=line,
101
+ level=_compute_level(label),
102
+ title_indices=[idx],
103
+ )
104
+ continue
105
+ # 收集脚注(仅当已有当前标题且匹配脚注规则)
106
+ if current and any(pat.search(line) for pat in FOOTNOTE_PATTERNS):
107
+ current.add_footnote_index(idx)
108
+ current.append_footnote_text(line)
109
+ # 收尾:追加最后一个段
110
+ if current:
111
+ d = asdict(current)
112
+ d["source_file"] = str(p)
113
+ sections_out.append(d)
114
+
115
+ context.doc_type = "rtf"
116
+ context.sections = sections_out
117
+ context.processing_summary["rtf_title_table_footnote_partition"] = {
118
+ "files_processed": len(paths),
119
+ "sections_extracted": len(sections_out),
120
+ "mode": "rtf_text_only",
121
+ }
122
+ return context
123
+
124
+
125
+ # 注册到命令注册表
126
+ CommandRegistry.register("RTFFileParseCommand", RTFFileParseCommand)
File without changes
@@ -0,0 +1,5 @@
1
+ """Common data models for the triclick document toolset."""
2
+
3
+ from .table_item import TableItem
4
+
5
+ __all__ = ['TableItem']
@@ -0,0 +1,41 @@
1
+ """Data models for table parsing and processing."""
2
+
3
+ from __future__ import annotations
4
+ from typing import List, Optional
5
+ from dataclasses import dataclass, field
6
+
7
+
8
+ @dataclass
9
+ class TableItem:
10
+ """表格项数据结构,包含标题、层级、索引和表格信息。"""
11
+ level: int = 0
12
+ table_index: Optional[int] = None
13
+ label: Optional[str] = None
14
+ title: Optional[str] = None
15
+ title_indices: List[int] = field(default_factory=list)
16
+ footnote: Optional[str] = None
17
+ footnote_indices: List[int] = field(default_factory=list)
18
+ local_path: Optional[str] = None
19
+
20
+ def set_table(self, table_index: int):
21
+ """设置当前 section 的表格索引。"""
22
+ self.table_index = table_index
23
+
24
+ def add_footnote_index(self, para_index: int):
25
+ """为当前 section 的表格追加脚注段落索引。"""
26
+ self.footnote_indices.append(para_index)
27
+
28
+ def append_footnote_text(self, text: str):
29
+ """追加脚注文本(按段落累积,以换行分隔)。"""
30
+ text = (text or "").strip()
31
+ if not text:
32
+ return
33
+ if self.footnote:
34
+ # 以换行分隔追加,保持原段落边界
35
+ self.footnote += ("\n" if not self.footnote.endswith("\n") else "") + text
36
+ else:
37
+ self.footnote = text
38
+
39
+ def get_section_label(self) -> str:
40
+ """获取用于文件命名的标签,若无标签则返回默认值。"""
41
+ return (self.label or 'Table')