triclick-doc-toolset 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- triclick_doc_toolset-1.0.0/.github/workflows/publish.yml +49 -0
- triclick_doc_toolset-1.0.0/.gitignore +22 -0
- triclick_doc_toolset-1.0.0/.python-version +1 -0
- triclick_doc_toolset-1.0.0/PKG-INFO +10 -0
- triclick_doc_toolset-1.0.0/README.md +0 -0
- triclick_doc_toolset-1.0.0/pipelines/generation.yaml +36 -0
- triclick_doc_toolset-1.0.0/pipelines/review.yaml +35 -0
- triclick_doc_toolset-1.0.0/pipelines/title_table_footnote_patterns.yaml +19 -0
- triclick_doc_toolset-1.0.0/pyproject.toml +50 -0
- triclick_doc_toolset-1.0.0/src/__init__.py +13 -0
- triclick_doc_toolset-1.0.0/src/commands/__init__.py +5 -0
- triclick_doc_toolset-1.0.0/src/commands/docx_file_parse_cmd.py +54 -0
- triclick_doc_toolset-1.0.0/src/commands/docx_file_partition_cmd.py +101 -0
- triclick_doc_toolset-1.0.0/src/commands/file_type_identification_cmd.py +72 -0
- triclick_doc_toolset-1.0.0/src/commands/rtf_file_parse_cmd.py +126 -0
- triclick_doc_toolset-1.0.0/src/common/__init__.py +0 -0
- triclick_doc_toolset-1.0.0/src/common/models/__init__.py +5 -0
- triclick_doc_toolset-1.0.0/src/common/models/table_item.py +41 -0
- triclick_doc_toolset-1.0.0/src/common/rules/rule_same_as_util.py +171 -0
- triclick_doc_toolset-1.0.0/src/common/utils/title_table_footnote_patterns_loader.py +57 -0
- triclick_doc_toolset-1.0.0/src/common/word/docx_file_parse_util.py +173 -0
- triclick_doc_toolset-1.0.0/src/common/word/docx_file_partition_util.py +127 -0
- triclick_doc_toolset-1.0.0/src/framework/__init__.py +16 -0
- triclick_doc_toolset-1.0.0/src/framework/command.py +33 -0
- triclick_doc_toolset-1.0.0/src/framework/command_registry.py +40 -0
- triclick_doc_toolset-1.0.0/src/framework/context.py +135 -0
- triclick_doc_toolset-1.0.0/src/framework/pipeline.py +132 -0
- triclick_doc_toolset-1.0.0/src/framework/strategy.py +62 -0
- triclick_doc_toolset-1.0.0/src/service.py +134 -0
- triclick_doc_toolset-1.0.0/tests/data.input/generation_shell.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/data.input/generation_shell0.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/data.input/generation_shell1.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/data.input/generation_shell2.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/data.input/generation_shell3.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/data.input/generation_shell5.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/data.input/review/review_Table_1_Summary_of_Patient_Disposition_Enrolled_Analysis_Set.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/data.input/review/review_Table_2_Summary_of_Demographic_and_Baseline_Characteristics.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/data.input/review/review_Table_3_Study_Treatment_Exposure_FAS.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/data.input/review/review_Table_4_Summary_of_Serum_Pharmacokinetic_Concentrations.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/data.input/review/review_Table_5_Summary_of_ROC_Analysis_for_PK_Parameters_PKAS.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/data.input/review/review_Table_6_Analysis_of_PK_Parameter_Estimates_PartA_PKAS.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/data.input/test/generation_shell.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/data.input/test/generation_shell0.docx +0 -0
- triclick_doc_toolset-1.0.0/tests/test_run_review.py +12 -0
- triclick_doc_toolset-1.0.0/tests/test_service.py +92 -0
- triclick_doc_toolset-1.0.0/uv.lock +796 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
branches:
|
|
6
|
+
- main
|
|
7
|
+
types: [closed]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
publish:
|
|
11
|
+
# 只在PR合并到main分支时执行
|
|
12
|
+
if: github.event_name == 'push' || (github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main')
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- name: Checkout code
|
|
17
|
+
uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
uses: actions/setup-python@v4
|
|
21
|
+
with:
|
|
22
|
+
python-version: '3.12'
|
|
23
|
+
|
|
24
|
+
- name: Install uv
|
|
25
|
+
uses: astral-sh/setup-uv@v3
|
|
26
|
+
|
|
27
|
+
- name: Install dependencies
|
|
28
|
+
run: |
|
|
29
|
+
uv sync
|
|
30
|
+
uv add --group dev build twine
|
|
31
|
+
|
|
32
|
+
- name: Clean previous builds
|
|
33
|
+
run: |
|
|
34
|
+
rm -rf dist/ build/ *.egg-info/
|
|
35
|
+
|
|
36
|
+
- name: Build package
|
|
37
|
+
run: |
|
|
38
|
+
uv run python -m build
|
|
39
|
+
|
|
40
|
+
- name: Check package
|
|
41
|
+
run: |
|
|
42
|
+
uv run twine check dist/*
|
|
43
|
+
|
|
44
|
+
- name: Publish to PyPI
|
|
45
|
+
env:
|
|
46
|
+
TWINE_USERNAME: __token__
|
|
47
|
+
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
|
|
48
|
+
run: |
|
|
49
|
+
uv run twine upload dist/*
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# os
|
|
2
|
+
.DS_Store
|
|
3
|
+
|
|
4
|
+
# IDE
|
|
5
|
+
.idea
|
|
6
|
+
.vscode
|
|
7
|
+
|
|
8
|
+
# Virtual environments
|
|
9
|
+
.venv
|
|
10
|
+
|
|
11
|
+
# Python-generated files
|
|
12
|
+
__pycache__/
|
|
13
|
+
*.py[oc]
|
|
14
|
+
build/
|
|
15
|
+
dist/
|
|
16
|
+
wheels/
|
|
17
|
+
output
|
|
18
|
+
data.output
|
|
19
|
+
*.egg-info
|
|
20
|
+
|
|
21
|
+
# project-specific files
|
|
22
|
+
data.output
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: triclick-doc-toolset
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Advanced document partitioning and processing toolset
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Requires-Dist: pydantic>=2.12.3
|
|
7
|
+
Requires-Dist: python-docx>=1.2.0
|
|
8
|
+
Requires-Dist: pyyaml>=6.0
|
|
9
|
+
Provides-Extra: dev
|
|
10
|
+
Requires-Dist: pytest<9.0,>=8.0; extra == 'dev'
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Generation流水线:文档类型识别 -> 解析 -> 拆分
|
|
2
|
+
|
|
3
|
+
pipeline:
|
|
4
|
+
strategies:
|
|
5
|
+
# 文件类型识别
|
|
6
|
+
- name: detect_file_type
|
|
7
|
+
exec_mode: sequential
|
|
8
|
+
priority: 1
|
|
9
|
+
commands:
|
|
10
|
+
- type: FileTypeIdentificationCommand
|
|
11
|
+
name: detect_file_type
|
|
12
|
+
priority: 1
|
|
13
|
+
|
|
14
|
+
# 按类型解析
|
|
15
|
+
- name: parse_by_type
|
|
16
|
+
exec_mode: sequential
|
|
17
|
+
priority: 2
|
|
18
|
+
commands:
|
|
19
|
+
- type: DocxFileParseCommand
|
|
20
|
+
name: parse_docx_title_table_footnote
|
|
21
|
+
priority: 1
|
|
22
|
+
condition: "doc_type == 'docx'"
|
|
23
|
+
- type: RTFFileParseCommand
|
|
24
|
+
name: parse_rtf_title_table_footnote
|
|
25
|
+
priority: 2
|
|
26
|
+
condition: "doc_type == 'rtf'"
|
|
27
|
+
|
|
28
|
+
# 文档拆分
|
|
29
|
+
- name: split_docx_tables
|
|
30
|
+
exec_mode: sequential
|
|
31
|
+
priority: 3
|
|
32
|
+
commands:
|
|
33
|
+
- type: DocxFilePartitionCommand
|
|
34
|
+
name: split_docx_to_single_tables
|
|
35
|
+
priority: 3
|
|
36
|
+
condition: "doc_type == 'docx'"
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Review流水线:文档类型识别 -> 解析 -> 拆分(小优先级先执行)
|
|
2
|
+
pipeline:
|
|
3
|
+
strategies:
|
|
4
|
+
# 文件类型识别,单独策略
|
|
5
|
+
- name: detect_file_type
|
|
6
|
+
exec_mode: sequential
|
|
7
|
+
priority: 1
|
|
8
|
+
commands:
|
|
9
|
+
- type: FileTypeIdentificationCommand
|
|
10
|
+
name: detect_file_type
|
|
11
|
+
priority: 1
|
|
12
|
+
|
|
13
|
+
# 按类型解析,单独策略
|
|
14
|
+
- name: parse_by_type
|
|
15
|
+
exec_mode: sequential
|
|
16
|
+
priority: 2
|
|
17
|
+
commands:
|
|
18
|
+
- type: DocxFileParseCommand
|
|
19
|
+
name: parse_docx_title_table_footnote
|
|
20
|
+
priority: 1
|
|
21
|
+
condition: "doc_type == 'docx'"
|
|
22
|
+
- type: RTFFileParseCommand
|
|
23
|
+
name: parse_rtf_title_table_footnote
|
|
24
|
+
priority: 2
|
|
25
|
+
condition: "doc_type == 'rtf'"
|
|
26
|
+
|
|
27
|
+
# DOCX拆分,单独策略
|
|
28
|
+
- name: split_docx_tables
|
|
29
|
+
exec_mode: sequential
|
|
30
|
+
priority: 3
|
|
31
|
+
commands:
|
|
32
|
+
- type: DocxFilePartitionCommand
|
|
33
|
+
name: split_docx_tables_with_copy
|
|
34
|
+
priority: 3
|
|
35
|
+
condition: "doc_type == 'docx'"
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# 用于从标题文本中提取用于命名的最小标签(前缀+编号),保持最小可表达
|
|
2
|
+
# 例如:"Table 2.3"、"Listing 1"。不包含后续描述。
|
|
3
|
+
# 参考上面的列表样式进行拆分
|
|
4
|
+
label_patterns:
|
|
5
|
+
- '^\s*(?:Table)\s+[A-Za-z0-9_-]+(?:\.[A-Za-z0-9_-]+)*(?=\s|$|[::])'
|
|
6
|
+
- '^\s*(?:Listing)\s+[A-Za-z0-9_-]+(?:\.[A-Za-z0-9_-]+)*(?=\s|$|[::])'
|
|
7
|
+
|
|
8
|
+
footnote_patterns:
|
|
9
|
+
- '^same\s+as\s+table\s+[0-9]+(\.[0-9]+)*'
|
|
10
|
+
|
|
11
|
+
# 可配置的标题最小正则列表
|
|
12
|
+
# 你可以根据文档风格增删这些规则;按序匹配,忽略大小写。
|
|
13
|
+
# 示例:匹配以 Table 或 Listing 开头,并跟随编号或编号+字母等形式。
|
|
14
|
+
title_patterns:
|
|
15
|
+
- '^(?:Table)\s+\S+'
|
|
16
|
+
- '^(?:Listing)\s+\S+'
|
|
17
|
+
- '^\s*(?:Table|Listing)\s+[A-Za-z0-9_-]+(?:\.[A-Za-z0-9_-]+)*(?=\s|$|[::])'
|
|
18
|
+
|
|
19
|
+
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "triclick-doc-toolset"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "Advanced document partitioning and processing toolset"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"pyyaml>=6.0",
|
|
9
|
+
"pydantic>=2.12.3",
|
|
10
|
+
"python-docx>=1.2.0",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[project.optional-dependencies]
|
|
14
|
+
dev = [
|
|
15
|
+
"pytest>=8.0,<9.0"
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[tool.setuptools]
|
|
19
|
+
package-dir = {"" = "src"}
|
|
20
|
+
|
|
21
|
+
[tool.setuptools.packages.find]
|
|
22
|
+
where = ["src"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
[build-system]
|
|
26
|
+
requires = ["hatchling>=1.26"]
|
|
27
|
+
build-backend = "hatchling.build"
|
|
28
|
+
|
|
29
|
+
[tool.hatch.build.targets.wheel]
|
|
30
|
+
packages = ["src"]
|
|
31
|
+
|
|
32
|
+
[[tool.uv.index]]
|
|
33
|
+
url = "https://pypi.org/simple/"
|
|
34
|
+
default = true
|
|
35
|
+
|
|
36
|
+
[tool.uv]
|
|
37
|
+
index-url = "https://pypi.org/simple/"
|
|
38
|
+
|
|
39
|
+
[tool.ruff]
|
|
40
|
+
line-length = 100
|
|
41
|
+
|
|
42
|
+
[tool.pyright]
|
|
43
|
+
pythonVersion = "3.12"
|
|
44
|
+
typeCheckingMode = "standard"
|
|
45
|
+
|
|
46
|
+
[dependency-groups]
|
|
47
|
+
dev = [
|
|
48
|
+
"build>=1.3.0",
|
|
49
|
+
"twine>=6.2.0",
|
|
50
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .framework import Pipeline, Strategy, Command, Context
|
|
2
|
+
from .service import run_pipeline, run_generation, run_review
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"Pipeline",
|
|
7
|
+
"Strategy",
|
|
8
|
+
"Command",
|
|
9
|
+
"Context",
|
|
10
|
+
"run_pipeline",
|
|
11
|
+
"run_generation",
|
|
12
|
+
"run_review",
|
|
13
|
+
]
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
# 确保命令模块被导入,从而完成注册到 CommandRegistry
|
|
2
|
+
from .docx_file_parse_cmd import DocxFileParseCommand # noqa: F401
|
|
3
|
+
from .docx_file_partition_cmd import DocxFilePartitionCommand # noqa: F401
|
|
4
|
+
from .file_type_identification_cmd import FileTypeIdentificationCommand # noqa: F401
|
|
5
|
+
from .rtf_file_parse_cmd import RTFFileParseCommand # noqa: F401
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import List, Dict
|
|
4
|
+
from dataclasses import asdict
|
|
5
|
+
|
|
6
|
+
# 框架依赖
|
|
7
|
+
from ..framework.command import Command
|
|
8
|
+
from ..framework.context import Context
|
|
9
|
+
from ..framework.command_registry import CommandRegistry
|
|
10
|
+
|
|
11
|
+
# 解析工具方法(仅保留元数据解析)
|
|
12
|
+
from ..common.word.docx_file_parse_util import (
|
|
13
|
+
extract_docx_content_with_metadata,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DocxFileParseCommand(Command):
|
|
18
|
+
"""
|
|
19
|
+
框架命令子类:按“标题-表格-脚注/段落”分块(仅元数据)。
|
|
20
|
+
仅使用 `extract_docx_content_with_metadata` 进行解析。
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def is_satisfied(self, context: Context) -> bool:
|
|
24
|
+
return context.has_document()
|
|
25
|
+
|
|
26
|
+
def execute(self, context: Context) -> Context:
|
|
27
|
+
# 解析输入路径(支持文件或文件夹),仅处理 .docx
|
|
28
|
+
paths = context.resolve_document_paths(patterns=["*.docx"])
|
|
29
|
+
if not paths:
|
|
30
|
+
context.add_error("No DOCX files resolved from context")
|
|
31
|
+
return context
|
|
32
|
+
|
|
33
|
+
# 将解析得到的 TableItem 列表转为结构化字典,并写入 Context.sections
|
|
34
|
+
parsed_sections: List[Dict] = []
|
|
35
|
+
for p in paths:
|
|
36
|
+
sections = extract_docx_content_with_metadata(str(p))
|
|
37
|
+
for sec in sections:
|
|
38
|
+
d = asdict(sec)
|
|
39
|
+
d["source_file"] = str(p)
|
|
40
|
+
parsed_sections.append(d)
|
|
41
|
+
|
|
42
|
+
# 更新上下文
|
|
43
|
+
context.doc_type = "docx"
|
|
44
|
+
context.sections = parsed_sections
|
|
45
|
+
context.processing_summary["title_table_footnote_partition"] = {
|
|
46
|
+
"files_processed": len(paths),
|
|
47
|
+
"sections_extracted": len(parsed_sections),
|
|
48
|
+
"mode": "metadata_only",
|
|
49
|
+
}
|
|
50
|
+
return context
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# 注册到命令注册表,便于 Pipeline 通过 YAML 创建
|
|
54
|
+
CommandRegistry.register("DocxFileParseCommand", DocxFileParseCommand)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Dict
|
|
5
|
+
from dataclasses import asdict
|
|
6
|
+
|
|
7
|
+
# 框架依赖
|
|
8
|
+
from ..framework.command import Command
|
|
9
|
+
from ..framework.context import Context
|
|
10
|
+
from ..framework.command_registry import CommandRegistry
|
|
11
|
+
|
|
12
|
+
# 仅对接拆分工具方法
|
|
13
|
+
from ..common.word.docx_file_partition_util import (
|
|
14
|
+
split_docx_into_tables_with_copy,
|
|
15
|
+
)
|
|
16
|
+
from ..common.models import TableItem
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DocxFilePartitionCommand(Command):
|
|
20
|
+
"""
|
|
21
|
+
基于“标题-表格-脚注”元数据拆分 DOCX,保留原样式。
|
|
22
|
+
只对接 `split_docx_into_tables_with_copy`,从 Context.sections 读取解析结果,
|
|
23
|
+
处理完后覆盖写回 Context.sections。
|
|
24
|
+
输出目录仅从 Context.metadata['output_dir'] 读取。
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def is_satisfied(self, context: Context) -> bool:
|
|
28
|
+
return context.has_document()
|
|
29
|
+
|
|
30
|
+
def _to_table_item(self, d: Dict) -> TableItem:
|
|
31
|
+
ti = TableItem(
|
|
32
|
+
level=int(d.get("level", 0) or 0),
|
|
33
|
+
table_index=d.get("table_index"),
|
|
34
|
+
label=d.get("label"),
|
|
35
|
+
title=d.get("title"),
|
|
36
|
+
)
|
|
37
|
+
# 安全过滤 None
|
|
38
|
+
ti.title_indices = [int(i) for i in (d.get("title_indices") or []) if isinstance(i, int)]
|
|
39
|
+
ti.footnote = d.get("footnote")
|
|
40
|
+
ti.footnote_indices = [int(i) for i in (d.get("footnote_indices") or []) if isinstance(i, int)]
|
|
41
|
+
ti.local_path = d.get("local_path")
|
|
42
|
+
return ti
|
|
43
|
+
|
|
44
|
+
def execute(self, context: Context) -> Context:
|
|
45
|
+
# 从 Context.sections 读取解析结果
|
|
46
|
+
sections_in = context.sections or []
|
|
47
|
+
if not sections_in:
|
|
48
|
+
context.add_error("No parsed sections found in context; run partition first")
|
|
49
|
+
return context
|
|
50
|
+
|
|
51
|
+
# 按 source_file 分组
|
|
52
|
+
grouped: Dict[str, List[TableItem]] = {}
|
|
53
|
+
for d in sections_in:
|
|
54
|
+
src = d.get("source_file")
|
|
55
|
+
if not src:
|
|
56
|
+
# 无来源则跳过
|
|
57
|
+
continue
|
|
58
|
+
grouped.setdefault(src, []).append(self._to_table_item(d))
|
|
59
|
+
|
|
60
|
+
out_dirs: List[str] = []
|
|
61
|
+
generated_files: List[str] = []
|
|
62
|
+
sections_out: List[dict] = []
|
|
63
|
+
|
|
64
|
+
for src_str, items in grouped.items():
|
|
65
|
+
src = Path(src_str)
|
|
66
|
+
base_out = context.metadata.get("output_dir")
|
|
67
|
+
if not base_out:
|
|
68
|
+
context.add_error("DocxFilePartitionCommand requires context.metadata['output_dir']")
|
|
69
|
+
return context
|
|
70
|
+
out_dir_str = str(base_out)
|
|
71
|
+
|
|
72
|
+
updated_items = split_docx_into_tables_with_copy(str(src), items, output_dir=out_dir_str)
|
|
73
|
+
out_dirs.append(out_dir_str)
|
|
74
|
+
|
|
75
|
+
# 写入 Context.sections(结构化 dict),附带 source_file
|
|
76
|
+
for sec in updated_items:
|
|
77
|
+
d = asdict(sec)
|
|
78
|
+
d["source_file"] = src_str
|
|
79
|
+
sections_out.append(d)
|
|
80
|
+
if d.get("local_path"):
|
|
81
|
+
generated_files.append(d["local_path"])
|
|
82
|
+
|
|
83
|
+
context.doc_type = "docx"
|
|
84
|
+
# 覆盖写回拆分后的分段到 Context
|
|
85
|
+
context.sections = sections_out
|
|
86
|
+
# 记录输出摘要
|
|
87
|
+
tables_processed = sum(1 for d in sections_out if d.get("local_path"))
|
|
88
|
+
context.processing_summary["title_table_footnote_split"] = {
|
|
89
|
+
"files_processed": len(grouped),
|
|
90
|
+
"tables_processed": tables_processed,
|
|
91
|
+
"generated_files_count": len(generated_files),
|
|
92
|
+
"output_dirs": out_dirs,
|
|
93
|
+
"mode": "copy_elements",
|
|
94
|
+
}
|
|
95
|
+
# 附加生成文件(可用于后续输出或验证)
|
|
96
|
+
context.generated_files.extend(generated_files)
|
|
97
|
+
return context
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# 注册到命令注册表,便于 Pipeline 通过 YAML 创建
|
|
101
|
+
CommandRegistry.register("DocxFilePartitionCommand", DocxFilePartitionCommand)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from ..framework.command import Command
|
|
7
|
+
from ..framework.context import Context
|
|
8
|
+
from ..framework.command_registry import CommandRegistry
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FileTypeIdentificationCommand(Command):
|
|
12
|
+
"""
|
|
13
|
+
文档类型识别命令:根据输入的文件或文件夹,识别主要文档类型。
|
|
14
|
+
|
|
15
|
+
规则:
|
|
16
|
+
- 单文件:按扩展名识别(docx/doc/rtf)。
|
|
17
|
+
- 文件夹:优先选择包含文件数量最多的已支持类型(docx > rtf > doc)。
|
|
18
|
+
- 识别结果写入 `context.doc_type`,并在 processing_summary 中记录统计。
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
SUPPORTED_EXTS = {"docx": "docx", "doc": "doc", "rtf": "rtf"}
|
|
22
|
+
|
|
23
|
+
def is_satisfied(self, context: Context) -> bool:
|
|
24
|
+
return context.document_uri is not None
|
|
25
|
+
|
|
26
|
+
def _count_types(self, paths: List[Path]) -> Dict[str, int]:
|
|
27
|
+
counts: Dict[str, int] = {"docx": 0, "doc": 0, "rtf": 0}
|
|
28
|
+
for p in paths:
|
|
29
|
+
ext = p.suffix.lower().lstrip(".")
|
|
30
|
+
if ext in counts:
|
|
31
|
+
counts[ext] += 1
|
|
32
|
+
return counts
|
|
33
|
+
|
|
34
|
+
def execute(self, context: Context) -> Context:
|
|
35
|
+
uri = context.document_uri
|
|
36
|
+
if uri is None:
|
|
37
|
+
context.add_error("No document_uri set in context")
|
|
38
|
+
return context
|
|
39
|
+
|
|
40
|
+
# 单文件识别
|
|
41
|
+
if isinstance(uri, (str, Path)) and Path(uri).is_file():
|
|
42
|
+
ext = Path(uri).suffix.lower().lstrip(".")
|
|
43
|
+
doc_type = self.SUPPORTED_EXTS.get(ext, None)
|
|
44
|
+
context.doc_type = doc_type
|
|
45
|
+
context.processing_summary["file_type_identification"] = {
|
|
46
|
+
"source": str(Path(uri)),
|
|
47
|
+
"doc_type": doc_type or "unknown",
|
|
48
|
+
}
|
|
49
|
+
return context
|
|
50
|
+
|
|
51
|
+
# 目录或列表识别
|
|
52
|
+
paths = context.resolve_document_paths(patterns=["*.docx", "*.doc", "*.rtf"])
|
|
53
|
+
counts = self._count_types(paths)
|
|
54
|
+
# 选择规则:docx 优先,其次 rtf,再次 doc
|
|
55
|
+
doc_type: Optional[str] = None
|
|
56
|
+
if counts["docx"] > 0:
|
|
57
|
+
doc_type = "docx"
|
|
58
|
+
elif counts["rtf"] > 0:
|
|
59
|
+
doc_type = "rtf"
|
|
60
|
+
elif counts["doc"] > 0:
|
|
61
|
+
doc_type = "doc"
|
|
62
|
+
else:
|
|
63
|
+
doc_type = None
|
|
64
|
+
context.doc_type = doc_type
|
|
65
|
+
context.processing_summary["file_type_identification"] = {
|
|
66
|
+
"counts": counts,
|
|
67
|
+
"doc_type": doc_type or "unknown",
|
|
68
|
+
}
|
|
69
|
+
return context
|
|
70
|
+
|
|
71
|
+
# 注册到命令注册表
|
|
72
|
+
CommandRegistry.register("FileTypeIdentificationCommand", FileTypeIdentificationCommand)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import asdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Dict, Optional
|
|
7
|
+
|
|
8
|
+
from ..framework.command import Command
|
|
9
|
+
from ..framework.context import Context
|
|
10
|
+
from ..framework.command_registry import CommandRegistry
|
|
11
|
+
|
|
12
|
+
from ..common.models import TableItem
|
|
13
|
+
from ..common.utils.title_table_footnote_patterns_loader import (
|
|
14
|
+
load_title_patterns, load_label_patterns, load_footnote_patterns
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
TITLE_PATTERNS = load_title_patterns()
|
|
19
|
+
LABEL_PATTERNS = load_label_patterns()
|
|
20
|
+
FOOTNOTE_PATTERNS = load_footnote_patterns()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _extract_label(text: str) -> Optional[str]:
|
|
24
|
+
text = (text or "").strip()
|
|
25
|
+
for pat in LABEL_PATTERNS:
|
|
26
|
+
m = pat.match(text)
|
|
27
|
+
if m:
|
|
28
|
+
label = m.group(0).strip()
|
|
29
|
+
label = re.sub(r"\s+", " ", label)
|
|
30
|
+
label = re.sub(r"^(table|listing)", lambda m: m.group(1).capitalize(), label)
|
|
31
|
+
return label
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _compute_level(label: Optional[str]) -> int:
|
|
36
|
+
if not label:
|
|
37
|
+
return 0
|
|
38
|
+
parts = label.split(None, 1)
|
|
39
|
+
number_part = parts[1] if len(parts) == 2 else parts[0]
|
|
40
|
+
return number_part.count('.')
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _rtf_to_text(rtf_path: Path) -> str:
|
|
44
|
+
# 朴素的 RTF 文本提取:去除控制词与分组,保留 \par 作为换行
|
|
45
|
+
data = rtf_path.read_bytes()
|
|
46
|
+
try:
|
|
47
|
+
s = data.decode('utf-8', errors='ignore')
|
|
48
|
+
except Exception:
|
|
49
|
+
s = data.decode('latin-1', errors='ignore')
|
|
50
|
+
# 将段落分隔转换为换行
|
|
51
|
+
s = s.replace("\\par", "\n").replace("\\line", "\n")
|
|
52
|
+
s = s.replace("\\tab", "\t")
|
|
53
|
+
# 移除转义的 16 进制字符(简化处理)
|
|
54
|
+
s = re.sub(r"\\'[0-9a-fA-F]{2}", " ", s)
|
|
55
|
+
# 移除控制词(如 \\b、\\fs24 等)
|
|
56
|
+
s = re.sub(r"\\[a-zA-Z]+(?:-?\d+)?\s?", "", s)
|
|
57
|
+
# 移除分组的大括号
|
|
58
|
+
s = s.replace('{', '').replace('}', '')
|
|
59
|
+
# 归一化换行
|
|
60
|
+
s = re.sub(r"\r\n|\r", "\n", s)
|
|
61
|
+
# 压缩多行空白
|
|
62
|
+
s = re.sub(r"\n{3,}", "\n\n", s)
|
|
63
|
+
return s
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class RTFFileParseCommand(Command):
|
|
67
|
+
"""
|
|
68
|
+
简易 RTF 解析命令:
|
|
69
|
+
- 逐行匹配标题模式,生成 TableItem;
|
|
70
|
+
- 尝试收集紧随标题后的脚注行(基于 footnote_patterns),直到遇到下一个标题;
|
|
71
|
+
- 不支持表格索引(table_index 始终为 None)。
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def is_satisfied(self, context: Context) -> bool:
|
|
75
|
+
return context.has_document()
|
|
76
|
+
|
|
77
|
+
def execute(self, context: Context) -> Context:
|
|
78
|
+
paths = context.resolve_document_paths(patterns=["*.rtf"])
|
|
79
|
+
if not paths:
|
|
80
|
+
# 若无 RTF 文件,直接返回,不记为错误(由条件控制是否运行)
|
|
81
|
+
return context
|
|
82
|
+
|
|
83
|
+
sections_out: List[Dict] = []
|
|
84
|
+
for p in paths:
|
|
85
|
+
txt = _rtf_to_text(Path(p))
|
|
86
|
+
lines = [ln.strip() for ln in txt.splitlines()]
|
|
87
|
+
current: Optional[TableItem] = None
|
|
88
|
+
for idx, line in enumerate(lines):
|
|
89
|
+
if not line:
|
|
90
|
+
continue
|
|
91
|
+
# 新标题开始
|
|
92
|
+
if any(pat.match(line) for pat in TITLE_PATTERNS):
|
|
93
|
+
if current:
|
|
94
|
+
d = asdict(current)
|
|
95
|
+
d["source_file"] = str(p)
|
|
96
|
+
sections_out.append(d)
|
|
97
|
+
label = _extract_label(line)
|
|
98
|
+
current = TableItem(
|
|
99
|
+
label=label,
|
|
100
|
+
title=line,
|
|
101
|
+
level=_compute_level(label),
|
|
102
|
+
title_indices=[idx],
|
|
103
|
+
)
|
|
104
|
+
continue
|
|
105
|
+
# 收集脚注(仅当已有当前标题且匹配脚注规则)
|
|
106
|
+
if current and any(pat.search(line) for pat in FOOTNOTE_PATTERNS):
|
|
107
|
+
current.add_footnote_index(idx)
|
|
108
|
+
current.append_footnote_text(line)
|
|
109
|
+
# 收尾:追加最后一个段
|
|
110
|
+
if current:
|
|
111
|
+
d = asdict(current)
|
|
112
|
+
d["source_file"] = str(p)
|
|
113
|
+
sections_out.append(d)
|
|
114
|
+
|
|
115
|
+
context.doc_type = "rtf"
|
|
116
|
+
context.sections = sections_out
|
|
117
|
+
context.processing_summary["rtf_title_table_footnote_partition"] = {
|
|
118
|
+
"files_processed": len(paths),
|
|
119
|
+
"sections_extracted": len(sections_out),
|
|
120
|
+
"mode": "rtf_text_only",
|
|
121
|
+
}
|
|
122
|
+
return context
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# 注册到命令注册表
|
|
126
|
+
CommandRegistry.register("RTFFileParseCommand", RTFFileParseCommand)
|
|
File without changes
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Data models for table parsing and processing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class TableItem:
|
|
10
|
+
"""表格项数据结构,包含标题、层级、索引和表格信息。"""
|
|
11
|
+
level: int = 0
|
|
12
|
+
table_index: Optional[int] = None
|
|
13
|
+
label: Optional[str] = None
|
|
14
|
+
title: Optional[str] = None
|
|
15
|
+
title_indices: List[int] = field(default_factory=list)
|
|
16
|
+
footnote: Optional[str] = None
|
|
17
|
+
footnote_indices: List[int] = field(default_factory=list)
|
|
18
|
+
local_path: Optional[str] = None
|
|
19
|
+
|
|
20
|
+
def set_table(self, table_index: int):
|
|
21
|
+
"""设置当前 section 的表格索引。"""
|
|
22
|
+
self.table_index = table_index
|
|
23
|
+
|
|
24
|
+
def add_footnote_index(self, para_index: int):
|
|
25
|
+
"""为当前 section 的表格追加脚注段落索引。"""
|
|
26
|
+
self.footnote_indices.append(para_index)
|
|
27
|
+
|
|
28
|
+
def append_footnote_text(self, text: str):
|
|
29
|
+
"""追加脚注文本(按段落累积,以换行分隔)。"""
|
|
30
|
+
text = (text or "").strip()
|
|
31
|
+
if not text:
|
|
32
|
+
return
|
|
33
|
+
if self.footnote:
|
|
34
|
+
# 以换行分隔追加,保持原段落边界
|
|
35
|
+
self.footnote += ("\n" if not self.footnote.endswith("\n") else "") + text
|
|
36
|
+
else:
|
|
37
|
+
self.footnote = text
|
|
38
|
+
|
|
39
|
+
def get_section_label(self) -> str:
|
|
40
|
+
"""获取用于文件命名的标签,若无标签则返回默认值。"""
|
|
41
|
+
return (self.label or 'Table')
|