cantonbio-ra-doc-mcp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cantonbio_ra_doc_mcp-0.1.0.dist-info/METADATA +10 -0
- cantonbio_ra_doc_mcp-0.1.0.dist-info/RECORD +7 -0
- cantonbio_ra_doc_mcp-0.1.0.dist-info/WHEEL +4 -0
- src/__init__.py +158 -0
- src/extract_quality_standards.py +342 -0
- src/fill_quality_standards.py +536 -0
- src/server.py +157 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cantonbio-ra-doc-mcp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: RA注册文档助手,协助处理注册文档的编写和校验工作
|
|
5
|
+
Requires-Python: >=3.13
|
|
6
|
+
Requires-Dist: mcp[cli]>=1.15.0
|
|
7
|
+
Requires-Dist: python-docx>=1.1.0
|
|
8
|
+
Provides-Extra: dev
|
|
9
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
|
|
10
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
src/__init__.py,sha256=rk3ufImwp6WyVkeYJUbgj7IDfXBzCYgnchd0lvKjOzc,4979
|
|
2
|
+
src/extract_quality_standards.py,sha256=TR0Yf060P956ksZSK6OPz9OSbRcNeNjPwVvtyb_n4L8,14721
|
|
3
|
+
src/fill_quality_standards.py,sha256=vqqBCkZ0WPSICXVj4WchTXn6B0-TPDsSLszY8P_pI3M,20779
|
|
4
|
+
src/server.py,sha256=12b8hYTEa9l7kVT80bZBPGc0s4OvxlEmrBlMBbCJvIA,4985
|
|
5
|
+
cantonbio_ra_doc_mcp-0.1.0.dist-info/METADATA,sha256=WrTwfxsNshT5fLRqXJDH9MBuTsbPF5UtgkFbjN_wxFE,349
|
|
6
|
+
cantonbio_ra_doc_mcp-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
7
|
+
cantonbio_ra_doc_mcp-0.1.0.dist-info/RECORD,,
|
src/__init__.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RA Document MCP Server
|
|
3
|
+
|
|
4
|
+
Run:
|
|
5
|
+
uv run src.server fastmcp_quickstart stdio
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from mcp.server.fastmcp import FastMCP
|
|
9
|
+
from src import extract_quality_standards
|
|
10
|
+
from src import fill_quality_standards
|
|
11
|
+
import os
|
|
12
|
+
import logging
|
|
13
|
+
import traceback
|
|
14
|
+
# Create an MCP server
|
|
15
|
+
mcp = FastMCP("ra-doc-mcp")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Initialize LOG_FILE variable without assigning a value
|
|
19
|
+
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
20
|
+
LOG_FILE = os.path.join(ROOT_DIR, "ra-file-mcp.log")
|
|
21
|
+
|
|
22
|
+
# Configure logging
|
|
23
|
+
logging.basicConfig(
|
|
24
|
+
level=logging.DEBUG,
|
|
25
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
26
|
+
handlers=[
|
|
27
|
+
logging.FileHandler(LOG_FILE, encoding='utf-8'),
|
|
28
|
+
logging.StreamHandler()
|
|
29
|
+
]
|
|
30
|
+
)
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
# Initialize EXCEL_FILES_PATH variable without assigning a value
|
|
34
|
+
DOCX_FILES_PATH = None
|
|
35
|
+
|
|
36
|
+
def get_docx_path(filename: str) -> str:
|
|
37
|
+
"""Get full path to Excel file.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
filename: Name of Excel file
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Full path to Excel file
|
|
44
|
+
"""
|
|
45
|
+
# If filename is already an absolute path, return it
|
|
46
|
+
if os.path.isabs(filename):
|
|
47
|
+
return filename
|
|
48
|
+
|
|
49
|
+
# Check if in SSE mode (EXCEL_FILES_PATH is not None)
|
|
50
|
+
if DOCX_FILES_PATH is None:
|
|
51
|
+
# Must use absolute path
|
|
52
|
+
raise ValueError(f"Invalid filename: {filename}, must be an absolute path when not in SSE mode")
|
|
53
|
+
|
|
54
|
+
# In SSE mode, if it's a relative path, resolve it based on EXCEL_FILES_PATH
|
|
55
|
+
return os.path.join(DOCX_FILES_PATH, filename)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# Add an addition tool
|
|
59
|
+
@mcp.tool()
|
|
60
|
+
def add(a: int, b: int) -> int:
|
|
61
|
+
"""Add two numbers"""
|
|
62
|
+
return a + b + 1
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# Add an addition tool
|
|
66
|
+
@mcp.tool()
|
|
67
|
+
def extract_quality_standards_table(doc_path: str) -> str:
|
|
68
|
+
"""Extract quality standards table from docx file"""
|
|
69
|
+
try:
|
|
70
|
+
logger.info(f"Starting quality standards extraction for: {doc_path}")
|
|
71
|
+
full_path = get_docx_path(doc_path)
|
|
72
|
+
logger.debug(f"Resolved full path: {full_path}")
|
|
73
|
+
|
|
74
|
+
result = extract_quality_standards.extract_quality_standards_table_from_docx(full_path)
|
|
75
|
+
logger.info(f"Successfully extracted quality standards from: {full_path}")
|
|
76
|
+
return result
|
|
77
|
+
|
|
78
|
+
except Exception as e:
|
|
79
|
+
error_msg = f"Error in extract_quality_standards_table: {str(e)}"
|
|
80
|
+
logger.error(error_msg)
|
|
81
|
+
logger.error(f"Full traceback: {traceback.format_exc()}")
|
|
82
|
+
return f"Error: {error_msg}"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@mcp.tool()
|
|
86
|
+
def fill_quality_standards_table(doc_path: str, markdown_content: str, table_index: int = None) -> str:
|
|
87
|
+
"""Fill Word document table with quality standards data from markdown format (modifies file in-place)
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
doc_path: Path to Word document to modify (absolute path or relative to DOCX_FILES_PATH)
|
|
91
|
+
markdown_content: Markdown table content as string
|
|
92
|
+
table_index: Specific table index to fill (None for auto-detection)
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Success message or error description
|
|
96
|
+
"""
|
|
97
|
+
try:
|
|
98
|
+
logger.info(f"Starting in-place table filling for: {doc_path}")
|
|
99
|
+
|
|
100
|
+
# Resolve path
|
|
101
|
+
full_doc_path = get_docx_path(doc_path)
|
|
102
|
+
logger.debug(f"Resolved document path: {full_doc_path}")
|
|
103
|
+
|
|
104
|
+
# Fill the document in-place
|
|
105
|
+
result = fill_quality_standards.fill_quality_standards_inplace(
|
|
106
|
+
full_doc_path, markdown_content, table_index, auto_merge=True
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
logger.info(f"Successfully filled quality standards table in-place: {result}")
|
|
110
|
+
return result
|
|
111
|
+
|
|
112
|
+
except Exception as e:
|
|
113
|
+
error_msg = f"Error in fill_quality_standards_table: {str(e)}"
|
|
114
|
+
logger.error(error_msg)
|
|
115
|
+
logger.error(f"Full traceback: {traceback.format_exc()}")
|
|
116
|
+
return f"Error: {error_msg}"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# @mcp.tool()
|
|
120
|
+
# def extract_quality_standards_from_content(file_content_base64: str) -> str:
|
|
121
|
+
# """
|
|
122
|
+
# Extract quality standards table from Word document content (not file path)
|
|
123
|
+
|
|
124
|
+
# Args:
|
|
125
|
+
# file_content_base64: Base64 encoded Word document content
|
|
126
|
+
|
|
127
|
+
# Returns:
|
|
128
|
+
# Markdown formatted quality standards table
|
|
129
|
+
# """
|
|
130
|
+
# try:
|
|
131
|
+
# file_bytes = base64.b64decode(file_content_base64)
|
|
132
|
+
# return extract_quality_standards.extract_quality_standards_table_from_bytes(file_bytes)
|
|
133
|
+
# except Exception as e:
|
|
134
|
+
# return f"Error decoding or processing file content: {e}"
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# Add a dynamic greeting resource
|
|
138
|
+
@mcp.resource("greeting://{name}")
|
|
139
|
+
def get_greeting(name: str) -> str:
|
|
140
|
+
"""Get a personalized greeting"""
|
|
141
|
+
return f"Hello, {name}!"
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# Add a prompt
|
|
145
|
+
@mcp.prompt()
|
|
146
|
+
def greet_user(name: str, style: str = "friendly") -> str:
|
|
147
|
+
"""Generate a greeting prompt"""
|
|
148
|
+
styles = {
|
|
149
|
+
"friendly": "Please write a warm, friendly greeting",
|
|
150
|
+
"formal": "Please write a formal, professional greeting",
|
|
151
|
+
"casual": "Please write a casual, relaxed greeting",
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
return f"{styles.get(style, styles['friendly'])} for someone named {name}."
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def main() -> None:
|
|
158
|
+
mcp.run(transport='stdio')
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Extract quality standards table from SPE-原液质量标准.docx
|
|
3
|
+
Specifically looking for section 4.3 检验项目、方法和标准
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from docx import Document
|
|
7
|
+
from docx.oxml.ns import qn
|
|
8
|
+
import re
|
|
9
|
+
import io
|
|
10
|
+
import logging
|
|
11
|
+
import traceback
|
|
12
|
+
|
|
13
|
+
# Get logger for this module
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
def extract_text_with_formatting(cell):
|
|
17
|
+
"""
|
|
18
|
+
Extract text from cell while preserving superscript and subscript formatting with error handling
|
|
19
|
+
"""
|
|
20
|
+
result = ""
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
logger.debug(f"Processing cell with {len(cell.paragraphs)} paragraphs")
|
|
24
|
+
|
|
25
|
+
for paragraph_idx, paragraph in enumerate(cell.paragraphs):
|
|
26
|
+
try:
|
|
27
|
+
logger.debug(f"Processing paragraph {paragraph_idx} with {len(paragraph.runs)} runs")
|
|
28
|
+
|
|
29
|
+
for run_idx, run in enumerate(paragraph.runs):
|
|
30
|
+
try:
|
|
31
|
+
text = run.text
|
|
32
|
+
if text:
|
|
33
|
+
logger.debug(f"Processing run {run_idx}: '{text[:50]}...' (length: {len(text)})")
|
|
34
|
+
|
|
35
|
+
# Check for superscript
|
|
36
|
+
if run.font.superscript:
|
|
37
|
+
logger.debug(f"Detected superscript in run {run_idx}")
|
|
38
|
+
# Convert to superscript Unicode characters when possible
|
|
39
|
+
superscript_map = {
|
|
40
|
+
'0': '⁰', '1': '¹', '2': '²', '3': '³', '4': '⁴', '5': '⁵',
|
|
41
|
+
'6': '⁶', '7': '⁷', '8': '⁸', '9': '⁹', '+': '⁺', '-': '⁻',
|
|
42
|
+
'=': '⁼', '(': '⁽', ')': '⁾', 'n': 'ⁿ'
|
|
43
|
+
}
|
|
44
|
+
converted_text = ""
|
|
45
|
+
for char in text:
|
|
46
|
+
converted_text += superscript_map.get(char, f"^{char}")
|
|
47
|
+
result += converted_text
|
|
48
|
+
# Check for subscript
|
|
49
|
+
elif run.font.subscript:
|
|
50
|
+
logger.debug(f"Detected subscript in run {run_idx}")
|
|
51
|
+
# Convert to subscript Unicode characters when possible
|
|
52
|
+
subscript_map = {
|
|
53
|
+
'0': '₀', '1': '₁', '2': '₂', '3': '₃', '4': '₄', '5': '₅',
|
|
54
|
+
'6': '₆', '7': '₇', '8': '₈', '9': '₉', '+': '₊', '-': '₋',
|
|
55
|
+
'=': '₌', '(': '₍', ')': '₎', 'a': 'ₐ', 'e': 'ₑ', 'i': 'ᵢ',
|
|
56
|
+
'o': 'ₒ', 'u': 'ᵤ', 'x': 'ₓ', 'h': 'ₕ', 'k': 'ₖ', 'l': 'ₗ',
|
|
57
|
+
'm': 'ₘ', 'n': 'ₙ', 'p': 'ₚ', 's': 'ₛ', 't': 'ₜ'
|
|
58
|
+
}
|
|
59
|
+
converted_text = ""
|
|
60
|
+
for char in text:
|
|
61
|
+
converted_text += subscript_map.get(char, f"_{char}")
|
|
62
|
+
result += converted_text
|
|
63
|
+
else:
|
|
64
|
+
result += text
|
|
65
|
+
except Exception as e:
|
|
66
|
+
logger.error(f"Error processing run {run_idx} in paragraph {paragraph_idx}: {str(e)}")
|
|
67
|
+
# Add the text as-is if formatting extraction fails
|
|
68
|
+
try:
|
|
69
|
+
if run.text:
|
|
70
|
+
result += run.text
|
|
71
|
+
except:
|
|
72
|
+
logger.error(f"Failed to get text from run {run_idx}")
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.error(f"Error processing paragraph {paragraph_idx}: {str(e)}")
|
|
77
|
+
# Try to get paragraph text as fallback
|
|
78
|
+
try:
|
|
79
|
+
if paragraph.text:
|
|
80
|
+
result += paragraph.text
|
|
81
|
+
except:
|
|
82
|
+
logger.error(f"Failed to get text from paragraph {paragraph_idx}")
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
# Add space between paragraphs if there are multiple
|
|
86
|
+
if len(cell.paragraphs) > 1:
|
|
87
|
+
result += " "
|
|
88
|
+
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logger.error(f"Critical error in extract_text_with_formatting: {str(e)}")
|
|
91
|
+
logger.error(f"Traceback: {traceback.format_exc()}")
|
|
92
|
+
# Try to get cell text as last resort
|
|
93
|
+
try:
|
|
94
|
+
result = cell.text if hasattr(cell, 'text') else ""
|
|
95
|
+
except:
|
|
96
|
+
result = ""
|
|
97
|
+
|
|
98
|
+
final_result = result.strip()
|
|
99
|
+
logger.debug(f"Cell extraction result: '{final_result[:100]}...' (length: {len(final_result)})")
|
|
100
|
+
return final_result
|
|
101
|
+
|
|
102
|
+
def extract_quality_standards_table(docx_path):
|
|
103
|
+
"""
|
|
104
|
+
Extract quality standards table from Word document section 4.3 with detailed logging
|
|
105
|
+
"""
|
|
106
|
+
logger.info(f"Loading document: {docx_path}")
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
# Load the document
|
|
110
|
+
doc = Document(docx_path)
|
|
111
|
+
logger.debug(f"Document loaded successfully, contains {len(doc.paragraphs)} paragraphs")
|
|
112
|
+
|
|
113
|
+
# Find section 4.3
|
|
114
|
+
found_section_43 = False
|
|
115
|
+
table_data = []
|
|
116
|
+
|
|
117
|
+
logger.info("Searching for section 4.3 in document content...")
|
|
118
|
+
|
|
119
|
+
# First, let's examine all paragraphs to find section 4.3
|
|
120
|
+
for i, paragraph in enumerate(doc.paragraphs):
|
|
121
|
+
try:
|
|
122
|
+
text = paragraph.text.strip()
|
|
123
|
+
if text:
|
|
124
|
+
logger.debug(f"Paragraph {i}: {text[:100]}...")
|
|
125
|
+
|
|
126
|
+
# Look for section 4.3
|
|
127
|
+
if re.search(r'4\.3.*检验项目.*方法.*标准', text, re.IGNORECASE):
|
|
128
|
+
logger.info(f"Found section 4.3 at paragraph {i}: {text}")
|
|
129
|
+
found_section_43 = True
|
|
130
|
+
break
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.error(f"Error processing paragraph {i}: {str(e)}")
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
logger.debug(f"Section 4.3 search completed. Found: {found_section_43}")
|
|
136
|
+
|
|
137
|
+
# Now look for tables near section 4.3
|
|
138
|
+
logger.info(f"Found {len(doc.tables)} tables in the document")
|
|
139
|
+
|
|
140
|
+
for table_idx, table in enumerate(doc.tables):
|
|
141
|
+
try:
|
|
142
|
+
logger.debug(f"Processing table {table_idx + 1}")
|
|
143
|
+
logger.debug(f"Table dimensions: {len(table.rows)} rows, {len(table.columns)} columns")
|
|
144
|
+
|
|
145
|
+
# Extract table data with formatting
|
|
146
|
+
table_content = []
|
|
147
|
+
for row_idx, row in enumerate(table.rows):
|
|
148
|
+
try:
|
|
149
|
+
row_data = []
|
|
150
|
+
for cell_idx, cell in enumerate(row.cells):
|
|
151
|
+
try:
|
|
152
|
+
# Use new function to extract text with formatting
|
|
153
|
+
cell_text = extract_text_with_formatting(cell)
|
|
154
|
+
# Clean up excessive whitespace but preserve formatting
|
|
155
|
+
cell_text = ' '.join(cell_text.split())
|
|
156
|
+
row_data.append(cell_text)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(f"Error processing cell [{row_idx}, {cell_idx}]: {str(e)}")
|
|
159
|
+
row_data.append("") # Add empty string for failed cells
|
|
160
|
+
|
|
161
|
+
table_content.append(row_data)
|
|
162
|
+
logger.debug(f"Row {row_idx}: {row_data}")
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.error(f"Error processing row {row_idx} in table {table_idx + 1}: {str(e)}")
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
# Check if this table contains quality standards data
|
|
168
|
+
if table_content:
|
|
169
|
+
header_row = table_content[0] if table_content else []
|
|
170
|
+
header_text = ' '.join(header_row).lower()
|
|
171
|
+
|
|
172
|
+
# Look for keywords that indicate this is the quality standards table
|
|
173
|
+
quality_keywords = ['检验项目', '检验方法', '质量标准', '类型', '项目', '方法', '标准']
|
|
174
|
+
keyword_count = sum(1 for keyword in quality_keywords if keyword in header_text)
|
|
175
|
+
|
|
176
|
+
logger.debug(f"Table {table_idx + 1} header keywords found: {keyword_count}")
|
|
177
|
+
if keyword_count >= 2: # At least 2 keywords match
|
|
178
|
+
logger.info(f"Table {table_idx + 1} appears to be the quality standards table (keyword matches: {keyword_count})")
|
|
179
|
+
table_data = table_content
|
|
180
|
+
break
|
|
181
|
+
else:
|
|
182
|
+
logger.warning(f"Table {table_idx + 1} has no content")
|
|
183
|
+
|
|
184
|
+
except Exception as e:
|
|
185
|
+
logger.error(f"Error processing table {table_idx + 1}: {str(e)}")
|
|
186
|
+
logger.error(f"Table processing traceback: {traceback.format_exc()}")
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
if not table_data:
|
|
190
|
+
logger.warning("No quality standards table found in document")
|
|
191
|
+
|
|
192
|
+
return table_data
|
|
193
|
+
|
|
194
|
+
except Exception as e:
|
|
195
|
+
logger.error(f"Critical error in extract_quality_standards_table: {str(e)}")
|
|
196
|
+
logger.error(f"Full traceback: {traceback.format_exc()}")
|
|
197
|
+
raise
|
|
198
|
+
|
|
199
|
+
def format_as_markdown_table(table_data, target_columns=['类型', '检验项目', '检验方法', '质量标准']):
|
|
200
|
+
"""
|
|
201
|
+
Convert table data to markdown format
|
|
202
|
+
"""
|
|
203
|
+
if not table_data:
|
|
204
|
+
return "No table data found"
|
|
205
|
+
|
|
206
|
+
# Try to map columns to target format
|
|
207
|
+
header_row = table_data[0] if table_data else []
|
|
208
|
+
print(f"\nOriginal headers: {header_row}")
|
|
209
|
+
|
|
210
|
+
# Create markdown table
|
|
211
|
+
markdown_lines = []
|
|
212
|
+
|
|
213
|
+
# Header
|
|
214
|
+
markdown_lines.append(f"| {' | '.join(target_columns)} |")
|
|
215
|
+
markdown_lines.append(f"| {' | '.join(['---'] * len(target_columns))} |")
|
|
216
|
+
|
|
217
|
+
# Data rows
|
|
218
|
+
for row in table_data[1:]: # Skip header row
|
|
219
|
+
# Pad row to match target columns length
|
|
220
|
+
padded_row = row + [''] * (len(target_columns) - len(row))
|
|
221
|
+
# Truncate if too long
|
|
222
|
+
padded_row = padded_row[:len(target_columns)]
|
|
223
|
+
|
|
224
|
+
markdown_lines.append(f"| {' | '.join(padded_row)} |")
|
|
225
|
+
|
|
226
|
+
return '\n'.join(markdown_lines)
|
|
227
|
+
|
|
228
|
+
# def extract_quality_standards_table_from_bytes(file_bytes: bytes) -> str:
|
|
229
|
+
# """
|
|
230
|
+
# Extract quality standards table from Word document bytes
|
|
231
|
+
|
|
232
|
+
# Args:
|
|
233
|
+
# file_bytes: Binary content of the Word document
|
|
234
|
+
|
|
235
|
+
# Returns:
|
|
236
|
+
# Markdown formatted table string
|
|
237
|
+
# """
|
|
238
|
+
# print("Extracting quality standards table from Word document bytes...")
|
|
239
|
+
|
|
240
|
+
# try:
|
|
241
|
+
# doc = Document(io.BytesIO(file_bytes))
|
|
242
|
+
|
|
243
|
+
# found_section_43 = False
|
|
244
|
+
# table_data = []
|
|
245
|
+
|
|
246
|
+
# print("Searching through document content...")
|
|
247
|
+
|
|
248
|
+
# for i, paragraph in enumerate(doc.paragraphs):
|
|
249
|
+
# text = paragraph.text.strip()
|
|
250
|
+
# if text:
|
|
251
|
+
# print(f"Paragraph {i}: {text[:100]}...")
|
|
252
|
+
|
|
253
|
+
# if re.search(r'4\.3.*检验项目.*方法.*标准', text, re.IGNORECASE):
|
|
254
|
+
# print(f"Found section 4.3 at paragraph {i}: {text}")
|
|
255
|
+
# found_section_43 = True
|
|
256
|
+
# break
|
|
257
|
+
|
|
258
|
+
# print(f"\nFound {len(doc.tables)} tables in the document")
|
|
259
|
+
|
|
260
|
+
# for table_idx, table in enumerate(doc.tables):
|
|
261
|
+
# print(f"\n--- Table {table_idx + 1} ---")
|
|
262
|
+
# print(f"Rows: {len(table.rows)}, Columns: {len(table.columns)}")
|
|
263
|
+
|
|
264
|
+
# table_content = []
|
|
265
|
+
# for row_idx, row in enumerate(table.rows):
|
|
266
|
+
# row_data = []
|
|
267
|
+
# for cell in row.cells:
|
|
268
|
+
# cell_text = extract_text_with_formatting(cell)
|
|
269
|
+
# cell_text = ' '.join(cell_text.split())
|
|
270
|
+
# row_data.append(cell_text)
|
|
271
|
+
# table_content.append(row_data)
|
|
272
|
+
# print(f"Row {row_idx}: {row_data}")
|
|
273
|
+
|
|
274
|
+
# if table_content:
|
|
275
|
+
# header_row = table_content[0] if table_content else []
|
|
276
|
+
# header_text = ' '.join(header_row).lower()
|
|
277
|
+
|
|
278
|
+
# quality_keywords = ['检验项目', '检验方法', '质量标准', '类型', '项目', '方法', '标准']
|
|
279
|
+
# keyword_count = sum(1 for keyword in quality_keywords if keyword in header_text)
|
|
280
|
+
|
|
281
|
+
# if keyword_count >= 2:
|
|
282
|
+
# print(f"Table {table_idx + 1} appears to be the quality standards table (keyword matches: {keyword_count})")
|
|
283
|
+
# table_data = table_content
|
|
284
|
+
# break
|
|
285
|
+
|
|
286
|
+
# if table_data:
|
|
287
|
+
# print(f"\n=== Extracted Table Data ({len(table_data)} rows) ===")
|
|
288
|
+
# for i, row in enumerate(table_data):
|
|
289
|
+
# print(f"Row {i}: {row}")
|
|
290
|
+
|
|
291
|
+
# print(f"\n=== Markdown Format ===")
|
|
292
|
+
# markdown_table = format_as_markdown_table(table_data)
|
|
293
|
+
# return markdown_table
|
|
294
|
+
# else:
|
|
295
|
+
# return "No quality standards table found in the document"
|
|
296
|
+
|
|
297
|
+
# except Exception as e:
|
|
298
|
+
# error_msg = f"Error processing document: {e}"
|
|
299
|
+
# print(error_msg)
|
|
300
|
+
# import traceback
|
|
301
|
+
# traceback.print_exc()
|
|
302
|
+
# return error_msg
|
|
303
|
+
|
|
304
|
+
def extract_quality_standards_table_from_docx(docx_path: str):
|
|
305
|
+
"""
|
|
306
|
+
Extract quality standards table from Word document with comprehensive error handling and debugging
|
|
307
|
+
"""
|
|
308
|
+
logger.info(f"Starting extraction from document: {docx_path}")
|
|
309
|
+
|
|
310
|
+
try:
|
|
311
|
+
# Check if file exists
|
|
312
|
+
import os
|
|
313
|
+
if not os.path.exists(docx_path):
|
|
314
|
+
error_msg = f"File not found: {docx_path}"
|
|
315
|
+
logger.error(error_msg)
|
|
316
|
+
return f"Error: {error_msg}"
|
|
317
|
+
|
|
318
|
+
logger.debug(f"File exists, size: {os.path.getsize(docx_path)} bytes")
|
|
319
|
+
|
|
320
|
+
# Extract table data
|
|
321
|
+
table_data = extract_quality_standards_table(docx_path)
|
|
322
|
+
|
|
323
|
+
if table_data:
|
|
324
|
+
logger.info(f"Successfully extracted table with {len(table_data)} rows")
|
|
325
|
+
logger.debug("Extracted table content:")
|
|
326
|
+
for i, row in enumerate(table_data):
|
|
327
|
+
logger.debug(f"Row {i}: {row}")
|
|
328
|
+
|
|
329
|
+
# Convert to markdown
|
|
330
|
+
markdown_table = format_as_markdown_table(table_data)
|
|
331
|
+
logger.debug(f"Generated markdown table:\n{markdown_table}")
|
|
332
|
+
return markdown_table
|
|
333
|
+
else:
|
|
334
|
+
error_msg = "No quality standards table found in the document"
|
|
335
|
+
logger.warning(error_msg)
|
|
336
|
+
return f"Warning: {error_msg}"
|
|
337
|
+
|
|
338
|
+
except Exception as e:
|
|
339
|
+
error_msg = f"Error processing document {docx_path}: {str(e)}"
|
|
340
|
+
logger.error(error_msg)
|
|
341
|
+
logger.error(f"Full traceback: {traceback.format_exc()}")
|
|
342
|
+
return f"Error: {error_msg}"
|
|
@@ -0,0 +1,536 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fill Word document table with quality standards data from markdown format
|
|
3
|
+
基于fill_word_table.py和merge_word_cells.py的功能,实现将markdown格式的质量标准表格填写到指定docx文件中
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from docx import Document
|
|
7
|
+
from docx.shared import Pt
|
|
8
|
+
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
9
|
+
import re
|
|
10
|
+
import logging
|
|
11
|
+
import traceback
|
|
12
|
+
from typing import List, Tuple, Optional, Dict
|
|
13
|
+
|
|
14
|
+
# Get logger for this module
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
def parse_markdown_table_from_string(markdown_content: str) -> List[List[str]]:
|
|
18
|
+
"""
|
|
19
|
+
Parse markdown table from string content to extract data
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
markdown_content: Markdown table content as string
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
List of rows, each row is a list of cell values
|
|
26
|
+
"""
|
|
27
|
+
logger.debug(f"Parsing markdown table from string content")
|
|
28
|
+
|
|
29
|
+
data = []
|
|
30
|
+
lines = markdown_content.strip().split('\n')
|
|
31
|
+
|
|
32
|
+
# Find table lines (skip header separator line with ---)
|
|
33
|
+
table_started = False
|
|
34
|
+
header_found = False
|
|
35
|
+
|
|
36
|
+
for line_num, line in enumerate(lines):
|
|
37
|
+
line = line.strip()
|
|
38
|
+
logger.debug(f"Processing line {line_num}: '{line}'")
|
|
39
|
+
|
|
40
|
+
if line.startswith('|') and line.endswith('|'):
|
|
41
|
+
if '---' in line:
|
|
42
|
+
# This is the separator line, skip it
|
|
43
|
+
logger.debug(f"Skipping separator line: {line}")
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
# Parse table row
|
|
47
|
+
cells = [cell.strip() for cell in line.split('|')[1:-1]] # Remove first and last empty elements
|
|
48
|
+
|
|
49
|
+
if not header_found:
|
|
50
|
+
# This is the header row
|
|
51
|
+
logger.debug(f"Found header row: {cells}")
|
|
52
|
+
header_found = True
|
|
53
|
+
# Store header for reference but don't include in data
|
|
54
|
+
continue
|
|
55
|
+
else:
|
|
56
|
+
# This is a data row
|
|
57
|
+
if cells and any(cell.strip() for cell in cells): # Skip empty rows
|
|
58
|
+
logger.debug(f"Found data row: {cells}")
|
|
59
|
+
data.append(cells)
|
|
60
|
+
|
|
61
|
+
logger.info(f"Parsed {len(data)} data rows from markdown table")
|
|
62
|
+
return data
|
|
63
|
+
|
|
64
|
+
def parse_markdown_table_from_file(md_file_path: str) -> List[List[str]]:
|
|
65
|
+
"""
|
|
66
|
+
Parse markdown table from file to extract data
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
md_file_path: Path to markdown file
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
List of rows, each row is a list of cell values
|
|
73
|
+
"""
|
|
74
|
+
logger.info(f"Reading markdown table from file: {md_file_path}")
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
with open(md_file_path, 'r', encoding='utf-8') as f:
|
|
78
|
+
content = f.read()
|
|
79
|
+
return parse_markdown_table_from_string(content)
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.error(f"Error reading markdown file {md_file_path}: {str(e)}")
|
|
82
|
+
raise
|
|
83
|
+
|
|
84
|
+
def restore_formatting_to_cell(cell, text: str):
|
|
85
|
+
"""
|
|
86
|
+
Restore superscript and subscript formatting from Unicode characters to Word formatting
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
cell: Word table cell
|
|
90
|
+
text: Text with Unicode super/subscript characters
|
|
91
|
+
"""
|
|
92
|
+
try:
|
|
93
|
+
# Clear existing content
|
|
94
|
+
cell.text = ""
|
|
95
|
+
paragraph = cell.paragraphs[0]
|
|
96
|
+
|
|
97
|
+
# Maps for converting Unicode back to normal characters
|
|
98
|
+
superscript_map = {
|
|
99
|
+
'⁰': '0', '¹': '1', '²': '2', '³': '3', '⁴': '4', '⁵': '5',
|
|
100
|
+
'⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁺': '+', '⁻': '-',
|
|
101
|
+
'⁼': '=', '⁽': '(', '⁾': ')', 'ⁿ': 'n'
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
subscript_map = {
|
|
105
|
+
'₀': '0', '₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
|
|
106
|
+
'₆': '6', '₇': '7', '₈': '8', '₉': '9', '₊': '+', '₋': '-',
|
|
107
|
+
'₌': '=', '₍': '(', '₎': ')', 'ₐ': 'a', 'ₑ': 'e', 'ᵢ': 'i',
|
|
108
|
+
'ₒ': 'o', 'ᵤ': 'u', 'ₓ': 'x', 'ₕ': 'h', 'ₖ': 'k', 'ₗ': 'l',
|
|
109
|
+
'ₘ': 'm', 'ₙ': 'n', 'ₚ': 'p', 'ₛ': 's', 'ₜ': 't'
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# Also handle ^{} and _{} notation
|
|
113
|
+
# Pattern for ^{text} and _{text}
|
|
114
|
+
super_pattern = r'\^\{([^}]+)\}'
|
|
115
|
+
sub_pattern = r'_\{([^}]+)\}'
|
|
116
|
+
|
|
117
|
+
# Replace ^{text} and _{text} patterns first
|
|
118
|
+
def replace_super(match):
|
|
119
|
+
return ''.join(superscript_map.get(c, f'^{c}') for c in match.group(1))
|
|
120
|
+
|
|
121
|
+
def replace_sub(match):
|
|
122
|
+
return ''.join(subscript_map.get(c, f'_{c}') for c in match.group(1))
|
|
123
|
+
|
|
124
|
+
text = re.sub(super_pattern, replace_super, text)
|
|
125
|
+
text = re.sub(sub_pattern, replace_sub, text)
|
|
126
|
+
|
|
127
|
+
# Process character by character to handle formatting
|
|
128
|
+
i = 0
|
|
129
|
+
while i < len(text):
|
|
130
|
+
char = text[i]
|
|
131
|
+
|
|
132
|
+
if char in superscript_map:
|
|
133
|
+
# Add superscript character
|
|
134
|
+
run = paragraph.add_run(superscript_map[char])
|
|
135
|
+
run.font.superscript = True
|
|
136
|
+
elif char in subscript_map:
|
|
137
|
+
# Add subscript character
|
|
138
|
+
run = paragraph.add_run(subscript_map[char])
|
|
139
|
+
run.font.subscript = True
|
|
140
|
+
elif char == '^' and i + 1 < len(text):
|
|
141
|
+
# Handle ^character notation
|
|
142
|
+
i += 1
|
|
143
|
+
next_char = text[i]
|
|
144
|
+
run = paragraph.add_run(next_char)
|
|
145
|
+
run.font.superscript = True
|
|
146
|
+
elif char == '_' and i + 1 < len(text):
|
|
147
|
+
# Handle _character notation
|
|
148
|
+
i += 1
|
|
149
|
+
next_char = text[i]
|
|
150
|
+
run = paragraph.add_run(next_char)
|
|
151
|
+
run.font.subscript = True
|
|
152
|
+
else:
|
|
153
|
+
# Normal character
|
|
154
|
+
run = paragraph.add_run(char)
|
|
155
|
+
|
|
156
|
+
i += 1
|
|
157
|
+
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.warning(f"Error restoring formatting to cell, using plain text: {str(e)}")
|
|
160
|
+
# Fallback to plain text
|
|
161
|
+
cell.text = text
|
|
162
|
+
|
|
163
|
+
def clear_table_content(table, keep_header: bool = True):
|
|
164
|
+
"""
|
|
165
|
+
Clear all rows in the table except optionally the header row
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
table: Word table object
|
|
169
|
+
keep_header: Whether to keep the first row (header)
|
|
170
|
+
"""
|
|
171
|
+
start_row = 1 if keep_header else 0
|
|
172
|
+
rows_to_remove = len(table.rows) - start_row
|
|
173
|
+
|
|
174
|
+
logger.debug(f"Clearing {rows_to_remove} existing rows (keeping header: {keep_header})...")
|
|
175
|
+
|
|
176
|
+
# Remove rows from the end to avoid index issues
|
|
177
|
+
for _ in range(rows_to_remove):
|
|
178
|
+
if len(table.rows) > start_row:
|
|
179
|
+
# Remove the last row
|
|
180
|
+
table._tbl.remove(table.rows[-1]._tr)
|
|
181
|
+
|
|
182
|
+
logger.debug(f"Table now has {len(table.rows)} rows")
|
|
183
|
+
|
|
184
|
+
def insert_table_rows(table, num_rows: int):
|
|
185
|
+
"""
|
|
186
|
+
Insert the specified number of empty rows into the table
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
table: Word table object
|
|
190
|
+
num_rows: Number of rows to insert
|
|
191
|
+
"""
|
|
192
|
+
logger.debug(f"Inserting {num_rows} new empty rows...")
|
|
193
|
+
|
|
194
|
+
for _ in range(num_rows):
|
|
195
|
+
# Add a new row to the table
|
|
196
|
+
new_row = table.add_row()
|
|
197
|
+
# Initialize cells with empty strings
|
|
198
|
+
for cell in new_row.cells:
|
|
199
|
+
cell.text = ""
|
|
200
|
+
|
|
201
|
+
logger.debug(f"Table now has {len(table.rows)} total rows")
|
|
202
|
+
|
|
203
|
+
def find_quality_standards_table(doc: Document) -> Optional[int]:
|
|
204
|
+
"""
|
|
205
|
+
Find the quality standards table in the document
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
doc: Word document object
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Table index if found, None otherwise
|
|
212
|
+
"""
|
|
213
|
+
logger.debug(f"Searching for quality standards table in {len(doc.tables)} tables")
|
|
214
|
+
|
|
215
|
+
for table_idx, table in enumerate(doc.tables):
|
|
216
|
+
try:
|
|
217
|
+
if len(table.rows) > 0:
|
|
218
|
+
# Check header row for quality standards keywords
|
|
219
|
+
header_row = table.rows[0]
|
|
220
|
+
header_text = ' '.join(cell.text for cell in header_row.cells).lower()
|
|
221
|
+
|
|
222
|
+
quality_keywords = ['检验项目', '检验方法', '质量标准', '类型', '项目', '方法', '标准']
|
|
223
|
+
keyword_count = sum(1 for keyword in quality_keywords if keyword in header_text)
|
|
224
|
+
|
|
225
|
+
logger.debug(f"Table {table_idx}: header keywords found: {keyword_count}")
|
|
226
|
+
|
|
227
|
+
if keyword_count >= 2: # At least 2 keywords match
|
|
228
|
+
logger.info(f"Found quality standards table at index {table_idx}")
|
|
229
|
+
return table_idx
|
|
230
|
+
|
|
231
|
+
except Exception as e:
|
|
232
|
+
logger.warning(f"Error checking table {table_idx}: {str(e)}")
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
logger.warning("No quality standards table found")
|
|
236
|
+
return None
|
|
237
|
+
|
|
238
|
+
def merge_cells_in_column(table, col_index: int, start_row: int, end_row: int):
|
|
239
|
+
"""
|
|
240
|
+
Merge cells in a column from start_row to end_row (inclusive)
|
|
241
|
+
|
|
242
|
+
Note: This function prevents duplicate content by clearing subsequent cells before merging
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
table: Word table object
|
|
246
|
+
col_index: Column index to merge
|
|
247
|
+
start_row: Starting row index
|
|
248
|
+
end_row: Ending row index
|
|
249
|
+
"""
|
|
250
|
+
if start_row >= end_row or end_row >= len(table.rows):
|
|
251
|
+
logger.debug(f"Skipping merge for column {col_index}, rows {start_row}-{end_row} (invalid range)")
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
logger.debug(f"Merging column {col_index}, rows {start_row}-{end_row}")
|
|
256
|
+
|
|
257
|
+
# Get the first cell to merge into
|
|
258
|
+
first_cell = table.rows[start_row].cells[col_index]
|
|
259
|
+
|
|
260
|
+
# Store the original content from the first cell
|
|
261
|
+
original_content = first_cell.text.strip()
|
|
262
|
+
|
|
263
|
+
# Clear content from cells to be merged to prevent duplication
|
|
264
|
+
for row_idx in range(start_row + 1, end_row + 1):
|
|
265
|
+
if row_idx < len(table.rows):
|
|
266
|
+
cell = table.rows[row_idx].cells[col_index]
|
|
267
|
+
cell.text = "" # Clear content before merging
|
|
268
|
+
|
|
269
|
+
# Now merge the cells (they're empty, so no duplicate content)
|
|
270
|
+
for row_idx in range(start_row + 1, end_row + 1):
|
|
271
|
+
if row_idx < len(table.rows):
|
|
272
|
+
cell_to_merge = table.rows[row_idx].cells[col_index]
|
|
273
|
+
first_cell.merge(cell_to_merge)
|
|
274
|
+
|
|
275
|
+
# Ensure the merged cell has the correct content
|
|
276
|
+
if first_cell.text.strip() != original_content:
|
|
277
|
+
first_cell.text = original_content
|
|
278
|
+
|
|
279
|
+
logger.debug(f"Successfully merged column {col_index}, rows {start_row}-{end_row}")
|
|
280
|
+
|
|
281
|
+
except Exception as e:
|
|
282
|
+
logger.warning(f"Could not merge cells {start_row}-{end_row} in column {col_index}: {str(e)}")
|
|
283
|
+
# If merging fails, at least clear the duplicate text in subsequent cells
|
|
284
|
+
try:
|
|
285
|
+
for row_idx in range(start_row + 1, end_row + 1):
|
|
286
|
+
if row_idx < len(table.rows):
|
|
287
|
+
cell = table.rows[row_idx].cells[col_index]
|
|
288
|
+
cell.text = ""
|
|
289
|
+
except Exception as e2:
|
|
290
|
+
logger.warning(f"Could not clear duplicate text: {str(e2)}")
|
|
291
|
+
|
|
292
|
+
def auto_merge_duplicate_cells(table, target_columns: List[str] = ['类型', '检验项目']):
|
|
293
|
+
"""
|
|
294
|
+
Automatically merge cells with duplicate content in specified columns
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
table: Word table object
|
|
298
|
+
target_columns: List of column names to check for merging
|
|
299
|
+
"""
|
|
300
|
+
if len(table.rows) <= 1:
|
|
301
|
+
logger.debug("Not enough rows for merging")
|
|
302
|
+
return
|
|
303
|
+
|
|
304
|
+
# Get header row to find column indices
|
|
305
|
+
header_row = table.rows[0]
|
|
306
|
+
column_mapping = {}
|
|
307
|
+
|
|
308
|
+
for col_idx, cell in enumerate(header_row.cells):
|
|
309
|
+
header_text = cell.text.strip()
|
|
310
|
+
for target_col in target_columns:
|
|
311
|
+
if target_col in header_text:
|
|
312
|
+
column_mapping[target_col] = col_idx
|
|
313
|
+
break
|
|
314
|
+
|
|
315
|
+
logger.debug(f"Column mapping for merging: {column_mapping}")
|
|
316
|
+
|
|
317
|
+
# Merge 类型 column
|
|
318
|
+
if '类型' in column_mapping:
|
|
319
|
+
col_idx = column_mapping['类型']
|
|
320
|
+
logger.debug(f"Processing 类型 column (index {col_idx}) for merging")
|
|
321
|
+
|
|
322
|
+
current_type = ""
|
|
323
|
+
merge_start = -1
|
|
324
|
+
|
|
325
|
+
for row_idx in range(1, len(table.rows)): # Skip header
|
|
326
|
+
cell_text = table.rows[row_idx].cells[col_idx].text.strip()
|
|
327
|
+
|
|
328
|
+
if cell_text != current_type:
|
|
329
|
+
# Different type found, merge previous group if needed
|
|
330
|
+
if merge_start != -1 and row_idx - merge_start > 1:
|
|
331
|
+
merge_cells_in_column(table, col_idx, merge_start, row_idx - 1)
|
|
332
|
+
|
|
333
|
+
current_type = cell_text
|
|
334
|
+
merge_start = row_idx
|
|
335
|
+
|
|
336
|
+
# Handle last group
|
|
337
|
+
if merge_start != -1 and len(table.rows) - merge_start > 1:
|
|
338
|
+
merge_cells_in_column(table, col_idx, merge_start, len(table.rows) - 1)
|
|
339
|
+
|
|
340
|
+
# Merge 检验项目 column within same 类型
|
|
341
|
+
if '检验项目' in column_mapping and '类型' in column_mapping:
|
|
342
|
+
type_col_idx = column_mapping['类型']
|
|
343
|
+
item_col_idx = column_mapping['检验项目']
|
|
344
|
+
|
|
345
|
+
logger.debug(f"Processing 检验项目 column (index {item_col_idx}) for merging within same 类型")
|
|
346
|
+
|
|
347
|
+
current_type = ""
|
|
348
|
+
current_item = ""
|
|
349
|
+
item_start = -1
|
|
350
|
+
|
|
351
|
+
for row_idx in range(1, len(table.rows)):
|
|
352
|
+
type_text = table.rows[row_idx].cells[type_col_idx].text.strip()
|
|
353
|
+
item_text = table.rows[row_idx].cells[item_col_idx].text.strip()
|
|
354
|
+
|
|
355
|
+
if type_text != current_type:
|
|
356
|
+
# Different type, merge previous item group if needed
|
|
357
|
+
if item_start != -1 and row_idx - item_start > 1 and current_item:
|
|
358
|
+
merge_cells_in_column(table, item_col_idx, item_start, row_idx - 1)
|
|
359
|
+
|
|
360
|
+
current_type = type_text
|
|
361
|
+
current_item = item_text
|
|
362
|
+
item_start = row_idx
|
|
363
|
+
elif item_text == current_item and item_text != "" and current_item != "":
|
|
364
|
+
# Same item in same type - continue the group
|
|
365
|
+
continue
|
|
366
|
+
else:
|
|
367
|
+
# Different item in same type, merge previous group if needed
|
|
368
|
+
if item_start != -1 and row_idx - item_start > 1 and current_item:
|
|
369
|
+
merge_cells_in_column(table, item_col_idx, item_start, row_idx - 1)
|
|
370
|
+
|
|
371
|
+
current_item = item_text
|
|
372
|
+
item_start = row_idx
|
|
373
|
+
|
|
374
|
+
# Handle last group
|
|
375
|
+
if item_start != -1 and len(table.rows) - item_start > 1 and current_item:
|
|
376
|
+
merge_cells_in_column(table, item_col_idx, item_start, len(table.rows) - 1)
|
|
377
|
+
|
|
378
|
+
def fill_word_document_table(doc_path: str, output_path: str, table_data: List[List[str]],
|
|
379
|
+
table_index: Optional[int] = None,
|
|
380
|
+
target_columns: List[str] = ['类型', '检验项目', '检验方法', '质量标准'],
|
|
381
|
+
auto_merge: bool = True) -> str:
|
|
382
|
+
"""
|
|
383
|
+
Fill Word document table with quality standards data
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
doc_path: Path to input Word document
|
|
387
|
+
output_path: Path to save output document
|
|
388
|
+
table_data: List of rows, each row is a list of cell values
|
|
389
|
+
table_index: Specific table index to fill (None for auto-detection)
|
|
390
|
+
target_columns: Expected column order
|
|
391
|
+
auto_merge: Whether to automatically merge duplicate cells
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
Success message or error description
|
|
395
|
+
"""
|
|
396
|
+
try:
|
|
397
|
+
logger.info(f"Loading document: {doc_path}")
|
|
398
|
+
doc = Document(doc_path)
|
|
399
|
+
|
|
400
|
+
# Find target table
|
|
401
|
+
if table_index is None:
|
|
402
|
+
table_index = find_quality_standards_table(doc)
|
|
403
|
+
if table_index is None:
|
|
404
|
+
return "Error: No quality standards table found in document"
|
|
405
|
+
|
|
406
|
+
if table_index >= len(doc.tables):
|
|
407
|
+
return f"Error: Table index {table_index} not found in document (only {len(doc.tables)} tables)"
|
|
408
|
+
|
|
409
|
+
target_table = doc.tables[table_index]
|
|
410
|
+
logger.info(f"Using table {table_index} with {len(target_table.rows)} rows and {len(target_table.columns)} columns")
|
|
411
|
+
|
|
412
|
+
# Validate data
|
|
413
|
+
if not table_data:
|
|
414
|
+
return "Error: No table data provided"
|
|
415
|
+
|
|
416
|
+
logger.info(f"Filling table with {len(table_data)} data rows")
|
|
417
|
+
|
|
418
|
+
# Step 1: Clear existing table content (except header)
|
|
419
|
+
clear_table_content(target_table, keep_header=True)
|
|
420
|
+
|
|
421
|
+
# Step 2: Insert required number of rows
|
|
422
|
+
num_data_rows = len(table_data)
|
|
423
|
+
insert_table_rows(target_table, num_data_rows)
|
|
424
|
+
|
|
425
|
+
# Step 3: Fill the table with data
|
|
426
|
+
for i, row_data in enumerate(table_data):
|
|
427
|
+
row_index = i + 1 # Skip header row
|
|
428
|
+
if row_index < len(target_table.rows):
|
|
429
|
+
row = target_table.rows[row_index]
|
|
430
|
+
|
|
431
|
+
# Fill cells based on available data and columns
|
|
432
|
+
max_cols = min(len(row_data), len(row.cells), len(target_columns))
|
|
433
|
+
|
|
434
|
+
for col_idx in range(max_cols):
|
|
435
|
+
cell_text = row_data[col_idx] if col_idx < len(row_data) else ""
|
|
436
|
+
cell = row.cells[col_idx]
|
|
437
|
+
|
|
438
|
+
# Restore formatting for the cell content
|
|
439
|
+
restore_formatting_to_cell(cell, cell_text)
|
|
440
|
+
|
|
441
|
+
logger.debug(f"Filled row {row_index}: {row_data[:max_cols]}")
|
|
442
|
+
else:
|
|
443
|
+
logger.warning(f"Skipping row {row_index}, table doesn't have enough rows")
|
|
444
|
+
|
|
445
|
+
# Step 4: Auto-merge duplicate cells if requested
|
|
446
|
+
if auto_merge:
|
|
447
|
+
logger.info("Performing automatic cell merging...")
|
|
448
|
+
auto_merge_duplicate_cells(target_table, target_columns[:2]) # Only merge 类型 and 检验项目
|
|
449
|
+
|
|
450
|
+
# Step 5: Save the modified document
|
|
451
|
+
doc.save(output_path)
|
|
452
|
+
logger.info(f"Document saved to: {output_path}")
|
|
453
|
+
|
|
454
|
+
return f"Successfully filled table with {len(table_data)} rows and saved to {output_path}"
|
|
455
|
+
|
|
456
|
+
except Exception as e:
|
|
457
|
+
error_msg = f"Error filling Word document table: {str(e)}"
|
|
458
|
+
logger.error(error_msg)
|
|
459
|
+
logger.error(f"Full traceback: {traceback.format_exc()}")
|
|
460
|
+
return error_msg
|
|
461
|
+
|
|
462
|
+
def fill_quality_standards_from_markdown(doc_path: str, output_path: str, markdown_content: str,
|
|
463
|
+
table_index: Optional[int] = None, auto_merge: bool = True) -> str:
|
|
464
|
+
"""
|
|
465
|
+
Fill Word document table with quality standards data from markdown content
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
doc_path: Path to input Word document
|
|
469
|
+
output_path: Path to save output document
|
|
470
|
+
markdown_content: Markdown table content as string
|
|
471
|
+
table_index: Specific table index to fill (None for auto-detection)
|
|
472
|
+
auto_merge: Whether to automatically merge duplicate cells
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
Success message or error description
|
|
476
|
+
"""
|
|
477
|
+
try:
|
|
478
|
+
logger.info("Parsing markdown table content...")
|
|
479
|
+
table_data = parse_markdown_table_from_string(markdown_content)
|
|
480
|
+
|
|
481
|
+
if not table_data:
|
|
482
|
+
return "Error: No valid table data found in markdown content"
|
|
483
|
+
|
|
484
|
+
return fill_word_document_table(doc_path, output_path, table_data, table_index, auto_merge=auto_merge)
|
|
485
|
+
|
|
486
|
+
except Exception as e:
|
|
487
|
+
error_msg = f"Error processing markdown content: {str(e)}"
|
|
488
|
+
logger.error(error_msg)
|
|
489
|
+
logger.error(f"Full traceback: {traceback.format_exc()}")
|
|
490
|
+
return error_msg
|
|
491
|
+
|
|
492
|
+
def fill_quality_standards_from_file(doc_path: str, output_path: str, markdown_file_path: str,
|
|
493
|
+
table_index: Optional[int] = None, auto_merge: bool = True) -> str:
|
|
494
|
+
"""
|
|
495
|
+
Fill Word document table with quality standards data from markdown file
|
|
496
|
+
|
|
497
|
+
Args:
|
|
498
|
+
doc_path: Path to input Word document
|
|
499
|
+
output_path: Path to save output document
|
|
500
|
+
markdown_file_path: Path to markdown file containing table data
|
|
501
|
+
table_index: Specific table index to fill (None for auto-detection)
|
|
502
|
+
auto_merge: Whether to automatically merge duplicate cells
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
Success message or error description
|
|
506
|
+
"""
|
|
507
|
+
try:
|
|
508
|
+
logger.info(f"Reading markdown file: {markdown_file_path}")
|
|
509
|
+
table_data = parse_markdown_table_from_file(markdown_file_path)
|
|
510
|
+
|
|
511
|
+
if not table_data:
|
|
512
|
+
return "Error: No valid table data found in markdown file"
|
|
513
|
+
|
|
514
|
+
return fill_word_document_table(doc_path, output_path, table_data, table_index, auto_merge=auto_merge)
|
|
515
|
+
|
|
516
|
+
except Exception as e:
|
|
517
|
+
error_msg = f"Error processing markdown file: {str(e)}"
|
|
518
|
+
logger.error(error_msg)
|
|
519
|
+
logger.error(f"Full traceback: {traceback.format_exc()}")
|
|
520
|
+
return error_msg
|
|
521
|
+
|
|
522
|
+
def fill_quality_standards_inplace(doc_path: str, markdown_content: str,
|
|
523
|
+
table_index: Optional[int] = None, auto_merge: bool = True) -> str:
|
|
524
|
+
"""
|
|
525
|
+
Fill Word document table with quality standards data from markdown content (modifies file in-place)
|
|
526
|
+
|
|
527
|
+
Args:
|
|
528
|
+
doc_path: Path to Word document to modify in-place
|
|
529
|
+
markdown_content: Markdown table content as string
|
|
530
|
+
table_index: Specific table index to fill (None for auto-detection)
|
|
531
|
+
auto_merge: Whether to automatically merge duplicate cells
|
|
532
|
+
|
|
533
|
+
Returns:
|
|
534
|
+
Success message or error description
|
|
535
|
+
"""
|
|
536
|
+
return fill_quality_standards_from_markdown(doc_path, doc_path, markdown_content, table_index, auto_merge)
|
src/server.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RA Document MCP Server
|
|
3
|
+
|
|
4
|
+
Run:
|
|
5
|
+
uv run src.server fastmcp_quickstart stdio
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from mcp.server.fastmcp import FastMCP
|
|
9
|
+
from src import extract_quality_standards
|
|
10
|
+
from src import fill_quality_standards
|
|
11
|
+
import os
|
|
12
|
+
import logging
|
|
13
|
+
import traceback
|
|
14
|
+
# Create an MCP server
|
|
15
|
+
mcp = FastMCP("ra-doc-mcp")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Initialize LOG_FILE variable without assigning a value
|
|
19
|
+
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
20
|
+
LOG_FILE = os.path.join(ROOT_DIR, "ra-file-mcp.log")
|
|
21
|
+
|
|
22
|
+
# Configure logging
|
|
23
|
+
logging.basicConfig(
|
|
24
|
+
level=logging.DEBUG,
|
|
25
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
26
|
+
handlers=[
|
|
27
|
+
logging.FileHandler(LOG_FILE, encoding='utf-8'),
|
|
28
|
+
logging.StreamHandler()
|
|
29
|
+
]
|
|
30
|
+
)
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
# Initialize EXCEL_FILES_PATH variable without assigning a value
|
|
34
|
+
DOCX_FILES_PATH = None
|
|
35
|
+
|
|
36
|
+
def get_docx_path(filename: str) -> str:
|
|
37
|
+
"""Get full path to Excel file.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
filename: Name of Excel file
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Full path to Excel file
|
|
44
|
+
"""
|
|
45
|
+
# If filename is already an absolute path, return it
|
|
46
|
+
if os.path.isabs(filename):
|
|
47
|
+
return filename
|
|
48
|
+
|
|
49
|
+
# Check if in SSE mode (EXCEL_FILES_PATH is not None)
|
|
50
|
+
if DOCX_FILES_PATH is None:
|
|
51
|
+
# Must use absolute path
|
|
52
|
+
raise ValueError(f"Invalid filename: {filename}, must be an absolute path when not in SSE mode")
|
|
53
|
+
|
|
54
|
+
# In SSE mode, if it's a relative path, resolve it based on EXCEL_FILES_PATH
|
|
55
|
+
return os.path.join(DOCX_FILES_PATH, filename)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# Add an addition tool
|
|
59
|
+
@mcp.tool()
|
|
60
|
+
def add(a: int, b: int) -> int:
|
|
61
|
+
"""Add two numbers"""
|
|
62
|
+
return a + b + 1
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# Add an addition tool
|
|
66
|
+
@mcp.tool()
|
|
67
|
+
def extract_quality_standards_table(doc_path: str) -> str:
|
|
68
|
+
"""Extract quality standards table from docx file"""
|
|
69
|
+
try:
|
|
70
|
+
logger.info(f"Starting quality standards extraction for: {doc_path}")
|
|
71
|
+
full_path = get_docx_path(doc_path)
|
|
72
|
+
logger.debug(f"Resolved full path: {full_path}")
|
|
73
|
+
|
|
74
|
+
result = extract_quality_standards.extract_quality_standards_table_from_docx(full_path)
|
|
75
|
+
logger.info(f"Successfully extracted quality standards from: {full_path}")
|
|
76
|
+
return result
|
|
77
|
+
|
|
78
|
+
except Exception as e:
|
|
79
|
+
error_msg = f"Error in extract_quality_standards_table: {str(e)}"
|
|
80
|
+
logger.error(error_msg)
|
|
81
|
+
logger.error(f"Full traceback: {traceback.format_exc()}")
|
|
82
|
+
return f"Error: {error_msg}"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@mcp.tool()
|
|
86
|
+
def fill_quality_standards_table(doc_path: str, markdown_content: str, table_index: int = None) -> str:
|
|
87
|
+
"""Fill Word document table with quality standards data from markdown format (modifies file in-place)
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
doc_path: Path to Word document to modify (absolute path or relative to DOCX_FILES_PATH)
|
|
91
|
+
markdown_content: Markdown table content as string
|
|
92
|
+
table_index: Specific table index to fill (None for auto-detection)
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Success message or error description
|
|
96
|
+
"""
|
|
97
|
+
try:
|
|
98
|
+
logger.info(f"Starting in-place table filling for: {doc_path}")
|
|
99
|
+
|
|
100
|
+
# Resolve path
|
|
101
|
+
full_doc_path = get_docx_path(doc_path)
|
|
102
|
+
logger.debug(f"Resolved document path: {full_doc_path}")
|
|
103
|
+
|
|
104
|
+
# Fill the document in-place
|
|
105
|
+
result = fill_quality_standards.fill_quality_standards_inplace(
|
|
106
|
+
full_doc_path, markdown_content, table_index, auto_merge=True
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
logger.info(f"Successfully filled quality standards table in-place: {result}")
|
|
110
|
+
return result
|
|
111
|
+
|
|
112
|
+
except Exception as e:
|
|
113
|
+
error_msg = f"Error in fill_quality_standards_table: {str(e)}"
|
|
114
|
+
logger.error(error_msg)
|
|
115
|
+
logger.error(f"Full traceback: {traceback.format_exc()}")
|
|
116
|
+
return f"Error: {error_msg}"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# @mcp.tool()
|
|
120
|
+
# def extract_quality_standards_from_content(file_content_base64: str) -> str:
|
|
121
|
+
# """
|
|
122
|
+
# Extract quality standards table from Word document content (not file path)
|
|
123
|
+
|
|
124
|
+
# Args:
|
|
125
|
+
# file_content_base64: Base64 encoded Word document content
|
|
126
|
+
|
|
127
|
+
# Returns:
|
|
128
|
+
# Markdown formatted quality standards table
|
|
129
|
+
# """
|
|
130
|
+
# try:
|
|
131
|
+
# file_bytes = base64.b64decode(file_content_base64)
|
|
132
|
+
# return extract_quality_standards.extract_quality_standards_table_from_bytes(file_bytes)
|
|
133
|
+
# except Exception as e:
|
|
134
|
+
# return f"Error decoding or processing file content: {e}"
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# Add a dynamic greeting resource
|
|
138
|
+
@mcp.resource("greeting://{name}")
|
|
139
|
+
def get_greeting(name: str) -> str:
|
|
140
|
+
"""Get a personalized greeting"""
|
|
141
|
+
return f"Hello, {name}!"
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# Add a prompt
|
|
145
|
+
@mcp.prompt()
|
|
146
|
+
def greet_user(name: str, style: str = "friendly") -> str:
|
|
147
|
+
"""Generate a greeting prompt"""
|
|
148
|
+
styles = {
|
|
149
|
+
"friendly": "Please write a warm, friendly greeting",
|
|
150
|
+
"formal": "Please write a formal, professional greeting",
|
|
151
|
+
"casual": "Please write a casual, relaxed greeting",
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
return f"{styles.get(style, styles['friendly'])} for someone named {name}."
|
|
155
|
+
|
|
156
|
+
if __name__ == "__main__":
|
|
157
|
+
mcp.run(transport='stdio')
|