ima-claude 2.9.0 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -15
- package/dist/cli.js +385 -17
- package/package.json +1 -1
- package/platforms/gemini/adapter.ts +443 -0
- package/platforms/gemini/gemini-extension.json +17 -0
- package/platforms/gemini/hooks-translator.py +66 -0
- package/platforms/shared/detector.ts +5 -1
- package/plugins/ima-claude/.claude-plugin/plugin.json +2 -2
- package/plugins/ima-claude/skills/gh-cli/SKILL.md +286 -0
- package/plugins/ima-claude/skills/ima-doc2pdf/SKILL.md +242 -0
- package/plugins/ima-claude/skills/ima-doc2pdf/references/formatting-spec.md +88 -0
- package/plugins/ima-claude/skills/ima-doc2pdf/scripts/docx_utils.py +21 -0
- package/plugins/ima-claude/skills/ima-doc2pdf/scripts/extract_docx.py +384 -0
- package/plugins/ima-claude/skills/ima-doc2pdf/scripts/generate_pdf.py +663 -0
- package/plugins/ima-claude/skills/mcp-gitea/SKILL.md +358 -0
- package/plugins/ima-claude/skills/mcp-github/SKILL.md +200 -0
- package/plugins/ima-claude/skills/mcp-qdrant/SKILL.md +21 -10
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Extract structured text from a Word document for IMA branded document generation.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python3 extract_docx.py <path_to_docx> [--json]
|
|
6
|
+
|
|
7
|
+
Outputs a structured representation of the document with:
|
|
8
|
+
- Heading hierarchy (H1, H2, H3)
|
|
9
|
+
- Body paragraphs with bold run markers
|
|
10
|
+
- Reference citations
|
|
11
|
+
- Q&A detection
|
|
12
|
+
- Word comments (if any @SLOT markers exist)
|
|
13
|
+
|
|
14
|
+
Required: pip install python-docx
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import sys
|
|
18
|
+
import json
|
|
19
|
+
import re
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from docx import Document
|
|
22
|
+
from docx.oxml.ns import qn
|
|
23
|
+
|
|
24
|
+
sys.path.insert(0, str(Path(__file__).parent))
|
|
25
|
+
from docx_utils import has_page_break
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def extract_comments(doc):
|
|
29
|
+
"""Extract Word comments and their anchored paragraph indices."""
|
|
30
|
+
comments = {}
|
|
31
|
+
# Comments are stored in the comments part of the docx
|
|
32
|
+
try:
|
|
33
|
+
comments_part = doc.part.package.part_related_by(
|
|
34
|
+
'http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments'
|
|
35
|
+
)
|
|
36
|
+
if comments_part is None:
|
|
37
|
+
return comments
|
|
38
|
+
|
|
39
|
+
from lxml import etree
|
|
40
|
+
tree = etree.fromstring(comments_part.blob)
|
|
41
|
+
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
|
42
|
+
|
|
43
|
+
for comment in tree.findall('.//w:comment', ns):
|
|
44
|
+
comment_id = comment.get(qn('w:id'))
|
|
45
|
+
text_parts = []
|
|
46
|
+
for p in comment.findall('.//w:t', ns):
|
|
47
|
+
if p.text:
|
|
48
|
+
text_parts.append(p.text)
|
|
49
|
+
text = ''.join(text_parts).strip()
|
|
50
|
+
if text.startswith('@SLOT:'):
|
|
51
|
+
slot_name = text.replace('@SLOT:', '').strip()
|
|
52
|
+
comments[comment_id] = slot_name
|
|
53
|
+
except Exception:
|
|
54
|
+
pass # No comments part or parsing error
|
|
55
|
+
|
|
56
|
+
return comments
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_comment_refs(paragraph):
|
|
60
|
+
"""Get comment IDs referenced by a paragraph (via commentRangeStart)."""
|
|
61
|
+
refs = []
|
|
62
|
+
for elem in paragraph._element.iter():
|
|
63
|
+
if elem.tag.endswith('commentRangeStart'):
|
|
64
|
+
comment_id = elem.get(qn('w:id'))
|
|
65
|
+
if comment_id:
|
|
66
|
+
refs.append(comment_id)
|
|
67
|
+
return refs
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def extract_runs_with_formatting(paragraph):
|
|
71
|
+
"""Extract text runs with bold/italic markers."""
|
|
72
|
+
runs = []
|
|
73
|
+
for run in paragraph.runs:
|
|
74
|
+
text = run.text
|
|
75
|
+
if not text:
|
|
76
|
+
continue
|
|
77
|
+
runs.append({
|
|
78
|
+
'text': text,
|
|
79
|
+
'bold': run.bold or False,
|
|
80
|
+
'italic': run.italic or False,
|
|
81
|
+
})
|
|
82
|
+
return runs
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def get_num_info(paragraph):
|
|
86
|
+
"""Extract Word numbering info (numId, ilvl) if present."""
|
|
87
|
+
pPr = paragraph._element.find(qn('w:pPr'))
|
|
88
|
+
if pPr is None:
|
|
89
|
+
return None, None
|
|
90
|
+
numPr = pPr.find(qn('w:numPr'))
|
|
91
|
+
if numPr is None:
|
|
92
|
+
return None, None
|
|
93
|
+
ilvl_el = numPr.find(qn('w:ilvl'))
|
|
94
|
+
numId_el = numPr.find(qn('w:numId'))
|
|
95
|
+
ilvl = ilvl_el.get(qn('w:val')) if ilvl_el is not None else '0'
|
|
96
|
+
numId = numId_el.get(qn('w:val')) if numId_el is not None else None
|
|
97
|
+
return numId, int(ilvl)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def is_all_bold(para):
|
|
101
|
+
"""Check if all non-empty runs in a paragraph are bold."""
|
|
102
|
+
text_runs = [r for r in para.runs if r.text.strip()]
|
|
103
|
+
if not text_runs:
|
|
104
|
+
return False
|
|
105
|
+
return all(r.bold for r in text_runs)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def classify_paragraph(para, runs_info):
|
|
109
|
+
"""Classify a paragraph's role in the document."""
|
|
110
|
+
text = para.text.strip()
|
|
111
|
+
style_name = para.style.name if para.style else ''
|
|
112
|
+
|
|
113
|
+
if not text or not text.strip('\u200b\u200c\u200d\ufeff\u00a0'):
|
|
114
|
+
return 'empty', None
|
|
115
|
+
|
|
116
|
+
# Heading detection — Word heading styles
|
|
117
|
+
if style_name.startswith('Heading'):
|
|
118
|
+
try:
|
|
119
|
+
level = int(style_name.replace('Heading ', '').strip())
|
|
120
|
+
except ValueError:
|
|
121
|
+
level = 1
|
|
122
|
+
return f'h{level}', None
|
|
123
|
+
|
|
124
|
+
# Title style
|
|
125
|
+
if style_name == 'Title':
|
|
126
|
+
return 'h1', None
|
|
127
|
+
|
|
128
|
+
# Date detection — must come before bold heading check since dates can be bold
|
|
129
|
+
if re.match(r'^Updated\s+\w+\s+(\d{1,2}\w{0,2},?\s+)?\d{4}', text):
|
|
130
|
+
return 'date', None
|
|
131
|
+
|
|
132
|
+
# YES/NO answer lines — must come before bold heading check
|
|
133
|
+
# These are bold answer lines under numbered Q&A headings
|
|
134
|
+
if re.match(r'^(YES|NO)[.\s]', text):
|
|
135
|
+
return 'answer_start', text[:3].rstrip('.')
|
|
136
|
+
|
|
137
|
+
# Bold body text = heading (common in IMA medical docs)
|
|
138
|
+
# Short all-bold Body/Default paragraphs are treated as section headings
|
|
139
|
+
if is_all_bold(para) and len(text) < 120:
|
|
140
|
+
# First bold paragraph is likely the title
|
|
141
|
+
# "Table N." lines are table captions, not headings
|
|
142
|
+
if re.match(r'^Table\s+\d+', text):
|
|
143
|
+
return 'table_caption', None
|
|
144
|
+
return 'heading_bold', None
|
|
145
|
+
|
|
146
|
+
# Q&A detection
|
|
147
|
+
if re.match(r'^Q[:\.]?\s', text) or (runs_info and runs_info[0].get('bold') and '?' in text):
|
|
148
|
+
return 'question', None
|
|
149
|
+
|
|
150
|
+
# Reference detection — numbered list item that looks like a citation
|
|
151
|
+
if re.match(r'^\d+\.\s+\w', text):
|
|
152
|
+
citation_markers = ('et al', ' pp.', ' vol.', ' doi:', '(19', '(20', 'j.', 'epub')
|
|
153
|
+
if any(m in text.lower() for m in citation_markers):
|
|
154
|
+
return 'reference', None
|
|
155
|
+
# Fall through to body if no citation markers — likely a numbered list item
|
|
156
|
+
|
|
157
|
+
# Figure caption
|
|
158
|
+
if re.match(r'^Figure\s+\d+', text):
|
|
159
|
+
return 'figure_caption', None
|
|
160
|
+
|
|
161
|
+
# Table caption (bold + starts with "Table N")
|
|
162
|
+
if re.match(r'^Table\s+\d+', text):
|
|
163
|
+
return 'table_caption', None
|
|
164
|
+
|
|
165
|
+
# Disclaimer detection
|
|
166
|
+
DISCLAIMER_PATTERNS = (
|
|
167
|
+
'not intended as',
|
|
168
|
+
'does not constitute',
|
|
169
|
+
'medical advice',
|
|
170
|
+
'consult your',
|
|
171
|
+
'disclaimer',
|
|
172
|
+
)
|
|
173
|
+
text_lower = text.lower()
|
|
174
|
+
if any(p in text_lower for p in DISCLAIMER_PATTERNS):
|
|
175
|
+
return 'disclaimer', None
|
|
176
|
+
|
|
177
|
+
# Warning box detection — explicit signal words always match; vague patterns
|
|
178
|
+
# only match when the paragraph is also all-bold and short (formatting signal).
|
|
179
|
+
STRONG_WARNING_PATTERNS = ('warning:', 'caution:', 'important safety')
|
|
180
|
+
WEAK_WARNING_PATTERNS = ('should not treat themselves',)
|
|
181
|
+
if any(p in text_lower for p in STRONG_WARNING_PATTERNS):
|
|
182
|
+
return 'warning', None
|
|
183
|
+
if is_all_bold(para) and len(text) < 200 and any(p in text_lower for p in WEAK_WARNING_PATTERNS):
|
|
184
|
+
return 'warning', None
|
|
185
|
+
|
|
186
|
+
# Author detection — presence of professional credentials in short text
|
|
187
|
+
CREDENTIAL_PATTERN = r'\b(M\.?D\.?|Ph\.?D\.?|D\.?O\.?|FCCM|FCCP|R\.?N\.?|N\.?P\.?|D\.?N\.?P\.?|FACP|FACEP)\b'
|
|
188
|
+
if re.search(CREDENTIAL_PATTERN, text) and len(text) < 100:
|
|
189
|
+
return 'author', None
|
|
190
|
+
|
|
191
|
+
# Bullet/list paragraph detection — Word List Bullet style or numPr XML
|
|
192
|
+
style_lower = style_name.lower()
|
|
193
|
+
if 'list bullet' in style_lower or 'list number' in style_lower:
|
|
194
|
+
return 'bullet', None
|
|
195
|
+
pPr = para._element.find(qn('w:pPr'))
|
|
196
|
+
if pPr is not None and pPr.find(qn('w:numPr')) is not None:
|
|
197
|
+
return 'bullet', None
|
|
198
|
+
|
|
199
|
+
return 'body', None
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def extract_document(docx_path):
|
|
203
|
+
"""Extract structured content from a Word document."""
|
|
204
|
+
doc = Document(docx_path)
|
|
205
|
+
comments = extract_comments(doc)
|
|
206
|
+
|
|
207
|
+
sections = []
|
|
208
|
+
current_section = None
|
|
209
|
+
references = []
|
|
210
|
+
in_references = False
|
|
211
|
+
qa_pairs = []
|
|
212
|
+
current_qa = None
|
|
213
|
+
|
|
214
|
+
for i, para in enumerate(doc.paragraphs):
|
|
215
|
+
text = para.text.strip()
|
|
216
|
+
|
|
217
|
+
# Detect explicit page breaks (w:br type=page) on empty paragraphs
|
|
218
|
+
if not text:
|
|
219
|
+
if has_page_break(para):
|
|
220
|
+
sections.append({
|
|
221
|
+
'index': i, 'type': 'page_break', 'text': '',
|
|
222
|
+
'runs': [], 'style': ''
|
|
223
|
+
})
|
|
224
|
+
continue
|
|
225
|
+
|
|
226
|
+
runs = extract_runs_with_formatting(para)
|
|
227
|
+
para_type, meta = classify_paragraph(para, runs)
|
|
228
|
+
|
|
229
|
+
# Check for @SLOT comments
|
|
230
|
+
comment_refs = get_comment_refs(para)
|
|
231
|
+
slot_override = None
|
|
232
|
+
for ref in comment_refs:
|
|
233
|
+
if ref in comments:
|
|
234
|
+
slot_override = comments[ref]
|
|
235
|
+
break
|
|
236
|
+
|
|
237
|
+
# Extract numbering info
|
|
238
|
+
numId, ilvl = get_num_info(para)
|
|
239
|
+
|
|
240
|
+
# Build paragraph entry
|
|
241
|
+
entry = {
|
|
242
|
+
'index': i,
|
|
243
|
+
'type': para_type,
|
|
244
|
+
'text': text,
|
|
245
|
+
'runs': runs,
|
|
246
|
+
'style': para.style.name if para.style else '',
|
|
247
|
+
}
|
|
248
|
+
if has_page_break(para):
|
|
249
|
+
entry['page_break'] = True
|
|
250
|
+
if numId is not None:
|
|
251
|
+
entry['numId'] = numId
|
|
252
|
+
entry['ilvl'] = ilvl
|
|
253
|
+
if slot_override:
|
|
254
|
+
entry['slot_override'] = slot_override
|
|
255
|
+
if meta:
|
|
256
|
+
entry['meta'] = meta
|
|
257
|
+
|
|
258
|
+
# Track references section
|
|
259
|
+
if text.lower() == 'references':
|
|
260
|
+
in_references = True
|
|
261
|
+
entry['type'] = 'ref_heading'
|
|
262
|
+
sections.append(entry)
|
|
263
|
+
continue
|
|
264
|
+
|
|
265
|
+
if in_references and para_type == 'reference':
|
|
266
|
+
references.append(entry)
|
|
267
|
+
continue
|
|
268
|
+
elif in_references and para_type != 'reference' and para_type != 'empty':
|
|
269
|
+
# Might still be a reference without the number prefix
|
|
270
|
+
if references: # We already have some references
|
|
271
|
+
references.append(entry)
|
|
272
|
+
continue
|
|
273
|
+
in_references = False
|
|
274
|
+
|
|
275
|
+
# Track Q&A pairs
|
|
276
|
+
if para_type == 'question':
|
|
277
|
+
if current_qa:
|
|
278
|
+
qa_pairs.append(current_qa)
|
|
279
|
+
current_qa = {'question': entry, 'answer_parts': []}
|
|
280
|
+
continue
|
|
281
|
+
elif current_qa and (para_type in ('answer_start', 'body')):
|
|
282
|
+
current_qa['answer_parts'].append(entry)
|
|
283
|
+
continue
|
|
284
|
+
elif current_qa and para_type.startswith('h'):
|
|
285
|
+
qa_pairs.append(current_qa)
|
|
286
|
+
current_qa = None
|
|
287
|
+
|
|
288
|
+
sections.append(entry)
|
|
289
|
+
|
|
290
|
+
# Flush remaining Q&A
|
|
291
|
+
if current_qa:
|
|
292
|
+
qa_pairs.append(current_qa)
|
|
293
|
+
|
|
294
|
+
return {
|
|
295
|
+
'source_file': str(docx_path),
|
|
296
|
+
'total_paragraphs': len(doc.paragraphs),
|
|
297
|
+
'sections': sections,
|
|
298
|
+
'qa_pairs': qa_pairs,
|
|
299
|
+
'references': references,
|
|
300
|
+
'slot_overrides': {v: k for k, v in comments.items()} if comments else {},
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def print_readable(data):
|
|
305
|
+
"""Print human-readable summary."""
|
|
306
|
+
print(f"Source: {data['source_file']}")
|
|
307
|
+
print(f"Total paragraphs: {data['total_paragraphs']}")
|
|
308
|
+
print(f"Sections: {len(data['sections'])}")
|
|
309
|
+
print(f"Q&A pairs: {len(data['qa_pairs'])}")
|
|
310
|
+
print(f"References: {len(data['references'])}")
|
|
311
|
+
print(f"@SLOT overrides: {len(data['slot_overrides'])}")
|
|
312
|
+
print()
|
|
313
|
+
|
|
314
|
+
print("=== DOCUMENT STRUCTURE ===")
|
|
315
|
+
for entry in data['sections']:
|
|
316
|
+
prefix = ''
|
|
317
|
+
if entry['type'] == 'h1':
|
|
318
|
+
prefix = '# '
|
|
319
|
+
elif entry['type'] == 'h2':
|
|
320
|
+
prefix = '## '
|
|
321
|
+
elif entry['type'] == 'h3':
|
|
322
|
+
prefix = '### '
|
|
323
|
+
elif entry['type'] == 'heading_bold':
|
|
324
|
+
prefix = '** '
|
|
325
|
+
elif entry['type'] == 'disclaimer':
|
|
326
|
+
prefix = '[DISCLAIMER] '
|
|
327
|
+
elif entry['type'] == 'warning':
|
|
328
|
+
prefix = '[WARNING] '
|
|
329
|
+
elif entry['type'] == 'date':
|
|
330
|
+
prefix = '[DATE] '
|
|
331
|
+
elif entry['type'] == 'author':
|
|
332
|
+
prefix = '[AUTHOR] '
|
|
333
|
+
elif entry['type'] == 'figure_caption':
|
|
334
|
+
prefix = '[FIGURE] '
|
|
335
|
+
elif entry['type'] == 'ref_heading':
|
|
336
|
+
prefix = '[REFERENCES] '
|
|
337
|
+
|
|
338
|
+
slot = entry.get('slot_override', '')
|
|
339
|
+
slot_str = f' → @SLOT:{slot}' if slot else ''
|
|
340
|
+
|
|
341
|
+
text_preview = entry['text'][:120] + ('...' if len(entry['text']) > 120 else '')
|
|
342
|
+
print(f" {prefix}{text_preview}{slot_str}")
|
|
343
|
+
|
|
344
|
+
if data['qa_pairs']:
|
|
345
|
+
print()
|
|
346
|
+
print("=== Q&A PAIRS ===")
|
|
347
|
+
for qa in data['qa_pairs']:
|
|
348
|
+
q_text = qa['question']['text'][:100]
|
|
349
|
+
a_count = len(qa['answer_parts'])
|
|
350
|
+
print(f" Q: {q_text}")
|
|
351
|
+
print(f" ({a_count} answer paragraph(s))")
|
|
352
|
+
|
|
353
|
+
if data['references']:
|
|
354
|
+
print()
|
|
355
|
+
print(f"=== REFERENCES ({len(data['references'])} total) ===")
|
|
356
|
+
for ref in data['references'][:3]:
|
|
357
|
+
print(f" {ref['text'][:100]}...")
|
|
358
|
+
if len(data['references']) > 3:
|
|
359
|
+
print(f" ... and {len(data['references']) - 3} more")
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
if __name__ == '__main__':
|
|
363
|
+
import io
|
|
364
|
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
|
365
|
+
|
|
366
|
+
if len(sys.argv) < 2:
|
|
367
|
+
print("Usage: python extract_docx.py <path_to_docx> [--json]")
|
|
368
|
+
sys.exit(1)
|
|
369
|
+
|
|
370
|
+
docx_path = Path(sys.argv[1])
|
|
371
|
+
if not docx_path.exists():
|
|
372
|
+
print(f"Error: File not found: {docx_path}")
|
|
373
|
+
sys.exit(1)
|
|
374
|
+
|
|
375
|
+
output_json = '--json' in sys.argv
|
|
376
|
+
|
|
377
|
+
data = extract_document(docx_path)
|
|
378
|
+
|
|
379
|
+
if output_json:
|
|
380
|
+
# JSON output for Claude to consume
|
|
381
|
+
print(json.dumps(data, indent=2, ensure_ascii=False))
|
|
382
|
+
else:
|
|
383
|
+
# Human-readable output
|
|
384
|
+
print_readable(data)
|