ima-claude 2.9.0 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,384 @@
1
+ """
2
+ Extract structured text from a Word document for IMA branded document generation.
3
+
4
+ Usage:
5
+ python3 extract_docx.py <path_to_docx> [--json]
6
+
7
+ Outputs a structured representation of the document with:
8
+ - Heading hierarchy (H1, H2, H3)
9
+ - Body paragraphs with bold run markers
10
+ - Reference citations
11
+ - Q&A detection
12
+ - Word comments (if any @SLOT markers exist)
13
+
14
+ Required: pip install python-docx
15
+ """
16
+
17
+ import sys
18
+ import json
19
+ import re
20
+ from pathlib import Path
21
+ from docx import Document
22
+ from docx.oxml.ns import qn
23
+
24
+ sys.path.insert(0, str(Path(__file__).parent))
25
+ from docx_utils import has_page_break
26
+
27
+
28
+ def extract_comments(doc):
29
+ """Extract Word comments and their anchored paragraph indices."""
30
+ comments = {}
31
+ # Comments are stored in the comments part of the docx
32
+ try:
33
+ comments_part = doc.part.package.part_related_by(
34
+ 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments'
35
+ )
36
+ if comments_part is None:
37
+ return comments
38
+
39
+ from lxml import etree
40
+ tree = etree.fromstring(comments_part.blob)
41
+ ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
42
+
43
+ for comment in tree.findall('.//w:comment', ns):
44
+ comment_id = comment.get(qn('w:id'))
45
+ text_parts = []
46
+ for p in comment.findall('.//w:t', ns):
47
+ if p.text:
48
+ text_parts.append(p.text)
49
+ text = ''.join(text_parts).strip()
50
+ if text.startswith('@SLOT:'):
51
+ slot_name = text.replace('@SLOT:', '').strip()
52
+ comments[comment_id] = slot_name
53
+ except Exception:
54
+ pass # No comments part or parsing error
55
+
56
+ return comments
57
+
58
+
59
+ def get_comment_refs(paragraph):
60
+ """Get comment IDs referenced by a paragraph (via commentRangeStart)."""
61
+ refs = []
62
+ for elem in paragraph._element.iter():
63
+ if elem.tag.endswith('commentRangeStart'):
64
+ comment_id = elem.get(qn('w:id'))
65
+ if comment_id:
66
+ refs.append(comment_id)
67
+ return refs
68
+
69
+
70
+ def extract_runs_with_formatting(paragraph):
71
+ """Extract text runs with bold/italic markers."""
72
+ runs = []
73
+ for run in paragraph.runs:
74
+ text = run.text
75
+ if not text:
76
+ continue
77
+ runs.append({
78
+ 'text': text,
79
+ 'bold': run.bold or False,
80
+ 'italic': run.italic or False,
81
+ })
82
+ return runs
83
+
84
+
85
+ def get_num_info(paragraph):
86
+ """Extract Word numbering info (numId, ilvl) if present."""
87
+ pPr = paragraph._element.find(qn('w:pPr'))
88
+ if pPr is None:
89
+ return None, None
90
+ numPr = pPr.find(qn('w:numPr'))
91
+ if numPr is None:
92
+ return None, None
93
+ ilvl_el = numPr.find(qn('w:ilvl'))
94
+ numId_el = numPr.find(qn('w:numId'))
95
+ ilvl = ilvl_el.get(qn('w:val')) if ilvl_el is not None else '0'
96
+ numId = numId_el.get(qn('w:val')) if numId_el is not None else None
97
+ return numId, int(ilvl)
98
+
99
+
100
+ def is_all_bold(para):
101
+ """Check if all non-empty runs in a paragraph are bold."""
102
+ text_runs = [r for r in para.runs if r.text.strip()]
103
+ if not text_runs:
104
+ return False
105
+ return all(r.bold for r in text_runs)
106
+
107
+
108
+ def classify_paragraph(para, runs_info):
109
+ """Classify a paragraph's role in the document."""
110
+ text = para.text.strip()
111
+ style_name = para.style.name if para.style else ''
112
+
113
+ if not text or not text.strip('\u200b\u200c\u200d\ufeff\u00a0'):
114
+ return 'empty', None
115
+
116
+ # Heading detection — Word heading styles
117
+ if style_name.startswith('Heading'):
118
+ try:
119
+ level = int(style_name.replace('Heading ', '').strip())
120
+ except ValueError:
121
+ level = 1
122
+ return f'h{level}', None
123
+
124
+ # Title style
125
+ if style_name == 'Title':
126
+ return 'h1', None
127
+
128
+ # Date detection — must come before bold heading check since dates can be bold
129
+ if re.match(r'^Updated\s+\w+\s+(\d{1,2}\w{0,2},?\s+)?\d{4}', text):
130
+ return 'date', None
131
+
132
+ # YES/NO answer lines — must come before bold heading check
133
+ # These are bold answer lines under numbered Q&A headings
134
+ if re.match(r'^(YES|NO)[.\s]', text):
135
+ return 'answer_start', text[:3].rstrip('.')
136
+
137
+ # Bold body text = heading (common in IMA medical docs)
138
+ # Short all-bold Body/Default paragraphs are treated as section headings
139
+ if is_all_bold(para) and len(text) < 120:
140
+ # First bold paragraph is likely the title
141
+ # "Table N." lines are table captions, not headings
142
+ if re.match(r'^Table\s+\d+', text):
143
+ return 'table_caption', None
144
+ return 'heading_bold', None
145
+
146
+ # Q&A detection
147
+ if re.match(r'^Q[:\.]?\s', text) or (runs_info and runs_info[0].get('bold') and '?' in text):
148
+ return 'question', None
149
+
150
+ # Reference detection — numbered list item that looks like a citation
151
+ if re.match(r'^\d+\.\s+\w', text):
152
+ citation_markers = ('et al', ' pp.', ' vol.', ' doi:', '(19', '(20', 'j.', 'epub')
153
+ if any(m in text.lower() for m in citation_markers):
154
+ return 'reference', None
155
+ # Fall through to body if no citation markers — likely a numbered list item
156
+
157
+ # Figure caption
158
+ if re.match(r'^Figure\s+\d+', text):
159
+ return 'figure_caption', None
160
+
161
+ # Table caption (bold + starts with "Table N")
162
+ if re.match(r'^Table\s+\d+', text):
163
+ return 'table_caption', None
164
+
165
+ # Disclaimer detection
166
+ DISCLAIMER_PATTERNS = (
167
+ 'not intended as',
168
+ 'does not constitute',
169
+ 'medical advice',
170
+ 'consult your',
171
+ 'disclaimer',
172
+ )
173
+ text_lower = text.lower()
174
+ if any(p in text_lower for p in DISCLAIMER_PATTERNS):
175
+ return 'disclaimer', None
176
+
177
+ # Warning box detection — explicit signal words always match; vague patterns
178
+ # only match when the paragraph is also all-bold and short (formatting signal).
179
+ STRONG_WARNING_PATTERNS = ('warning:', 'caution:', 'important safety')
180
+ WEAK_WARNING_PATTERNS = ('should not treat themselves',)
181
+ if any(p in text_lower for p in STRONG_WARNING_PATTERNS):
182
+ return 'warning', None
183
+ if is_all_bold(para) and len(text) < 200 and any(p in text_lower for p in WEAK_WARNING_PATTERNS):
184
+ return 'warning', None
185
+
186
+ # Author detection — presence of professional credentials in short text
187
+ CREDENTIAL_PATTERN = r'\b(M\.?D\.?|Ph\.?D\.?|D\.?O\.?|FCCM|FCCP|R\.?N\.?|N\.?P\.?|D\.?N\.?P\.?|FACP|FACEP)\b'
188
+ if re.search(CREDENTIAL_PATTERN, text) and len(text) < 100:
189
+ return 'author', None
190
+
191
+ # Bullet/list paragraph detection — Word List Bullet style or numPr XML
192
+ style_lower = style_name.lower()
193
+ if 'list bullet' in style_lower or 'list number' in style_lower:
194
+ return 'bullet', None
195
+ pPr = para._element.find(qn('w:pPr'))
196
+ if pPr is not None and pPr.find(qn('w:numPr')) is not None:
197
+ return 'bullet', None
198
+
199
+ return 'body', None
200
+
201
+
202
+ def extract_document(docx_path):
203
+ """Extract structured content from a Word document."""
204
+ doc = Document(docx_path)
205
+ comments = extract_comments(doc)
206
+
207
+ sections = []
208
+ current_section = None
209
+ references = []
210
+ in_references = False
211
+ qa_pairs = []
212
+ current_qa = None
213
+
214
+ for i, para in enumerate(doc.paragraphs):
215
+ text = para.text.strip()
216
+
217
+ # Detect explicit page breaks (w:br type=page) on empty paragraphs
218
+ if not text:
219
+ if has_page_break(para):
220
+ sections.append({
221
+ 'index': i, 'type': 'page_break', 'text': '',
222
+ 'runs': [], 'style': ''
223
+ })
224
+ continue
225
+
226
+ runs = extract_runs_with_formatting(para)
227
+ para_type, meta = classify_paragraph(para, runs)
228
+
229
+ # Check for @SLOT comments
230
+ comment_refs = get_comment_refs(para)
231
+ slot_override = None
232
+ for ref in comment_refs:
233
+ if ref in comments:
234
+ slot_override = comments[ref]
235
+ break
236
+
237
+ # Extract numbering info
238
+ numId, ilvl = get_num_info(para)
239
+
240
+ # Build paragraph entry
241
+ entry = {
242
+ 'index': i,
243
+ 'type': para_type,
244
+ 'text': text,
245
+ 'runs': runs,
246
+ 'style': para.style.name if para.style else '',
247
+ }
248
+ if has_page_break(para):
249
+ entry['page_break'] = True
250
+ if numId is not None:
251
+ entry['numId'] = numId
252
+ entry['ilvl'] = ilvl
253
+ if slot_override:
254
+ entry['slot_override'] = slot_override
255
+ if meta:
256
+ entry['meta'] = meta
257
+
258
+ # Track references section
259
+ if text.lower() == 'references':
260
+ in_references = True
261
+ entry['type'] = 'ref_heading'
262
+ sections.append(entry)
263
+ continue
264
+
265
+ if in_references and para_type == 'reference':
266
+ references.append(entry)
267
+ continue
268
+ elif in_references and para_type != 'reference' and para_type != 'empty':
269
+ # Might still be a reference without the number prefix
270
+ if references: # We already have some references
271
+ references.append(entry)
272
+ continue
273
+ in_references = False
274
+
275
+ # Track Q&A pairs
276
+ if para_type == 'question':
277
+ if current_qa:
278
+ qa_pairs.append(current_qa)
279
+ current_qa = {'question': entry, 'answer_parts': []}
280
+ continue
281
+ elif current_qa and (para_type in ('answer_start', 'body')):
282
+ current_qa['answer_parts'].append(entry)
283
+ continue
284
+ elif current_qa and para_type.startswith('h'):
285
+ qa_pairs.append(current_qa)
286
+ current_qa = None
287
+
288
+ sections.append(entry)
289
+
290
+ # Flush remaining Q&A
291
+ if current_qa:
292
+ qa_pairs.append(current_qa)
293
+
294
+ return {
295
+ 'source_file': str(docx_path),
296
+ 'total_paragraphs': len(doc.paragraphs),
297
+ 'sections': sections,
298
+ 'qa_pairs': qa_pairs,
299
+ 'references': references,
300
+ 'slot_overrides': {v: k for k, v in comments.items()} if comments else {},
301
+ }
302
+
303
+
304
+ def print_readable(data):
305
+ """Print human-readable summary."""
306
+ print(f"Source: {data['source_file']}")
307
+ print(f"Total paragraphs: {data['total_paragraphs']}")
308
+ print(f"Sections: {len(data['sections'])}")
309
+ print(f"Q&A pairs: {len(data['qa_pairs'])}")
310
+ print(f"References: {len(data['references'])}")
311
+ print(f"@SLOT overrides: {len(data['slot_overrides'])}")
312
+ print()
313
+
314
+ print("=== DOCUMENT STRUCTURE ===")
315
+ for entry in data['sections']:
316
+ prefix = ''
317
+ if entry['type'] == 'h1':
318
+ prefix = '# '
319
+ elif entry['type'] == 'h2':
320
+ prefix = '## '
321
+ elif entry['type'] == 'h3':
322
+ prefix = '### '
323
+ elif entry['type'] == 'heading_bold':
324
+ prefix = '** '
325
+ elif entry['type'] == 'disclaimer':
326
+ prefix = '[DISCLAIMER] '
327
+ elif entry['type'] == 'warning':
328
+ prefix = '[WARNING] '
329
+ elif entry['type'] == 'date':
330
+ prefix = '[DATE] '
331
+ elif entry['type'] == 'author':
332
+ prefix = '[AUTHOR] '
333
+ elif entry['type'] == 'figure_caption':
334
+ prefix = '[FIGURE] '
335
+ elif entry['type'] == 'ref_heading':
336
+ prefix = '[REFERENCES] '
337
+
338
+ slot = entry.get('slot_override', '')
339
+ slot_str = f' → @SLOT:{slot}' if slot else ''
340
+
341
+ text_preview = entry['text'][:120] + ('...' if len(entry['text']) > 120 else '')
342
+ print(f" {prefix}{text_preview}{slot_str}")
343
+
344
+ if data['qa_pairs']:
345
+ print()
346
+ print("=== Q&A PAIRS ===")
347
+ for qa in data['qa_pairs']:
348
+ q_text = qa['question']['text'][:100]
349
+ a_count = len(qa['answer_parts'])
350
+ print(f" Q: {q_text}")
351
+ print(f" ({a_count} answer paragraph(s))")
352
+
353
+ if data['references']:
354
+ print()
355
+ print(f"=== REFERENCES ({len(data['references'])} total) ===")
356
+ for ref in data['references'][:3]:
357
+ print(f" {ref['text'][:100]}...")
358
+ if len(data['references']) > 3:
359
+ print(f" ... and {len(data['references']) - 3} more")
360
+
361
+
362
+ if __name__ == '__main__':
363
+ import io
364
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
365
+
366
+ if len(sys.argv) < 2:
367
+ print("Usage: python extract_docx.py <path_to_docx> [--json]")
368
+ sys.exit(1)
369
+
370
+ docx_path = Path(sys.argv[1])
371
+ if not docx_path.exists():
372
+ print(f"Error: File not found: {docx_path}")
373
+ sys.exit(1)
374
+
375
+ output_json = '--json' in sys.argv
376
+
377
+ data = extract_document(docx_path)
378
+
379
+ if output_json:
380
+ # JSON output for Claude to consume
381
+ print(json.dumps(data, indent=2, ensure_ascii=False))
382
+ else:
383
+ # Human-readable output
384
+ print_readable(data)