docx-to-builder 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +177 -0
- package/cli.js +38 -0
- package/docx-to-builder.py +1268 -0
- package/examples/sample-builder.js +943 -0
- package/examples/sample-template.docx +0 -0
- package/package.json +45 -0
|
@@ -0,0 +1,1268 @@
|
|
|
1
|
+
"""
|
|
2
|
+
docx-to-builder.py
|
|
3
|
+
|
|
4
|
+
Parses any .docx template and generates a complete, ready-to-run JavaScript
|
|
5
|
+
builder using the `docx` npm package (https://docx.js.org).
|
|
6
|
+
|
|
7
|
+
The generated builder:
|
|
8
|
+
- Reproduces the exact layout, colors, fonts, spacing, borders, and tables
|
|
9
|
+
- Replaces bracketed template placeholders like [Client Name] with data.fieldName
|
|
10
|
+
- Supports multi-section documents (cover page + body with separate margins)
|
|
11
|
+
- Preserves hyperlinks as ExternalHyperlink in the output
|
|
12
|
+
- Outputs a properly branded .docx when run with Node.js
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
python3 docx-to-builder.py <template.docx> [--output <builder.js>]
|
|
16
|
+
python3 docx-to-builder.py <template.docx> --list
|
|
17
|
+
|
|
18
|
+
Options:
|
|
19
|
+
--output <path> Write the generated builder to this path instead of next to the template
|
|
20
|
+
--list Print a summary of what was found in the template and exit (no file written)
|
|
21
|
+
|
|
22
|
+
Examples:
|
|
23
|
+
python3 docx-to-builder.py sample-template.docx
|
|
24
|
+
python3 docx-to-builder.py my-proposal.docx --output proposal-builder.js
|
|
25
|
+
python3 docx-to-builder.py my-proposal.docx --list
|
|
26
|
+
|
|
27
|
+
Requirements:
|
|
28
|
+
- Python 3.8+ (standard library only — no pip installs needed)
|
|
29
|
+
- Node.js + docx npm package in the output project: npm install docx
|
|
30
|
+
|
|
31
|
+
The generated file will be placed next to the input .docx unless --output is specified.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
import sys
|
|
35
|
+
import os
|
|
36
|
+
import zipfile
|
|
37
|
+
import xml.etree.ElementTree as ET
|
|
38
|
+
import re
|
|
39
|
+
import colorsys
|
|
40
|
+
from dataclasses import dataclass, field
|
|
41
|
+
from typing import Optional
|
|
42
|
+
|
|
43
|
+
# ─── XML namespaces ───────────────────────────────────────────────────────────
|
|
44
|
+
NS = {
|
|
45
|
+
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
|
46
|
+
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
|
|
47
|
+
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
|
|
48
|
+
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
|
|
49
|
+
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
def qn(tag):
|
|
53
|
+
"""Resolve a prefixed tag like w:p to its full Clark notation."""
|
|
54
|
+
prefix, local = tag.split(':')
|
|
55
|
+
return f'{{{NS[prefix]}}}{local}'
|
|
56
|
+
|
|
57
|
+
def _bool_prop(rPr, tag):
|
|
58
|
+
"""
|
|
59
|
+
Return True if a toggle property like <w:b/> is present AND not explicitly
|
|
60
|
+
turned off with w:val="0" or w:val="false". Word uses this pattern for
|
|
61
|
+
properties inherited from a style that need to be reset on a specific run.
|
|
62
|
+
"""
|
|
63
|
+
el = rPr.find(qn(tag))
|
|
64
|
+
if el is None:
|
|
65
|
+
return False
|
|
66
|
+
val = el.get(qn('w:val'))
|
|
67
|
+
# Explicit off: val="0" or val="false"
|
|
68
|
+
if val in ('0', 'false'):
|
|
69
|
+
return False
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
# ─── Data structures ─────────────────────────────────────────────────────────
|
|
73
|
+
@dataclass
|
|
74
|
+
class RunFormat:
|
|
75
|
+
text: str = ''
|
|
76
|
+
bold: bool = False
|
|
77
|
+
italic: bool = False
|
|
78
|
+
allCaps: bool = False
|
|
79
|
+
font: str = 'Calibri'
|
|
80
|
+
size: Optional[int] = None # stored as half-points (Word XML unit)
|
|
81
|
+
color: Optional[str] = None
|
|
82
|
+
is_page_number: bool = False # PAGE or NUMPAGES field
|
|
83
|
+
is_tab: bool = False
|
|
84
|
+
is_cached_field: bool = False # stale cached value between fldChar begin/end — skip
|
|
85
|
+
hyperlink_url: Optional[str] = None # set if this run is inside a w:hyperlink
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class ParaFormat:
|
|
89
|
+
runs: list = field(default_factory=list)
|
|
90
|
+
alignment: Optional[str] = None
|
|
91
|
+
spacing_before: int = 0
|
|
92
|
+
spacing_after: int = 0
|
|
93
|
+
border_top: Optional[dict] = None
|
|
94
|
+
border_bottom: Optional[dict] = None
|
|
95
|
+
style: Optional[str] = None
|
|
96
|
+
is_bullet: bool = False
|
|
97
|
+
is_page_break: bool = False
|
|
98
|
+
tab_stop_right: Optional[int] = None
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class CellFormat:
|
|
102
|
+
paragraphs: list = field(default_factory=list)
|
|
103
|
+
width_dxa: Optional[int] = None
|
|
104
|
+
shading: Optional[str] = None
|
|
105
|
+
border_color: Optional[str] = None
|
|
106
|
+
margin_top: int = 0
|
|
107
|
+
margin_bottom: int = 0
|
|
108
|
+
margin_left: int = 0
|
|
109
|
+
margin_right: int = 0
|
|
110
|
+
valign: Optional[str] = None
|
|
111
|
+
|
|
112
|
+
@dataclass
|
|
113
|
+
class TableFormat:
|
|
114
|
+
rows: list = field(default_factory=list)
|
|
115
|
+
width_dxa: Optional[int] = None
|
|
116
|
+
width_type: Optional[str] = None
|
|
117
|
+
alignment: Optional[str] = None
|
|
118
|
+
|
|
119
|
+
@dataclass
|
|
120
|
+
class ImageRef:
|
|
121
|
+
rel_id: str
|
|
122
|
+
width_emu: int
|
|
123
|
+
height_emu: int
|
|
124
|
+
para_align: Optional[str] = None
|
|
125
|
+
spacing_before: int = 0
|
|
126
|
+
spacing_after: int = 0
|
|
127
|
+
|
|
128
|
+
@dataclass
|
|
129
|
+
class SectionBreak:
|
|
130
|
+
margin_top: int = 1440
|
|
131
|
+
margin_bottom: int = 1440
|
|
132
|
+
margin_left: int = 1440
|
|
133
|
+
margin_right: int = 1440
|
|
134
|
+
|
|
135
|
+
# ─── Color naming ─────────────────────────────────────────────────────────────
|
|
136
|
+
# Well-known brand colors get readable names. Everything else is auto-named by
|
|
137
|
+
# hue + lightness so you get e.g. "navyDark" instead of "color2C4A6E".
|
|
138
|
+
_KNOWN_COLORS = {
|
|
139
|
+
'973A38': 'accent',
|
|
140
|
+
'59575A': 'body',
|
|
141
|
+
'8C8C8C': 'subtle',
|
|
142
|
+
'DDDDDD': 'rule',
|
|
143
|
+
'CCCCCC': 'border',
|
|
144
|
+
'F0E8E8': 'tintLight',
|
|
145
|
+
'F2F2F2': 'offWhite',
|
|
146
|
+
'D9D9D9': 'lightGray',
|
|
147
|
+
'E8E8E8': 'veryLightGray',
|
|
148
|
+
'D0A0A0': 'tintMid',
|
|
149
|
+
'F9F0F0': 'tintFaint',
|
|
150
|
+
'1A1A1A': 'nearBlack',
|
|
151
|
+
'FFFFFF': 'white',
|
|
152
|
+
'000000': 'black',
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
def _auto_color_name(hex6: str) -> str:
|
|
156
|
+
"""Derive a readable JS identifier from an unknown hex color using hue + lightness."""
|
|
157
|
+
try:
|
|
158
|
+
r = int(hex6[0:2], 16) / 255
|
|
159
|
+
g = int(hex6[2:4], 16) / 255
|
|
160
|
+
b = int(hex6[4:6], 16) / 255
|
|
161
|
+
except ValueError:
|
|
162
|
+
return f'color{hex6}'
|
|
163
|
+
|
|
164
|
+
h, l, s = colorsys.rgb_to_hls(r, g, b)
|
|
165
|
+
|
|
166
|
+
# Lightness buckets
|
|
167
|
+
if l >= 0.92: tone = 'Faint'
|
|
168
|
+
elif l >= 0.80: tone = 'Light'
|
|
169
|
+
elif l >= 0.60: tone = 'Mid'
|
|
170
|
+
elif l >= 0.40: tone = 'Base'
|
|
171
|
+
elif l >= 0.20: tone = 'Dark'
|
|
172
|
+
else: tone = 'Deep'
|
|
173
|
+
|
|
174
|
+
# If very desaturated treat as gray
|
|
175
|
+
if s < 0.08:
|
|
176
|
+
return f'gray{tone}'
|
|
177
|
+
|
|
178
|
+
hue_deg = h * 360
|
|
179
|
+
if hue_deg < 20: hue = 'red'
|
|
180
|
+
elif hue_deg < 45: hue = 'orange'
|
|
181
|
+
elif hue_deg < 70: hue = 'yellow'
|
|
182
|
+
elif hue_deg < 150: hue = 'green'
|
|
183
|
+
elif hue_deg < 195: hue = 'teal'
|
|
184
|
+
elif hue_deg < 255: hue = 'blue'
|
|
185
|
+
elif hue_deg < 285: hue = 'indigo'
|
|
186
|
+
elif hue_deg < 330: hue = 'purple'
|
|
187
|
+
else: hue = 'red'
|
|
188
|
+
|
|
189
|
+
# Special-case common hues people actually name
|
|
190
|
+
if hue == 'blue' and hue_deg >= 210 and hue_deg <= 245 and l < 0.45:
|
|
191
|
+
hue = 'navy'
|
|
192
|
+
|
|
193
|
+
return f'{hue}{tone}'
|
|
194
|
+
|
|
195
|
+
def _name_color(hex6: str, existing_names: set) -> str:
|
|
196
|
+
"""Return a JS-safe unique name for a hex color, avoiding collisions."""
|
|
197
|
+
key = hex6.upper()
|
|
198
|
+
if key in _KNOWN_COLORS:
|
|
199
|
+
return _KNOWN_COLORS[key]
|
|
200
|
+
base = _auto_color_name(key)
|
|
201
|
+
name = base
|
|
202
|
+
n = 2
|
|
203
|
+
while name in existing_names:
|
|
204
|
+
name = f'{base}{n}'
|
|
205
|
+
n += 1
|
|
206
|
+
return name
|
|
207
|
+
|
|
208
|
+
def build_color_map(colors: list) -> dict:
|
|
209
|
+
"""Map a list of hex color strings to unique, readable JS constant names."""
|
|
210
|
+
color_map = {}
|
|
211
|
+
used_names = set()
|
|
212
|
+
for hex_color in colors:
|
|
213
|
+
name = _name_color(hex_color, used_names)
|
|
214
|
+
color_map[hex_color] = name
|
|
215
|
+
used_names.add(name)
|
|
216
|
+
return color_map
|
|
217
|
+
|
|
218
|
+
# ─── Bracket placeholder → data field name ───────────────────────────────────
|
|
219
|
+
BRACKET_FIELD_MAP = {
|
|
220
|
+
'client name': 'data.clientName',
|
|
221
|
+
'client': 'data.clientName',
|
|
222
|
+
'proposal title': 'data.title',
|
|
223
|
+
'document title': 'data.title',
|
|
224
|
+
'title': 'data.title',
|
|
225
|
+
'month day, year': 'data.date',
|
|
226
|
+
'date': 'data.date',
|
|
227
|
+
'draft / final': 'data.status',
|
|
228
|
+
'status': 'data.status',
|
|
229
|
+
'prepared by': 'data.preparedBy',
|
|
230
|
+
'contact name': 'data.contactName',
|
|
231
|
+
'company name': 'data.companyName',
|
|
232
|
+
'brief description of services': 'data.serviceDescription',
|
|
233
|
+
'brief description': 'data.description',
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
def bracket_to_field(text):
|
|
237
|
+
"""
|
|
238
|
+
If text looks like a single template placeholder e.g. '[Client Name]',
|
|
239
|
+
return the JS data field reference. Otherwise return None (keep as literal).
|
|
240
|
+
|
|
241
|
+
Rules:
|
|
242
|
+
- Must be a single [bracket] — multi-bracket text handled by split_bracket_runs
|
|
243
|
+
- Instructional text (> 5 words or > 50 chars inside brackets) is treated as a
|
|
244
|
+
static literal, not a data field — those are human-readable instructions the
|
|
245
|
+
template author wrote, not machine placeholders
|
|
246
|
+
"""
|
|
247
|
+
stripped = text.strip()
|
|
248
|
+
if not (stripped.startswith('[') and stripped.endswith(']')):
|
|
249
|
+
return None
|
|
250
|
+
if stripped.count('[') != 1:
|
|
251
|
+
return None
|
|
252
|
+
inner = stripped[1:-1].strip()
|
|
253
|
+
# Long instructional text — keep as literal
|
|
254
|
+
if len(inner) > 50 or len(inner.split()) > 5:
|
|
255
|
+
return None
|
|
256
|
+
inner_lower = inner.lower()
|
|
257
|
+
if inner_lower in BRACKET_FIELD_MAP:
|
|
258
|
+
return BRACKET_FIELD_MAP[inner_lower]
|
|
259
|
+
for key, val in BRACKET_FIELD_MAP.items():
|
|
260
|
+
if key in inner_lower:
|
|
261
|
+
return val
|
|
262
|
+
# Auto-generate camelCase field name
|
|
263
|
+
words = re.sub(r'[^a-zA-Z0-9 ]', '', inner).split()
|
|
264
|
+
if not words:
|
|
265
|
+
return None
|
|
266
|
+
camel = words[0].lower() + ''.join(w.title() for w in words[1:])
|
|
267
|
+
return f'data.{camel}'
|
|
268
|
+
|
|
269
|
+
def split_bracket_runs(text):
|
|
270
|
+
"""
|
|
271
|
+
Split a text run that contains multiple [bracket] placeholders into a list
|
|
272
|
+
of (text_segment, is_placeholder) tuples so each can be emitted separately.
|
|
273
|
+
|
|
274
|
+
e.g. '[Foo] — [Bar]' → [('[Foo]', True), (' — ', False), ('[Bar]', True)]
|
|
275
|
+
"""
|
|
276
|
+
parts = []
|
|
277
|
+
for seg in re.split(r'(\[[^\[\]]+\])', text):
|
|
278
|
+
if not seg:
|
|
279
|
+
continue
|
|
280
|
+
is_ph = bool(re.match(r'^\[[^\[\]]+\]$', seg))
|
|
281
|
+
parts.append((seg, is_ph))
|
|
282
|
+
return parts
|
|
283
|
+
|
|
284
|
+
# ─── Parser ───────────────────────────────────────────────────────────────────
|
|
285
|
+
class DocxParser:
|
|
286
|
+
def __init__(self, docx_path):
|
|
287
|
+
self.path = docx_path
|
|
288
|
+
self.body_elements = []
|
|
289
|
+
self.header_elements = []
|
|
290
|
+
self.footer_elements = []
|
|
291
|
+
self.media_files = [] # zip paths, e.g. ['word/media/image1.png']
|
|
292
|
+
self.media_bytes = {} # filename -> bytes, written to assets/ by main()
|
|
293
|
+
self.relationships = {} # rId -> target path
|
|
294
|
+
self.hyperlink_rels = {} # rId -> URL (from document.xml.rels where Type=hyperlink)
|
|
295
|
+
self.colors = []
|
|
296
|
+
self._parse()
|
|
297
|
+
|
|
298
|
+
def _parse(self):
|
|
299
|
+
with zipfile.ZipFile(self.path, 'r') as z:
|
|
300
|
+
names = z.namelist()
|
|
301
|
+
self.media_files = [f for f in names if 'word/media/' in f and os.path.basename(f)]
|
|
302
|
+
|
|
303
|
+
# Extract image bytes while the zip is open
|
|
304
|
+
for mf in self.media_files:
|
|
305
|
+
fname = os.path.basename(mf)
|
|
306
|
+
if fname:
|
|
307
|
+
self.media_bytes[fname] = z.read(mf)
|
|
308
|
+
|
|
309
|
+
# Relationships (images + hyperlinks)
|
|
310
|
+
if 'word/_rels/document.xml.rels' in names:
|
|
311
|
+
with z.open('word/_rels/document.xml.rels') as f:
|
|
312
|
+
rels_tree = ET.parse(f)
|
|
313
|
+
for rel in rels_tree.getroot():
|
|
314
|
+
rid = rel.get('Id', '')
|
|
315
|
+
target = rel.get('Target', '')
|
|
316
|
+
rtype = rel.get('Type', '')
|
|
317
|
+
self.relationships[rid] = target
|
|
318
|
+
if 'hyperlink' in rtype:
|
|
319
|
+
self.hyperlink_rels[rid] = target
|
|
320
|
+
|
|
321
|
+
# Document body
|
|
322
|
+
with z.open('word/document.xml') as f:
|
|
323
|
+
doc_tree = ET.parse(f)
|
|
324
|
+
body = doc_tree.getroot().find(qn('w:body'))
|
|
325
|
+
self.body_elements = self._parse_body(body)
|
|
326
|
+
|
|
327
|
+
# Header (use first one found)
|
|
328
|
+
for fname in sorted(f for f in names if re.match(r'word/header\d+\.xml', f)):
|
|
329
|
+
with z.open(fname) as f:
|
|
330
|
+
h_tree = ET.parse(f)
|
|
331
|
+
self.header_elements = self._parse_paragraphs_from(h_tree.getroot())
|
|
332
|
+
break
|
|
333
|
+
|
|
334
|
+
# Footer (use first one found)
|
|
335
|
+
for fname in sorted(f for f in names if re.match(r'word/footer\d+\.xml', f)):
|
|
336
|
+
with z.open(fname) as f:
|
|
337
|
+
ft_tree = ET.parse(f)
|
|
338
|
+
self.footer_elements = self._parse_paragraphs_from(ft_tree.getroot())
|
|
339
|
+
break
|
|
340
|
+
|
|
341
|
+
self._extract_colors()
|
|
342
|
+
|
|
343
|
+
def _parse_body(self, body):
|
|
344
|
+
elements = []
|
|
345
|
+
for child in body:
|
|
346
|
+
tag = child.tag
|
|
347
|
+
if tag == qn('w:p'):
|
|
348
|
+
parsed = self._parse_para(child)
|
|
349
|
+
if parsed is not None:
|
|
350
|
+
elements.append(parsed)
|
|
351
|
+
elif tag == qn('w:tbl'):
|
|
352
|
+
elements.append(self._parse_table(child))
|
|
353
|
+
elif tag == qn('w:sectPr'):
|
|
354
|
+
elements.append(self._parse_section(child))
|
|
355
|
+
return elements
|
|
356
|
+
|
|
357
|
+
def _parse_paragraphs_from(self, root):
|
|
358
|
+
elements = []
|
|
359
|
+
for child in root:
|
|
360
|
+
if child.tag == qn('w:p'):
|
|
361
|
+
parsed = self._parse_para(child)
|
|
362
|
+
if parsed is not None:
|
|
363
|
+
elements.append(parsed)
|
|
364
|
+
return elements
|
|
365
|
+
|
|
366
|
+
def _parse_para(self, p_el):
|
|
367
|
+
pPr = p_el.find(qn('w:pPr'))
|
|
368
|
+
para = ParaFormat()
|
|
369
|
+
|
|
370
|
+
# Inline section break
|
|
371
|
+
if pPr is not None:
|
|
372
|
+
sect = pPr.find(qn('w:sectPr'))
|
|
373
|
+
if sect is not None:
|
|
374
|
+
return self._parse_section(sect)
|
|
375
|
+
|
|
376
|
+
if pPr is not None:
|
|
377
|
+
jc = pPr.find(qn('w:jc'))
|
|
378
|
+
if jc is not None:
|
|
379
|
+
v = jc.get(qn('w:val'), '')
|
|
380
|
+
para.alignment = {'center': 'CENTER', 'right': 'RIGHT', 'both': 'BOTH'}.get(v, 'LEFT')
|
|
381
|
+
|
|
382
|
+
sp = pPr.find(qn('w:spacing'))
|
|
383
|
+
if sp is not None:
|
|
384
|
+
b = sp.get(qn('w:before'), '0')
|
|
385
|
+
a = sp.get(qn('w:after'), '0')
|
|
386
|
+
para.spacing_before = int(b) if str(b).isdigit() else 0
|
|
387
|
+
para.spacing_after = int(a) if str(a).isdigit() else 0
|
|
388
|
+
|
|
389
|
+
pBdr = pPr.find(qn('w:pBdr'))
|
|
390
|
+
if pBdr is not None:
|
|
391
|
+
for side in ['top', 'bottom']:
|
|
392
|
+
bdr = pBdr.find(qn(f'w:{side}'))
|
|
393
|
+
if bdr is not None:
|
|
394
|
+
binfo = {
|
|
395
|
+
'style': bdr.get(qn('w:val'), 'single'),
|
|
396
|
+
'size': int(bdr.get(qn('w:sz'), '4')),
|
|
397
|
+
'color': bdr.get(qn('w:color'), '000000'),
|
|
398
|
+
'space': int(bdr.get(qn('w:space'), '0')),
|
|
399
|
+
}
|
|
400
|
+
if side == 'top': para.border_top = binfo
|
|
401
|
+
else: para.border_bottom = binfo
|
|
402
|
+
|
|
403
|
+
pStyle = pPr.find(qn('w:pStyle'))
|
|
404
|
+
if pStyle is not None:
|
|
405
|
+
para.style = pStyle.get(qn('w:val'))
|
|
406
|
+
|
|
407
|
+
if pPr.find(qn('w:numPr')) is not None:
|
|
408
|
+
para.is_bullet = True
|
|
409
|
+
|
|
410
|
+
tabs = pPr.find(qn('w:tabs'))
|
|
411
|
+
if tabs is not None:
|
|
412
|
+
for tab in tabs.findall(qn('w:tab')):
|
|
413
|
+
if tab.get(qn('w:val')) == 'right':
|
|
414
|
+
pos = tab.get(qn('w:pos'))
|
|
415
|
+
if pos:
|
|
416
|
+
para.tab_stop_right = int(pos)
|
|
417
|
+
|
|
418
|
+
# Walk children: runs, hyperlinks, drawings
|
|
419
|
+
inside_cached_field = False
|
|
420
|
+
for child in p_el:
|
|
421
|
+
tag = child.tag
|
|
422
|
+
|
|
423
|
+
if tag == qn('w:r'):
|
|
424
|
+
fld = child.find(qn('w:fldChar'))
|
|
425
|
+
if fld is not None:
|
|
426
|
+
ftype = fld.get(qn('w:fldCharType'), '')
|
|
427
|
+
if ftype == 'begin': inside_cached_field = True
|
|
428
|
+
elif ftype == 'end': inside_cached_field = False
|
|
429
|
+
continue
|
|
430
|
+
|
|
431
|
+
drawing = child.find(qn('w:drawing'))
|
|
432
|
+
if drawing is not None:
|
|
433
|
+
img = self._parse_drawing(drawing)
|
|
434
|
+
if img:
|
|
435
|
+
img.para_align = para.alignment
|
|
436
|
+
img.spacing_before = para.spacing_before
|
|
437
|
+
img.spacing_after = para.spacing_after
|
|
438
|
+
return img
|
|
439
|
+
continue
|
|
440
|
+
|
|
441
|
+
run = self._parse_run(child, is_cached=inside_cached_field)
|
|
442
|
+
if run:
|
|
443
|
+
para.runs.append(run)
|
|
444
|
+
|
|
445
|
+
elif tag == qn('w:hyperlink'):
|
|
446
|
+
# Resolve URL from relationship ID or inline anchor
|
|
447
|
+
rid = child.get(qn('r:id'), '')
|
|
448
|
+
url = self.hyperlink_rels.get(rid, child.get(qn('w:anchor'), ''))
|
|
449
|
+
for r_el in child.findall(qn('w:r')):
|
|
450
|
+
run = self._parse_run(r_el, is_cached=inside_cached_field)
|
|
451
|
+
if run:
|
|
452
|
+
run.hyperlink_url = url
|
|
453
|
+
para.runs.append(run)
|
|
454
|
+
|
|
455
|
+
elif tag == qn('w:drawing'):
|
|
456
|
+
img = self._parse_drawing(child)
|
|
457
|
+
if img:
|
|
458
|
+
img.para_align = para.alignment
|
|
459
|
+
img.spacing_before = para.spacing_before
|
|
460
|
+
img.spacing_after = para.spacing_after
|
|
461
|
+
return img
|
|
462
|
+
|
|
463
|
+
for br in p_el.findall(f'.//{qn("w:br")}'):
|
|
464
|
+
if br.get(qn('w:type')) == 'page':
|
|
465
|
+
para.is_page_break = True
|
|
466
|
+
|
|
467
|
+
return para
|
|
468
|
+
|
|
469
|
+
def _parse_run(self, r_el, is_cached=False):
|
|
470
|
+
if r_el.find(qn('w:drawing')) is not None:
|
|
471
|
+
return None
|
|
472
|
+
|
|
473
|
+
run = RunFormat()
|
|
474
|
+
run.is_cached_field = is_cached
|
|
475
|
+
|
|
476
|
+
rPr = r_el.find(qn('w:rPr'))
|
|
477
|
+
if rPr is not None:
|
|
478
|
+
run.bold = _bool_prop(rPr, 'w:b')
|
|
479
|
+
run.italic = _bool_prop(rPr, 'w:i')
|
|
480
|
+
run.allCaps = _bool_prop(rPr, 'w:caps')
|
|
481
|
+
|
|
482
|
+
fonts_el = rPr.find(qn('w:rFonts'))
|
|
483
|
+
if fonts_el is not None:
|
|
484
|
+
run.font = fonts_el.get(qn('w:ascii'), 'Calibri')
|
|
485
|
+
|
|
486
|
+
sz = rPr.find(qn('w:sz'))
|
|
487
|
+
if sz is not None:
|
|
488
|
+
hp = sz.get(qn('w:val'))
|
|
489
|
+
if hp:
|
|
490
|
+
run.size = int(hp)
|
|
491
|
+
|
|
492
|
+
color_el = rPr.find(qn('w:color'))
|
|
493
|
+
if color_el is not None:
|
|
494
|
+
c = color_el.get(qn('w:val'))
|
|
495
|
+
if c and c != 'auto':
|
|
496
|
+
run.color = c
|
|
497
|
+
|
|
498
|
+
# Field instruction (PAGE / NUMPAGES)
|
|
499
|
+
instr = r_el.find(qn('w:instrText'))
|
|
500
|
+
if instr is not None:
|
|
501
|
+
text = (instr.text or '').strip()
|
|
502
|
+
if text in ('PAGE', 'NUMPAGES'):
|
|
503
|
+
run.is_page_number = True
|
|
504
|
+
run.is_cached_field = False
|
|
505
|
+
run.text = text
|
|
506
|
+
return run
|
|
507
|
+
return None
|
|
508
|
+
|
|
509
|
+
if r_el.find(qn('w:tab')) is not None:
|
|
510
|
+
run.is_tab = True
|
|
511
|
+
run.text = '\t'
|
|
512
|
+
return run
|
|
513
|
+
|
|
514
|
+
t = r_el.find(qn('w:t'))
|
|
515
|
+
if t is not None and t.text:
|
|
516
|
+
run.text = t.text
|
|
517
|
+
return run
|
|
518
|
+
|
|
519
|
+
return None
|
|
520
|
+
|
|
521
|
+
def _parse_drawing(self, drawing_el):
|
|
522
|
+
extent = drawing_el.find(f'.//{qn("wp:extent")}')
|
|
523
|
+
if extent is None:
|
|
524
|
+
return None
|
|
525
|
+
cx = int(extent.get('cx', 0))
|
|
526
|
+
cy = int(extent.get('cy', 0))
|
|
527
|
+
|
|
528
|
+
blip = drawing_el.find(f'.//{qn("a:blip")}')
|
|
529
|
+
if blip is None:
|
|
530
|
+
return None
|
|
531
|
+
rel_id = blip.get(qn('r:embed'))
|
|
532
|
+
if not rel_id:
|
|
533
|
+
return None
|
|
534
|
+
|
|
535
|
+
return ImageRef(rel_id=rel_id, width_emu=cx, height_emu=cy)
|
|
536
|
+
|
|
537
|
+
def _parse_table(self, tbl_el):
|
|
538
|
+
table = TableFormat()
|
|
539
|
+
tblPr = tbl_el.find(qn('w:tblPr'))
|
|
540
|
+
|
|
541
|
+
if tblPr is not None:
|
|
542
|
+
tblW = tblPr.find(qn('w:tblW'))
|
|
543
|
+
if tblW is not None:
|
|
544
|
+
table.width_dxa = int(tblW.get(qn('w:w'), 0))
|
|
545
|
+
table.width_type = tblW.get(qn('w:type'), 'dxa')
|
|
546
|
+
jc = tblPr.find(qn('w:jc'))
|
|
547
|
+
if jc is not None:
|
|
548
|
+
table.alignment = jc.get(qn('w:val'), 'left').upper()
|
|
549
|
+
|
|
550
|
+
for tr in tbl_el.findall(qn('w:tr')):
|
|
551
|
+
row = [self._parse_cell(tc) for tc in tr.findall(qn('w:tc'))]
|
|
552
|
+
if row:
|
|
553
|
+
table.rows.append(row)
|
|
554
|
+
|
|
555
|
+
return table
|
|
556
|
+
|
|
557
|
+
def _parse_cell(self, tc_el):
|
|
558
|
+
cell = CellFormat()
|
|
559
|
+
tcPr = tc_el.find(qn('w:tcPr'))
|
|
560
|
+
|
|
561
|
+
if tcPr is not None:
|
|
562
|
+
tcW = tcPr.find(qn('w:tcW'))
|
|
563
|
+
if tcW is not None:
|
|
564
|
+
cell.width_dxa = int(tcW.get(qn('w:w'), 0))
|
|
565
|
+
|
|
566
|
+
shd = tcPr.find(qn('w:shd'))
|
|
567
|
+
if shd is not None:
|
|
568
|
+
fill = shd.get(qn('w:fill'))
|
|
569
|
+
if fill and fill.upper() not in ('AUTO', 'FFFFFF'):
|
|
570
|
+
cell.shading = fill.upper()
|
|
571
|
+
|
|
572
|
+
tcMar = tcPr.find(qn('w:tcMar'))
|
|
573
|
+
if tcMar is not None:
|
|
574
|
+
for side, attr_name in [('top','margin_top'),('bottom','margin_bottom'),
|
|
575
|
+
('left','margin_left'),('right','margin_right')]:
|
|
576
|
+
m = tcMar.find(qn(f'w:{side}'))
|
|
577
|
+
if m is not None:
|
|
578
|
+
setattr(cell, attr_name, int(m.get(qn('w:w'), '0')))
|
|
579
|
+
|
|
580
|
+
tcBdr = tcPr.find(qn('w:tcBorders'))
|
|
581
|
+
if tcBdr is not None:
|
|
582
|
+
top_bdr = tcBdr.find(qn('w:top'))
|
|
583
|
+
if top_bdr is not None:
|
|
584
|
+
cell.border_color = top_bdr.get(qn('w:color'))
|
|
585
|
+
|
|
586
|
+
vAlign = tcPr.find(qn('w:vAlign'))
|
|
587
|
+
if vAlign is not None:
|
|
588
|
+
cell.valign = vAlign.get(qn('w:val'))
|
|
589
|
+
|
|
590
|
+
for p in tc_el.findall(qn('w:p')):
|
|
591
|
+
parsed = self._parse_para(p)
|
|
592
|
+
if parsed is not None:
|
|
593
|
+
cell.paragraphs.append(parsed)
|
|
594
|
+
|
|
595
|
+
return cell
|
|
596
|
+
|
|
597
|
+
def _parse_section(self, sect_el):
|
|
598
|
+
s = SectionBreak()
|
|
599
|
+
pgMar = sect_el.find(qn('w:pgMar'))
|
|
600
|
+
if pgMar is not None:
|
|
601
|
+
s.margin_top = int(pgMar.get(qn('w:top'), 1440))
|
|
602
|
+
s.margin_bottom = int(pgMar.get(qn('w:bottom'), 1440))
|
|
603
|
+
s.margin_left = int(pgMar.get(qn('w:left'), 1440))
|
|
604
|
+
s.margin_right = int(pgMar.get(qn('w:right'), 1440))
|
|
605
|
+
return s
|
|
606
|
+
|
|
607
|
+
def _extract_colors(self):
|
|
608
|
+
colors = set()
|
|
609
|
+
def scan(elements):
|
|
610
|
+
for el in elements:
|
|
611
|
+
if isinstance(el, ParaFormat):
|
|
612
|
+
for r in el.runs:
|
|
613
|
+
if r.color: colors.add(r.color.upper())
|
|
614
|
+
if el.border_bottom: colors.add(el.border_bottom['color'].upper())
|
|
615
|
+
if el.border_top: colors.add(el.border_top['color'].upper())
|
|
616
|
+
elif isinstance(el, TableFormat):
|
|
617
|
+
for row in el.rows:
|
|
618
|
+
for cell in row:
|
|
619
|
+
if cell.shading: colors.add(cell.shading.upper())
|
|
620
|
+
if cell.border_color: colors.add(cell.border_color.upper())
|
|
621
|
+
scan(cell.paragraphs)
|
|
622
|
+
scan(self.body_elements)
|
|
623
|
+
scan(self.header_elements)
|
|
624
|
+
scan(self.footer_elements)
|
|
625
|
+
self.colors = sorted(colors)
|
|
626
|
+
|
|
627
|
+
def list_summary(self):
|
|
628
|
+
"""Return a human-readable summary of what was found in the template."""
|
|
629
|
+
sections = [el for el in self.body_elements if isinstance(el, SectionBreak)]
|
|
630
|
+
images = [el for el in self.body_elements if isinstance(el, ImageRef)]
|
|
631
|
+
tables = [el for el in self.body_elements if isinstance(el, TableFormat)]
|
|
632
|
+
paras = [el for el in self.body_elements if isinstance(el, ParaFormat)]
|
|
633
|
+
hyperlinks = set()
|
|
634
|
+
def find_links(elements):
|
|
635
|
+
for el in elements:
|
|
636
|
+
if isinstance(el, ParaFormat):
|
|
637
|
+
for r in el.runs:
|
|
638
|
+
if r.hyperlink_url:
|
|
639
|
+
hyperlinks.add(r.hyperlink_url)
|
|
640
|
+
elif isinstance(el, TableFormat):
|
|
641
|
+
for row in el.rows:
|
|
642
|
+
for cell in row:
|
|
643
|
+
find_links(cell.paragraphs)
|
|
644
|
+
find_links(self.body_elements)
|
|
645
|
+
|
|
646
|
+
placeholders = {}
|
|
647
|
+
def find_placeholders(elements):
|
|
648
|
+
for el in elements:
|
|
649
|
+
if isinstance(el, ParaFormat):
|
|
650
|
+
for r in el.runs:
|
|
651
|
+
if r.text:
|
|
652
|
+
for seg, is_ph in split_bracket_runs(r.text):
|
|
653
|
+
if is_ph:
|
|
654
|
+
f = bracket_to_field(seg)
|
|
655
|
+
if f:
|
|
656
|
+
placeholders[f.replace('data.','')] = seg
|
|
657
|
+
elif isinstance(el, TableFormat):
|
|
658
|
+
for row in el.rows:
|
|
659
|
+
for cell in row:
|
|
660
|
+
find_placeholders(cell.paragraphs)
|
|
661
|
+
find_placeholders(self.body_elements)
|
|
662
|
+
|
|
663
|
+
lines = [
|
|
664
|
+
f"Template: {os.path.basename(self.path)}",
|
|
665
|
+
f" Sections : {len(sections) + 1} (section breaks found: {len(sections)})",
|
|
666
|
+
f" Body paras : {len(paras)}",
|
|
667
|
+
f" Tables : {len(tables)}",
|
|
668
|
+
f" Images : {len(images)} {[os.path.basename(self.relationships.get(i.rel_id,'?')) for i in images]}",
|
|
669
|
+
f" Colors : {self.colors}",
|
|
670
|
+
f" Header : {len(self.header_elements)} paragraph(s)",
|
|
671
|
+
f" Footer : {len(self.footer_elements)} paragraph(s)",
|
|
672
|
+
]
|
|
673
|
+
if placeholders:
|
|
674
|
+
lines.append(f" Placeholders detected:")
|
|
675
|
+
for name, raw in placeholders.items():
|
|
676
|
+
lines.append(f" data.{name} <- {raw}")
|
|
677
|
+
if hyperlinks:
|
|
678
|
+
lines.append(f" Hyperlinks:")
|
|
679
|
+
for url in sorted(hyperlinks):
|
|
680
|
+
lines.append(f" {url}")
|
|
681
|
+
return '\n'.join(lines)
|
|
682
|
+
|
|
683
|
+
# ─── Code Generator ──────────────────────────────────────────────────────────
|
|
684
|
+
class JSGenerator:
|
|
685
|
+
def __init__(self, parser: DocxParser, template_name: str):
|
|
686
|
+
self.p = parser
|
|
687
|
+
self.name = template_name
|
|
688
|
+
self.color_map = build_color_map(parser.colors)
|
|
689
|
+
self.inferred_data_fields = {} # field_name -> example value
|
|
690
|
+
|
|
691
|
+
def _color_ref(self, hex_color):
|
|
692
|
+
if not hex_color:
|
|
693
|
+
return "'000000'"
|
|
694
|
+
key = hex_color.upper()
|
|
695
|
+
return f"COLOR.{self.color_map[key]}" if key in self.color_map else f"'{hex_color}'"
|
|
696
|
+
|
|
697
|
+
def _emu_to_inch(self, emu):
|
|
698
|
+
return round(emu / 914400, 3)
|
|
699
|
+
|
|
700
|
+
def _run_to_js_parts(self, r: RunFormat, indent: str) -> list:
|
|
701
|
+
"""
|
|
702
|
+
Convert a RunFormat to one or more JS TextRun/ExternalHyperlink strings.
|
|
703
|
+
Returns a list (usually one item) because a run with multiple [brackets]
|
|
704
|
+
gets split into multiple TextRuns.
|
|
705
|
+
"""
|
|
706
|
+
if r.is_cached_field:
|
|
707
|
+
return []
|
|
708
|
+
|
|
709
|
+
if r.is_tab:
|
|
710
|
+
return [f"{indent}new TextRun({{ text: '\\t', font: FONT, size: {r.size or 22}, color: {self._color_ref(r.color)} }}),"]
|
|
711
|
+
|
|
712
|
+
if r.is_page_number:
|
|
713
|
+
field = 'PageNumber.CURRENT' if r.text == 'PAGE' else 'PageNumber.TOTAL_PAGES'
|
|
714
|
+
return [f"{indent}new TextRun({{ children: [{field}], font: FONT, size: {r.size or 22}, color: {self._color_ref(r.color)} }}),"]
|
|
715
|
+
|
|
716
|
+
if not r.text:
|
|
717
|
+
return []
|
|
718
|
+
|
|
719
|
+
# Split into bracket segments so '[Foo] — [Bar]' emits two TextRuns
|
|
720
|
+
segments = split_bracket_runs(r.text)
|
|
721
|
+
results = []
|
|
722
|
+
for seg_text, is_placeholder in segments:
|
|
723
|
+
field_ref = bracket_to_field(seg_text) if is_placeholder else None
|
|
724
|
+
if field_ref:
|
|
725
|
+
field_name = field_ref.replace('data.', '')
|
|
726
|
+
if field_name not in self.inferred_data_fields:
|
|
727
|
+
self.inferred_data_fields[field_name] = seg_text.strip('[]')
|
|
728
|
+
text_expr = field_ref
|
|
729
|
+
else:
|
|
730
|
+
escaped = seg_text.replace('\\', '\\\\').replace("'", "\\'")
|
|
731
|
+
text_expr = f"'{escaped}'"
|
|
732
|
+
|
|
733
|
+
opts = [f"text: {text_expr}", "font: FONT"]
|
|
734
|
+
if r.size: opts.append(f"size: {r.size}")
|
|
735
|
+
if r.bold: opts.append("bold: true")
|
|
736
|
+
if r.italic: opts.append("italics: true")
|
|
737
|
+
if r.allCaps: opts.append("allCaps: true")
|
|
738
|
+
if r.color: opts.append(f"color: {self._color_ref(r.color)}")
|
|
739
|
+
|
|
740
|
+
run_js = f"new TextRun({{ {', '.join(opts)} }})"
|
|
741
|
+
|
|
742
|
+
if r.hyperlink_url:
|
|
743
|
+
escaped_url = r.hyperlink_url.replace("'", "\\'")
|
|
744
|
+
results.append(f"{indent}new ExternalHyperlink({{ link: '{escaped_url}', children: [{run_js}] }}),")
|
|
745
|
+
else:
|
|
746
|
+
results.append(f"{indent}{run_js},")
|
|
747
|
+
|
|
748
|
+
return results
|
|
749
|
+
|
|
750
|
+
def _para_to_js(self, para: ParaFormat, indent: str) -> Optional[str]:
|
|
751
|
+
if not isinstance(para, ParaFormat):
|
|
752
|
+
return None
|
|
753
|
+
if para.is_page_break:
|
|
754
|
+
return f"{indent}new Paragraph({{ pageBreakBefore: true }}),"
|
|
755
|
+
|
|
756
|
+
lines = [f"{indent}new Paragraph({{"]
|
|
757
|
+
|
|
758
|
+
run_lines = []
|
|
759
|
+
for r in para.runs:
|
|
760
|
+
run_lines.extend(self._run_to_js_parts(r, indent + ' '))
|
|
761
|
+
|
|
762
|
+
if run_lines:
|
|
763
|
+
lines.append(f"{indent} children: [")
|
|
764
|
+
lines.extend(run_lines)
|
|
765
|
+
lines.append(f"{indent} ],")
|
|
766
|
+
else:
|
|
767
|
+
lines.append(f"{indent} children: [],")
|
|
768
|
+
|
|
769
|
+
if para.alignment and para.alignment != 'LEFT':
|
|
770
|
+
lines.append(f"{indent} alignment: AlignmentType.{para.alignment},")
|
|
771
|
+
|
|
772
|
+
if para.spacing_before or para.spacing_after:
|
|
773
|
+
lines.append(f"{indent} spacing: {{ before: {para.spacing_before}, after: {para.spacing_after} }},")
|
|
774
|
+
|
|
775
|
+
borders = {}
|
|
776
|
+
if para.border_top:
|
|
777
|
+
b = para.border_top
|
|
778
|
+
borders['top'] = f"{{ style: BorderStyle.SINGLE, size: {b['size']}, color: {self._color_ref(b['color'])}, space: {b['space']} }}"
|
|
779
|
+
if para.border_bottom:
|
|
780
|
+
b = para.border_bottom
|
|
781
|
+
borders['bottom'] = f"{{ style: BorderStyle.SINGLE, size: {b['size']}, color: {self._color_ref(b['color'])}, space: {b['space']} }}"
|
|
782
|
+
if borders:
|
|
783
|
+
bstr = ', '.join(f"{k}: {v}" for k, v in borders.items())
|
|
784
|
+
lines.append(f"{indent} border: {{ {bstr} }},")
|
|
785
|
+
|
|
786
|
+
if para.tab_stop_right:
|
|
787
|
+
lines.append(f"{indent} tabStops: [{{ type: TabStopType.RIGHT, position: TabStopPosition.MAX }}],")
|
|
788
|
+
|
|
789
|
+
if para.is_bullet:
|
|
790
|
+
lines.append(f"{indent} bullet: {{ level: 0 }},")
|
|
791
|
+
|
|
792
|
+
lines.append(f"{indent}}})," )
|
|
793
|
+
return '\n'.join(lines)
|
|
794
|
+
|
|
795
|
+
def _image_to_js(self, img: ImageRef, indent: str) -> str:
|
|
796
|
+
target = self.p.relationships.get(img.rel_id, '')
|
|
797
|
+
fname = os.path.basename(target) or 'image.png'
|
|
798
|
+
var_name = re.sub(r'[^a-zA-Z0-9]', '_', os.path.splitext(fname)[0]).upper() + '_PATH'
|
|
799
|
+
ext = os.path.splitext(fname)[1].lstrip('.').lower() or 'png'
|
|
800
|
+
w_pt = round(self._emu_to_inch(img.width_emu) * 72)
|
|
801
|
+
h_pt = round(self._emu_to_inch(img.height_emu) * 72)
|
|
802
|
+
|
|
803
|
+
lines = [
|
|
804
|
+
f"{indent}new Paragraph({{",
|
|
805
|
+
f"{indent} children: [",
|
|
806
|
+
f"{indent} new ImageRun({{",
|
|
807
|
+
f"{indent} data: readFileSync({var_name}),",
|
|
808
|
+
f"{indent} transformation: {{ width: {w_pt}, height: {h_pt} }},",
|
|
809
|
+
f"{indent} type: '{ext}',",
|
|
810
|
+
f"{indent} }}),",
|
|
811
|
+
f"{indent} ],",
|
|
812
|
+
]
|
|
813
|
+
if img.para_align and img.para_align != 'LEFT':
|
|
814
|
+
lines.append(f"{indent} alignment: AlignmentType.{img.para_align},")
|
|
815
|
+
if img.spacing_before or img.spacing_after:
|
|
816
|
+
lines.append(f"{indent} spacing: {{ before: {img.spacing_before}, after: {img.spacing_after} }},")
|
|
817
|
+
lines.append(f"{indent}}})," )
|
|
818
|
+
return '\n'.join(lines)
|
|
819
|
+
|
|
820
|
+
def _table_to_js(self, table: TableFormat, indent: str) -> str:
|
|
821
|
+
lines = [f"{indent}new Table({{"]
|
|
822
|
+
|
|
823
|
+
if table.width_type == 'pct' or (table.width_dxa and table.width_dxa > 8000):
|
|
824
|
+
lines.append(f"{indent} width: {{ size: 100, type: WidthType.PERCENTAGE }},")
|
|
825
|
+
elif table.width_dxa:
|
|
826
|
+
lines.append(f"{indent} width: {{ size: {table.width_dxa}, type: WidthType.DXA }},")
|
|
827
|
+
|
|
828
|
+
if table.alignment:
|
|
829
|
+
lines.append(f"{indent} alignment: AlignmentType.{table.alignment},")
|
|
830
|
+
|
|
831
|
+
lines.append(f"{indent} rows: [")
|
|
832
|
+
for row in table.rows:
|
|
833
|
+
lines.append(f"{indent} new TableRow({{")
|
|
834
|
+
lines.append(f"{indent} children: [")
|
|
835
|
+
total_w = sum(c.width_dxa or 0 for c in row)
|
|
836
|
+
for cell in row:
|
|
837
|
+
lines += self._cell_to_js(cell, total_w, indent + ' ')
|
|
838
|
+
lines.append(f"{indent} ],")
|
|
839
|
+
lines.append(f"{indent} }}),")
|
|
840
|
+
lines.append(f"{indent} ],")
|
|
841
|
+
lines.append(f"{indent}}})," )
|
|
842
|
+
return '\n'.join(lines)
|
|
843
|
+
|
|
844
|
+
def _cell_to_js(self, cell: CellFormat, total_width: int, indent: str) -> list:
|
|
845
|
+
lines = [f"{indent}new TableCell({{"]
|
|
846
|
+
|
|
847
|
+
if cell.width_dxa and total_width:
|
|
848
|
+
pct = round(cell.width_dxa / total_width * 100)
|
|
849
|
+
lines.append(f"{indent} width: {{ size: {pct}, type: WidthType.PERCENTAGE }},")
|
|
850
|
+
|
|
851
|
+
if cell.shading:
|
|
852
|
+
lines.append(f"{indent} shading: {{ fill: {self._color_ref(cell.shading)} }},")
|
|
853
|
+
|
|
854
|
+
bc = self._color_ref(cell.border_color) if cell.border_color else "'DDDDDD'"
|
|
855
|
+
lines += [
|
|
856
|
+
f"{indent} borders: {{",
|
|
857
|
+
f"{indent} top: {{ style: BorderStyle.SINGLE, size: 4, color: {bc} }},",
|
|
858
|
+
f"{indent} bottom: {{ style: BorderStyle.SINGLE, size: 4, color: {bc} }},",
|
|
859
|
+
f"{indent} left: {{ style: BorderStyle.SINGLE, size: 4, color: {bc} }},",
|
|
860
|
+
f"{indent} right: {{ style: BorderStyle.SINGLE, size: 4, color: {bc} }},",
|
|
861
|
+
f"{indent} }},",
|
|
862
|
+
]
|
|
863
|
+
|
|
864
|
+
if any([cell.margin_top, cell.margin_bottom, cell.margin_left, cell.margin_right]):
|
|
865
|
+
lines.append(f"{indent} margins: {{")
|
|
866
|
+
for side, val in [('top',cell.margin_top),('bottom',cell.margin_bottom),
|
|
867
|
+
('left',cell.margin_left),('right',cell.margin_right)]:
|
|
868
|
+
if val: lines.append(f"{indent} {side}: {val},")
|
|
869
|
+
lines.append(f"{indent} }},")
|
|
870
|
+
|
|
871
|
+
lines.append(f"{indent} children: [")
|
|
872
|
+
for para in cell.paragraphs:
|
|
873
|
+
pjs = self._para_to_js(para, indent + ' ')
|
|
874
|
+
if pjs:
|
|
875
|
+
lines.append(pjs)
|
|
876
|
+
lines.append(f"{indent} ],")
|
|
877
|
+
|
|
878
|
+
if cell.valign:
|
|
879
|
+
lines.append(f"{indent} verticalAlign: '{cell.valign}',")
|
|
880
|
+
|
|
881
|
+
lines.append(f"{indent}}})," )
|
|
882
|
+
return lines
|
|
883
|
+
|
|
884
|
+
# ── Section generators ───────────────────────────────────────────────────
|
|
885
|
+
|
|
886
|
+
def _gen_file_header(self):
|
|
887
|
+
# Check whether any hyperlinks exist so we know whether to import ExternalHyperlink
|
|
888
|
+
has_hyperlinks = any(
|
|
889
|
+
r.hyperlink_url
|
|
890
|
+
for el in self.p.body_elements + self.p.header_elements + self.p.footer_elements
|
|
891
|
+
if isinstance(el, ParaFormat)
|
|
892
|
+
for r in el.runs
|
|
893
|
+
)
|
|
894
|
+
self._has_hyperlinks = has_hyperlinks
|
|
895
|
+
|
|
896
|
+
basename = os.path.basename(self.p.path)
|
|
897
|
+
hyperlink_import = ', ExternalHyperlink' if has_hyperlinks else ''
|
|
898
|
+
return [
|
|
899
|
+
f'/**',
|
|
900
|
+
f' * {self.name.title()} Document Builder',
|
|
901
|
+
f' * Auto-generated by docx-to-builder https://github.com/jermorrison22/docx-to-builder',
|
|
902
|
+
f' * Source template: {basename}',
|
|
903
|
+
f' * Generated: {__import__("datetime").date.today().isoformat()}',
|
|
904
|
+
f' *',
|
|
905
|
+
f' * HOW TO USE:',
|
|
906
|
+
f' * 1. Fill in the data object below (or pass your own)',
|
|
907
|
+
f' * 2. Run: node {self.name}-builder.js',
|
|
908
|
+
f' * 3. Find the output at: output/{self.name}-output.docx',
|
|
909
|
+
f' *',
|
|
910
|
+
f' * To regenerate this file from a new template:',
|
|
911
|
+
f' * python3 docx-to-builder.py your-template.docx',
|
|
912
|
+
f' */',
|
|
913
|
+
'',
|
|
914
|
+
"import {",
|
|
915
|
+
f" Document, Packer, Paragraph, TextRun, ImageRun{hyperlink_import},",
|
|
916
|
+
" Table, TableRow, TableCell, WidthType, AlignmentType,",
|
|
917
|
+
" BorderStyle, Header, Footer, PageNumber,",
|
|
918
|
+
" TabStopType, TabStopPosition, convertInchesToTwip,",
|
|
919
|
+
"} from 'docx';",
|
|
920
|
+
"import { readFileSync, writeFileSync, mkdirSync } from 'fs';",
|
|
921
|
+
"import { fileURLToPath } from 'url';",
|
|
922
|
+
"import { dirname, join } from 'path';",
|
|
923
|
+
"",
|
|
924
|
+
"const __dirname = dirname(fileURLToPath(import.meta.url));",
|
|
925
|
+
"",
|
|
926
|
+
]
|
|
927
|
+
|
|
928
|
+
def _gen_constants(self):
|
|
929
|
+
lines = [
|
|
930
|
+
"// ─── Brand colors extracted from template ────────────────────────────────────",
|
|
931
|
+
"const COLOR = {",
|
|
932
|
+
]
|
|
933
|
+
for hex_color, name in sorted(self.color_map.items(), key=lambda x: x[1]):
|
|
934
|
+
lines.append(f" {name}: '{hex_color}',")
|
|
935
|
+
lines += ["};", ""]
|
|
936
|
+
|
|
937
|
+
fonts = {}
|
|
938
|
+
def scan(els):
|
|
939
|
+
for el in els:
|
|
940
|
+
if isinstance(el, ParaFormat):
|
|
941
|
+
for r in el.runs:
|
|
942
|
+
fonts[r.font] = fonts.get(r.font, 0) + 1
|
|
943
|
+
elif isinstance(el, TableFormat):
|
|
944
|
+
for row in el.rows:
|
|
945
|
+
for cell in row:
|
|
946
|
+
scan(cell.paragraphs)
|
|
947
|
+
scan(self.p.body_elements)
|
|
948
|
+
main_font = max(fonts, key=fonts.get) if fonts else 'Calibri'
|
|
949
|
+
lines += [f"const FONT = '{main_font}';", ""]
|
|
950
|
+
|
|
951
|
+
if self.p.media_files:
|
|
952
|
+
lines.append("// ─── Asset paths — update these if you move the image files ─────────────────")
|
|
953
|
+
for media in self.p.media_files:
|
|
954
|
+
fname = os.path.basename(media)
|
|
955
|
+
if not fname:
|
|
956
|
+
continue
|
|
957
|
+
var = re.sub(r'[^a-zA-Z0-9]', '_', os.path.splitext(fname)[0]).upper() + '_PATH'
|
|
958
|
+
lines.append(f"const {var} = join(__dirname, 'assets/{fname}');")
|
|
959
|
+
lines.append("")
|
|
960
|
+
|
|
961
|
+
return lines
|
|
962
|
+
|
|
963
|
+
def _gen_header_fn(self):
|
|
964
|
+
lines = [
|
|
965
|
+
"// ─── Header ───────────────────────────────────────────────────────────────────",
|
|
966
|
+
"function buildHeader(data) {",
|
|
967
|
+
" return new Header({",
|
|
968
|
+
" children: [",
|
|
969
|
+
]
|
|
970
|
+
for el in self.p.header_elements:
|
|
971
|
+
if isinstance(el, ParaFormat):
|
|
972
|
+
pjs = self._para_to_js(el, ' ')
|
|
973
|
+
if pjs:
|
|
974
|
+
lines.append(pjs)
|
|
975
|
+
lines += [" ],", " });", "}", ""]
|
|
976
|
+
return lines
|
|
977
|
+
|
|
978
|
+
def _gen_footer_fn(self):
|
|
979
|
+
lines = [
|
|
980
|
+
"// ─── Footer ───────────────────────────────────────────────────────────────────",
|
|
981
|
+
"function buildFooter() {",
|
|
982
|
+
" return new Footer({",
|
|
983
|
+
" children: [",
|
|
984
|
+
]
|
|
985
|
+
for el in self.p.footer_elements:
|
|
986
|
+
if isinstance(el, ParaFormat):
|
|
987
|
+
pjs = self._para_to_js(el, ' ')
|
|
988
|
+
if pjs:
|
|
989
|
+
lines.append(pjs)
|
|
990
|
+
lines += [" ],", " });", "}", ""]
|
|
991
|
+
return lines
|
|
992
|
+
|
|
993
|
+
def _elements_to_js(self, elements, indent=' ', is_first_section=True) -> list:
|
|
994
|
+
"""Convert a list of body elements to JS lines (used per-section)."""
|
|
995
|
+
# Strip trailing page-break paragraphs — the section boundary itself
|
|
996
|
+
# already forces a new page, so a pageBreakBefore at the end of a section
|
|
997
|
+
# produces a blank page.
|
|
998
|
+
trimmed = list(elements)
|
|
999
|
+
while trimmed and isinstance(trimmed[-1], ParaFormat) and trimmed[-1].is_page_break:
|
|
1000
|
+
trimmed.pop()
|
|
1001
|
+
|
|
1002
|
+
# Also strip leading page-break paragraphs from non-first sections —
|
|
1003
|
+
# the section start already puts us on a new page.
|
|
1004
|
+
if not is_first_section:
|
|
1005
|
+
while trimmed and isinstance(trimmed[0], ParaFormat) and trimmed[0].is_page_break:
|
|
1006
|
+
trimmed.pop(0)
|
|
1007
|
+
|
|
1008
|
+
elements = trimmed
|
|
1009
|
+
|
|
1010
|
+
lines = []
|
|
1011
|
+
for el in elements:
|
|
1012
|
+
if isinstance(el, SectionBreak):
|
|
1013
|
+
continue # handled at the section boundary level
|
|
1014
|
+
|
|
1015
|
+
if isinstance(el, ImageRef):
|
|
1016
|
+
target = self.p.relationships.get(el.rel_id, '')
|
|
1017
|
+
fname = os.path.basename(target) or 'image.png'
|
|
1018
|
+
w_in = self._emu_to_inch(el.width_emu)
|
|
1019
|
+
h_in = self._emu_to_inch(el.height_emu)
|
|
1020
|
+
lines.append(f"{indent}// Image: {fname} ({w_in}\" x {h_in}\")")
|
|
1021
|
+
lines.append(f"{indent}elements.push(")
|
|
1022
|
+
lines.append(self._image_to_js(el, indent + ' '))
|
|
1023
|
+
lines.append(f"{indent});")
|
|
1024
|
+
lines.append("")
|
|
1025
|
+
|
|
1026
|
+
elif isinstance(el, ParaFormat):
|
|
1027
|
+
if el.is_page_break:
|
|
1028
|
+
lines.append(f"{indent}elements.push(new Paragraph({{ pageBreakBefore: true }}));")
|
|
1029
|
+
lines.append("")
|
|
1030
|
+
continue
|
|
1031
|
+
pjs = self._para_to_js(el, indent + ' ')
|
|
1032
|
+
if pjs:
|
|
1033
|
+
preview = ' '.join(
|
|
1034
|
+
r.text for r in el.runs
|
|
1035
|
+
if r.text and not r.is_tab and not r.is_page_number and not r.is_cached_field
|
|
1036
|
+
)[:60]
|
|
1037
|
+
if preview:
|
|
1038
|
+
lines.append(f"{indent}// {repr(preview)}")
|
|
1039
|
+
lines.append(f"{indent}elements.push(")
|
|
1040
|
+
lines.append(pjs)
|
|
1041
|
+
lines.append(f"{indent});")
|
|
1042
|
+
lines.append("")
|
|
1043
|
+
|
|
1044
|
+
elif isinstance(el, TableFormat):
|
|
1045
|
+
n_rows = len(el.rows)
|
|
1046
|
+
n_cols = max((len(r) for r in el.rows), default=0)
|
|
1047
|
+
lines.append(f"{indent}// Table ({n_rows} rows x {n_cols} cols)")
|
|
1048
|
+
lines.append(f"{indent}elements.push(")
|
|
1049
|
+
lines.append(self._table_to_js(el, indent + ' '))
|
|
1050
|
+
lines.append(f"{indent});")
|
|
1051
|
+
lines.append("")
|
|
1052
|
+
|
|
1053
|
+
return lines
|
|
1054
|
+
|
|
1055
|
+
def _gen_content_fn(self):
|
|
1056
|
+
"""
|
|
1057
|
+
Split body elements at SectionBreak boundaries and generate one
|
|
1058
|
+
buildSection_N() function per section, plus a buildContent() that
|
|
1059
|
+
returns the full array.
|
|
1060
|
+
"""
|
|
1061
|
+
# Partition body_elements into sections
|
|
1062
|
+
sections_data = [] # list of (section_break, [elements before next break])
|
|
1063
|
+
current = []
|
|
1064
|
+
current_break = None
|
|
1065
|
+
for el in self.p.body_elements:
|
|
1066
|
+
if isinstance(el, SectionBreak):
|
|
1067
|
+
sections_data.append((current_break, current))
|
|
1068
|
+
current = []
|
|
1069
|
+
current_break = el
|
|
1070
|
+
else:
|
|
1071
|
+
current.append(el)
|
|
1072
|
+
# Don't forget trailing elements after last break
|
|
1073
|
+
sections_data.append((current_break, current))
|
|
1074
|
+
|
|
1075
|
+
lines = [
|
|
1076
|
+
"// ─── Document content ────────────────────────────────────────────────────────",
|
|
1077
|
+
"// Bracketed placeholders (e.g. [Client Name]) have been mapped to data.fieldName.",
|
|
1078
|
+
"// Static template text is kept as-is. Replace any literal string with a data",
|
|
1079
|
+
"// field if you need it to be dynamic.",
|
|
1080
|
+
"",
|
|
1081
|
+
]
|
|
1082
|
+
|
|
1083
|
+
fn_names = []
|
|
1084
|
+
for i, (sec_break, elements) in enumerate(sections_data):
|
|
1085
|
+
fn = f'buildSection{i + 1}'
|
|
1086
|
+
fn_names.append((fn, sec_break))
|
|
1087
|
+
comment = ''
|
|
1088
|
+
if sec_break:
|
|
1089
|
+
comment = f' // margins: T={sec_break.margin_top} B={sec_break.margin_bottom} L={sec_break.margin_left} R={sec_break.margin_right} (twips)'
|
|
1090
|
+
lines.append(f"function {fn}(data) {{{comment}")
|
|
1091
|
+
lines.append(" const elements = [];")
|
|
1092
|
+
lines.append("")
|
|
1093
|
+
lines.extend(self._elements_to_js(elements, ' ', is_first_section=(i == 0)))
|
|
1094
|
+
lines.append(" return elements;")
|
|
1095
|
+
lines.append("}")
|
|
1096
|
+
lines.append("")
|
|
1097
|
+
|
|
1098
|
+
return lines, fn_names
|
|
1099
|
+
|
|
1100
|
+
def _gen_builder_fn(self, fn_names):
|
|
1101
|
+
"""Generate the main buildDocument() that wires sections together."""
|
|
1102
|
+
lines = [
|
|
1103
|
+
"// ─── Main builder ────────────────────────────────────────────────────────────",
|
|
1104
|
+
"export async function buildDocument(data, outputPath) {",
|
|
1105
|
+
" const doc = new Document({",
|
|
1106
|
+
" sections: [",
|
|
1107
|
+
]
|
|
1108
|
+
|
|
1109
|
+
for fn, sec_break in fn_names:
|
|
1110
|
+
sb = sec_break if sec_break else SectionBreak()
|
|
1111
|
+
lines += [
|
|
1112
|
+
" {",
|
|
1113
|
+
" properties: {",
|
|
1114
|
+
" page: {",
|
|
1115
|
+
" margin: {",
|
|
1116
|
+
f" top: convertInchesToTwip({sb.margin_top / 1440:.3f}),",
|
|
1117
|
+
f" bottom: convertInchesToTwip({sb.margin_bottom / 1440:.3f}),",
|
|
1118
|
+
f" left: convertInchesToTwip({sb.margin_left / 1440:.3f}),",
|
|
1119
|
+
f" right: convertInchesToTwip({sb.margin_right / 1440:.3f}),",
|
|
1120
|
+
" },",
|
|
1121
|
+
" },",
|
|
1122
|
+
" },",
|
|
1123
|
+
" headers: { default: buildHeader(data) },",
|
|
1124
|
+
" footers: { default: buildFooter() },",
|
|
1125
|
+
f" children: {fn}(data),",
|
|
1126
|
+
" },",
|
|
1127
|
+
]
|
|
1128
|
+
|
|
1129
|
+
lines += [
|
|
1130
|
+
" ],",
|
|
1131
|
+
" });",
|
|
1132
|
+
"",
|
|
1133
|
+
" const buffer = await Packer.toBuffer(doc);",
|
|
1134
|
+
" writeFileSync(outputPath, buffer);",
|
|
1135
|
+
" console.log('Document saved to: ' + outputPath);",
|
|
1136
|
+
"}",
|
|
1137
|
+
"",
|
|
1138
|
+
]
|
|
1139
|
+
return lines
|
|
1140
|
+
|
|
1141
|
+
def _gen_example_data(self):
|
|
1142
|
+
lines = [
|
|
1143
|
+
"// ─── Data object ─────────────────────────────────────────────────────────────",
|
|
1144
|
+
"// Fields below were inferred from bracketed placeholders in your template.",
|
|
1145
|
+
"// Add, remove, or rename fields to match your use case.",
|
|
1146
|
+
"const data = {",
|
|
1147
|
+
]
|
|
1148
|
+
if self.inferred_data_fields:
|
|
1149
|
+
for field_name, example in self.inferred_data_fields.items():
|
|
1150
|
+
escaped = example.replace("'", "\\'")
|
|
1151
|
+
lines.append(f" {field_name}: '{escaped}',")
|
|
1152
|
+
else:
|
|
1153
|
+
lines.append(" title: 'Document Title',")
|
|
1154
|
+
lines.append(" clientName: 'Client Name',")
|
|
1155
|
+
lines.append(" date: new Date().toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' }),")
|
|
1156
|
+
lines += ["};", ""]
|
|
1157
|
+
return lines
|
|
1158
|
+
|
|
1159
|
+
def _gen_cli(self):
|
|
1160
|
+
return [
|
|
1161
|
+
"// ─── Run from command line ────────────────────────────────────────────────────",
|
|
1162
|
+
"if (process.argv[1] === fileURLToPath(import.meta.url)) {",
|
|
1163
|
+
" const outputArg = process.argv.indexOf('--output');",
|
|
1164
|
+
" const outputPath = outputArg !== -1",
|
|
1165
|
+
" ? process.argv[outputArg + 1]",
|
|
1166
|
+
f" : join(__dirname, 'output/{self.name}-output.docx');",
|
|
1167
|
+
"",
|
|
1168
|
+
" mkdirSync(join(__dirname, 'output'), { recursive: true });",
|
|
1169
|
+
" buildDocument(data, outputPath).catch(console.error);",
|
|
1170
|
+
"}",
|
|
1171
|
+
]
|
|
1172
|
+
|
|
1173
|
+
def generate(self):
|
|
1174
|
+
# Generate header first so _has_hyperlinks gets set
|
|
1175
|
+
file_header = self._gen_file_header()
|
|
1176
|
+
constants = self._gen_constants()
|
|
1177
|
+
header_fn = self._gen_header_fn()
|
|
1178
|
+
footer_fn = self._gen_footer_fn()
|
|
1179
|
+
|
|
1180
|
+
# Content generation populates inferred_data_fields as a side-effect
|
|
1181
|
+
content_lines, fn_names = self._gen_content_fn()
|
|
1182
|
+
builder_fn = self._gen_builder_fn(fn_names)
|
|
1183
|
+
example = self._gen_example_data()
|
|
1184
|
+
cli = self._gen_cli()
|
|
1185
|
+
|
|
1186
|
+
lines = []
|
|
1187
|
+
lines += file_header
|
|
1188
|
+
lines += constants
|
|
1189
|
+
lines += header_fn
|
|
1190
|
+
lines += footer_fn
|
|
1191
|
+
lines += content_lines
|
|
1192
|
+
lines += builder_fn
|
|
1193
|
+
lines += example
|
|
1194
|
+
lines += cli
|
|
1195
|
+
return '\n'.join(lines)
|
|
1196
|
+
|
|
1197
|
+
|
|
1198
|
+
# ─── Main ─────────────────────────────────────────────────────────────────────
|
|
1199
|
+
def main():
|
|
1200
|
+
if len(sys.argv) < 2:
|
|
1201
|
+
print(__doc__)
|
|
1202
|
+
sys.exit(1)
|
|
1203
|
+
|
|
1204
|
+
docx_path = sys.argv[1]
|
|
1205
|
+
if not os.path.exists(docx_path):
|
|
1206
|
+
print(f"Error: file not found: {docx_path}")
|
|
1207
|
+
sys.exit(1)
|
|
1208
|
+
|
|
1209
|
+
list_mode = '--list' in sys.argv
|
|
1210
|
+
|
|
1211
|
+
# Determine output path
|
|
1212
|
+
output_path = None
|
|
1213
|
+
if '--output' in sys.argv:
|
|
1214
|
+
idx = sys.argv.index('--output')
|
|
1215
|
+
output_path = sys.argv[idx + 1]
|
|
1216
|
+
|
|
1217
|
+
# Derive a clean document name from the filename
|
|
1218
|
+
basename = os.path.basename(docx_path)
|
|
1219
|
+
name = re.sub(r'(?i)template\s*[-\u2013]\s*', '', basename)
|
|
1220
|
+
name = os.path.splitext(name)[0].strip().lower()
|
|
1221
|
+
name = re.sub(r'\s+', '-', name)
|
|
1222
|
+
name = re.sub(r'[^a-z0-9-]', '', name)
|
|
1223
|
+
|
|
1224
|
+
if not output_path:
|
|
1225
|
+
output_path = os.path.join(os.path.dirname(os.path.abspath(docx_path)), f'{name}-builder.js')
|
|
1226
|
+
|
|
1227
|
+
print(f"Parsing: {docx_path}")
|
|
1228
|
+
parser = DocxParser(docx_path)
|
|
1229
|
+
|
|
1230
|
+
if list_mode:
|
|
1231
|
+
print()
|
|
1232
|
+
print(parser.list_summary())
|
|
1233
|
+
return
|
|
1234
|
+
|
|
1235
|
+
sections = [el for el in parser.body_elements if isinstance(el, SectionBreak)]
|
|
1236
|
+
print(f" {len(parser.body_elements)} body elements | "
|
|
1237
|
+
f"{len(parser.header_elements)} header | "
|
|
1238
|
+
f"{len(parser.footer_elements)} footer | "
|
|
1239
|
+
f"{len(parser.media_files)} image(s) | "
|
|
1240
|
+
f"{len(sections) + 1} section(s)")
|
|
1241
|
+
print(f" Colors: {parser.colors}")
|
|
1242
|
+
|
|
1243
|
+
print("Generating JS builder...")
|
|
1244
|
+
gen = JSGenerator(parser, name)
|
|
1245
|
+
js_code = gen.generate()
|
|
1246
|
+
|
|
1247
|
+
os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
|
|
1248
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
1249
|
+
f.write(js_code)
|
|
1250
|
+
|
|
1251
|
+
print(f"[OK] {output_path}")
|
|
1252
|
+
|
|
1253
|
+
# Extract embedded images into assets/ next to the builder
|
|
1254
|
+
if parser.media_bytes:
|
|
1255
|
+
assets_dir = os.path.join(os.path.dirname(os.path.abspath(output_path)), 'assets')
|
|
1256
|
+
os.makedirs(assets_dir, exist_ok=True)
|
|
1257
|
+
for fname, data in parser.media_bytes.items():
|
|
1258
|
+
dest = os.path.join(assets_dir, fname)
|
|
1259
|
+
with open(dest, 'wb') as f:
|
|
1260
|
+
f.write(data)
|
|
1261
|
+
print(f" [image] assets/{fname}")
|
|
1262
|
+
|
|
1263
|
+
if gen.inferred_data_fields:
|
|
1264
|
+
print(f" Inferred data fields: {list(gen.inferred_data_fields.keys())}")
|
|
1265
|
+
print(f"\nNext: run node {os.path.basename(output_path)}")
|
|
1266
|
+
|
|
1267
|
+
if __name__ == '__main__':
|
|
1268
|
+
main()
|