docx-to-builder 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1268 @@
1
+ """
2
+ docx-to-builder.py
3
+
4
+ Parses any .docx template and generates a complete, ready-to-run JavaScript
5
+ builder using the `docx` npm package (https://docx.js.org).
6
+
7
+ The generated builder:
8
+ - Reproduces the exact layout, colors, fonts, spacing, borders, and tables
9
+ - Replaces bracketed template placeholders like [Client Name] with data.fieldName
10
+ - Supports multi-section documents (cover page + body with separate margins)
11
+ - Preserves hyperlinks as ExternalHyperlink in the output
12
+ - Outputs a properly branded .docx when run with Node.js
13
+
14
+ Usage:
15
+ python3 docx-to-builder.py <template.docx> [--output <builder.js>]
16
+ python3 docx-to-builder.py <template.docx> --list
17
+
18
+ Options:
19
+ --output <path> Write the generated builder to this path instead of next to the template
20
+ --list Print a summary of what was found in the template and exit (no file written)
21
+
22
+ Examples:
23
+ python3 docx-to-builder.py sample-template.docx
24
+ python3 docx-to-builder.py my-proposal.docx --output proposal-builder.js
25
+ python3 docx-to-builder.py my-proposal.docx --list
26
+
27
+ Requirements:
28
+ - Python 3.8+ (standard library only — no pip installs needed)
29
+ - Node.js + docx npm package in the output project: npm install docx
30
+
31
+ The generated file will be placed next to the input .docx unless --output is specified.
32
+ """
33
+
34
+ import sys
35
+ import os
36
+ import zipfile
37
+ import xml.etree.ElementTree as ET
38
+ import re
39
+ import colorsys
40
+ from dataclasses import dataclass, field
41
+ from typing import Optional
42
+
43
+ # ─── XML namespaces ───────────────────────────────────────────────────────────
44
+ NS = {
45
+ 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
46
+ 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
47
+ 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
48
+ 'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
49
+ 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
50
+ }
51
+
52
+ def qn(tag):
53
+ """Resolve a prefixed tag like w:p to its full Clark notation."""
54
+ prefix, local = tag.split(':')
55
+ return f'{{{NS[prefix]}}}{local}'
56
+
57
+ def _bool_prop(rPr, tag):
58
+ """
59
+ Return True if a toggle property like <w:b/> is present AND not explicitly
60
+ turned off with w:val="0" or w:val="false". Word uses this pattern for
61
+ properties inherited from a style that need to be reset on a specific run.
62
+ """
63
+ el = rPr.find(qn(tag))
64
+ if el is None:
65
+ return False
66
+ val = el.get(qn('w:val'))
67
+ # Explicit off: val="0" or val="false"
68
+ if val in ('0', 'false'):
69
+ return False
70
+ return True
71
+
72
+ # ─── Data structures ─────────────────────────────────────────────────────────
73
+ @dataclass
74
+ class RunFormat:
75
+ text: str = ''
76
+ bold: bool = False
77
+ italic: bool = False
78
+ allCaps: bool = False
79
+ font: str = 'Calibri'
80
+ size: Optional[int] = None # stored as half-points (Word XML unit)
81
+ color: Optional[str] = None
82
+ is_page_number: bool = False # PAGE or NUMPAGES field
83
+ is_tab: bool = False
84
+ is_cached_field: bool = False # stale cached value between fldChar begin/end — skip
85
+ hyperlink_url: Optional[str] = None # set if this run is inside a w:hyperlink
86
+
87
+ @dataclass
88
+ class ParaFormat:
89
+ runs: list = field(default_factory=list)
90
+ alignment: Optional[str] = None
91
+ spacing_before: int = 0
92
+ spacing_after: int = 0
93
+ border_top: Optional[dict] = None
94
+ border_bottom: Optional[dict] = None
95
+ style: Optional[str] = None
96
+ is_bullet: bool = False
97
+ is_page_break: bool = False
98
+ tab_stop_right: Optional[int] = None
99
+
100
+ @dataclass
101
+ class CellFormat:
102
+ paragraphs: list = field(default_factory=list)
103
+ width_dxa: Optional[int] = None
104
+ shading: Optional[str] = None
105
+ border_color: Optional[str] = None
106
+ margin_top: int = 0
107
+ margin_bottom: int = 0
108
+ margin_left: int = 0
109
+ margin_right: int = 0
110
+ valign: Optional[str] = None
111
+
112
+ @dataclass
113
+ class TableFormat:
114
+ rows: list = field(default_factory=list)
115
+ width_dxa: Optional[int] = None
116
+ width_type: Optional[str] = None
117
+ alignment: Optional[str] = None
118
+
119
+ @dataclass
120
+ class ImageRef:
121
+ rel_id: str
122
+ width_emu: int
123
+ height_emu: int
124
+ para_align: Optional[str] = None
125
+ spacing_before: int = 0
126
+ spacing_after: int = 0
127
+
128
+ @dataclass
129
+ class SectionBreak:
130
+ margin_top: int = 1440
131
+ margin_bottom: int = 1440
132
+ margin_left: int = 1440
133
+ margin_right: int = 1440
134
+
135
+ # ─── Color naming ─────────────────────────────────────────────────────────────
136
+ # Well-known brand colors get readable names. Everything else is auto-named by
137
+ # hue + lightness so you get e.g. "navyDark" instead of "color2C4A6E".
138
+ _KNOWN_COLORS = {
139
+ '973A38': 'accent',
140
+ '59575A': 'body',
141
+ '8C8C8C': 'subtle',
142
+ 'DDDDDD': 'rule',
143
+ 'CCCCCC': 'border',
144
+ 'F0E8E8': 'tintLight',
145
+ 'F2F2F2': 'offWhite',
146
+ 'D9D9D9': 'lightGray',
147
+ 'E8E8E8': 'veryLightGray',
148
+ 'D0A0A0': 'tintMid',
149
+ 'F9F0F0': 'tintFaint',
150
+ '1A1A1A': 'nearBlack',
151
+ 'FFFFFF': 'white',
152
+ '000000': 'black',
153
+ }
154
+
155
+ def _auto_color_name(hex6: str) -> str:
156
+ """Derive a readable JS identifier from an unknown hex color using hue + lightness."""
157
+ try:
158
+ r = int(hex6[0:2], 16) / 255
159
+ g = int(hex6[2:4], 16) / 255
160
+ b = int(hex6[4:6], 16) / 255
161
+ except ValueError:
162
+ return f'color{hex6}'
163
+
164
+ h, l, s = colorsys.rgb_to_hls(r, g, b)
165
+
166
+ # Lightness buckets
167
+ if l >= 0.92: tone = 'Faint'
168
+ elif l >= 0.80: tone = 'Light'
169
+ elif l >= 0.60: tone = 'Mid'
170
+ elif l >= 0.40: tone = 'Base'
171
+ elif l >= 0.20: tone = 'Dark'
172
+ else: tone = 'Deep'
173
+
174
+ # If very desaturated treat as gray
175
+ if s < 0.08:
176
+ return f'gray{tone}'
177
+
178
+ hue_deg = h * 360
179
+ if hue_deg < 20: hue = 'red'
180
+ elif hue_deg < 45: hue = 'orange'
181
+ elif hue_deg < 70: hue = 'yellow'
182
+ elif hue_deg < 150: hue = 'green'
183
+ elif hue_deg < 195: hue = 'teal'
184
+ elif hue_deg < 255: hue = 'blue'
185
+ elif hue_deg < 285: hue = 'indigo'
186
+ elif hue_deg < 330: hue = 'purple'
187
+ else: hue = 'red'
188
+
189
+ # Special-case common hues people actually name
190
+ if hue == 'blue' and hue_deg >= 210 and hue_deg <= 245 and l < 0.45:
191
+ hue = 'navy'
192
+
193
+ return f'{hue}{tone}'
194
+
195
+ def _name_color(hex6: str, existing_names: set) -> str:
196
+ """Return a JS-safe unique name for a hex color, avoiding collisions."""
197
+ key = hex6.upper()
198
+ if key in _KNOWN_COLORS:
199
+ return _KNOWN_COLORS[key]
200
+ base = _auto_color_name(key)
201
+ name = base
202
+ n = 2
203
+ while name in existing_names:
204
+ name = f'{base}{n}'
205
+ n += 1
206
+ return name
207
+
208
+ def build_color_map(colors: list) -> dict:
209
+ """Map a list of hex color strings to unique, readable JS constant names."""
210
+ color_map = {}
211
+ used_names = set()
212
+ for hex_color in colors:
213
+ name = _name_color(hex_color, used_names)
214
+ color_map[hex_color] = name
215
+ used_names.add(name)
216
+ return color_map
217
+
218
+ # ─── Bracket placeholder → data field name ───────────────────────────────────
219
+ BRACKET_FIELD_MAP = {
220
+ 'client name': 'data.clientName',
221
+ 'client': 'data.clientName',
222
+ 'proposal title': 'data.title',
223
+ 'document title': 'data.title',
224
+ 'title': 'data.title',
225
+ 'month day, year': 'data.date',
226
+ 'date': 'data.date',
227
+ 'draft / final': 'data.status',
228
+ 'status': 'data.status',
229
+ 'prepared by': 'data.preparedBy',
230
+ 'contact name': 'data.contactName',
231
+ 'company name': 'data.companyName',
232
+ 'brief description of services': 'data.serviceDescription',
233
+ 'brief description': 'data.description',
234
+ }
235
+
236
+ def bracket_to_field(text):
237
+ """
238
+ If text looks like a single template placeholder e.g. '[Client Name]',
239
+ return the JS data field reference. Otherwise return None (keep as literal).
240
+
241
+ Rules:
242
+ - Must be a single [bracket] — multi-bracket text handled by split_bracket_runs
243
+ - Instructional text (> 5 words or > 50 chars inside brackets) is treated as a
244
+ static literal, not a data field — those are human-readable instructions the
245
+ template author wrote, not machine placeholders
246
+ """
247
+ stripped = text.strip()
248
+ if not (stripped.startswith('[') and stripped.endswith(']')):
249
+ return None
250
+ if stripped.count('[') != 1:
251
+ return None
252
+ inner = stripped[1:-1].strip()
253
+ # Long instructional text — keep as literal
254
+ if len(inner) > 50 or len(inner.split()) > 5:
255
+ return None
256
+ inner_lower = inner.lower()
257
+ if inner_lower in BRACKET_FIELD_MAP:
258
+ return BRACKET_FIELD_MAP[inner_lower]
259
+ for key, val in BRACKET_FIELD_MAP.items():
260
+ if key in inner_lower:
261
+ return val
262
+ # Auto-generate camelCase field name
263
+ words = re.sub(r'[^a-zA-Z0-9 ]', '', inner).split()
264
+ if not words:
265
+ return None
266
+ camel = words[0].lower() + ''.join(w.title() for w in words[1:])
267
+ return f'data.{camel}'
268
+
269
+ def split_bracket_runs(text):
270
+ """
271
+ Split a text run that contains multiple [bracket] placeholders into a list
272
+ of (text_segment, is_placeholder) tuples so each can be emitted separately.
273
+
274
+ e.g. '[Foo] — [Bar]' → [('[Foo]', True), (' — ', False), ('[Bar]', True)]
275
+ """
276
+ parts = []
277
+ for seg in re.split(r'(\[[^\[\]]+\])', text):
278
+ if not seg:
279
+ continue
280
+ is_ph = bool(re.match(r'^\[[^\[\]]+\]$', seg))
281
+ parts.append((seg, is_ph))
282
+ return parts
283
+
284
+ # ─── Parser ───────────────────────────────────────────────────────────────────
285
+ class DocxParser:
286
+ def __init__(self, docx_path):
287
+ self.path = docx_path
288
+ self.body_elements = []
289
+ self.header_elements = []
290
+ self.footer_elements = []
291
+ self.media_files = [] # zip paths, e.g. ['word/media/image1.png']
292
+ self.media_bytes = {} # filename -> bytes, written to assets/ by main()
293
+ self.relationships = {} # rId -> target path
294
+ self.hyperlink_rels = {} # rId -> URL (from document.xml.rels where Type=hyperlink)
295
+ self.colors = []
296
+ self._parse()
297
+
298
+ def _parse(self):
299
+ with zipfile.ZipFile(self.path, 'r') as z:
300
+ names = z.namelist()
301
+ self.media_files = [f for f in names if 'word/media/' in f and os.path.basename(f)]
302
+
303
+ # Extract image bytes while the zip is open
304
+ for mf in self.media_files:
305
+ fname = os.path.basename(mf)
306
+ if fname:
307
+ self.media_bytes[fname] = z.read(mf)
308
+
309
+ # Relationships (images + hyperlinks)
310
+ if 'word/_rels/document.xml.rels' in names:
311
+ with z.open('word/_rels/document.xml.rels') as f:
312
+ rels_tree = ET.parse(f)
313
+ for rel in rels_tree.getroot():
314
+ rid = rel.get('Id', '')
315
+ target = rel.get('Target', '')
316
+ rtype = rel.get('Type', '')
317
+ self.relationships[rid] = target
318
+ if 'hyperlink' in rtype:
319
+ self.hyperlink_rels[rid] = target
320
+
321
+ # Document body
322
+ with z.open('word/document.xml') as f:
323
+ doc_tree = ET.parse(f)
324
+ body = doc_tree.getroot().find(qn('w:body'))
325
+ self.body_elements = self._parse_body(body)
326
+
327
+ # Header (use first one found)
328
+ for fname in sorted(f for f in names if re.match(r'word/header\d+\.xml', f)):
329
+ with z.open(fname) as f:
330
+ h_tree = ET.parse(f)
331
+ self.header_elements = self._parse_paragraphs_from(h_tree.getroot())
332
+ break
333
+
334
+ # Footer (use first one found)
335
+ for fname in sorted(f for f in names if re.match(r'word/footer\d+\.xml', f)):
336
+ with z.open(fname) as f:
337
+ ft_tree = ET.parse(f)
338
+ self.footer_elements = self._parse_paragraphs_from(ft_tree.getroot())
339
+ break
340
+
341
+ self._extract_colors()
342
+
343
+ def _parse_body(self, body):
344
+ elements = []
345
+ for child in body:
346
+ tag = child.tag
347
+ if tag == qn('w:p'):
348
+ parsed = self._parse_para(child)
349
+ if parsed is not None:
350
+ elements.append(parsed)
351
+ elif tag == qn('w:tbl'):
352
+ elements.append(self._parse_table(child))
353
+ elif tag == qn('w:sectPr'):
354
+ elements.append(self._parse_section(child))
355
+ return elements
356
+
357
+ def _parse_paragraphs_from(self, root):
358
+ elements = []
359
+ for child in root:
360
+ if child.tag == qn('w:p'):
361
+ parsed = self._parse_para(child)
362
+ if parsed is not None:
363
+ elements.append(parsed)
364
+ return elements
365
+
366
+ def _parse_para(self, p_el):
367
+ pPr = p_el.find(qn('w:pPr'))
368
+ para = ParaFormat()
369
+
370
+ # Inline section break
371
+ if pPr is not None:
372
+ sect = pPr.find(qn('w:sectPr'))
373
+ if sect is not None:
374
+ return self._parse_section(sect)
375
+
376
+ if pPr is not None:
377
+ jc = pPr.find(qn('w:jc'))
378
+ if jc is not None:
379
+ v = jc.get(qn('w:val'), '')
380
+ para.alignment = {'center': 'CENTER', 'right': 'RIGHT', 'both': 'BOTH'}.get(v, 'LEFT')
381
+
382
+ sp = pPr.find(qn('w:spacing'))
383
+ if sp is not None:
384
+ b = sp.get(qn('w:before'), '0')
385
+ a = sp.get(qn('w:after'), '0')
386
+ para.spacing_before = int(b) if str(b).isdigit() else 0
387
+ para.spacing_after = int(a) if str(a).isdigit() else 0
388
+
389
+ pBdr = pPr.find(qn('w:pBdr'))
390
+ if pBdr is not None:
391
+ for side in ['top', 'bottom']:
392
+ bdr = pBdr.find(qn(f'w:{side}'))
393
+ if bdr is not None:
394
+ binfo = {
395
+ 'style': bdr.get(qn('w:val'), 'single'),
396
+ 'size': int(bdr.get(qn('w:sz'), '4')),
397
+ 'color': bdr.get(qn('w:color'), '000000'),
398
+ 'space': int(bdr.get(qn('w:space'), '0')),
399
+ }
400
+ if side == 'top': para.border_top = binfo
401
+ else: para.border_bottom = binfo
402
+
403
+ pStyle = pPr.find(qn('w:pStyle'))
404
+ if pStyle is not None:
405
+ para.style = pStyle.get(qn('w:val'))
406
+
407
+ if pPr.find(qn('w:numPr')) is not None:
408
+ para.is_bullet = True
409
+
410
+ tabs = pPr.find(qn('w:tabs'))
411
+ if tabs is not None:
412
+ for tab in tabs.findall(qn('w:tab')):
413
+ if tab.get(qn('w:val')) == 'right':
414
+ pos = tab.get(qn('w:pos'))
415
+ if pos:
416
+ para.tab_stop_right = int(pos)
417
+
418
+ # Walk children: runs, hyperlinks, drawings
419
+ inside_cached_field = False
420
+ for child in p_el:
421
+ tag = child.tag
422
+
423
+ if tag == qn('w:r'):
424
+ fld = child.find(qn('w:fldChar'))
425
+ if fld is not None:
426
+ ftype = fld.get(qn('w:fldCharType'), '')
427
+ if ftype == 'begin': inside_cached_field = True
428
+ elif ftype == 'end': inside_cached_field = False
429
+ continue
430
+
431
+ drawing = child.find(qn('w:drawing'))
432
+ if drawing is not None:
433
+ img = self._parse_drawing(drawing)
434
+ if img:
435
+ img.para_align = para.alignment
436
+ img.spacing_before = para.spacing_before
437
+ img.spacing_after = para.spacing_after
438
+ return img
439
+ continue
440
+
441
+ run = self._parse_run(child, is_cached=inside_cached_field)
442
+ if run:
443
+ para.runs.append(run)
444
+
445
+ elif tag == qn('w:hyperlink'):
446
+ # Resolve URL from relationship ID or inline anchor
447
+ rid = child.get(qn('r:id'), '')
448
+ url = self.hyperlink_rels.get(rid, child.get(qn('w:anchor'), ''))
449
+ for r_el in child.findall(qn('w:r')):
450
+ run = self._parse_run(r_el, is_cached=inside_cached_field)
451
+ if run:
452
+ run.hyperlink_url = url
453
+ para.runs.append(run)
454
+
455
+ elif tag == qn('w:drawing'):
456
+ img = self._parse_drawing(child)
457
+ if img:
458
+ img.para_align = para.alignment
459
+ img.spacing_before = para.spacing_before
460
+ img.spacing_after = para.spacing_after
461
+ return img
462
+
463
+ for br in p_el.findall(f'.//{qn("w:br")}'):
464
+ if br.get(qn('w:type')) == 'page':
465
+ para.is_page_break = True
466
+
467
+ return para
468
+
469
+ def _parse_run(self, r_el, is_cached=False):
470
+ if r_el.find(qn('w:drawing')) is not None:
471
+ return None
472
+
473
+ run = RunFormat()
474
+ run.is_cached_field = is_cached
475
+
476
+ rPr = r_el.find(qn('w:rPr'))
477
+ if rPr is not None:
478
+ run.bold = _bool_prop(rPr, 'w:b')
479
+ run.italic = _bool_prop(rPr, 'w:i')
480
+ run.allCaps = _bool_prop(rPr, 'w:caps')
481
+
482
+ fonts_el = rPr.find(qn('w:rFonts'))
483
+ if fonts_el is not None:
484
+ run.font = fonts_el.get(qn('w:ascii'), 'Calibri')
485
+
486
+ sz = rPr.find(qn('w:sz'))
487
+ if sz is not None:
488
+ hp = sz.get(qn('w:val'))
489
+ if hp:
490
+ run.size = int(hp)
491
+
492
+ color_el = rPr.find(qn('w:color'))
493
+ if color_el is not None:
494
+ c = color_el.get(qn('w:val'))
495
+ if c and c != 'auto':
496
+ run.color = c
497
+
498
+ # Field instruction (PAGE / NUMPAGES)
499
+ instr = r_el.find(qn('w:instrText'))
500
+ if instr is not None:
501
+ text = (instr.text or '').strip()
502
+ if text in ('PAGE', 'NUMPAGES'):
503
+ run.is_page_number = True
504
+ run.is_cached_field = False
505
+ run.text = text
506
+ return run
507
+ return None
508
+
509
+ if r_el.find(qn('w:tab')) is not None:
510
+ run.is_tab = True
511
+ run.text = '\t'
512
+ return run
513
+
514
+ t = r_el.find(qn('w:t'))
515
+ if t is not None and t.text:
516
+ run.text = t.text
517
+ return run
518
+
519
+ return None
520
+
521
+ def _parse_drawing(self, drawing_el):
522
+ extent = drawing_el.find(f'.//{qn("wp:extent")}')
523
+ if extent is None:
524
+ return None
525
+ cx = int(extent.get('cx', 0))
526
+ cy = int(extent.get('cy', 0))
527
+
528
+ blip = drawing_el.find(f'.//{qn("a:blip")}')
529
+ if blip is None:
530
+ return None
531
+ rel_id = blip.get(qn('r:embed'))
532
+ if not rel_id:
533
+ return None
534
+
535
+ return ImageRef(rel_id=rel_id, width_emu=cx, height_emu=cy)
536
+
537
+ def _parse_table(self, tbl_el):
538
+ table = TableFormat()
539
+ tblPr = tbl_el.find(qn('w:tblPr'))
540
+
541
+ if tblPr is not None:
542
+ tblW = tblPr.find(qn('w:tblW'))
543
+ if tblW is not None:
544
+ table.width_dxa = int(tblW.get(qn('w:w'), 0))
545
+ table.width_type = tblW.get(qn('w:type'), 'dxa')
546
+ jc = tblPr.find(qn('w:jc'))
547
+ if jc is not None:
548
+ table.alignment = jc.get(qn('w:val'), 'left').upper()
549
+
550
+ for tr in tbl_el.findall(qn('w:tr')):
551
+ row = [self._parse_cell(tc) for tc in tr.findall(qn('w:tc'))]
552
+ if row:
553
+ table.rows.append(row)
554
+
555
+ return table
556
+
557
+ def _parse_cell(self, tc_el):
558
+ cell = CellFormat()
559
+ tcPr = tc_el.find(qn('w:tcPr'))
560
+
561
+ if tcPr is not None:
562
+ tcW = tcPr.find(qn('w:tcW'))
563
+ if tcW is not None:
564
+ cell.width_dxa = int(tcW.get(qn('w:w'), 0))
565
+
566
+ shd = tcPr.find(qn('w:shd'))
567
+ if shd is not None:
568
+ fill = shd.get(qn('w:fill'))
569
+ if fill and fill.upper() not in ('AUTO', 'FFFFFF'):
570
+ cell.shading = fill.upper()
571
+
572
+ tcMar = tcPr.find(qn('w:tcMar'))
573
+ if tcMar is not None:
574
+ for side, attr_name in [('top','margin_top'),('bottom','margin_bottom'),
575
+ ('left','margin_left'),('right','margin_right')]:
576
+ m = tcMar.find(qn(f'w:{side}'))
577
+ if m is not None:
578
+ setattr(cell, attr_name, int(m.get(qn('w:w'), '0')))
579
+
580
+ tcBdr = tcPr.find(qn('w:tcBorders'))
581
+ if tcBdr is not None:
582
+ top_bdr = tcBdr.find(qn('w:top'))
583
+ if top_bdr is not None:
584
+ cell.border_color = top_bdr.get(qn('w:color'))
585
+
586
+ vAlign = tcPr.find(qn('w:vAlign'))
587
+ if vAlign is not None:
588
+ cell.valign = vAlign.get(qn('w:val'))
589
+
590
+ for p in tc_el.findall(qn('w:p')):
591
+ parsed = self._parse_para(p)
592
+ if parsed is not None:
593
+ cell.paragraphs.append(parsed)
594
+
595
+ return cell
596
+
597
+ def _parse_section(self, sect_el):
598
+ s = SectionBreak()
599
+ pgMar = sect_el.find(qn('w:pgMar'))
600
+ if pgMar is not None:
601
+ s.margin_top = int(pgMar.get(qn('w:top'), 1440))
602
+ s.margin_bottom = int(pgMar.get(qn('w:bottom'), 1440))
603
+ s.margin_left = int(pgMar.get(qn('w:left'), 1440))
604
+ s.margin_right = int(pgMar.get(qn('w:right'), 1440))
605
+ return s
606
+
607
+ def _extract_colors(self):
608
+ colors = set()
609
+ def scan(elements):
610
+ for el in elements:
611
+ if isinstance(el, ParaFormat):
612
+ for r in el.runs:
613
+ if r.color: colors.add(r.color.upper())
614
+ if el.border_bottom: colors.add(el.border_bottom['color'].upper())
615
+ if el.border_top: colors.add(el.border_top['color'].upper())
616
+ elif isinstance(el, TableFormat):
617
+ for row in el.rows:
618
+ for cell in row:
619
+ if cell.shading: colors.add(cell.shading.upper())
620
+ if cell.border_color: colors.add(cell.border_color.upper())
621
+ scan(cell.paragraphs)
622
+ scan(self.body_elements)
623
+ scan(self.header_elements)
624
+ scan(self.footer_elements)
625
+ self.colors = sorted(colors)
626
+
627
+ def list_summary(self):
628
+ """Return a human-readable summary of what was found in the template."""
629
+ sections = [el for el in self.body_elements if isinstance(el, SectionBreak)]
630
+ images = [el for el in self.body_elements if isinstance(el, ImageRef)]
631
+ tables = [el for el in self.body_elements if isinstance(el, TableFormat)]
632
+ paras = [el for el in self.body_elements if isinstance(el, ParaFormat)]
633
+ hyperlinks = set()
634
+ def find_links(elements):
635
+ for el in elements:
636
+ if isinstance(el, ParaFormat):
637
+ for r in el.runs:
638
+ if r.hyperlink_url:
639
+ hyperlinks.add(r.hyperlink_url)
640
+ elif isinstance(el, TableFormat):
641
+ for row in el.rows:
642
+ for cell in row:
643
+ find_links(cell.paragraphs)
644
+ find_links(self.body_elements)
645
+
646
+ placeholders = {}
647
+ def find_placeholders(elements):
648
+ for el in elements:
649
+ if isinstance(el, ParaFormat):
650
+ for r in el.runs:
651
+ if r.text:
652
+ for seg, is_ph in split_bracket_runs(r.text):
653
+ if is_ph:
654
+ f = bracket_to_field(seg)
655
+ if f:
656
+ placeholders[f.replace('data.','')] = seg
657
+ elif isinstance(el, TableFormat):
658
+ for row in el.rows:
659
+ for cell in row:
660
+ find_placeholders(cell.paragraphs)
661
+ find_placeholders(self.body_elements)
662
+
663
+ lines = [
664
+ f"Template: {os.path.basename(self.path)}",
665
+ f" Sections : {len(sections) + 1} (section breaks found: {len(sections)})",
666
+ f" Body paras : {len(paras)}",
667
+ f" Tables : {len(tables)}",
668
+ f" Images : {len(images)} {[os.path.basename(self.relationships.get(i.rel_id,'?')) for i in images]}",
669
+ f" Colors : {self.colors}",
670
+ f" Header : {len(self.header_elements)} paragraph(s)",
671
+ f" Footer : {len(self.footer_elements)} paragraph(s)",
672
+ ]
673
+ if placeholders:
674
+ lines.append(f" Placeholders detected:")
675
+ for name, raw in placeholders.items():
676
+ lines.append(f" data.{name} <- {raw}")
677
+ if hyperlinks:
678
+ lines.append(f" Hyperlinks:")
679
+ for url in sorted(hyperlinks):
680
+ lines.append(f" {url}")
681
+ return '\n'.join(lines)
682
+
683
+ # ─── Code Generator ──────────────────────────────────────────────────────────
684
+ class JSGenerator:
685
+ def __init__(self, parser: DocxParser, template_name: str):
686
+ self.p = parser
687
+ self.name = template_name
688
+ self.color_map = build_color_map(parser.colors)
689
+ self.inferred_data_fields = {} # field_name -> example value
690
+
691
+ def _color_ref(self, hex_color):
692
+ if not hex_color:
693
+ return "'000000'"
694
+ key = hex_color.upper()
695
+ return f"COLOR.{self.color_map[key]}" if key in self.color_map else f"'{hex_color}'"
696
+
697
+ def _emu_to_inch(self, emu):
698
+ return round(emu / 914400, 3)
699
+
700
+ def _run_to_js_parts(self, r: RunFormat, indent: str) -> list:
701
+ """
702
+ Convert a RunFormat to one or more JS TextRun/ExternalHyperlink strings.
703
+ Returns a list (usually one item) because a run with multiple [brackets]
704
+ gets split into multiple TextRuns.
705
+ """
706
+ if r.is_cached_field:
707
+ return []
708
+
709
+ if r.is_tab:
710
+ return [f"{indent}new TextRun({{ text: '\\t', font: FONT, size: {r.size or 22}, color: {self._color_ref(r.color)} }}),"]
711
+
712
+ if r.is_page_number:
713
+ field = 'PageNumber.CURRENT' if r.text == 'PAGE' else 'PageNumber.TOTAL_PAGES'
714
+ return [f"{indent}new TextRun({{ children: [{field}], font: FONT, size: {r.size or 22}, color: {self._color_ref(r.color)} }}),"]
715
+
716
+ if not r.text:
717
+ return []
718
+
719
+ # Split into bracket segments so '[Foo] — [Bar]' emits two TextRuns
720
+ segments = split_bracket_runs(r.text)
721
+ results = []
722
+ for seg_text, is_placeholder in segments:
723
+ field_ref = bracket_to_field(seg_text) if is_placeholder else None
724
+ if field_ref:
725
+ field_name = field_ref.replace('data.', '')
726
+ if field_name not in self.inferred_data_fields:
727
+ self.inferred_data_fields[field_name] = seg_text.strip('[]')
728
+ text_expr = field_ref
729
+ else:
730
+ escaped = seg_text.replace('\\', '\\\\').replace("'", "\\'")
731
+ text_expr = f"'{escaped}'"
732
+
733
+ opts = [f"text: {text_expr}", "font: FONT"]
734
+ if r.size: opts.append(f"size: {r.size}")
735
+ if r.bold: opts.append("bold: true")
736
+ if r.italic: opts.append("italics: true")
737
+ if r.allCaps: opts.append("allCaps: true")
738
+ if r.color: opts.append(f"color: {self._color_ref(r.color)}")
739
+
740
+ run_js = f"new TextRun({{ {', '.join(opts)} }})"
741
+
742
+ if r.hyperlink_url:
743
+ escaped_url = r.hyperlink_url.replace("'", "\\'")
744
+ results.append(f"{indent}new ExternalHyperlink({{ link: '{escaped_url}', children: [{run_js}] }}),")
745
+ else:
746
+ results.append(f"{indent}{run_js},")
747
+
748
+ return results
749
+
750
+ def _para_to_js(self, para: ParaFormat, indent: str) -> Optional[str]:
751
+ if not isinstance(para, ParaFormat):
752
+ return None
753
+ if para.is_page_break:
754
+ return f"{indent}new Paragraph({{ pageBreakBefore: true }}),"
755
+
756
+ lines = [f"{indent}new Paragraph({{"]
757
+
758
+ run_lines = []
759
+ for r in para.runs:
760
+ run_lines.extend(self._run_to_js_parts(r, indent + ' '))
761
+
762
+ if run_lines:
763
+ lines.append(f"{indent} children: [")
764
+ lines.extend(run_lines)
765
+ lines.append(f"{indent} ],")
766
+ else:
767
+ lines.append(f"{indent} children: [],")
768
+
769
+ if para.alignment and para.alignment != 'LEFT':
770
+ lines.append(f"{indent} alignment: AlignmentType.{para.alignment},")
771
+
772
+ if para.spacing_before or para.spacing_after:
773
+ lines.append(f"{indent} spacing: {{ before: {para.spacing_before}, after: {para.spacing_after} }},")
774
+
775
+ borders = {}
776
+ if para.border_top:
777
+ b = para.border_top
778
+ borders['top'] = f"{{ style: BorderStyle.SINGLE, size: {b['size']}, color: {self._color_ref(b['color'])}, space: {b['space']} }}"
779
+ if para.border_bottom:
780
+ b = para.border_bottom
781
+ borders['bottom'] = f"{{ style: BorderStyle.SINGLE, size: {b['size']}, color: {self._color_ref(b['color'])}, space: {b['space']} }}"
782
+ if borders:
783
+ bstr = ', '.join(f"{k}: {v}" for k, v in borders.items())
784
+ lines.append(f"{indent} border: {{ {bstr} }},")
785
+
786
+ if para.tab_stop_right:
787
+ lines.append(f"{indent} tabStops: [{{ type: TabStopType.RIGHT, position: TabStopPosition.MAX }}],")
788
+
789
+ if para.is_bullet:
790
+ lines.append(f"{indent} bullet: {{ level: 0 }},")
791
+
792
+ lines.append(f"{indent}}})," )
793
+ return '\n'.join(lines)
794
+
795
+ def _image_to_js(self, img: ImageRef, indent: str) -> str:
796
+ target = self.p.relationships.get(img.rel_id, '')
797
+ fname = os.path.basename(target) or 'image.png'
798
+ var_name = re.sub(r'[^a-zA-Z0-9]', '_', os.path.splitext(fname)[0]).upper() + '_PATH'
799
+ ext = os.path.splitext(fname)[1].lstrip('.').lower() or 'png'
800
+ w_pt = round(self._emu_to_inch(img.width_emu) * 72)
801
+ h_pt = round(self._emu_to_inch(img.height_emu) * 72)
802
+
803
+ lines = [
804
+ f"{indent}new Paragraph({{",
805
+ f"{indent} children: [",
806
+ f"{indent} new ImageRun({{",
807
+ f"{indent} data: readFileSync({var_name}),",
808
+ f"{indent} transformation: {{ width: {w_pt}, height: {h_pt} }},",
809
+ f"{indent} type: '{ext}',",
810
+ f"{indent} }}),",
811
+ f"{indent} ],",
812
+ ]
813
+ if img.para_align and img.para_align != 'LEFT':
814
+ lines.append(f"{indent} alignment: AlignmentType.{img.para_align},")
815
+ if img.spacing_before or img.spacing_after:
816
+ lines.append(f"{indent} spacing: {{ before: {img.spacing_before}, after: {img.spacing_after} }},")
817
+ lines.append(f"{indent}}})," )
818
+ return '\n'.join(lines)
819
+
820
+ def _table_to_js(self, table: TableFormat, indent: str) -> str:
821
+ lines = [f"{indent}new Table({{"]
822
+
823
+ if table.width_type == 'pct' or (table.width_dxa and table.width_dxa > 8000):
824
+ lines.append(f"{indent} width: {{ size: 100, type: WidthType.PERCENTAGE }},")
825
+ elif table.width_dxa:
826
+ lines.append(f"{indent} width: {{ size: {table.width_dxa}, type: WidthType.DXA }},")
827
+
828
+ if table.alignment:
829
+ lines.append(f"{indent} alignment: AlignmentType.{table.alignment},")
830
+
831
+ lines.append(f"{indent} rows: [")
832
+ for row in table.rows:
833
+ lines.append(f"{indent} new TableRow({{")
834
+ lines.append(f"{indent} children: [")
835
+ total_w = sum(c.width_dxa or 0 for c in row)
836
+ for cell in row:
837
+ lines += self._cell_to_js(cell, total_w, indent + ' ')
838
+ lines.append(f"{indent} ],")
839
+ lines.append(f"{indent} }}),")
840
+ lines.append(f"{indent} ],")
841
+ lines.append(f"{indent}}})," )
842
+ return '\n'.join(lines)
843
+
844
+ def _cell_to_js(self, cell: CellFormat, total_width: int, indent: str) -> list:
845
+ lines = [f"{indent}new TableCell({{"]
846
+
847
+ if cell.width_dxa and total_width:
848
+ pct = round(cell.width_dxa / total_width * 100)
849
+ lines.append(f"{indent} width: {{ size: {pct}, type: WidthType.PERCENTAGE }},")
850
+
851
+ if cell.shading:
852
+ lines.append(f"{indent} shading: {{ fill: {self._color_ref(cell.shading)} }},")
853
+
854
+ bc = self._color_ref(cell.border_color) if cell.border_color else "'DDDDDD'"
855
+ lines += [
856
+ f"{indent} borders: {{",
857
+ f"{indent} top: {{ style: BorderStyle.SINGLE, size: 4, color: {bc} }},",
858
+ f"{indent} bottom: {{ style: BorderStyle.SINGLE, size: 4, color: {bc} }},",
859
+ f"{indent} left: {{ style: BorderStyle.SINGLE, size: 4, color: {bc} }},",
860
+ f"{indent} right: {{ style: BorderStyle.SINGLE, size: 4, color: {bc} }},",
861
+ f"{indent} }},",
862
+ ]
863
+
864
+ if any([cell.margin_top, cell.margin_bottom, cell.margin_left, cell.margin_right]):
865
+ lines.append(f"{indent} margins: {{")
866
+ for side, val in [('top',cell.margin_top),('bottom',cell.margin_bottom),
867
+ ('left',cell.margin_left),('right',cell.margin_right)]:
868
+ if val: lines.append(f"{indent} {side}: {val},")
869
+ lines.append(f"{indent} }},")
870
+
871
+ lines.append(f"{indent} children: [")
872
+ for para in cell.paragraphs:
873
+ pjs = self._para_to_js(para, indent + ' ')
874
+ if pjs:
875
+ lines.append(pjs)
876
+ lines.append(f"{indent} ],")
877
+
878
+ if cell.valign:
879
+ lines.append(f"{indent} verticalAlign: '{cell.valign}',")
880
+
881
+ lines.append(f"{indent}}})," )
882
+ return lines
883
+
884
+ # ── Section generators ───────────────────────────────────────────────────
885
+
886
+ def _gen_file_header(self):
887
+ # Check whether any hyperlinks exist so we know whether to import ExternalHyperlink
888
+ has_hyperlinks = any(
889
+ r.hyperlink_url
890
+ for el in self.p.body_elements + self.p.header_elements + self.p.footer_elements
891
+ if isinstance(el, ParaFormat)
892
+ for r in el.runs
893
+ )
894
+ self._has_hyperlinks = has_hyperlinks
895
+
896
+ basename = os.path.basename(self.p.path)
897
+ hyperlink_import = ', ExternalHyperlink' if has_hyperlinks else ''
898
+ return [
899
+ f'/**',
900
+ f' * {self.name.title()} Document Builder',
901
+ f' * Auto-generated by docx-to-builder https://github.com/jermorrison22/docx-to-builder',
902
+ f' * Source template: {basename}',
903
+ f' * Generated: {__import__("datetime").date.today().isoformat()}',
904
+ f' *',
905
+ f' * HOW TO USE:',
906
+ f' * 1. Fill in the data object below (or pass your own)',
907
+ f' * 2. Run: node {self.name}-builder.js',
908
+ f' * 3. Find the output at: output/{self.name}-output.docx',
909
+ f' *',
910
+ f' * To regenerate this file from a new template:',
911
+ f' * python3 docx-to-builder.py your-template.docx',
912
+ f' */',
913
+ '',
914
+ "import {",
915
+ f" Document, Packer, Paragraph, TextRun, ImageRun{hyperlink_import},",
916
+ " Table, TableRow, TableCell, WidthType, AlignmentType,",
917
+ " BorderStyle, Header, Footer, PageNumber,",
918
+ " TabStopType, TabStopPosition, convertInchesToTwip,",
919
+ "} from 'docx';",
920
+ "import { readFileSync, writeFileSync, mkdirSync } from 'fs';",
921
+ "import { fileURLToPath } from 'url';",
922
+ "import { dirname, join } from 'path';",
923
+ "",
924
+ "const __dirname = dirname(fileURLToPath(import.meta.url));",
925
+ "",
926
+ ]
927
+
928
+ def _gen_constants(self):
929
+ lines = [
930
+ "// ─── Brand colors extracted from template ────────────────────────────────────",
931
+ "const COLOR = {",
932
+ ]
933
+ for hex_color, name in sorted(self.color_map.items(), key=lambda x: x[1]):
934
+ lines.append(f" {name}: '{hex_color}',")
935
+ lines += ["};", ""]
936
+
937
+ fonts = {}
938
+ def scan(els):
939
+ for el in els:
940
+ if isinstance(el, ParaFormat):
941
+ for r in el.runs:
942
+ fonts[r.font] = fonts.get(r.font, 0) + 1
943
+ elif isinstance(el, TableFormat):
944
+ for row in el.rows:
945
+ for cell in row:
946
+ scan(cell.paragraphs)
947
+ scan(self.p.body_elements)
948
+ main_font = max(fonts, key=fonts.get) if fonts else 'Calibri'
949
+ lines += [f"const FONT = '{main_font}';", ""]
950
+
951
+ if self.p.media_files:
952
+ lines.append("// ─── Asset paths — update these if you move the image files ─────────────────")
953
+ for media in self.p.media_files:
954
+ fname = os.path.basename(media)
955
+ if not fname:
956
+ continue
957
+ var = re.sub(r'[^a-zA-Z0-9]', '_', os.path.splitext(fname)[0]).upper() + '_PATH'
958
+ lines.append(f"const {var} = join(__dirname, 'assets/{fname}');")
959
+ lines.append("")
960
+
961
+ return lines
962
+
963
+ def _gen_header_fn(self):
964
+ lines = [
965
+ "// ─── Header ───────────────────────────────────────────────────────────────────",
966
+ "function buildHeader(data) {",
967
+ " return new Header({",
968
+ " children: [",
969
+ ]
970
+ for el in self.p.header_elements:
971
+ if isinstance(el, ParaFormat):
972
+ pjs = self._para_to_js(el, ' ')
973
+ if pjs:
974
+ lines.append(pjs)
975
+ lines += [" ],", " });", "}", ""]
976
+ return lines
977
+
978
+ def _gen_footer_fn(self):
979
+ lines = [
980
+ "// ─── Footer ───────────────────────────────────────────────────────────────────",
981
+ "function buildFooter() {",
982
+ " return new Footer({",
983
+ " children: [",
984
+ ]
985
+ for el in self.p.footer_elements:
986
+ if isinstance(el, ParaFormat):
987
+ pjs = self._para_to_js(el, ' ')
988
+ if pjs:
989
+ lines.append(pjs)
990
+ lines += [" ],", " });", "}", ""]
991
+ return lines
992
+
993
+ def _elements_to_js(self, elements, indent=' ', is_first_section=True) -> list:
994
+ """Convert a list of body elements to JS lines (used per-section)."""
995
+ # Strip trailing page-break paragraphs — the section boundary itself
996
+ # already forces a new page, so a pageBreakBefore at the end of a section
997
+ # produces a blank page.
998
+ trimmed = list(elements)
999
+ while trimmed and isinstance(trimmed[-1], ParaFormat) and trimmed[-1].is_page_break:
1000
+ trimmed.pop()
1001
+
1002
+ # Also strip leading page-break paragraphs from non-first sections —
1003
+ # the section start already puts us on a new page.
1004
+ if not is_first_section:
1005
+ while trimmed and isinstance(trimmed[0], ParaFormat) and trimmed[0].is_page_break:
1006
+ trimmed.pop(0)
1007
+
1008
+ elements = trimmed
1009
+
1010
+ lines = []
1011
+ for el in elements:
1012
+ if isinstance(el, SectionBreak):
1013
+ continue # handled at the section boundary level
1014
+
1015
+ if isinstance(el, ImageRef):
1016
+ target = self.p.relationships.get(el.rel_id, '')
1017
+ fname = os.path.basename(target) or 'image.png'
1018
+ w_in = self._emu_to_inch(el.width_emu)
1019
+ h_in = self._emu_to_inch(el.height_emu)
1020
+ lines.append(f"{indent}// Image: {fname} ({w_in}\" x {h_in}\")")
1021
+ lines.append(f"{indent}elements.push(")
1022
+ lines.append(self._image_to_js(el, indent + ' '))
1023
+ lines.append(f"{indent});")
1024
+ lines.append("")
1025
+
1026
+ elif isinstance(el, ParaFormat):
1027
+ if el.is_page_break:
1028
+ lines.append(f"{indent}elements.push(new Paragraph({{ pageBreakBefore: true }}));")
1029
+ lines.append("")
1030
+ continue
1031
+ pjs = self._para_to_js(el, indent + ' ')
1032
+ if pjs:
1033
+ preview = ' '.join(
1034
+ r.text for r in el.runs
1035
+ if r.text and not r.is_tab and not r.is_page_number and not r.is_cached_field
1036
+ )[:60]
1037
+ if preview:
1038
+ lines.append(f"{indent}// {repr(preview)}")
1039
+ lines.append(f"{indent}elements.push(")
1040
+ lines.append(pjs)
1041
+ lines.append(f"{indent});")
1042
+ lines.append("")
1043
+
1044
+ elif isinstance(el, TableFormat):
1045
+ n_rows = len(el.rows)
1046
+ n_cols = max((len(r) for r in el.rows), default=0)
1047
+ lines.append(f"{indent}// Table ({n_rows} rows x {n_cols} cols)")
1048
+ lines.append(f"{indent}elements.push(")
1049
+ lines.append(self._table_to_js(el, indent + ' '))
1050
+ lines.append(f"{indent});")
1051
+ lines.append("")
1052
+
1053
+ return lines
1054
+
1055
+ def _gen_content_fn(self):
1056
+ """
1057
+ Split body elements at SectionBreak boundaries and generate one
1058
+ buildSection_N() function per section, plus a buildContent() that
1059
+ returns the full array.
1060
+ """
1061
+ # Partition body_elements into sections
1062
+ sections_data = [] # list of (section_break, [elements before next break])
1063
+ current = []
1064
+ current_break = None
1065
+ for el in self.p.body_elements:
1066
+ if isinstance(el, SectionBreak):
1067
+ sections_data.append((current_break, current))
1068
+ current = []
1069
+ current_break = el
1070
+ else:
1071
+ current.append(el)
1072
+ # Don't forget trailing elements after last break
1073
+ sections_data.append((current_break, current))
1074
+
1075
+ lines = [
1076
+ "// ─── Document content ────────────────────────────────────────────────────────",
1077
+ "// Bracketed placeholders (e.g. [Client Name]) have been mapped to data.fieldName.",
1078
+ "// Static template text is kept as-is. Replace any literal string with a data",
1079
+ "// field if you need it to be dynamic.",
1080
+ "",
1081
+ ]
1082
+
1083
+ fn_names = []
1084
+ for i, (sec_break, elements) in enumerate(sections_data):
1085
+ fn = f'buildSection{i + 1}'
1086
+ fn_names.append((fn, sec_break))
1087
+ comment = ''
1088
+ if sec_break:
1089
+ comment = f' // margins: T={sec_break.margin_top} B={sec_break.margin_bottom} L={sec_break.margin_left} R={sec_break.margin_right} (twips)'
1090
+ lines.append(f"function {fn}(data) {{{comment}")
1091
+ lines.append(" const elements = [];")
1092
+ lines.append("")
1093
+ lines.extend(self._elements_to_js(elements, ' ', is_first_section=(i == 0)))
1094
+ lines.append(" return elements;")
1095
+ lines.append("}")
1096
+ lines.append("")
1097
+
1098
+ return lines, fn_names
1099
+
1100
+ def _gen_builder_fn(self, fn_names):
1101
+ """Generate the main buildDocument() that wires sections together."""
1102
+ lines = [
1103
+ "// ─── Main builder ────────────────────────────────────────────────────────────",
1104
+ "export async function buildDocument(data, outputPath) {",
1105
+ " const doc = new Document({",
1106
+ " sections: [",
1107
+ ]
1108
+
1109
+ for fn, sec_break in fn_names:
1110
+ sb = sec_break if sec_break else SectionBreak()
1111
+ lines += [
1112
+ " {",
1113
+ " properties: {",
1114
+ " page: {",
1115
+ " margin: {",
1116
+ f" top: convertInchesToTwip({sb.margin_top / 1440:.3f}),",
1117
+ f" bottom: convertInchesToTwip({sb.margin_bottom / 1440:.3f}),",
1118
+ f" left: convertInchesToTwip({sb.margin_left / 1440:.3f}),",
1119
+ f" right: convertInchesToTwip({sb.margin_right / 1440:.3f}),",
1120
+ " },",
1121
+ " },",
1122
+ " },",
1123
+ " headers: { default: buildHeader(data) },",
1124
+ " footers: { default: buildFooter() },",
1125
+ f" children: {fn}(data),",
1126
+ " },",
1127
+ ]
1128
+
1129
+ lines += [
1130
+ " ],",
1131
+ " });",
1132
+ "",
1133
+ " const buffer = await Packer.toBuffer(doc);",
1134
+ " writeFileSync(outputPath, buffer);",
1135
+ " console.log('Document saved to: ' + outputPath);",
1136
+ "}",
1137
+ "",
1138
+ ]
1139
+ return lines
1140
+
1141
+ def _gen_example_data(self):
1142
+ lines = [
1143
+ "// ─── Data object ─────────────────────────────────────────────────────────────",
1144
+ "// Fields below were inferred from bracketed placeholders in your template.",
1145
+ "// Add, remove, or rename fields to match your use case.",
1146
+ "const data = {",
1147
+ ]
1148
+ if self.inferred_data_fields:
1149
+ for field_name, example in self.inferred_data_fields.items():
1150
+ escaped = example.replace("'", "\\'")
1151
+ lines.append(f" {field_name}: '{escaped}',")
1152
+ else:
1153
+ lines.append(" title: 'Document Title',")
1154
+ lines.append(" clientName: 'Client Name',")
1155
+ lines.append(" date: new Date().toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' }),")
1156
+ lines += ["};", ""]
1157
+ return lines
1158
+
1159
+ def _gen_cli(self):
1160
+ return [
1161
+ "// ─── Run from command line ────────────────────────────────────────────────────",
1162
+ "if (process.argv[1] === fileURLToPath(import.meta.url)) {",
1163
+ " const outputArg = process.argv.indexOf('--output');",
1164
+ " const outputPath = outputArg !== -1",
1165
+ " ? process.argv[outputArg + 1]",
1166
+ f" : join(__dirname, 'output/{self.name}-output.docx');",
1167
+ "",
1168
+ " mkdirSync(join(__dirname, 'output'), { recursive: true });",
1169
+ " buildDocument(data, outputPath).catch(console.error);",
1170
+ "}",
1171
+ ]
1172
+
1173
+ def generate(self):
1174
+ # Generate header first so _has_hyperlinks gets set
1175
+ file_header = self._gen_file_header()
1176
+ constants = self._gen_constants()
1177
+ header_fn = self._gen_header_fn()
1178
+ footer_fn = self._gen_footer_fn()
1179
+
1180
+ # Content generation populates inferred_data_fields as a side-effect
1181
+ content_lines, fn_names = self._gen_content_fn()
1182
+ builder_fn = self._gen_builder_fn(fn_names)
1183
+ example = self._gen_example_data()
1184
+ cli = self._gen_cli()
1185
+
1186
+ lines = []
1187
+ lines += file_header
1188
+ lines += constants
1189
+ lines += header_fn
1190
+ lines += footer_fn
1191
+ lines += content_lines
1192
+ lines += builder_fn
1193
+ lines += example
1194
+ lines += cli
1195
+ return '\n'.join(lines)
1196
+
1197
+
1198
+ # ─── Main ─────────────────────────────────────────────────────────────────────
1199
+ def main():
1200
+ if len(sys.argv) < 2:
1201
+ print(__doc__)
1202
+ sys.exit(1)
1203
+
1204
+ docx_path = sys.argv[1]
1205
+ if not os.path.exists(docx_path):
1206
+ print(f"Error: file not found: {docx_path}")
1207
+ sys.exit(1)
1208
+
1209
+ list_mode = '--list' in sys.argv
1210
+
1211
+ # Determine output path
1212
+ output_path = None
1213
+ if '--output' in sys.argv:
1214
+ idx = sys.argv.index('--output')
1215
+ output_path = sys.argv[idx + 1]
1216
+
1217
+ # Derive a clean document name from the filename
1218
+ basename = os.path.basename(docx_path)
1219
+ name = re.sub(r'(?i)template\s*[-\u2013]\s*', '', basename)
1220
+ name = os.path.splitext(name)[0].strip().lower()
1221
+ name = re.sub(r'\s+', '-', name)
1222
+ name = re.sub(r'[^a-z0-9-]', '', name)
1223
+
1224
+ if not output_path:
1225
+ output_path = os.path.join(os.path.dirname(os.path.abspath(docx_path)), f'{name}-builder.js')
1226
+
1227
+ print(f"Parsing: {docx_path}")
1228
+ parser = DocxParser(docx_path)
1229
+
1230
+ if list_mode:
1231
+ print()
1232
+ print(parser.list_summary())
1233
+ return
1234
+
1235
+ sections = [el for el in parser.body_elements if isinstance(el, SectionBreak)]
1236
+ print(f" {len(parser.body_elements)} body elements | "
1237
+ f"{len(parser.header_elements)} header | "
1238
+ f"{len(parser.footer_elements)} footer | "
1239
+ f"{len(parser.media_files)} image(s) | "
1240
+ f"{len(sections) + 1} section(s)")
1241
+ print(f" Colors: {parser.colors}")
1242
+
1243
+ print("Generating JS builder...")
1244
+ gen = JSGenerator(parser, name)
1245
+ js_code = gen.generate()
1246
+
1247
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
1248
+ with open(output_path, 'w', encoding='utf-8') as f:
1249
+ f.write(js_code)
1250
+
1251
+ print(f"[OK] {output_path}")
1252
+
1253
+ # Extract embedded images into assets/ next to the builder
1254
+ if parser.media_bytes:
1255
+ assets_dir = os.path.join(os.path.dirname(os.path.abspath(output_path)), 'assets')
1256
+ os.makedirs(assets_dir, exist_ok=True)
1257
+ for fname, data in parser.media_bytes.items():
1258
+ dest = os.path.join(assets_dir, fname)
1259
+ with open(dest, 'wb') as f:
1260
+ f.write(data)
1261
+ print(f" [image] assets/{fname}")
1262
+
1263
+ if gen.inferred_data_fields:
1264
+ print(f" Inferred data fields: {list(gen.inferred_data_fields.keys())}")
1265
+ print(f"\nNext: run node {os.path.basename(output_path)}")
1266
+
1267
+ if __name__ == '__main__':
1268
+ main()