@heylemon/lemonade 0.1.9 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/build-info.json
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
ae5dd78f56173dbfc66b8b6f4a72fb917e3108dcb10745fd17c25536ee4bd709
|
package/package.json
CHANGED
package/skills/docx/SKILL.md
CHANGED
|
@@ -147,6 +147,27 @@ new Paragraph({
|
|
|
147
147
|
```
|
|
148
148
|
|
|
149
149
|
Also note: use `italics: true` (with 's'), not `italic: true`. The latter silently does nothing in docx-js.
|
|
150
|
+
|
|
151
|
+
### Prevent Duplicate Text Rendering (CRITICAL)
|
|
152
|
+
|
|
153
|
+
**Use exactly one text source per paragraph.** In docx-js, a paragraph can get text from:
|
|
154
|
+
- `text: "..."` on `new Paragraph(...)`, or
|
|
155
|
+
- `children: [new TextRun({ text: "..." })]`
|
|
156
|
+
|
|
157
|
+
If you set both, the same title/subtitle/date can render twice (common on cover pages).
|
|
158
|
+
|
|
159
|
+
```javascript
|
|
160
|
+
// ✅ Correct: one source (children only)
|
|
161
|
+
new Paragraph({
|
|
162
|
+
spacing: { after: 160 },
|
|
163
|
+
children: [new TextRun({ text: "February 22, 2026", size: 28, color: COLORS.muted })]
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
// ❌ Wrong: duplicates text (text + children)
|
|
167
|
+
new Paragraph({
|
|
168
|
+
text: "February 22, 2026",
|
|
169
|
+
children: [new TextRun({ text: "February 22, 2026", size: 28, color: COLORS.muted })]
|
|
170
|
+
});
|
|
150
171
|
```
|
|
151
172
|
|
|
152
173
|
### Lists: Never Use Unicode Bullets
|
|
@@ -346,6 +367,7 @@ Or use a more sophisticated approach with field codes (requires XML manipulation
|
|
|
346
367
|
- Use `HeadingLevel.HEADING_1`, etc., with `outlineLevel` for TOC
|
|
347
368
|
- Use `children` with TextRun for headings, NOT the `run` property (Pages ignores `run`)
|
|
348
369
|
- Use `italics: true` (with 's'), not `italic: true`
|
|
370
|
+
- Use exactly one text source per paragraph: either `text` OR `children`, never both
|
|
349
371
|
- Set `spacing.before` and `spacing.after` explicitly on headings
|
|
350
372
|
|
|
351
373
|
**DON'T:**
|
|
@@ -359,6 +381,7 @@ Or use a more sophisticated approach with field codes (requires XML manipulation
|
|
|
359
381
|
- Assume A4 page size (always set explicitly to US Letter)
|
|
360
382
|
- Use `run` property on headings (use `children` with TextRun instead — Pages ignores `run`)
|
|
361
383
|
- Use `italic: true` without the 's' (use `italics: true`)
|
|
384
|
+
- Put the same literal content in both `text` and `children` on one Paragraph (duplicates output)
|
|
362
385
|
- Mix font styles — stick to 2 fonts (heading + body)
|
|
363
386
|
|
|
364
387
|
---
|
|
@@ -450,6 +473,7 @@ The validator checks:
|
|
|
450
473
|
- Paragraph length (warns if > 500 chars)
|
|
451
474
|
- Empty headings
|
|
452
475
|
- Manual bullet characters (should use List APIs)
|
|
476
|
+
- Obvious cover/title duplication risks (`text` + `children` on same paragraph in generation code)
|
|
453
477
|
|
|
454
478
|
### 2. Fix Tables for Cross-Platform (MANDATORY if document contains tables)
|
|
455
479
|
|
|
@@ -495,6 +519,8 @@ pdftoppm -jpeg -r 150 report.pdf page
|
|
|
495
519
|
# View: page-1.jpg, page-2.jpg, etc.
|
|
496
520
|
```
|
|
497
521
|
|
|
522
|
+
During this visual pass, explicitly check the title block (title, subtitle, date) for ghosted/double text. If duplicated, inspect the corresponding `new Paragraph(...)` and remove either `text` or `children` so only one text source remains.
|
|
523
|
+
|
|
498
524
|
### 5. Fix & Re-validate
|
|
499
525
|
|
|
500
526
|
If the validator reports issues or the PDF looks off:
|
|
@@ -2,6 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
Each template shows the structure and a docx-js code snippet for that document type.
|
|
4
4
|
|
|
5
|
+
## Important Implementation Notes
|
|
6
|
+
|
|
7
|
+
- Prefer the deterministic Python engine (`scripts/create_doc.py`) for production output when possible.
|
|
8
|
+
- If using docx-js directly, use exactly one paragraph text source:
|
|
9
|
+
- `text: "..."` **or** `children: [new TextRun({ text: "..." })]`
|
|
10
|
+
- never both on the same `new Paragraph(...)`.
|
|
11
|
+
- Run both post-processors after generation:
|
|
12
|
+
- `python scripts/fix_tables.py output.docx`
|
|
13
|
+
- `python scripts/fix_styles.py output.docx`
|
|
14
|
+
- Run `python scripts/validate.py output.docx` before delivery to catch duplicate render patterns early.
|
|
15
|
+
|
|
5
16
|
## Report
|
|
6
17
|
|
|
7
18
|
The most versatile template. Use for analytical, strategic, or informational documents.
|
|
@@ -8,6 +8,7 @@ Checks for:
|
|
|
8
8
|
- Paragraph length and structure
|
|
9
9
|
- Empty headings
|
|
10
10
|
- Manual bullet characters (should use List APIs)
|
|
11
|
+
- Duplicate title/content rendering (same phrase repeated adjacently)
|
|
11
12
|
- Table consistency
|
|
12
13
|
- Page setup
|
|
13
14
|
|
|
@@ -18,6 +19,7 @@ import sys
|
|
|
18
19
|
import os
|
|
19
20
|
import zipfile
|
|
20
21
|
import xml.etree.ElementTree as ET
|
|
22
|
+
import re
|
|
21
23
|
from pathlib import Path
|
|
22
24
|
|
|
23
25
|
try:
|
|
@@ -90,10 +92,35 @@ def validate_document(docx_path):
|
|
|
90
92
|
|
|
91
93
|
# Unicode bullet characters
|
|
92
94
|
bullet_chars = {'•', '◦', '▪', '●', '○', '■', '–', '—', '-', '*'}
|
|
95
|
+
non_empty_paragraphs = []
|
|
96
|
+
|
|
97
|
+
def normalize_text(text):
|
|
98
|
+
"""Normalize visible text for duplicate detection."""
|
|
99
|
+
# Case-insensitive, collapse whitespace, keep only alnum/space
|
|
100
|
+
cleaned = re.sub(r"[^a-zA-Z0-9\s]", " ", text.lower())
|
|
101
|
+
return " ".join(cleaned.split())
|
|
102
|
+
|
|
103
|
+
def find_adjacent_repeat_phrase(text, min_words=3, max_words=16):
|
|
104
|
+
"""Detect adjacent repeated phrase in one paragraph.
|
|
105
|
+
|
|
106
|
+
Example: "The Great War ... The Great War ..."
|
|
107
|
+
"""
|
|
108
|
+
words = normalize_text(text).split()
|
|
109
|
+
if len(words) < min_words * 2:
|
|
110
|
+
return None
|
|
111
|
+
upper = min(max_words, len(words) // 2)
|
|
112
|
+
for size in range(upper, min_words - 1, -1):
|
|
113
|
+
for i in range(0, len(words) - 2 * size + 1):
|
|
114
|
+
left = words[i:i + size]
|
|
115
|
+
right = words[i + size:i + 2 * size]
|
|
116
|
+
if left == right:
|
|
117
|
+
return " ".join(left)
|
|
118
|
+
return None
|
|
93
119
|
|
|
94
120
|
for p in doc.paragraphs:
|
|
95
121
|
paragraph_count += 1
|
|
96
122
|
text = p.text.strip()
|
|
123
|
+
norm_text = normalize_text(text)
|
|
97
124
|
|
|
98
125
|
# Track empty paragraphs
|
|
99
126
|
if not text:
|
|
@@ -102,6 +129,7 @@ def validate_document(docx_path):
|
|
|
102
129
|
max_consecutive_empty = max(max_consecutive_empty, consecutive_empty)
|
|
103
130
|
else:
|
|
104
131
|
consecutive_empty = 0
|
|
132
|
+
non_empty_paragraphs.append((paragraph_count, text, norm_text))
|
|
105
133
|
|
|
106
134
|
# Check headings
|
|
107
135
|
if p.style.name.startswith("Heading"):
|
|
@@ -137,6 +165,14 @@ def validate_document(docx_path):
|
|
|
137
165
|
word_count = len(text.split())
|
|
138
166
|
warnings.append(f"Very long paragraph ({len(text)} chars, {word_count} words): \"{text[:50]}...\" — consider breaking up")
|
|
139
167
|
|
|
168
|
+
# Detect duplicated phrase inside the same paragraph
|
|
169
|
+
if len(text) >= 20:
|
|
170
|
+
repeated = find_adjacent_repeat_phrase(text)
|
|
171
|
+
if repeated:
|
|
172
|
+
issues.append(
|
|
173
|
+
f"Possible duplicated text in paragraph {paragraph_count}: repeated phrase \"{repeated[:60]}\""
|
|
174
|
+
)
|
|
175
|
+
|
|
140
176
|
# Check for excessive spacing/empty paragraphs
|
|
141
177
|
if len(text) == 0 and paragraph_count > 1:
|
|
142
178
|
# This is tracked separately
|
|
@@ -181,6 +217,15 @@ def validate_document(docx_path):
|
|
|
181
217
|
if paragraph_count < 3:
|
|
182
218
|
warnings.append("Document is very short — consider if this is complete")
|
|
183
219
|
|
|
220
|
+
# Detect duplicate consecutive non-empty paragraphs (common in title blocks)
|
|
221
|
+
for idx in range(len(non_empty_paragraphs) - 1):
|
|
222
|
+
curr_num, curr_raw, curr_norm = non_empty_paragraphs[idx]
|
|
223
|
+
next_num, next_raw, next_norm = non_empty_paragraphs[idx + 1]
|
|
224
|
+
if curr_norm and curr_norm == next_norm:
|
|
225
|
+
issues.append(
|
|
226
|
+
f"Consecutive duplicate paragraphs at {curr_num} and {next_num}: \"{curr_raw[:80]}\""
|
|
227
|
+
)
|
|
228
|
+
|
|
184
229
|
# ─── Summary info ───
|
|
185
230
|
info.append(f"Paragraphs: {paragraph_count}")
|
|
186
231
|
info.append(f"Headings: {heading_count}")
|