report-compiler 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- report_compiler-0.1.0/PKG-INFO +330 -0
- report_compiler-0.1.0/README.md +313 -0
- report_compiler-0.1.0/pyproject.toml +29 -0
- report_compiler-0.1.0/setup.cfg +4 -0
- report_compiler-0.1.0/src/report_compiler/__init__.py +14 -0
- report_compiler-0.1.0/src/report_compiler/cli.py +327 -0
- report_compiler-0.1.0/src/report_compiler/core/__init__.py +1 -0
- report_compiler-0.1.0/src/report_compiler/core/compiler.py +414 -0
- report_compiler-0.1.0/src/report_compiler/core/config.py +74 -0
- report_compiler-0.1.0/src/report_compiler/document/__init__.py +1 -0
- report_compiler-0.1.0/src/report_compiler/document/docx_processor.py +224 -0
- report_compiler-0.1.0/src/report_compiler/document/libreoffice_converter.py +44 -0
- report_compiler-0.1.0/src/report_compiler/document/placeholder_parser.py +202 -0
- report_compiler-0.1.0/src/report_compiler/document/word_converter.py +140 -0
- report_compiler-0.1.0/src/report_compiler/pdf/__init__.py +1 -0
- report_compiler-0.1.0/src/report_compiler/pdf/content_analyzer.py +239 -0
- report_compiler-0.1.0/src/report_compiler/pdf/marker_remover.py +147 -0
- report_compiler-0.1.0/src/report_compiler/pdf/merge_processor.py +247 -0
- report_compiler-0.1.0/src/report_compiler/pdf/overlay_processor.py +168 -0
- report_compiler-0.1.0/src/report_compiler/utils/__init__.py +1 -0
- report_compiler-0.1.0/src/report_compiler/utils/conversions.py +12 -0
- report_compiler-0.1.0/src/report_compiler/utils/file_manager.py +208 -0
- report_compiler-0.1.0/src/report_compiler/utils/logging_config.py +181 -0
- report_compiler-0.1.0/src/report_compiler/utils/page_selector.py +182 -0
- report_compiler-0.1.0/src/report_compiler/utils/pdf_to_svg.py +116 -0
- report_compiler-0.1.0/src/report_compiler/utils/validators.py +287 -0
- report_compiler-0.1.0/src/report_compiler.egg-info/PKG-INFO +330 -0
- report_compiler-0.1.0/src/report_compiler.egg-info/SOURCES.txt +30 -0
- report_compiler-0.1.0/src/report_compiler.egg-info/dependency_links.txt +1 -0
- report_compiler-0.1.0/src/report_compiler.egg-info/entry_points.txt +2 -0
- report_compiler-0.1.0/src/report_compiler.egg-info/requires.txt +8 -0
- report_compiler-0.1.0/src/report_compiler.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: report_compiler
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A tool for compiling reports from various sources.
|
|
5
|
+
Author-email: YOUR NAME <your@email.com>
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.7
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: comtypes>=1.2.1
|
|
12
|
+
Requires-Dist: Pillow>=10.2.0
|
|
13
|
+
Requires-Dist: python-docx>=1.1.0
|
|
14
|
+
Requires-Dist: PyMuPDF>=1.26.3
|
|
15
|
+
Requires-Dist: typer>=0.9.0
|
|
16
|
+
Requires-Dist: pywin32; sys_platform == "win32"
|
|
17
|
+
|
|
18
|
+
# Report Compiler
|
|
19
|
+
|
|
20
|
+
A Python-based automated DOCX and PDF report compiler for engineering teams. This tool allows engineers to write reports in Word, use placeholders to insert external PDFs, and compile everything into a professional PDF with a single command.
|
|
21
|
+
|
|
22
|
+
## Overview
|
|
23
|
+
|
|
24
|
+
The Report Compiler automates the creation of comprehensive PDF reports by:
|
|
25
|
+
|
|
26
|
+
1. **Finding PDF placeholders** in Word documents using two types of tags:
|
|
27
|
+
- `[[OVERLAY: path/to/file.pdf, page=5]]` for table-based overlays
|
|
28
|
+
- `[[INSERT: path/to/file.pdf]]` for paragraph-based insertions
|
|
29
|
+
2. **Modifying the Word document** to create markers and page breaks
|
|
30
|
+
3. **Converting to PDF** using Word automation (win32com)
|
|
31
|
+
4. **Processing PDF insertions** with overlays and merges using PyMuPDF
|
|
32
|
+
|
|
33
|
+
## Features
|
|
34
|
+
|
|
35
|
+
- ✅ **Two insertion types** - Table-based overlays and paragraph-based merges
|
|
36
|
+
- ✅ **Relative path support** - PDF paths resolved relative to the input Word document
|
|
37
|
+
- ✅ **Page selection support** - Specify which pages to include from source PDFs using flexible syntax
|
|
38
|
+
- ✅ **Multi-page PDF support** - Automatic cell replication for multi-page table overlays
|
|
39
|
+
- ✅ **Annotation preservation** - PDF annotations automatically baked into content during processing
|
|
40
|
+
- ✅ **Marker removal** - Automatic removal of placement markers from final PDF
|
|
41
|
+
- ✅ **Robust page breaks** - Proper page breaks for paragraph-based insertions
|
|
42
|
+
- ✅ **Error handling** - Comprehensive error reporting and validation
|
|
43
|
+
- ✅ **Debug support** - `--keep-temp` flag to retain temporary files for debugging
|
|
44
|
+
- ✅ **Table-based overlay** - Precise PDF placement using table dimensions and marker positioning
|
|
45
|
+
- ✅ **Cell replication** - Multi-page PDFs create consecutive table cells automatically
|
|
46
|
+
- ✅ **Intelligent positioning** - Uses table properties for automatic overlay rectangle calculation
|
|
47
|
+
- ✅ **Modular architecture** - Clean separation of concerns with focused classes and modules
|
|
48
|
+
|
|
49
|
+
## Architecture
|
|
50
|
+
|
|
51
|
+
The Report Compiler uses a modular architecture with clear separation of responsibilities:
|
|
52
|
+
|
|
53
|
+
### Core Modules
|
|
54
|
+
|
|
55
|
+
- **`report_compiler.core`** - Main orchestration and configuration
|
|
56
|
+
- `ReportCompiler` - Main orchestrator class
|
|
57
|
+
- `Config` - Configuration management and constants
|
|
58
|
+
|
|
59
|
+
- **`report_compiler.document`** - Word document processing
|
|
60
|
+
- `PlaceholderParser` - Detects and parses PDF placeholders
|
|
61
|
+
- `DocxProcessor` - Modifies DOCX files (markers, page breaks, cell replication)
|
|
62
|
+
- `WordConverter` - Converts DOCX to PDF using Word automation
|
|
63
|
+
|
|
64
|
+
- **`report_compiler.pdf`** - PDF processing and manipulation
|
|
65
|
+
- `ContentAnalyzer` - Analyzes PDF content and structure
|
|
66
|
+
- `OverlayProcessor` - Handles table-based PDF overlays
|
|
67
|
+
- `MergeProcessor` - Handles paragraph-based PDF merges
|
|
68
|
+
- `MarkerRemover` - Removes placement markers from final PDF
|
|
69
|
+
|
|
70
|
+
- **`report_compiler.utils`** - Utility classes and helpers
|
|
71
|
+
- `FileManager` - Temporary file management and cleanup
|
|
72
|
+
- `Validators` - Input validation and PDF verification
|
|
73
|
+
- `PageSelector` - Page selection parsing and processing
|
|
74
|
+
|
|
75
|
+
### Usage as a Library
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from report_compiler.core.compiler import ReportCompiler
|
|
79
|
+
|
|
80
|
+
# Basic usage
|
|
81
|
+
compiler = ReportCompiler("input.docx", "output.pdf")
|
|
82
|
+
compiler.compile()
|
|
83
|
+
|
|
84
|
+
# With debug mode
|
|
85
|
+
compiler = ReportCompiler("input.docx", "output.pdf", keep_temp=True)
|
|
86
|
+
compiler.compile()
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Quick Start
|
|
90
|
+
|
|
91
|
+
### Installation
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
pip install -r requirements.txt
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Basic Usage
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
report-compiler compile input_report.docx output_report.pdf
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Debug Mode (with temp files)
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
report-compiler compile input_report.docx output_report.pdf --keep-temp
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Placeholder Format
|
|
110
|
+
|
|
111
|
+
The Report Compiler supports two types of PDF insertion placeholders:
|
|
112
|
+
|
|
113
|
+
### Table-based Overlays (OVERLAY tags)
|
|
114
|
+
|
|
115
|
+
For inserting PDFs as overlays onto existing pages, preserving the main document's content and layout. Place these in **single-cell (1x1) tables**:
|
|
116
|
+
|
|
117
|
+
```text
|
|
118
|
+
[[OVERLAY: appendices/sketch.pdf]]
|
|
119
|
+
[[OVERLAY: calculations/diagram.pdf, page=2]]
|
|
120
|
+
[[OVERLAY: C:\Shared\drawing.pdf, page=1-3]]
|
|
121
|
+
[[OVERLAY: diagrams/full_page.pdf, crop=false]]
|
|
122
|
+
[[OVERLAY: sketches/detail.pdf, page=2, crop=false]]
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
**OVERLAY Parameters:**
|
|
126
|
+
|
|
127
|
+
- `page=` - Page selection (same format as INSERT)
|
|
128
|
+
- `crop=` - Content cropping control:
|
|
129
|
+
- `crop=true` (default): Automatically crops to content bounding box, removing excess whitespace
|
|
130
|
+
- `crop=false`: Uses the full page dimensions without cropping
|
|
131
|
+
|
|
132
|
+
### Paragraph-based Merges (INSERT tags)
|
|
133
|
+
|
|
134
|
+
For inserting entire PDF pages after a marker position. The original paragraph content is preserved, and PDF pages are inserted immediately after it. Place these in **standalone paragraphs**:
|
|
135
|
+
|
|
136
|
+
```text
|
|
137
|
+
[[INSERT: appendices/structural_analysis.pdf]]
|
|
138
|
+
[[INSERT: calculations/load_analysis.pdf:1-5]]
|
|
139
|
+
[[INSERT: C:\Shared\external_report.pdf]]
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Page Selection
|
|
143
|
+
|
|
144
|
+
Both OVERLAY and INSERT tags support page selection:
|
|
145
|
+
|
|
146
|
+
**OVERLAY page selection (using `page=` parameter):**
|
|
147
|
+
|
|
148
|
+
```text
|
|
149
|
+
[[OVERLAY: appendices/report.pdf, page=5]] # Page 5 only
|
|
150
|
+
[[OVERLAY: appendices/report.pdf, page=1-3]] # Pages 1, 2, and 3
|
|
151
|
+
[[OVERLAY: appendices/report.pdf, page=1,3,5]] # Pages 1, 3, and 5
|
|
152
|
+
[[OVERLAY: appendices/report.pdf, page=2-]] # Pages 2 to end
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
**INSERT page selection (using `:` separator):**
|
|
156
|
+
|
|
157
|
+
```text
|
|
158
|
+
[[INSERT: appendices/report.pdf:1-3]] # Pages 1, 2, and 3
|
|
159
|
+
[[INSERT: appendices/report.pdf:5]] # Page 5 only
|
|
160
|
+
[[INSERT: appendices/report.pdf:1,3,5]] # Pages 1, 3, and 5
|
|
161
|
+
[[INSERT: appendices/report.pdf:2-]] # Pages 2 to end
|
|
162
|
+
[[INSERT: appendices/report.pdf:1-3,7,9-]] # Mixed: pages 1-3, 7, and 9 to end
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
**Page Selection Formats:**
|
|
166
|
+
|
|
167
|
+
- `5` - Single page (page 5)
|
|
168
|
+
- `1-3` - Range of pages (pages 1, 2, 3)
|
|
169
|
+
- `2-` - Open-ended range (pages 2 to end of document)
|
|
170
|
+
- `1,3,5` - Specific pages (pages 1, 3, and 5)
|
|
171
|
+
- `1-3,7,9-12` - Combined specifications
|
|
172
|
+
|
|
173
|
+
**Note:** Page numbers are 1-indexed (first page = 1). Invalid page numbers are automatically filtered out.
|
|
174
|
+
|
|
175
|
+
**Multi-page PDFs**: Automatically handled via cell replication (table-based overlays) or sequential page insertion (paragraph-based merges)
|
|
176
|
+
|
|
177
|
+
**Note**: Relative paths are resolved relative to the Word document's location.
|
|
178
|
+
|
|
179
|
+
## How It Works
|
|
180
|
+
|
|
181
|
+
### 1. Placeholder Detection
|
|
182
|
+
|
|
183
|
+
- **Table scanning** - Identifies `[[OVERLAY: ...]]` tags in single-cell tables
|
|
184
|
+
- **Paragraph scanning** - Identifies `[[INSERT: ...]]` tags in standalone paragraphs
|
|
185
|
+
- **Path resolution** - Resolves relative paths relative to Word document location
|
|
186
|
+
- **Page parsing** - Parses page selection syntax (e.g., `:1-3`, `,page=5`)
|
|
187
|
+
- **PDF validation** - Validates that referenced PDF files exist and are readable
|
|
188
|
+
- **Page counting** - Counts effective pages after applying page selection filters
|
|
189
|
+
- **Layout detection** - Identifies single-cell tables vs standalone paragraphs
|
|
190
|
+
|
|
191
|
+
### 2. Document Modification
|
|
192
|
+
|
|
193
|
+
- **Table placeholders** - Replaces with visible red markers (`%%OVERLAY_START_N%%`)
|
|
194
|
+
- **Cell replication** - Creates additional table cells for multi-page selections
|
|
195
|
+
- **Paragraph placeholders** - Replaces with merge markers and page breaks (`%%MERGE_START_N%%`)
|
|
196
|
+
- **Marker placement** - Places markers first, then page breaks for correct timing
|
|
197
|
+
- **Temporary document** - Saves modified document for PDF conversion
|
|
198
|
+
|
|
199
|
+
### 3. PDF Conversion
|
|
200
|
+
|
|
201
|
+
- Converts modified Word document to PDF using Word automation
|
|
202
|
+
- Preserves formatting and creates base PDF with markers
|
|
203
|
+
|
|
204
|
+
### 4. PDF Processing
|
|
205
|
+
|
|
206
|
+
#### Paragraph-based Merges (INSERT)
|
|
207
|
+
|
|
208
|
+
- **Marker location** - Finds merge markers in the base PDF
|
|
209
|
+
- **Marker removal** - Removes markers using redaction (white fill)
|
|
210
|
+
- **Page insertion** - Inserts PDF pages immediately after marker position
|
|
211
|
+
- **Content preservation** - Original document content remains intact
|
|
212
|
+
|
|
213
|
+
#### Table-based Overlays (OVERLAY)
|
|
214
|
+
|
|
215
|
+
- **Page selection** - Processes only the specified pages from source PDFs
|
|
216
|
+
- **Annotation preservation** - Automatically bakes PDF annotations into content using `Document.bake()`
|
|
217
|
+
- **Multi-page support** - Creates additional table cells for multi-page selections
|
|
218
|
+
- **Precise positioning** - Searches for overlay markers in the base PDF
|
|
219
|
+
- **Rectangle calculation** - Uses the marker position as the top-left corner of the overlay area
|
|
220
|
+
- **Marker removal** - Removes markers using redaction (white fill)
|
|
221
|
+
- **Sequential overlay** - Overlays each selected page onto calculated rectangles
|
|
222
|
+
- **Final assembly** - Saves completed PDF with all appendices integrated
|
|
223
|
+
|
|
224
|
+
## Table-Based Overlay System
|
|
225
|
+
|
|
226
|
+
The Report Compiler uses a precise approach for PDF overlay placement with full support for multi-page PDFs and annotation preservation:
|
|
227
|
+
|
|
228
|
+
### Single-Page PDF Overlay
|
|
229
|
+
|
|
230
|
+
1. **Table Detection** - Identifies single-cell tables containing `[[OVERLAY: path.pdf]]` placeholders
|
|
231
|
+
2. **Page Selection** - Parses page specifications like `,page=1-3` or `,page=5` if provided
|
|
232
|
+
3. **Dimension Extraction** - Extracts exact table dimensions from Word document metadata
|
|
233
|
+
4. **Marker Placement** - Places a red marker at the top-left of the table cell
|
|
234
|
+
5. **Rectangle Calculation** - Uses marker position + table dimensions = overlay area
|
|
235
|
+
6. **Annotation Preservation** - Bakes PDF annotations into content before overlay
|
|
236
|
+
7. **Precise Overlay** - Places selected PDF pages exactly within the calculated rectangle
|
|
237
|
+
|
|
238
|
+
### Multi-Page PDF Overlay
|
|
239
|
+
|
|
240
|
+
For multi-page PDFs or page selections, the system automatically replicates table cells:
|
|
241
|
+
|
|
242
|
+
1. **Page Detection** - Identifies PDFs with multiple pages or page selections
|
|
243
|
+
2. **Cell Replication** - Adds consecutive table rows for each selected page
|
|
244
|
+
3. **Marker Generation** - Creates unique markers for each cell (`%%OVERLAY_START_00_PAGE_02%%`)
|
|
245
|
+
4. **Sequential Overlay** - Overlays selected pages into consecutive table cells
|
|
246
|
+
5. **Unified Layout** - All selected PDF pages appear together in the same table area
|
|
247
|
+
|
|
248
|
+
### Page Selection Examples
|
|
249
|
+
|
|
250
|
+
```text
|
|
251
|
+
[[OVERLAY: report.pdf, page=1-3]] → 3 table cells with pages 1, 2, 3
|
|
252
|
+
[[OVERLAY: report.pdf, page=2,5,7]] → 3 table cells with pages 2, 5, 7
|
|
253
|
+
[[OVERLAY: report.pdf, page=3-]] → Multiple cells with pages 3 to end
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### Example Output
|
|
257
|
+
|
|
258
|
+
```text
|
|
259
|
+
Single Table → Page Selection:
|
|
260
|
+
┌─────────────────┐
|
|
261
|
+
│ PDF Page 2 │ ← Only page 2 (from [[OVERLAY: doc.pdf, page=2]])
|
|
262
|
+
└─────────────────┘
|
|
263
|
+
|
|
264
|
+
Single Table → Multi-Page Selection:
|
|
265
|
+
┌─────────────────┐
|
|
266
|
+
│ PDF Page 1 │ ← From [[OVERLAY: doc.pdf, page=1,3,5]]
|
|
267
|
+
├─────────────────┤
|
|
268
|
+
│ PDF Page 3 │ ← Replicated cell
|
|
269
|
+
├─────────────────┤
|
|
270
|
+
│ PDF Page 5 │ ← Replicated cell
|
|
271
|
+
└─────────────────┘
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### Example Debug Output
|
|
275
|
+
|
|
276
|
+
```text
|
|
277
|
+
📋 Table found: 7.50 x 4.00 inches
|
|
278
|
+
📍 Marker at: (0.50, 1.59) inches
|
|
279
|
+
📐 Overlay: (0.50, 1.59) to (8.00, 5.59) inches
|
|
280
|
+
🔥 Baking annotations: 12 found
|
|
281
|
+
✅ PDF positioned perfectly
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### Key Benefits
|
|
285
|
+
|
|
286
|
+
- **Simple & Reliable** - Single marker approach with cell replication
|
|
287
|
+
- **Flexible Page Selection** - Extract exactly the pages you need from large PDFs
|
|
288
|
+
- **Multi-page Support** - Automatic handling of PDFs with any number of pages
|
|
289
|
+
- **Annotation Preservation** - PDF annotations automatically preserved during overlay
|
|
290
|
+
- **Accurate** - Uses Word's own measurements
|
|
291
|
+
- **Easy to Debug** - Clear inch measurements and detailed logging with page selection info
|
|
292
|
+
- **Consistent** - Predictable placement and unified layout
|
|
293
|
+
|
|
294
|
+
## Example Workflow
|
|
295
|
+
|
|
296
|
+
```text
|
|
297
|
+
Input: bridge_report.docx containing [[INSERT: appendices/analysis.pdf:2-4,7]]
|
|
298
|
+
↓
|
|
299
|
+
Step 1: Find placeholder and validate analysis.pdf (10 pages)
|
|
300
|
+
Parse page spec "2-4,7" → pages 2, 3, 4, 7 (4 pages selected)
|
|
301
|
+
↓
|
|
302
|
+
Step 2: Replace placeholder with marker + replicate table cells for 4 pages
|
|
303
|
+
↓
|
|
304
|
+
Step 3: Convert modified DOCX to PDF (creates base PDF with 4 table cells)
|
|
305
|
+
↓
|
|
306
|
+
Step 4: Bake annotations, find markers, overlay pages 2,3,4,7 sequentially
|
|
307
|
+
↓
|
|
308
|
+
Output: bridge_report.pdf with selected pages integrated in consecutive cells
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
## Requirements
|
|
312
|
+
|
|
313
|
+
- **Windows** (for Word automation via win32com)
|
|
314
|
+
- **Microsoft Word** installed and accessible
|
|
315
|
+
- **Python 3.7+**
|
|
316
|
+
- **Dependencies**: `python-docx`, `pywin32`, `PyMuPDF`
|
|
317
|
+
|
|
318
|
+
## VS Code Debugging
|
|
319
|
+
|
|
320
|
+
The project includes comprehensive VS Code launch configurations:
|
|
321
|
+
|
|
322
|
+
- **Debug Report Compiler - Example File** - Basic debugging with example file
|
|
323
|
+
- **Debug Report Compiler - Example File (Keep Temp)** - Debug with temp files retained
|
|
324
|
+
- **Debug Report Compiler - Custom Input** - Interactive file input debugging
|
|
325
|
+
- **Debug Report Compiler - Step Into All Code** - Detailed debugging with all code
|
|
326
|
+
- **Debug Report Compiler - Error Testing** - Test error handling scenarios
|
|
327
|
+
|
|
328
|
+
## License
|
|
329
|
+
|
|
330
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
# Report Compiler
|
|
2
|
+
|
|
3
|
+
A Python-based automated DOCX and PDF report compiler for engineering teams. This tool allows engineers to write reports in Word, use placeholders to insert external PDFs, and compile everything into a professional PDF with a single command.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The Report Compiler automates the creation of comprehensive PDF reports by:
|
|
8
|
+
|
|
9
|
+
1. **Finding PDF placeholders** in Word documents using two types of tags:
|
|
10
|
+
- `[[OVERLAY: path/to/file.pdf, page=5]]` for table-based overlays
|
|
11
|
+
- `[[INSERT: path/to/file.pdf]]` for paragraph-based insertions
|
|
12
|
+
2. **Modifying the Word document** to create markers and page breaks
|
|
13
|
+
3. **Converting to PDF** using Word automation (win32com)
|
|
14
|
+
4. **Processing PDF insertions** with overlays and merges using PyMuPDF
|
|
15
|
+
|
|
16
|
+
## Features
|
|
17
|
+
|
|
18
|
+
- ✅ **Two insertion types** - Table-based overlays and paragraph-based merges
|
|
19
|
+
- ✅ **Relative path support** - PDF paths resolved relative to the input Word document
|
|
20
|
+
- ✅ **Page selection support** - Specify which pages to include from source PDFs using flexible syntax
|
|
21
|
+
- ✅ **Multi-page PDF support** - Automatic cell replication for multi-page table overlays
|
|
22
|
+
- ✅ **Annotation preservation** - PDF annotations automatically baked into content during processing
|
|
23
|
+
- ✅ **Marker removal** - Automatic removal of placement markers from final PDF
|
|
24
|
+
- ✅ **Robust page breaks** - Proper page breaks for paragraph-based insertions
|
|
25
|
+
- ✅ **Error handling** - Comprehensive error reporting and validation
|
|
26
|
+
- ✅ **Debug support** - `--keep-temp` flag to retain temporary files for debugging
|
|
27
|
+
- ✅ **Table-based overlay** - Precise PDF placement using table dimensions and marker positioning
|
|
28
|
+
- ✅ **Cell replication** - Multi-page PDFs create consecutive table cells automatically
|
|
29
|
+
- ✅ **Intelligent positioning** - Uses table properties for automatic overlay rectangle calculation
|
|
30
|
+
- ✅ **Modular architecture** - Clean separation of concerns with focused classes and modules
|
|
31
|
+
|
|
32
|
+
## Architecture
|
|
33
|
+
|
|
34
|
+
The Report Compiler uses a modular architecture with clear separation of responsibilities:
|
|
35
|
+
|
|
36
|
+
### Core Modules
|
|
37
|
+
|
|
38
|
+
- **`report_compiler.core`** - Main orchestration and configuration
|
|
39
|
+
- `ReportCompiler` - Main orchestrator class
|
|
40
|
+
- `Config` - Configuration management and constants
|
|
41
|
+
|
|
42
|
+
- **`report_compiler.document`** - Word document processing
|
|
43
|
+
- `PlaceholderParser` - Detects and parses PDF placeholders
|
|
44
|
+
- `DocxProcessor` - Modifies DOCX files (markers, page breaks, cell replication)
|
|
45
|
+
- `WordConverter` - Converts DOCX to PDF using Word automation
|
|
46
|
+
|
|
47
|
+
- **`report_compiler.pdf`** - PDF processing and manipulation
|
|
48
|
+
- `ContentAnalyzer` - Analyzes PDF content and structure
|
|
49
|
+
- `OverlayProcessor` - Handles table-based PDF overlays
|
|
50
|
+
- `MergeProcessor` - Handles paragraph-based PDF merges
|
|
51
|
+
- `MarkerRemover` - Removes placement markers from final PDF
|
|
52
|
+
|
|
53
|
+
- **`report_compiler.utils`** - Utility classes and helpers
|
|
54
|
+
- `FileManager` - Temporary file management and cleanup
|
|
55
|
+
- `Validators` - Input validation and PDF verification
|
|
56
|
+
- `PageSelector` - Page selection parsing and processing
|
|
57
|
+
|
|
58
|
+
### Usage as a Library
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from report_compiler.core.compiler import ReportCompiler
|
|
62
|
+
|
|
63
|
+
# Basic usage
|
|
64
|
+
compiler = ReportCompiler("input.docx", "output.pdf")
|
|
65
|
+
compiler.compile()
|
|
66
|
+
|
|
67
|
+
# With debug mode
|
|
68
|
+
compiler = ReportCompiler("input.docx", "output.pdf", keep_temp=True)
|
|
69
|
+
compiler.compile()
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Quick Start
|
|
73
|
+
|
|
74
|
+
### Installation
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install -r requirements.txt
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Basic Usage
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
report-compiler compile input_report.docx output_report.pdf
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Debug Mode (with temp files)
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
report-compiler compile input_report.docx output_report.pdf --keep-temp
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Placeholder Format
|
|
93
|
+
|
|
94
|
+
The Report Compiler supports two types of PDF insertion placeholders:
|
|
95
|
+
|
|
96
|
+
### Table-based Overlays (OVERLAY tags)
|
|
97
|
+
|
|
98
|
+
For inserting PDFs as overlays onto existing pages, preserving the main document's content and layout. Place these in **single-cell (1x1) tables**:
|
|
99
|
+
|
|
100
|
+
```text
|
|
101
|
+
[[OVERLAY: appendices/sketch.pdf]]
|
|
102
|
+
[[OVERLAY: calculations/diagram.pdf, page=2]]
|
|
103
|
+
[[OVERLAY: C:\Shared\drawing.pdf, page=1-3]]
|
|
104
|
+
[[OVERLAY: diagrams/full_page.pdf, crop=false]]
|
|
105
|
+
[[OVERLAY: sketches/detail.pdf, page=2, crop=false]]
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
**OVERLAY Parameters:**
|
|
109
|
+
|
|
110
|
+
- `page=` - Page selection (same format as INSERT)
|
|
111
|
+
- `crop=` - Content cropping control:
|
|
112
|
+
- `crop=true` (default): Automatically crops to content bounding box, removing excess whitespace
|
|
113
|
+
- `crop=false`: Uses the full page dimensions without cropping
|
|
114
|
+
|
|
115
|
+
### Paragraph-based Merges (INSERT tags)
|
|
116
|
+
|
|
117
|
+
For inserting entire PDF pages after a marker position. The original paragraph content is preserved, and PDF pages are inserted immediately after it. Place these in **standalone paragraphs**:
|
|
118
|
+
|
|
119
|
+
```text
|
|
120
|
+
[[INSERT: appendices/structural_analysis.pdf]]
|
|
121
|
+
[[INSERT: calculations/load_analysis.pdf:1-5]]
|
|
122
|
+
[[INSERT: C:\Shared\external_report.pdf]]
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Page Selection
|
|
126
|
+
|
|
127
|
+
Both OVERLAY and INSERT tags support page selection:
|
|
128
|
+
|
|
129
|
+
**OVERLAY page selection (using `page=` parameter):**
|
|
130
|
+
|
|
131
|
+
```text
|
|
132
|
+
[[OVERLAY: appendices/report.pdf, page=5]] # Page 5 only
|
|
133
|
+
[[OVERLAY: appendices/report.pdf, page=1-3]] # Pages 1, 2, and 3
|
|
134
|
+
[[OVERLAY: appendices/report.pdf, page=1,3,5]] # Pages 1, 3, and 5
|
|
135
|
+
[[OVERLAY: appendices/report.pdf, page=2-]] # Pages 2 to end
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
**INSERT page selection (using `:` separator):**
|
|
139
|
+
|
|
140
|
+
```text
|
|
141
|
+
[[INSERT: appendices/report.pdf:1-3]] # Pages 1, 2, and 3
|
|
142
|
+
[[INSERT: appendices/report.pdf:5]] # Page 5 only
|
|
143
|
+
[[INSERT: appendices/report.pdf:1,3,5]] # Pages 1, 3, and 5
|
|
144
|
+
[[INSERT: appendices/report.pdf:2-]] # Pages 2 to end
|
|
145
|
+
[[INSERT: appendices/report.pdf:1-3,7,9-]] # Mixed: pages 1-3, 7, and 9 to end
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
**Page Selection Formats:**
|
|
149
|
+
|
|
150
|
+
- `5` - Single page (page 5)
|
|
151
|
+
- `1-3` - Range of pages (pages 1, 2, 3)
|
|
152
|
+
- `2-` - Open-ended range (pages 2 to end of document)
|
|
153
|
+
- `1,3,5` - Specific pages (pages 1, 3, and 5)
|
|
154
|
+
- `1-3,7,9-12` - Combined specifications
|
|
155
|
+
|
|
156
|
+
**Note:** Page numbers are 1-indexed (first page = 1). Invalid page numbers are automatically filtered out.
|
|
157
|
+
|
|
158
|
+
**Multi-page PDFs**: Automatically handled via cell replication (table-based overlays) or sequential page insertion (paragraph-based merges)
|
|
159
|
+
|
|
160
|
+
**Note**: Relative paths are resolved relative to the Word document's location.
|
|
161
|
+
|
|
162
|
+
## How It Works
|
|
163
|
+
|
|
164
|
+
### 1. Placeholder Detection
|
|
165
|
+
|
|
166
|
+
- **Table scanning** - Identifies `[[OVERLAY: ...]]` tags in single-cell tables
|
|
167
|
+
- **Paragraph scanning** - Identifies `[[INSERT: ...]]` tags in standalone paragraphs
|
|
168
|
+
- **Path resolution** - Resolves relative paths relative to Word document location
|
|
169
|
+
- **Page parsing** - Parses page selection syntax (e.g., `:1-3`, `,page=5`)
|
|
170
|
+
- **PDF validation** - Validates that referenced PDF files exist and are readable
|
|
171
|
+
- **Page counting** - Counts effective pages after applying page selection filters
|
|
172
|
+
- **Layout detection** - Identifies single-cell tables vs standalone paragraphs
|
|
173
|
+
|
|
174
|
+
### 2. Document Modification
|
|
175
|
+
|
|
176
|
+
- **Table placeholders** - Replaces with visible red markers (`%%OVERLAY_START_N%%`)
|
|
177
|
+
- **Cell replication** - Creates additional table cells for multi-page selections
|
|
178
|
+
- **Paragraph placeholders** - Replaces with merge markers and page breaks (`%%MERGE_START_N%%`)
|
|
179
|
+
- **Marker placement** - Places markers first, then page breaks for correct timing
|
|
180
|
+
- **Temporary document** - Saves modified document for PDF conversion
|
|
181
|
+
|
|
182
|
+
### 3. PDF Conversion
|
|
183
|
+
|
|
184
|
+
- Converts modified Word document to PDF using Word automation
|
|
185
|
+
- Preserves formatting and creates base PDF with markers
|
|
186
|
+
|
|
187
|
+
### 4. PDF Processing
|
|
188
|
+
|
|
189
|
+
#### Paragraph-based Merges (INSERT)
|
|
190
|
+
|
|
191
|
+
- **Marker location** - Finds merge markers in the base PDF
|
|
192
|
+
- **Marker removal** - Removes markers using redaction (white fill)
|
|
193
|
+
- **Page insertion** - Inserts PDF pages immediately after marker position
|
|
194
|
+
- **Content preservation** - Original document content remains intact
|
|
195
|
+
|
|
196
|
+
#### Table-based Overlays (OVERLAY)
|
|
197
|
+
|
|
198
|
+
- **Page selection** - Processes only the specified pages from source PDFs
|
|
199
|
+
- **Annotation preservation** - Automatically bakes PDF annotations into content using `Document.bake()`
|
|
200
|
+
- **Multi-page support** - Creates additional table cells for multi-page selections
|
|
201
|
+
- **Precise positioning** - Searches for overlay markers in the base PDF
|
|
202
|
+
- **Rectangle calculation** - Uses the marker position as the top-left corner of the overlay area
|
|
203
|
+
- **Marker removal** - Removes markers using redaction (white fill)
|
|
204
|
+
- **Sequential overlay** - Overlays each selected page onto calculated rectangles
|
|
205
|
+
- **Final assembly** - Saves completed PDF with all appendices integrated
|
|
206
|
+
|
|
207
|
+
## Table-Based Overlay System
|
|
208
|
+
|
|
209
|
+
The Report Compiler uses a precise approach for PDF overlay placement with full support for multi-page PDFs and annotation preservation:
|
|
210
|
+
|
|
211
|
+
### Single-Page PDF Overlay
|
|
212
|
+
|
|
213
|
+
1. **Table Detection** - Identifies single-cell tables containing `[[OVERLAY: path.pdf]]` placeholders
|
|
214
|
+
2. **Page Selection** - Parses page specifications like `,page=1-3` or `,page=5` if provided
|
|
215
|
+
3. **Dimension Extraction** - Extracts exact table dimensions from Word document metadata
|
|
216
|
+
4. **Marker Placement** - Places a red marker at the top-left of the table cell
|
|
217
|
+
5. **Rectangle Calculation** - Uses marker position + table dimensions = overlay area
|
|
218
|
+
6. **Annotation Preservation** - Bakes PDF annotations into content before overlay
|
|
219
|
+
7. **Precise Overlay** - Places selected PDF pages exactly within the calculated rectangle
|
|
220
|
+
|
|
221
|
+
### Multi-Page PDF Overlay
|
|
222
|
+
|
|
223
|
+
For multi-page PDFs or page selections, the system automatically replicates table cells:
|
|
224
|
+
|
|
225
|
+
1. **Page Detection** - Identifies PDFs with multiple pages or page selections
|
|
226
|
+
2. **Cell Replication** - Adds consecutive table rows for each selected page
|
|
227
|
+
3. **Marker Generation** - Creates unique markers for each cell (`%%OVERLAY_START_00_PAGE_02%%`)
|
|
228
|
+
4. **Sequential Overlay** - Overlays selected pages into consecutive table cells
|
|
229
|
+
5. **Unified Layout** - All selected PDF pages appear together in the same table area
|
|
230
|
+
|
|
231
|
+
### Page Selection Examples
|
|
232
|
+
|
|
233
|
+
```text
|
|
234
|
+
[[OVERLAY: report.pdf, page=1-3]] → 3 table cells with pages 1, 2, 3
|
|
235
|
+
[[OVERLAY: report.pdf, page=2,5,7]] → 3 table cells with pages 2, 5, 7
|
|
236
|
+
[[OVERLAY: report.pdf, page=3-]] → Multiple cells with pages 3 to end
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Example Output
|
|
240
|
+
|
|
241
|
+
```text
|
|
242
|
+
Single Table → Page Selection:
|
|
243
|
+
┌─────────────────┐
|
|
244
|
+
│ PDF Page 2 │ ← Only page 2 (from [[OVERLAY: doc.pdf, page=2]])
|
|
245
|
+
└─────────────────┘
|
|
246
|
+
|
|
247
|
+
Single Table → Multi-Page Selection:
|
|
248
|
+
┌─────────────────┐
|
|
249
|
+
│ PDF Page 1 │ ← From [[OVERLAY: doc.pdf, page=1,3,5]]
|
|
250
|
+
├─────────────────┤
|
|
251
|
+
│ PDF Page 3 │ ← Replicated cell
|
|
252
|
+
├─────────────────┤
|
|
253
|
+
│ PDF Page 5 │ ← Replicated cell
|
|
254
|
+
└─────────────────┘
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### Example Debug Output
|
|
258
|
+
|
|
259
|
+
```text
|
|
260
|
+
📋 Table found: 7.50 x 4.00 inches
|
|
261
|
+
📍 Marker at: (0.50, 1.59) inches
|
|
262
|
+
📐 Overlay: (0.50, 1.59) to (8.00, 5.59) inches
|
|
263
|
+
🔥 Baking annotations: 12 found
|
|
264
|
+
✅ PDF positioned perfectly
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
### Key Benefits
|
|
268
|
+
|
|
269
|
+
- **Simple & Reliable** - Single marker approach with cell replication
|
|
270
|
+
- **Flexible Page Selection** - Extract exactly the pages you need from large PDFs
|
|
271
|
+
- **Multi-page Support** - Automatic handling of PDFs with any number of pages
|
|
272
|
+
- **Annotation Preservation** - PDF annotations automatically preserved during overlay
|
|
273
|
+
- **Accurate** - Uses Word's own measurements
|
|
274
|
+
- **Easy to Debug** - Clear inch measurements and detailed logging with page selection info
|
|
275
|
+
- **Consistent** - Predictable placement and unified layout
|
|
276
|
+
|
|
277
|
+
## Example Workflow
|
|
278
|
+
|
|
279
|
+
```text
|
|
280
|
+
Input: bridge_report.docx containing [[INSERT: appendices/analysis.pdf:2-4,7]]
|
|
281
|
+
↓
|
|
282
|
+
Step 1: Find placeholder and validate analysis.pdf (10 pages)
|
|
283
|
+
Parse page spec "2-4,7" → pages 2, 3, 4, 7 (4 pages selected)
|
|
284
|
+
↓
|
|
285
|
+
Step 2: Replace placeholder with marker + replicate table cells for 4 pages
|
|
286
|
+
↓
|
|
287
|
+
Step 3: Convert modified DOCX to PDF (creates base PDF with 4 table cells)
|
|
288
|
+
↓
|
|
289
|
+
Step 4: Bake annotations, find markers, overlay pages 2,3,4,7 sequentially
|
|
290
|
+
↓
|
|
291
|
+
Output: bridge_report.pdf with selected pages integrated in consecutive cells
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
## Requirements
|
|
295
|
+
|
|
296
|
+
- **Windows** (for Word automation via win32com)
|
|
297
|
+
- **Microsoft Word** installed and accessible
|
|
298
|
+
- **Python 3.7+**
|
|
299
|
+
- **Dependencies**: `python-docx`, `pywin32`, `PyMuPDF`
|
|
300
|
+
|
|
301
|
+
## VS Code Debugging
|
|
302
|
+
|
|
303
|
+
The project includes comprehensive VS Code launch configurations:
|
|
304
|
+
|
|
305
|
+
- **Debug Report Compiler - Example File** - Basic debugging with example file
|
|
306
|
+
- **Debug Report Compiler - Example File (Keep Temp)** - Debug with temp files retained
|
|
307
|
+
- **Debug Report Compiler - Custom Input** - Interactive file input debugging
|
|
308
|
+
- **Debug Report Compiler - Step Into All Code** - Detailed debugging with all code
|
|
309
|
+
- **Debug Report Compiler - Error Testing** - Test error handling scenarios
|
|
310
|
+
|
|
311
|
+
## License
|
|
312
|
+
|
|
313
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "report_compiler"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="YOUR NAME", email="your@email.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "A tool for compiling reports from various sources."
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.7"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
]
|
|
19
|
+
dependencies = [
|
|
20
|
+
"comtypes>=1.2.1",
|
|
21
|
+
"Pillow>=10.2.0",
|
|
22
|
+
"python-docx>=1.1.0",
|
|
23
|
+
"PyMuPDF>=1.26.3",
|
|
24
|
+
"typer>=0.9.0",
|
|
25
|
+
"pywin32; sys_platform == 'win32'"
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
report-compiler = "report_compiler.cli:main"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Report Compiler - A Python-based DOCX+PDF report compiler for engineering teams.
|
|
3
|
+
|
|
4
|
+
This package provides functionality to compile Word documents with embedded PDF placeholders
|
|
5
|
+
into professional PDF reports with precise overlay positioning and merged appendices.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "2.0.0"
|
|
9
|
+
__author__ = "Report Compiler Team"
|
|
10
|
+
|
|
11
|
+
# from .core.compiler import ReportCompiler # Temporarily commented
|
|
12
|
+
from .core.config import Config
|
|
13
|
+
|
|
14
|
+
__all__ = ['Config'] # 'ReportCompiler'
|