@heylemon/lemonade 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,286 +0,0 @@
1
- """Remove unreferenced files from an unpacked PPTX directory.
2
-
3
- Usage: python clean.py <unpacked_dir>
4
-
5
- Example:
6
- python clean.py unpacked/
7
-
8
- This script removes:
9
- - Orphaned slides (not in sldIdLst) and their relationships
10
- - [trash] directory (unreferenced files)
11
- - Orphaned .rels files for deleted resources
12
- - Unreferenced media, embeddings, charts, diagrams, drawings, ink files
13
- - Unreferenced theme files
14
- - Unreferenced notes slides
15
- - Content-Type overrides for deleted files
16
- """
17
-
18
- import sys
19
- from pathlib import Path
20
-
21
- import defusedxml.minidom
22
-
23
-
24
- import re
25
-
26
-
27
- def get_slides_in_sldidlst(unpacked_dir: Path) -> set[str]:
28
- pres_path = unpacked_dir / "ppt" / "presentation.xml"
29
- pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels"
30
-
31
- if not pres_path.exists() or not pres_rels_path.exists():
32
- return set()
33
-
34
- rels_dom = defusedxml.minidom.parse(str(pres_rels_path))
35
- rid_to_slide = {}
36
- for rel in rels_dom.getElementsByTagName("Relationship"):
37
- rid = rel.getAttribute("Id")
38
- target = rel.getAttribute("Target")
39
- rel_type = rel.getAttribute("Type")
40
- if "slide" in rel_type and target.startswith("slides/"):
41
- rid_to_slide[rid] = target.replace("slides/", "")
42
-
43
- pres_content = pres_path.read_text(encoding="utf-8")
44
- referenced_rids = set(re.findall(r'<p:sldId[^>]*r:id="([^"]+)"', pres_content))
45
-
46
- return {rid_to_slide[rid] for rid in referenced_rids if rid in rid_to_slide}
47
-
48
-
49
- def remove_orphaned_slides(unpacked_dir: Path) -> list[str]:
50
- slides_dir = unpacked_dir / "ppt" / "slides"
51
- slides_rels_dir = slides_dir / "_rels"
52
- pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels"
53
-
54
- if not slides_dir.exists():
55
- return []
56
-
57
- referenced_slides = get_slides_in_sldidlst(unpacked_dir)
58
- removed = []
59
-
60
- for slide_file in slides_dir.glob("slide*.xml"):
61
- if slide_file.name not in referenced_slides:
62
- rel_path = slide_file.relative_to(unpacked_dir)
63
- slide_file.unlink()
64
- removed.append(str(rel_path))
65
-
66
- rels_file = slides_rels_dir / f"{slide_file.name}.rels"
67
- if rels_file.exists():
68
- rels_file.unlink()
69
- removed.append(str(rels_file.relative_to(unpacked_dir)))
70
-
71
- if removed and pres_rels_path.exists():
72
- rels_dom = defusedxml.minidom.parse(str(pres_rels_path))
73
- changed = False
74
-
75
- for rel in list(rels_dom.getElementsByTagName("Relationship")):
76
- target = rel.getAttribute("Target")
77
- if target.startswith("slides/"):
78
- slide_name = target.replace("slides/", "")
79
- if slide_name not in referenced_slides:
80
- if rel.parentNode:
81
- rel.parentNode.removeChild(rel)
82
- changed = True
83
-
84
- if changed:
85
- with open(pres_rels_path, "wb") as f:
86
- f.write(rels_dom.toxml(encoding="utf-8"))
87
-
88
- return removed
89
-
90
-
91
- def remove_trash_directory(unpacked_dir: Path) -> list[str]:
92
- trash_dir = unpacked_dir / "[trash]"
93
- removed = []
94
-
95
- if trash_dir.exists() and trash_dir.is_dir():
96
- for file_path in trash_dir.iterdir():
97
- if file_path.is_file():
98
- rel_path = file_path.relative_to(unpacked_dir)
99
- removed.append(str(rel_path))
100
- file_path.unlink()
101
- trash_dir.rmdir()
102
-
103
- return removed
104
-
105
-
106
- def get_slide_referenced_files(unpacked_dir: Path) -> set:
107
- referenced = set()
108
- slides_rels_dir = unpacked_dir / "ppt" / "slides" / "_rels"
109
-
110
- if not slides_rels_dir.exists():
111
- return referenced
112
-
113
- for rels_file in slides_rels_dir.glob("*.rels"):
114
- dom = defusedxml.minidom.parse(str(rels_file))
115
- for rel in dom.getElementsByTagName("Relationship"):
116
- target = rel.getAttribute("Target")
117
- if not target:
118
- continue
119
- target_path = (rels_file.parent.parent / target).resolve()
120
- try:
121
- referenced.add(target_path.relative_to(unpacked_dir.resolve()))
122
- except ValueError:
123
- pass
124
-
125
- return referenced
126
-
127
-
128
- def remove_orphaned_rels_files(unpacked_dir: Path) -> list[str]:
129
- resource_dirs = ["charts", "diagrams", "drawings"]
130
- removed = []
131
- slide_referenced = get_slide_referenced_files(unpacked_dir)
132
-
133
- for dir_name in resource_dirs:
134
- rels_dir = unpacked_dir / "ppt" / dir_name / "_rels"
135
- if not rels_dir.exists():
136
- continue
137
-
138
- for rels_file in rels_dir.glob("*.rels"):
139
- resource_file = rels_dir.parent / rels_file.name.replace(".rels", "")
140
- try:
141
- resource_rel_path = resource_file.resolve().relative_to(unpacked_dir.resolve())
142
- except ValueError:
143
- continue
144
-
145
- if not resource_file.exists() or resource_rel_path not in slide_referenced:
146
- rels_file.unlink()
147
- rel_path = rels_file.relative_to(unpacked_dir)
148
- removed.append(str(rel_path))
149
-
150
- return removed
151
-
152
-
153
- def get_referenced_files(unpacked_dir: Path) -> set:
154
- referenced = set()
155
-
156
- for rels_file in unpacked_dir.rglob("*.rels"):
157
- dom = defusedxml.minidom.parse(str(rels_file))
158
- for rel in dom.getElementsByTagName("Relationship"):
159
- target = rel.getAttribute("Target")
160
- if not target:
161
- continue
162
- target_path = (rels_file.parent.parent / target).resolve()
163
- try:
164
- referenced.add(target_path.relative_to(unpacked_dir.resolve()))
165
- except ValueError:
166
- pass
167
-
168
- return referenced
169
-
170
-
171
- def remove_orphaned_files(unpacked_dir: Path, referenced: set) -> list[str]:
172
- resource_dirs = ["media", "embeddings", "charts", "diagrams", "tags", "drawings", "ink"]
173
- removed = []
174
-
175
- for dir_name in resource_dirs:
176
- dir_path = unpacked_dir / "ppt" / dir_name
177
- if not dir_path.exists():
178
- continue
179
-
180
- for file_path in dir_path.glob("*"):
181
- if not file_path.is_file():
182
- continue
183
- rel_path = file_path.relative_to(unpacked_dir)
184
- if rel_path not in referenced:
185
- file_path.unlink()
186
- removed.append(str(rel_path))
187
-
188
- theme_dir = unpacked_dir / "ppt" / "theme"
189
- if theme_dir.exists():
190
- for file_path in theme_dir.glob("theme*.xml"):
191
- rel_path = file_path.relative_to(unpacked_dir)
192
- if rel_path not in referenced:
193
- file_path.unlink()
194
- removed.append(str(rel_path))
195
- theme_rels = theme_dir / "_rels" / f"{file_path.name}.rels"
196
- if theme_rels.exists():
197
- theme_rels.unlink()
198
- removed.append(str(theme_rels.relative_to(unpacked_dir)))
199
-
200
- notes_dir = unpacked_dir / "ppt" / "notesSlides"
201
- if notes_dir.exists():
202
- for file_path in notes_dir.glob("*.xml"):
203
- if not file_path.is_file():
204
- continue
205
- rel_path = file_path.relative_to(unpacked_dir)
206
- if rel_path not in referenced:
207
- file_path.unlink()
208
- removed.append(str(rel_path))
209
-
210
- notes_rels_dir = notes_dir / "_rels"
211
- if notes_rels_dir.exists():
212
- for file_path in notes_rels_dir.glob("*.rels"):
213
- notes_file = notes_dir / file_path.name.replace(".rels", "")
214
- if not notes_file.exists():
215
- file_path.unlink()
216
- removed.append(str(file_path.relative_to(unpacked_dir)))
217
-
218
- return removed
219
-
220
-
221
- def update_content_types(unpacked_dir: Path, removed_files: list[str]) -> None:
222
- ct_path = unpacked_dir / "[Content_Types].xml"
223
- if not ct_path.exists():
224
- return
225
-
226
- dom = defusedxml.minidom.parse(str(ct_path))
227
- changed = False
228
-
229
- for override in list(dom.getElementsByTagName("Override")):
230
- part_name = override.getAttribute("PartName").lstrip("/")
231
- if part_name in removed_files:
232
- if override.parentNode:
233
- override.parentNode.removeChild(override)
234
- changed = True
235
-
236
- if changed:
237
- with open(ct_path, "wb") as f:
238
- f.write(dom.toxml(encoding="utf-8"))
239
-
240
-
241
- def clean_unused_files(unpacked_dir: Path) -> list[str]:
242
- all_removed = []
243
-
244
- slides_removed = remove_orphaned_slides(unpacked_dir)
245
- all_removed.extend(slides_removed)
246
-
247
- trash_removed = remove_trash_directory(unpacked_dir)
248
- all_removed.extend(trash_removed)
249
-
250
- while True:
251
- removed_rels = remove_orphaned_rels_files(unpacked_dir)
252
- referenced = get_referenced_files(unpacked_dir)
253
- removed_files = remove_orphaned_files(unpacked_dir, referenced)
254
-
255
- total_removed = removed_rels + removed_files
256
- if not total_removed:
257
- break
258
-
259
- all_removed.extend(total_removed)
260
-
261
- if all_removed:
262
- update_content_types(unpacked_dir, all_removed)
263
-
264
- return all_removed
265
-
266
-
267
- if __name__ == "__main__":
268
- if len(sys.argv) != 2:
269
- print("Usage: python clean.py <unpacked_dir>", file=sys.stderr)
270
- print("Example: python clean.py unpacked/", file=sys.stderr)
271
- sys.exit(1)
272
-
273
- unpacked_dir = Path(sys.argv[1])
274
-
275
- if not unpacked_dir.exists():
276
- print(f"Error: {unpacked_dir} not found", file=sys.stderr)
277
- sys.exit(1)
278
-
279
- removed = clean_unused_files(unpacked_dir)
280
-
281
- if removed:
282
- print(f"Removed {len(removed)} unreferenced files:")
283
- for f in removed:
284
- print(f" {f}")
285
- else:
286
- print("No unreferenced files found")
@@ -1,289 +0,0 @@
1
- """Create thumbnail grids from PowerPoint presentation slides.
2
-
3
- Creates a grid layout of slide thumbnails for quick visual analysis.
4
- Labels each thumbnail with its XML filename (e.g., slide1.xml).
5
- Hidden slides are shown with a placeholder pattern.
6
-
7
- Usage:
8
- python thumbnail.py input.pptx [output_prefix] [--cols N]
9
-
10
- Examples:
11
- python thumbnail.py presentation.pptx
12
- # Creates: thumbnails.jpg
13
-
14
- python thumbnail.py template.pptx grid --cols 4
15
- # Creates: grid.jpg (or grid-1.jpg, grid-2.jpg for large decks)
16
- """
17
-
18
- import argparse
19
- import subprocess
20
- import sys
21
- import tempfile
22
- import zipfile
23
- from pathlib import Path
24
-
25
- import defusedxml.minidom
26
- from office.soffice import get_soffice_env
27
- from PIL import Image, ImageDraw, ImageFont
28
-
29
- THUMBNAIL_WIDTH = 300
30
- CONVERSION_DPI = 100
31
- MAX_COLS = 6
32
- DEFAULT_COLS = 3
33
- JPEG_QUALITY = 95
34
- GRID_PADDING = 20
35
- BORDER_WIDTH = 2
36
- FONT_SIZE_RATIO = 0.10
37
- LABEL_PADDING_RATIO = 0.4
38
-
39
-
40
- def main():
41
- parser = argparse.ArgumentParser(
42
- description="Create thumbnail grids from PowerPoint slides."
43
- )
44
- parser.add_argument("input", help="Input PowerPoint file (.pptx)")
45
- parser.add_argument(
46
- "output_prefix",
47
- nargs="?",
48
- default="thumbnails",
49
- help="Output prefix for image files (default: thumbnails)",
50
- )
51
- parser.add_argument(
52
- "--cols",
53
- type=int,
54
- default=DEFAULT_COLS,
55
- help=f"Number of columns (default: {DEFAULT_COLS}, max: {MAX_COLS})",
56
- )
57
-
58
- args = parser.parse_args()
59
-
60
- cols = min(args.cols, MAX_COLS)
61
- if args.cols > MAX_COLS:
62
- print(f"Warning: Columns limited to {MAX_COLS}")
63
-
64
- input_path = Path(args.input)
65
- if not input_path.exists() or input_path.suffix.lower() != ".pptx":
66
- print(f"Error: Invalid PowerPoint file: {args.input}", file=sys.stderr)
67
- sys.exit(1)
68
-
69
- output_path = Path(f"{args.output_prefix}.jpg")
70
-
71
- try:
72
- slide_info = get_slide_info(input_path)
73
-
74
- with tempfile.TemporaryDirectory() as temp_dir:
75
- temp_path = Path(temp_dir)
76
- visible_images = convert_to_images(input_path, temp_path)
77
-
78
- if not visible_images and not any(s["hidden"] for s in slide_info):
79
- print("Error: No slides found", file=sys.stderr)
80
- sys.exit(1)
81
-
82
- slides = build_slide_list(slide_info, visible_images, temp_path)
83
-
84
- grid_files = create_grids(slides, cols, THUMBNAIL_WIDTH, output_path)
85
-
86
- print(f"Created {len(grid_files)} grid(s):")
87
- for grid_file in grid_files:
88
- print(f" {grid_file}")
89
-
90
- except Exception as e:
91
- print(f"Error: {e}", file=sys.stderr)
92
- sys.exit(1)
93
-
94
-
95
- def get_slide_info(pptx_path: Path) -> list[dict]:
96
- with zipfile.ZipFile(pptx_path, "r") as zf:
97
- rels_content = zf.read("ppt/_rels/presentation.xml.rels").decode("utf-8")
98
- rels_dom = defusedxml.minidom.parseString(rels_content)
99
-
100
- rid_to_slide = {}
101
- for rel in rels_dom.getElementsByTagName("Relationship"):
102
- rid = rel.getAttribute("Id")
103
- target = rel.getAttribute("Target")
104
- rel_type = rel.getAttribute("Type")
105
- if "slide" in rel_type and target.startswith("slides/"):
106
- rid_to_slide[rid] = target.replace("slides/", "")
107
-
108
- pres_content = zf.read("ppt/presentation.xml").decode("utf-8")
109
- pres_dom = defusedxml.minidom.parseString(pres_content)
110
-
111
- slides = []
112
- for sld_id in pres_dom.getElementsByTagName("p:sldId"):
113
- rid = sld_id.getAttribute("r:id")
114
- if rid in rid_to_slide:
115
- hidden = sld_id.getAttribute("show") == "0"
116
- slides.append({"name": rid_to_slide[rid], "hidden": hidden})
117
-
118
- return slides
119
-
120
-
121
- def build_slide_list(
122
- slide_info: list[dict],
123
- visible_images: list[Path],
124
- temp_dir: Path,
125
- ) -> list[tuple[Path, str]]:
126
- if visible_images:
127
- with Image.open(visible_images[0]) as img:
128
- placeholder_size = img.size
129
- else:
130
- placeholder_size = (1920, 1080)
131
-
132
- slides = []
133
- visible_idx = 0
134
-
135
- for info in slide_info:
136
- if info["hidden"]:
137
- placeholder_path = temp_dir / f"hidden-{info['name']}.jpg"
138
- placeholder_img = create_hidden_placeholder(placeholder_size)
139
- placeholder_img.save(placeholder_path, "JPEG")
140
- slides.append((placeholder_path, f"{info['name']} (hidden)"))
141
- else:
142
- if visible_idx < len(visible_images):
143
- slides.append((visible_images[visible_idx], info["name"]))
144
- visible_idx += 1
145
-
146
- return slides
147
-
148
-
149
- def create_hidden_placeholder(size: tuple[int, int]) -> Image.Image:
150
- img = Image.new("RGB", size, color="#F0F0F0")
151
- draw = ImageDraw.Draw(img)
152
- line_width = max(5, min(size) // 100)
153
- draw.line([(0, 0), size], fill="#CCCCCC", width=line_width)
154
- draw.line([(size[0], 0), (0, size[1])], fill="#CCCCCC", width=line_width)
155
- return img
156
-
157
-
158
- def convert_to_images(pptx_path: Path, temp_dir: Path) -> list[Path]:
159
- pdf_path = temp_dir / f"{pptx_path.stem}.pdf"
160
-
161
- result = subprocess.run(
162
- [
163
- "soffice",
164
- "--headless",
165
- "--convert-to",
166
- "pdf",
167
- "--outdir",
168
- str(temp_dir),
169
- str(pptx_path),
170
- ],
171
- capture_output=True,
172
- text=True,
173
- env=get_soffice_env(),
174
- )
175
- if result.returncode != 0 or not pdf_path.exists():
176
- raise RuntimeError("PDF conversion failed")
177
-
178
- result = subprocess.run(
179
- [
180
- "pdftoppm",
181
- "-jpeg",
182
- "-r",
183
- str(CONVERSION_DPI),
184
- str(pdf_path),
185
- str(temp_dir / "slide"),
186
- ],
187
- capture_output=True,
188
- text=True,
189
- )
190
- if result.returncode != 0:
191
- raise RuntimeError("Image conversion failed")
192
-
193
- return sorted(temp_dir.glob("slide-*.jpg"))
194
-
195
-
196
- def create_grids(
197
- slides: list[tuple[Path, str]],
198
- cols: int,
199
- width: int,
200
- output_path: Path,
201
- ) -> list[str]:
202
- max_per_grid = cols * (cols + 1)
203
- grid_files = []
204
-
205
- for chunk_idx, start_idx in enumerate(range(0, len(slides), max_per_grid)):
206
- end_idx = min(start_idx + max_per_grid, len(slides))
207
- chunk_slides = slides[start_idx:end_idx]
208
-
209
- grid = create_grid(chunk_slides, cols, width)
210
-
211
- if len(slides) <= max_per_grid:
212
- grid_filename = output_path
213
- else:
214
- stem = output_path.stem
215
- suffix = output_path.suffix
216
- grid_filename = output_path.parent / f"{stem}-{chunk_idx + 1}{suffix}"
217
-
218
- grid_filename.parent.mkdir(parents=True, exist_ok=True)
219
- grid.save(str(grid_filename), quality=JPEG_QUALITY)
220
- grid_files.append(str(grid_filename))
221
-
222
- return grid_files
223
-
224
-
225
- def create_grid(
226
- slides: list[tuple[Path, str]],
227
- cols: int,
228
- width: int,
229
- ) -> Image.Image:
230
- font_size = int(width * FONT_SIZE_RATIO)
231
- label_padding = int(font_size * LABEL_PADDING_RATIO)
232
-
233
- with Image.open(slides[0][0]) as img:
234
- aspect = img.height / img.width
235
- height = int(width * aspect)
236
-
237
- rows = (len(slides) + cols - 1) // cols
238
- grid_w = cols * width + (cols + 1) * GRID_PADDING
239
- grid_h = rows * (height + font_size + label_padding * 2) + (rows + 1) * GRID_PADDING
240
-
241
- grid = Image.new("RGB", (grid_w, grid_h), "white")
242
- draw = ImageDraw.Draw(grid)
243
-
244
- try:
245
- font = ImageFont.load_default(size=font_size)
246
- except Exception:
247
- font = ImageFont.load_default()
248
-
249
- for i, (img_path, slide_name) in enumerate(slides):
250
- row, col = i // cols, i % cols
251
- x = col * width + (col + 1) * GRID_PADDING
252
- y_base = (
253
- row * (height + font_size + label_padding * 2) + (row + 1) * GRID_PADDING
254
- )
255
-
256
- label = slide_name
257
- bbox = draw.textbbox((0, 0), label, font=font)
258
- text_w = bbox[2] - bbox[0]
259
- draw.text(
260
- (x + (width - text_w) // 2, y_base + label_padding),
261
- label,
262
- fill="black",
263
- font=font,
264
- )
265
-
266
- y_thumbnail = y_base + label_padding + font_size + label_padding
267
-
268
- with Image.open(img_path) as img:
269
- img.thumbnail((width, height), Image.Resampling.LANCZOS)
270
- w, h = img.size
271
- tx = x + (width - w) // 2
272
- ty = y_thumbnail + (height - h) // 2
273
- grid.paste(img, (tx, ty))
274
-
275
- if BORDER_WIDTH > 0:
276
- draw.rectangle(
277
- [
278
- (tx - BORDER_WIDTH, ty - BORDER_WIDTH),
279
- (tx + w + BORDER_WIDTH - 1, ty + h + BORDER_WIDTH - 1),
280
- ],
281
- outline="gray",
282
- width=BORDER_WIDTH,
283
- )
284
-
285
- return grid
286
-
287
-
288
- if __name__ == "__main__":
289
- main()