pdftree 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdftree/__init__.py ADDED
File without changes
pdftree/app.py ADDED
@@ -0,0 +1,599 @@
1
+ import os
2
+ import pathlib
3
+ import shlex
4
+ import subprocess
5
+ import sys
6
+ import tempfile
7
+
8
+ import pikepdf
9
+ from rich.text import Text
10
+ from textual.app import App, ComposeResult
11
+ from textual.binding import Binding
12
+ from textual.containers import Horizontal, Vertical
13
+ from textual.widgets import Input, Label, RichLog
14
+ from textual.widgets import Tree as TextualTree
15
+ from textual.widgets.tree import TreeNode
16
+
17
+ from .pdf_utils import JumpReference, build_tree, is_content_stream
18
+ from .screens import HelpScreen, PromptScreen, UnsavedChangesScreen
19
+ from .tree_utils import (
20
+ expand_to,
21
+ get_node_by_path,
22
+ get_node_name,
23
+ iter_nodes,
24
+ rebuild_stream_label,
25
+ )
26
+ from .widgets import PageInput, PDFTree, SearchInput
27
+
28
+
29
+ class PDFTreeApp(App):
30
+ """A Textual app to interactively explore PDF structures and view stream contents."""
31
+
32
+ BINDINGS = [
33
+ Binding("f1", "show_help", "Help", show=True),
34
+ Binding("H", "show_help", "Help", show=True),
35
+ Binding("q", "quit", "Quit", show=True),
36
+ Binding("g", "prompt_page", "Go to Page (g)", show=True),
37
+ Binding("s", "export_stream", "Save stream (s)", show=True),
38
+ Binding("e", "edit_stream", "Edit Stream (e)", show=True),
39
+ Binding("f", "normalize_stream", "Format Stream (f)", show=True),
40
+ Binding("w", "save_pdf", "Save PDF (w)", show=True),
41
+ Binding("ctrl+c", "quit", "Quit", show=True),
42
+ Binding("ctrl+z", "suspend_process", "Suspend", show=True),
43
+ Binding("ctrl+l", "redraw_screen", "Redraw", show=False),
44
+ Binding("/", "search_forward", "Search (/)", show=True),
45
+ Binding("?", "search_backward", "Search (?)", show=True),
46
+ Binding("n", "repeat_search_forward", "Next (n)", show=True),
47
+ Binding("p", "repeat_search_backward", "Prev (p)", show=True),
48
+ ]
49
+
50
+ TITLE = "pdftree - Interactive Object Explorer"
51
+
52
+ CSS_PATH = "styles.tcss"
53
+
54
+ def __init__(self, pdf_path: str):
55
+ super().__init__()
56
+ self.pdf_path = pdf_path
57
+ self.pdf = None
58
+ self.last_search_query: str | None = None
59
+ self._search_direction: str = "forward"
60
+ # Flag to suppress on_tree_node_selected firing when we move the cursor
61
+ # programmatically (search / jump). Stored on self, not on tree nodes.
62
+ self._programmatic_move: bool = False
63
+ self.obj_to_node: dict[tuple[int, int], TreeNode] = {}
64
+ self.is_dirty: bool = False
65
+
66
+ # -------------------------------------------------------------------------
67
+ # Normalize stream
68
+ # -------------------------------------------------------------------------
69
+
70
+ def action_normalize_stream(self) -> None:
71
+ """Format a content stream to have one operator per line."""
72
+ tree = self.query_one("#tree-pane", TextualTree)
73
+ node = tree.cursor_node
74
+
75
+ if node is None or not isinstance(node.data, pikepdf.Stream):
76
+ self.query_one("#breadcrumb", Label).update(
77
+ "[yellow]Please select a Stream node (Red) to format.[/yellow]"
78
+ )
79
+ return
80
+
81
+ node_name = get_node_name(node)
82
+ parent_name = get_node_name(node.parent) if node.parent is not None else ""
83
+
84
+ if not is_content_stream(node.data, node_name, parent_name):
85
+ self.query_one("#breadcrumb", Label).update(
86
+ (
87
+ f"[yellow]Not reformatting '{node_name}' (parent: '{parent_name}') as it is not a content stream.[/yellow]"
88
+ )
89
+ )
90
+ return
91
+
92
+ try:
93
+ # 1. Parse and unparse using pikepdf
94
+ parsed = pikepdf.parse_content_stream(node.data)
95
+ normalized_bytes = pikepdf.unparse_content_stream(parsed)
96
+
97
+ # 2. Check if it actually changed
98
+ old_bytes = node.data.read_bytes()
99
+ if normalized_bytes != old_bytes:
100
+ # 3. Write back to pikepdf
101
+ node.data.write(normalized_bytes)
102
+
103
+ # 4. Update the label length safely
104
+ rebuild_stream_label(node, len(normalized_bytes))
105
+
106
+ self.is_dirty = True
107
+
108
+ self.query_one("#breadcrumb", Label).update(
109
+ f"[green]Stream formatted! Length: {len(old_bytes)} -> {len(normalized_bytes)} bytes.[/green]"
110
+ )
111
+
112
+ # Force a redraw of the detail pane to show the formatted text
113
+ self.call_after_refresh(self.do_jump_factory(tree, node))
114
+
115
+ else:
116
+ self.query_one("#breadcrumb", Label).update(
117
+ "[dim]Stream already formatted or unchanged.[/dim]"
118
+ )
119
+
120
+ except Exception as e:
121
+ # This will catch if the user tries to format an image stream or
122
+ # something else that isn't a valid PDF content stream.
123
+ self.query_one("#breadcrumb", Label).update(
124
+ f"[red]Failed to format (might not be a content stream):[/red] {e}"
125
+ )
126
+
127
+ def do_jump_factory(self, tree, node):
128
+ def jump():
129
+ self._programmatic_move = True
130
+ tree.select_node(node)
131
+
132
+ return jump
133
+
134
+ # -------------------------------------------------------------------------
135
+ # Prompt for save on quit
136
+ # -------------------------------------------------------------------------
137
+
138
+ def action_quit(self) -> None:
139
+ """Override Textual's default quit to check for unsaved changes."""
140
+ if getattr(self, "is_dirty", False):
141
+ # Prompt the user if changes exist
142
+ self.push_screen(UnsavedChangesScreen(), self._quit_confirm_callback)
143
+ else:
144
+ # Otherwise, use Textual's native exit method
145
+ self.exit()
146
+
147
+ def _quit_confirm_callback(self, quit_anyway: bool) -> None:
148
+ """Callback fired when the UnsavedChangesScreen is dismissed."""
149
+ if quit_anyway:
150
+ self.exit()
151
+
152
+ # -------------------------------------------------------------------------
153
+ # Screen helpers
154
+ # -------------------------------------------------------------------------
155
+
156
+ def action_redraw_screen(self, *args, **kwargs) -> None:
157
+ self.screen.refresh(layout=True)
158
+
159
+ # -------------------------------------------------------------------------
160
+ # Page navigation
161
+ # -------------------------------------------------------------------------
162
+
163
+ def action_prompt_page(self) -> None:
164
+ """Open the page jump prompt."""
165
+ page_input = self.query_one("#page-input", PageInput)
166
+ page_input.value = ""
167
+ page_input.display = True
168
+ page_input.focus()
169
+
170
+ def action_cancel_page_jump(self) -> None:
171
+ """Hide the page jump prompt."""
172
+ page_input = self.query_one("#page-input", PageInput)
173
+ page_input.display = False
174
+ self.query_one("#tree-pane").focus()
175
+
176
+ # -------------------------------------------------------------------------
177
+ # Export stream
178
+ # -------------------------------------------------------------------------
179
+
180
+ def _save_stream_callback(self, filename: str | None) -> None:
181
+ """Callback fired when the SavePromptScreen is dismissed."""
182
+ if not filename:
183
+ return # User canceled or entered an empty string
184
+
185
+ node = getattr(self, "_pending_export_node", None)
186
+
187
+ # Double check we are still on a stream just in case
188
+ if node is None or not isinstance(node.data, pikepdf.Stream):
189
+ return
190
+
191
+ try:
192
+ raw_bytes = node.data.read_bytes()
193
+ with open(filename, "wb") as f:
194
+ f.write(raw_bytes)
195
+
196
+ self.query_one("#breadcrumb", Label).update(
197
+ f"[green]Successfully saved {len(raw_bytes)} bytes to '{filename}'[/green]"
198
+ )
199
+ except Exception as e:
200
+ self.query_one("#breadcrumb", Label).update(f"[red]Failed to save file:[/red] {e}")
201
+
202
+ # -------------------------------------------------------------------------
203
+ # Edit stream
204
+ # -------------------------------------------------------------------------
205
+
206
+ def action_edit_stream(self) -> None:
207
+ """Export stream to temp file, suspend TUI, run $EDITOR, read back."""
208
+ tree = self.query_one("#tree-pane", PDFTree)
209
+ node = tree.cursor_node
210
+
211
+ if node is None or not isinstance(node.data, pikepdf.Stream):
212
+ self.query_one("#breadcrumb", Label).update(
213
+ "[yellow]Please select a Stream node (Red) to edit.[/yellow]"
214
+ )
215
+ return
216
+
217
+ # 1. Setup temp file
218
+ try:
219
+ old_bytes = node.data.read_bytes()
220
+ fd, temp_path = tempfile.mkstemp(suffix=".txt")
221
+ with os.fdopen(fd, "wb") as f:
222
+ f.write(old_bytes)
223
+ except Exception as e:
224
+ self.query_one("#breadcrumb", Label).update(f"[red]Error reading stream:[/red] {e}")
225
+ return
226
+
227
+ editor_env = os.environ.get("EDITOR", "nano" if os.name != "nt" else "notepad")
228
+ cmd_list = shlex.split(editor_env) + [temp_path]
229
+
230
+ # 2. Safely call the editor
231
+ try:
232
+ with self.suspend():
233
+ subprocess.run(cmd_list, check=True)
234
+ except FileNotFoundError:
235
+ os.remove(temp_path)
236
+ self.query_one("#breadcrumb", Label).update(
237
+ f"[red]Editor not found:[/red] '{cmd_list[0]}'. Check your $EDITOR variable."
238
+ )
239
+ return
240
+ except subprocess.CalledProcessError as e:
241
+ os.remove(temp_path)
242
+ self.query_one("#breadcrumb", Label).update(
243
+ f"[red]Editor exited with an error code:[/red] {e.returncode}"
244
+ )
245
+ return
246
+
247
+ # 3. Process the results
248
+ try:
249
+ with open(temp_path, "rb") as f:
250
+ new_bytes = f.read()
251
+ os.remove(temp_path)
252
+
253
+ if new_bytes != old_bytes:
254
+ # Write back to pikepdf
255
+ node.data.write(new_bytes)
256
+
257
+ rebuild_stream_label(node, len(new_bytes))
258
+
259
+ # Set dirty flag *after* successful write
260
+ self.is_dirty = True
261
+
262
+ self.query_one("#breadcrumb", Label).update(
263
+ f"[green]Stream updated! Length changed: {len(old_bytes)} -> {len(new_bytes)} bytes.[/green]"
264
+ )
265
+ self.call_after_refresh(self.do_jump_factory(tree, node))
266
+ else:
267
+ self.query_one("#breadcrumb", Label).update(
268
+ "[dim]Stream unchanged. Editing canceled.[/dim]"
269
+ )
270
+ except Exception as e:
271
+ self.query_one("#breadcrumb", Label).update(
272
+ f"[red]Error saving stream data:[/red] {e}"
273
+ )
274
+
275
+ # -------------------------------------------------------------------------
276
+ # Save PDF
277
+ # -------------------------------------------------------------------------
278
+
279
+ def action_save_pdf(self) -> None:
280
+ """Prompt the user for a filename to save the entire document."""
281
+ p = pathlib.Path(self.pdf_path)
282
+ default_name = f"{p.stem}_modified{p.suffix}"
283
+
284
+ self.push_screen(
285
+ PromptScreen("Save Entire PDF As:", default_name), self._save_pdf_callback
286
+ )
287
+
288
+ def action_export_stream(self) -> None:
289
+ """Prompt to save the currently selected stream."""
290
+ tree = self.query_one("#tree-pane", PDFTree)
291
+ node = tree.cursor_node
292
+
293
+ if node is not None and isinstance(node.data, pikepdf.Stream):
294
+ self._pending_export_node = node
295
+ self.push_screen(
296
+ PromptScreen("Export Stream As:", "stream.bin"),
297
+ self._save_stream_callback,
298
+ )
299
+ else:
300
+ self.query_one("#breadcrumb", Label).update(
301
+ "[yellow]Please select a Stream node (Red) to export.[/yellow]"
302
+ )
303
+
304
+ def _save_pdf_callback(self, filename: str | None) -> None:
305
+ if not filename:
306
+ return
307
+
308
+ try:
309
+ # Dump the in-memory pikepdf object tree back out to disk
310
+ self.pdf.save(filename)
311
+ self.is_dirty = False
312
+
313
+ self.query_one("#breadcrumb", Label).update(
314
+ f"[green]Successfully saved modified PDF to '{filename}'[/green]"
315
+ )
316
+ except Exception as e:
317
+ self.query_one("#breadcrumb", Label).update(f"[red]Failed to save PDF:[/red] {e}")
318
+
319
+ # -------------------------------------------------------------------------
320
+ # Help
321
+ # -------------------------------------------------------------------------
322
+
323
+ def action_show_help(self) -> None:
324
+ """Push the help screen when the user presses a help key."""
325
+ self.push_screen(HelpScreen())
326
+
327
+ # -------------------------------------------------------------------------
328
+ # Search actions
329
+ # -------------------------------------------------------------------------
330
+
331
+ def action_search_forward(self) -> None:
332
+ self._search_direction = "forward"
333
+ self._open_search_bar("forward (/)")
334
+
335
+ def action_search_backward(self) -> None:
336
+ self._search_direction = "backward"
337
+ self._open_search_bar("backward (?)")
338
+
339
+ def action_repeat_search_forward(self) -> None:
340
+ self._search_direction = "forward"
341
+ self._perform_search(self.last_search_query)
342
+
343
+ def action_repeat_search_backward(self) -> None:
344
+ self._search_direction = "backward"
345
+ self._perform_search(self.last_search_query)
346
+
347
+ def action_cancel_search(self) -> None:
348
+ search_bar = self.query_one("#search-bar", SearchInput)
349
+ search_bar.display = False
350
+ search_bar.value = ""
351
+ self.query_one("#tree-pane").focus()
352
+
353
+ def _open_search_bar(self, direction_text: str) -> None:
354
+ search_bar = self.query_one("#search-bar", SearchInput)
355
+ search_bar.placeholder = (
356
+ f"Search {direction_text} — Enter to jump · Esc or ctrl+g to cancel"
357
+ )
358
+ search_bar.display = True
359
+ search_bar.focus()
360
+
361
+ def on_input_submitted(self, event: Input.Submitted) -> None:
362
+ event.input.display = False
363
+ self.query_one("#tree-pane").focus()
364
+
365
+ if event.input.id == "search-bar":
366
+ query = event.value.strip().lower()
367
+ if query:
368
+ self.last_search_query = query
369
+ self._perform_search(self.last_search_query)
370
+
371
+ elif event.input.id == "page-input":
372
+ self._handle_page_jump(event.value.strip())
373
+
374
+ def _handle_page_jump(self, value: str) -> None:
375
+ if not value:
376
+ return
377
+
378
+ try:
379
+ page_num = int(value)
380
+ num_pages = len(self.pdf.pages)
381
+ except ValueError:
382
+ self.query_one("#breadcrumb", Label).update(f"[red]Invalid page number:[/red] {value}")
383
+ return
384
+
385
+ if not (1 <= page_num <= num_pages):
386
+ self.query_one("#breadcrumb", Label).update(
387
+ f"[red]Page {page_num} out of bounds (1-{num_pages})[/red]"
388
+ )
389
+ return
390
+
391
+ # 1. pikepdf gives us the canonical page dictionary via the flat .pages list
392
+ page_obj = self.pdf.pages[page_num - 1]
393
+
394
+ # 2. Extract its exact object/generation signature
395
+ target_node = self.obj_to_node.get(page_obj.objgen)
396
+
397
+ if target_node:
398
+ expand_to(target_node)
399
+
400
+ tree = self.query_one("#tree-pane", PDFTree)
401
+
402
+ self.call_after_refresh(self.do_jump_factory(tree, target_node))
403
+
404
+ self.query_one("#breadcrumb", Label).update(
405
+ f"[green]Jumped to Page {page_num} ({page_obj.objgen[0]}:{page_obj.objgen[1]})[/green]"
406
+ )
407
+ else:
408
+ self.query_one("#breadcrumb", Label).update(
409
+ f"[red]Could not find tree node for Page {page_num}[/red]"
410
+ )
411
+
412
+ def _perform_search(self, query: str | None) -> None:
413
+ if not query:
414
+ return
415
+
416
+ tree = self.query_one("#tree-pane", PDFTree)
417
+ tree.focus()
418
+
419
+ all_nodes = list(iter_nodes(tree.root))
420
+
421
+ start_node = tree.cursor_node
422
+ try:
423
+ start_idx = all_nodes.index(start_node)
424
+ except ValueError:
425
+ start_idx = -1
426
+
427
+ if self._search_direction == "forward":
428
+ if start_idx == -1:
429
+ search_sequence = all_nodes
430
+ else:
431
+ search_sequence = all_nodes[start_idx + 1 :] + all_nodes[: start_idx + 1]
432
+ else:
433
+ if start_idx == -1:
434
+ search_sequence = all_nodes[::-1]
435
+ else:
436
+ search_sequence = all_nodes[:start_idx][::-1] + all_nodes[start_idx:][::-1]
437
+
438
+ match = next((n for n in search_sequence if query in n.label.plain.lower()), None)
439
+
440
+ if match:
441
+ expand_to(match)
442
+ self.call_after_refresh(self.do_jump_factory(tree, match))
443
+ status = f"[green]Found:[/green] {query}"
444
+ else:
445
+ status = f"[red]Not found:[/red] {query}"
446
+
447
+ self.query_one("#breadcrumb", Label).update(status)
448
+
449
+ # -------------------------------------------------------------------------
450
+ # Layout
451
+ # -------------------------------------------------------------------------
452
+
453
+ def compose(self) -> ComposeResult:
454
+ with Horizontal():
455
+ yield PDFTree(f"[bold magenta]{self.pdf_path}[/bold magenta]", id="tree-pane")
456
+ with Vertical(id="right-pane"):
457
+ yield Label("Trailer", id="breadcrumb")
458
+ yield RichLog(id="details-pane", highlight=True, wrap=True, auto_scroll=False)
459
+ yield SearchInput(
460
+ placeholder="Search nodes (Enter to jump, Esc or ctrl+g to cancel)...",
461
+ id="search-bar",
462
+ )
463
+ yield PageInput(
464
+ placeholder="Go to page (Enter to jump, Esc or ctrl+g to cancel)...",
465
+ id="page-input",
466
+ )
467
+
468
+ # -------------------------------------------------------------------------
469
+ # Lifecycle
470
+ # -------------------------------------------------------------------------
471
+
472
+ def on_mount(self) -> None:
473
+ tree = self.query_one("#tree-pane", PDFTree)
474
+ log = self.query_one("#details-pane", RichLog)
475
+
476
+ tree.auto_expand = False
477
+ tree.root.expand()
478
+ log.write(
479
+ Text.from_markup(
480
+ "[dim italic]Select a Stream node (Red) to view contents, "
481
+ "or click a ↪ Jump link to navigate.[/dim italic]"
482
+ )
483
+ )
484
+
485
+ self.app_resume_signal.subscribe(self, self.action_redraw_screen)
486
+ self.query_one("#search-bar").display = False
487
+ self.query_one("#page-input").display = False
488
+
489
+ try:
490
+ self.pdf = pikepdf.Pdf.open(self.pdf_path)
491
+ with self.app.batch_update():
492
+ build_tree(
493
+ self.pdf.trailer,
494
+ tree.root,
495
+ node_registry=self.obj_to_node,
496
+ name="Trailer",
497
+ )
498
+ except Exception as e:
499
+ tree.root.add_leaf(f"[bold red]Fatal Error opening PDF: {e}[/bold red]")
500
+
501
+ self._startup_selection(tree)
502
+
503
+ def on_unmount(self) -> None:
504
+ if self.pdf:
505
+ self.pdf.close()
506
+
507
+ # -------------------------------------------------------------------------
508
+ # Tree events
509
+ # -------------------------------------------------------------------------
510
+
511
+ def on_tree_node_highlighted(self, event: TextualTree.NodeHighlighted) -> None:
512
+ if event.node is None:
513
+ return
514
+ # Don't overwrite a search status message with the breadcrumb
515
+ if self._programmatic_move:
516
+ self._programmatic_move = False
517
+ return
518
+
519
+ parts = []
520
+ curr = event.node
521
+ while curr is not None and curr.parent is not None:
522
+ parts.append(get_node_name(curr))
523
+ curr = curr.parent
524
+ parts.reverse()
525
+ self.query_one("#breadcrumb", Label).update(" > ".join(parts))
526
+
527
+ def on_tree_node_selected(self, event: TextualTree.NodeSelected) -> None:
528
+ log = self.query_one("#details-pane", RichLog)
529
+ tree = self.query_one("#tree-pane", TextualTree)
530
+ node_data = event.node.data
531
+
532
+ if not isinstance(node_data, (JumpReference, pikepdf.Stream)):
533
+ return
534
+
535
+ log.clear()
536
+ log.scroll_home(animate=False)
537
+
538
+ if isinstance(node_data, JumpReference):
539
+ target = node_data.target_node
540
+ if target:
541
+ expand_to(target)
542
+ self.call_after_refresh(self.do_jump_factory(tree, target))
543
+ log.write(Text.from_markup("[bold yellow]--- Jumped to Object ---[/bold yellow]"))
544
+ log.write(
545
+ Text.from_markup(
546
+ "[dim]Moved cursor to the original location of this object.[/dim]"
547
+ )
548
+ )
549
+ return
550
+
551
+ if isinstance(node_data, pikepdf.Stream):
552
+ objgen_str = ":".join(str(x) for x in node_data.objgen)
553
+ log.write(
554
+ Text.from_markup(
555
+ f"[bold magenta]--- Obj {objgen_str} Decompressed Stream Output ---[/bold magenta]\n"
556
+ )
557
+ )
558
+ try:
559
+ raw_bytes = node_data.read_bytes()
560
+ try:
561
+ log.write(raw_bytes.decode("utf-8"))
562
+ except UnicodeDecodeError:
563
+ log.write(
564
+ Text.from_markup(
565
+ f"[bold red]<Binary Stream: {len(raw_bytes)} bytes>[/bold red]"
566
+ )
567
+ )
568
+ log.write(Text.from_markup("[dim]First 500 bytes as repr:[/dim]\n"))
569
+ log.write(repr(raw_bytes[:500]))
570
+ except Exception as e:
571
+ log.write(Text.from_markup(f"[bold red]Error reading stream:[/bold red] {e}"))
572
+
573
+ # -------------------------------------------------------------------------
574
+ # Helpers
575
+ # -------------------------------------------------------------------------
576
+
577
+ def _startup_selection(self, tree: PDFTree) -> None:
578
+ pages_node = get_node_by_path(tree, ["Trailer", "/Root", "/Pages"])
579
+ if pages_node:
580
+ expand_to(pages_node)
581
+ pages_node.expand()
582
+ self.call_after_refresh(lambda: tree.select_node(pages_node))
583
+ tree.focus()
584
+
585
+
586
+ def main():
587
+ if len(sys.argv) < 2 or "-h" in sys.argv[1:] or "--help" in sys.argv[1:]:
588
+ print("Usage: python tree_tui.py <file.pdf>")
589
+ sys.exit(1)
590
+
591
+ app = PDFTreeApp(sys.argv[1])
592
+ app.run()
593
+
594
+ # Force the terminal prompt below the leftover TUI ghost
595
+ print("\033[999;1H\n", end="")
596
+
597
+
598
+ if __name__ == "__main__":
599
+ main()