pdflinkcheck 1.1.72__py3-none-any.whl → 1.1.94__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +2 -5
- pdflinkcheck/analyze_pymupdf.py +12 -6
- pdflinkcheck/analyze_pypdf.py +25 -7
- pdflinkcheck/analyze_pypdf_v2.py +5 -6
- pdflinkcheck/cli.py +82 -91
- pdflinkcheck/data/I Have Questions.md +51 -0
- pdflinkcheck/data/LICENSE +17 -654
- pdflinkcheck/data/README.md +49 -49
- pdflinkcheck/data/icons/BoxArt-1080x1080.png +0 -0
- pdflinkcheck/data/icons/Logo-150x150.png +0 -0
- pdflinkcheck/data/icons/Logo-300x300.png +0 -0
- pdflinkcheck/data/icons/Logo-71x71.png +0 -0
- pdflinkcheck/data/icons/PosterArt-720x1080.png +0 -0
- pdflinkcheck/data/icons/SmallLogo-44x44.png +0 -0
- pdflinkcheck/data/icons/SplashScreen-620x300.png +0 -0
- pdflinkcheck/data/icons/StoreLogo-50x50.png +0 -0
- pdflinkcheck/data/icons/WideLogo-310x150.png +0 -0
- pdflinkcheck/data/icons/red_pdf_512px.ico +0 -0
- pdflinkcheck/data/pyproject.toml +20 -23
- pdflinkcheck/data/themes/forest/forest-dark/border-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-invalid.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/card.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/down.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/empty.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/notebook.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/right.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/scale-hor.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/scale-vert.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/separator.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/sizegrip.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tree-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tree-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark.tcl +536 -0
- pdflinkcheck/data/themes/forest/forest-light/border-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-invalid.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/card.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/down.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/empty.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/notebook.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/right-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/right.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/scale-hor.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/scale-vert.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/separator.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/sizegrip.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-down-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tree-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tree-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light.tcl +544 -0
- pdflinkcheck/datacopy.py +2 -0
- pdflinkcheck/dev.py +10 -23
- pdflinkcheck/environment.py +64 -0
- pdflinkcheck/gui.py +229 -103
- pdflinkcheck/io.py +4 -18
- pdflinkcheck/report.py +161 -89
- pdflinkcheck/stdlib_server.py +14 -6
- pdflinkcheck/update_msix_version.py +47 -0
- pdflinkcheck/validate.py +59 -80
- pdflinkcheck/version_info.py +5 -2
- {pdflinkcheck-1.1.72.dist-info → pdflinkcheck-1.1.94.dist-info}/METADATA +54 -52
- pdflinkcheck-1.1.94.dist-info/RECORD +176 -0
- pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +24 -0
- pdflinkcheck-1.1.94.dist-info/licenses/LICENSE-MIT +9 -0
- pdflinkcheck-1.1.72.dist-info/RECORD +0 -21
- {pdflinkcheck-1.1.72.dist-info → pdflinkcheck-1.1.94.dist-info}/WHEEL +0 -0
- {pdflinkcheck-1.1.72.dist-info → pdflinkcheck-1.1.94.dist-info}/entry_points.txt +0 -0
- /pdflinkcheck-1.1.72.dist-info/licenses/LICENSE → /pdflinkcheck-1.1.94.dist-info/licenses/LICENSE-AGPL3 +0 -0
pdflinkcheck/report.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
1
3
|
# pdflinkcheck/report.py
|
|
2
4
|
|
|
3
5
|
import sys
|
|
@@ -6,9 +8,30 @@ from typing import Optional, Dict, Any
|
|
|
6
8
|
import pyhabitat
|
|
7
9
|
|
|
8
10
|
from pdflinkcheck.io import error_logger, export_report_json, export_report_txt, get_first_pdf_in_cwd, get_friendly_path, LOG_FILE_PATH
|
|
11
|
+
from pdflinkcheck.environment import pymupdf_is_available
|
|
12
|
+
from pdflinkcheck.validate import run_validation
|
|
9
13
|
|
|
14
|
+
SEP_COUNT=28
|
|
15
|
+
|
|
16
|
+
def run_report_and_call_exports(pdf_path: str = None, max_links: int = 0, export_format: str = "JSON", pdf_library: str = "pypdf", print_bool:bool=True) -> Dict[str, Any]:
|
|
17
|
+
# The meat and potatoes
|
|
18
|
+
report_results = run_report_and_validtion(
|
|
19
|
+
pdf_path=str(pdf_path),
|
|
20
|
+
max_links=max_links,
|
|
21
|
+
pdf_library = pdf_library,
|
|
22
|
+
)
|
|
23
|
+
if export_format:
|
|
24
|
+
report_data_dict = report_results["data"]
|
|
25
|
+
report_buffer_str = report_results["text"]
|
|
26
|
+
if "JSON" in export_format.upper():
|
|
27
|
+
export_report_json(report_data_dict, pdf_path, pdf_library)
|
|
28
|
+
|
|
29
|
+
if "TXT" in export_format.upper():
|
|
30
|
+
export_report_txt(report_buffer_str, pdf_path, pdf_library)
|
|
31
|
+
return report_results
|
|
32
|
+
|
|
10
33
|
|
|
11
|
-
def
|
|
34
|
+
def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_library: str = "pypdf", print_bool:bool=True) -> Dict[str, Any]:
|
|
12
35
|
"""
|
|
13
36
|
Core high-level PDF link analysis logic.
|
|
14
37
|
|
|
@@ -34,8 +57,8 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
34
57
|
|
|
35
58
|
# Helper to handle conditional printing and mandatory buffering
|
|
36
59
|
def log(msg: str):
|
|
37
|
-
if print_bool:
|
|
38
|
-
print(msg)
|
|
60
|
+
if print_bool:
|
|
61
|
+
print(msg)
|
|
39
62
|
report_buffer.append(msg)
|
|
40
63
|
|
|
41
64
|
# Expected: "pypdf" or "PyMuPDF"
|
|
@@ -44,26 +67,38 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
44
67
|
if pdf_library in allowed_libraries and pdf_library == "pypdf":
|
|
45
68
|
from pdflinkcheck.analyze_pypdf import (extract_links_pypdf as extract_links, extract_toc_pypdf as extract_toc)
|
|
46
69
|
elif pdf_library in allowed_libraries and pdf_library == "pymupdf":
|
|
47
|
-
|
|
48
|
-
import fitz
|
|
49
|
-
except ImportError:
|
|
70
|
+
if not pymupdf_is_available():
|
|
50
71
|
print("PyMuPDF was explicitly requested as the PDF Engine")
|
|
51
|
-
print("
|
|
72
|
+
print("Switch the PDF library to 'pypdf' instead, or install PyMuPDF. ")
|
|
52
73
|
print("To install PyMuPDF locally, try: `uv sync --extra full` OR `pip install .[full]`")
|
|
53
74
|
if pyhabitat.on_termux():
|
|
54
75
|
print(f"pyhabitat.on_termux() = {pyhabitat.on_termux()}")
|
|
55
76
|
print("PyMuPDF is not expected to work on Termux. Use pypdf.")
|
|
56
77
|
print("\n")
|
|
57
|
-
return
|
|
78
|
+
#return
|
|
79
|
+
raise ImportError(f"The 'fitz' module is required for this functionality. Original error: {e}") from e
|
|
58
80
|
from pdflinkcheck.analyze_pymupdf import (extract_links_pymupdf as extract_links, extract_toc_pymupdf as extract_toc)
|
|
59
81
|
|
|
60
82
|
log("\n--- Starting Analysis ... ---\n")
|
|
61
|
-
if pdf_path is None:
|
|
62
|
-
pdf_path = get_first_pdf_in_cwd()
|
|
63
83
|
if pdf_path is None:
|
|
64
84
|
log("pdf_path is None")
|
|
65
85
|
log("Tip: Drop a PDF in the current folder or pass in a path arg.")
|
|
66
|
-
|
|
86
|
+
empty_report = {
|
|
87
|
+
"data": {
|
|
88
|
+
"external_links": [],
|
|
89
|
+
"internal_links": [],
|
|
90
|
+
"toc": []
|
|
91
|
+
},
|
|
92
|
+
"text": "\n".join(report_buffer),
|
|
93
|
+
"metadata": {
|
|
94
|
+
"pdf_name": Path(pdf_path).name,
|
|
95
|
+
"library_used": pdf_library,
|
|
96
|
+
"total_links": 0
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return empty_report
|
|
101
|
+
|
|
67
102
|
try:
|
|
68
103
|
log(f"Target file: {get_friendly_path(pdf_path)}")
|
|
69
104
|
log(f"PDF Engine: {pdf_library}")
|
|
@@ -78,7 +113,21 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
78
113
|
if not extracted_links and not structural_toc:
|
|
79
114
|
log(f"\nNo hyperlinks or structural TOC found in {Path(pdf_path).name}.")
|
|
80
115
|
log("(This is common for scanned/image-only PDFs.)")
|
|
81
|
-
|
|
116
|
+
|
|
117
|
+
empty_result = {
|
|
118
|
+
"data": {
|
|
119
|
+
"external_links": [],
|
|
120
|
+
"internal_links": [],
|
|
121
|
+
"toc": []
|
|
122
|
+
},
|
|
123
|
+
"text": "\n".join(report_buffer),
|
|
124
|
+
"metadata": {
|
|
125
|
+
"pdf_name": Path(pdf_path).name,
|
|
126
|
+
"library_used": pdf_library,
|
|
127
|
+
"total_links": 0
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
return empty_result
|
|
82
131
|
|
|
83
132
|
# 3. Separate the lists based on the 'type' key
|
|
84
133
|
uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
|
|
@@ -89,24 +138,25 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
89
138
|
total_internal_links = len(goto_links) + len(resolved_action_links)
|
|
90
139
|
limit = max_links if max_links > 0 else None
|
|
91
140
|
uri_and_other = uri_links + other_links
|
|
141
|
+
|
|
142
|
+
str_structural_toc = get_structural_toc(structural_toc)
|
|
92
143
|
|
|
93
144
|
# --- ANALYSIS SUMMARY (Using your print logic) ---
|
|
94
|
-
log("\n" + "=" *
|
|
145
|
+
log("\n" + "=" * SEP_COUNT)
|
|
95
146
|
log(f"--- Link Analysis Results for {Path(pdf_path).name} ---")
|
|
96
147
|
log(f"Total active links: {len(extracted_links)} (External: {len(uri_links)}, Internal Jumps: {total_internal_links}, Other: {len(other_links)})")
|
|
97
148
|
log(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
|
|
98
|
-
log("=" *
|
|
149
|
+
log("=" * SEP_COUNT)
|
|
99
150
|
|
|
100
151
|
# --- Section 1: TOC ---
|
|
101
|
-
str_structural_toc = print_structural_toc(structural_toc)
|
|
102
152
|
log(str_structural_toc)
|
|
103
153
|
|
|
104
154
|
# --- Section 2: ACTIVE INTERNAL JUMPS ---
|
|
105
|
-
log("\n" + "=" *
|
|
155
|
+
log("\n" + "=" * SEP_COUNT)
|
|
106
156
|
log(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links} found")
|
|
107
|
-
log("=" *
|
|
157
|
+
log("=" * SEP_COUNT)
|
|
108
158
|
log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
|
|
109
|
-
log("-" *
|
|
159
|
+
log("-" * SEP_COUNT)
|
|
110
160
|
|
|
111
161
|
all_internal = goto_links + resolved_action_links
|
|
112
162
|
if total_internal_links > 0:
|
|
@@ -118,13 +168,13 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
118
168
|
log(f"... and {len(all_internal) - limit} more links (use --max-links 0 to show all).")
|
|
119
169
|
else:
|
|
120
170
|
log(" No internal GoTo or Resolved Action links found.")
|
|
121
|
-
log("-" *
|
|
171
|
+
log("-" * SEP_COUNT)
|
|
122
172
|
|
|
123
173
|
# --- Section 3: ACTIVE URI LINKS ---
|
|
124
|
-
log("\n" + "=" *
|
|
174
|
+
log("\n" + "=" * SEP_COUNT)
|
|
125
175
|
log(f"## Active URI Links (External & Other) - {len(uri_and_other)} found")
|
|
126
176
|
log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
|
|
127
|
-
log("=" *
|
|
177
|
+
log("=" * SEP_COUNT)
|
|
128
178
|
|
|
129
179
|
if uri_and_other:
|
|
130
180
|
for i, link in enumerate(uri_and_other[:limit], 1):
|
|
@@ -136,45 +186,53 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
136
186
|
|
|
137
187
|
else:
|
|
138
188
|
log(" No external or 'Other' links found.")
|
|
139
|
-
log("-" *
|
|
140
|
-
|
|
141
|
-
log("\n--- Analysis Complete ---\n")
|
|
189
|
+
log("-" * SEP_COUNT)
|
|
142
190
|
|
|
143
|
-
# Final aggregation of the buffer into one string
|
|
144
|
-
report_buffer_str = "\n".join(report_buffer)
|
|
145
191
|
|
|
146
192
|
# Return the collected data for potential future JSON/other output
|
|
147
|
-
|
|
193
|
+
report_data_dict = {
|
|
148
194
|
"external_links": uri_links,
|
|
149
195
|
"internal_links": all_internal,
|
|
150
|
-
"toc": structural_toc
|
|
196
|
+
"toc": structural_toc,
|
|
197
|
+
"validation": {}
|
|
151
198
|
}
|
|
152
199
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
# export_report_data(final_report_data_dict, Path(pdf_path).name, export_format, pdf_library)
|
|
157
|
-
|
|
158
|
-
if export_format:
|
|
159
|
-
fmt_upper = export_format.upper()
|
|
160
|
-
|
|
161
|
-
if "JSON" in fmt_upper:
|
|
162
|
-
export_report_json(final_report_data_dict, pdf_path, pdf_library)
|
|
163
|
-
|
|
164
|
-
if "TXT" in fmt_upper:
|
|
165
|
-
export_report_txt(report_buffer_str, pdf_path, pdf_library)
|
|
166
|
-
|
|
167
|
-
report_results = {
|
|
168
|
-
"data": final_report_data_dict, # The structured JSON-ready dict
|
|
169
|
-
"text": report_buffer_str, # The human-readable string
|
|
200
|
+
intermediate_report_results = {
|
|
201
|
+
"data": report_data_dict, # The structured JSON-ready dict
|
|
202
|
+
"text": "",
|
|
170
203
|
"metadata": { # Helpful for the GUI/Logs
|
|
171
204
|
"pdf_name": Path(pdf_path).name,
|
|
172
205
|
"library_used": pdf_library,
|
|
173
206
|
"total_links": len(extracted_links)
|
|
174
207
|
}
|
|
175
208
|
}
|
|
209
|
+
|
|
210
|
+
log("\n--- Analysis Complete ---")
|
|
211
|
+
|
|
212
|
+
validation_results = run_validation(report_results=intermediate_report_results,
|
|
213
|
+
pdf_path=pdf_path,
|
|
214
|
+
pdf_library=pdf_library)
|
|
215
|
+
log(validation_results.get("summary-txt",""))
|
|
216
|
+
report_results = intermediate_report_results
|
|
217
|
+
|
|
218
|
+
# Final aggregation of the buffer into one string, after the last call to log()
|
|
219
|
+
report_buffer_str = "\n".join(report_buffer)
|
|
220
|
+
|
|
221
|
+
report_results["data"]["validation"].update(validation_results)
|
|
222
|
+
#report_results["text"].update(report_buffer_str) # The human-readable string
|
|
223
|
+
report_results["text"] = report_buffer_str
|
|
224
|
+
|
|
225
|
+
# 5. Export Report
|
|
226
|
+
#if export_format:
|
|
227
|
+
# # Assuming export_to will hold the output format string (e.g., "JSON")
|
|
228
|
+
# export_report_data(report_data_dict, Path(pdf_path).name, export_format, pdf_library)
|
|
229
|
+
|
|
230
|
+
if print_bool:
|
|
231
|
+
print(report_buffer_str)
|
|
232
|
+
|
|
176
233
|
# Return a clean results object
|
|
177
234
|
return report_results
|
|
235
|
+
|
|
178
236
|
except Exception as e:
|
|
179
237
|
# Specific handling for common read failures
|
|
180
238
|
if "invalid pdf header" in str(e).lower() or "EOF marker not found" in str(e) or "stream has ended unexpectedly" in str(e):
|
|
@@ -194,44 +252,36 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
194
252
|
}
|
|
195
253
|
}
|
|
196
254
|
|
|
255
|
+
#except Exception as e:
|
|
256
|
+
# # Log the critical failure
|
|
257
|
+
# error_logger.error(f"Critical failure during run_report for {pdf_path}: {e}", exc_info=True)
|
|
258
|
+
# log(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
|
|
259
|
+
# raise # Allow the exception to propagate or handle gracefully
|
|
197
260
|
except Exception as e:
|
|
198
|
-
# Log the critical failure
|
|
199
261
|
error_logger.error(f"Critical failure during run_report for {pdf_path}: {e}", exc_info=True)
|
|
200
|
-
log(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
# Iterate and format
|
|
224
|
-
for item in structural_toc:
|
|
225
|
-
# Use level for indentation (e.g., Level 1 = 0 spaces, Level 2 = 4 spaces, Level 3 = 8 spaces)
|
|
226
|
-
indent = " " * 4 * (item['level'] - 1)
|
|
227
|
-
# Format the title and target page number
|
|
228
|
-
page_str = str(item['target_page']).rjust(page_width)
|
|
229
|
-
print(f"{indent}{item['title']} . . . page {page_str}")
|
|
230
|
-
|
|
231
|
-
print("-" * 70)
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
def print_structural_toc(structural_toc: list, print_bool: bool = False) -> str:
|
|
262
|
+
log(f"FATAL: Analysis failed: {str(e)}. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
|
|
263
|
+
|
|
264
|
+
# Always return a safe empty result on error
|
|
265
|
+
return {
|
|
266
|
+
"data": {
|
|
267
|
+
"external_links": [],
|
|
268
|
+
"internal_links": [],
|
|
269
|
+
"toc": [],
|
|
270
|
+
"validation": {}
|
|
271
|
+
},
|
|
272
|
+
"text": "\n".join(report_buffer + [
|
|
273
|
+
"\n--- Analysis failed ---",
|
|
274
|
+
f"Error: {str(e)}",
|
|
275
|
+
"No links or TOC extracted."
|
|
276
|
+
]),
|
|
277
|
+
"metadata": {
|
|
278
|
+
"pdf_name": Path(pdf_path).name,
|
|
279
|
+
"library_used": pdf_library,
|
|
280
|
+
"total_links": 0
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
def get_structural_toc(structural_toc: list) -> str:
|
|
235
285
|
"""
|
|
236
286
|
Formats the structural TOC data into a hierarchical string and optionally prints it.
|
|
237
287
|
|
|
@@ -243,16 +293,14 @@ def print_structural_toc(structural_toc: list, print_bool: bool = False) -> str:
|
|
|
243
293
|
A formatted string of the structural TOC.
|
|
244
294
|
"""
|
|
245
295
|
lines = []
|
|
246
|
-
lines.append("\n" + "=" *
|
|
296
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
247
297
|
lines.append("## Structural Table of Contents (PDF Bookmarks/Outline)")
|
|
248
|
-
lines.append("=" *
|
|
298
|
+
lines.append("=" * SEP_COUNT)
|
|
249
299
|
|
|
250
300
|
if not structural_toc:
|
|
251
301
|
msg = "No structural TOC (bookmarks/outline) found."
|
|
252
302
|
lines.append(msg)
|
|
253
303
|
output = "\n".join(lines)
|
|
254
|
-
if print_bool:
|
|
255
|
-
print(output)
|
|
256
304
|
return output
|
|
257
305
|
|
|
258
306
|
# Determine max page width for consistent alignment
|
|
@@ -269,12 +317,36 @@ def print_structural_toc(structural_toc: list, print_bool: bool = False) -> str:
|
|
|
269
317
|
|
|
270
318
|
lines.append(f"{indent}{item['title']} . . . page {page_str}")
|
|
271
319
|
|
|
272
|
-
lines.append("-" *
|
|
320
|
+
lines.append("-" * SEP_COUNT)
|
|
273
321
|
|
|
274
322
|
# Final aggregation
|
|
275
323
|
str_structural_toc = "\n".join(lines)
|
|
276
|
-
|
|
277
|
-
if print_bool:
|
|
278
|
-
print(str_structural_toc)
|
|
279
324
|
|
|
280
325
|
return str_structural_toc
|
|
326
|
+
|
|
327
|
+
if __name__ == "__main__":
|
|
328
|
+
|
|
329
|
+
from pdflinkcheck.io import get_first_pdf_in_cwd
|
|
330
|
+
pdf_path = get_first_pdf_in_cwd()
|
|
331
|
+
# Run analysis first
|
|
332
|
+
|
|
333
|
+
if pymupdf_is_available():
|
|
334
|
+
pdf_library = "pymupdf"
|
|
335
|
+
else:
|
|
336
|
+
pdf_library = "pypdf"
|
|
337
|
+
report = run_report(
|
|
338
|
+
pdf_path=pdf_path,
|
|
339
|
+
max_links=0,
|
|
340
|
+
export_format="",
|
|
341
|
+
pdf_library=pdf_library,
|
|
342
|
+
print_bool=True # We handle printing in validation
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
if not report or not report.get("data"):
|
|
346
|
+
print("No data extracted — nothing to validate.")
|
|
347
|
+
sys.exit(1)
|
|
348
|
+
|
|
349
|
+
else:
|
|
350
|
+
print("Success!")
|
|
351
|
+
print(f"list(report['data']) = {list(report['data'])}")
|
|
352
|
+
|
pdflinkcheck/stdlib_server.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
1
3
|
# src/pdflinkcheck/stdlib_server.py
|
|
2
4
|
import http.server
|
|
3
5
|
import socketserver
|
|
@@ -8,7 +10,7 @@ import os
|
|
|
8
10
|
from pathlib import Path
|
|
9
11
|
import email # This replaces cgi for multipart parsing
|
|
10
12
|
|
|
11
|
-
from pdflinkcheck.report import
|
|
13
|
+
from pdflinkcheck.report import run_report_and_call_exports
|
|
12
14
|
|
|
13
15
|
PORT = 8000
|
|
14
16
|
|
|
@@ -17,8 +19,8 @@ HTML_FORM = """
|
|
|
17
19
|
<html>
|
|
18
20
|
<head><title>pdflinkcheck Stdlib Server</title></head>
|
|
19
21
|
<body style="font-family: sans-serif; max-width: 800px; margin: 40px auto;">
|
|
20
|
-
<h1>pdflinkcheck API (
|
|
21
|
-
<p>Upload a PDF for link/TOC analysis
|
|
22
|
+
<h1>pdflinkcheck API (pure stdlib)</h1>
|
|
23
|
+
<p>Upload a PDF for link/TOC analysis.</p>
|
|
22
24
|
<form action="/" method="post" enctype="multipart/form-data">
|
|
23
25
|
<p><input type="file" name="file" accept=".pdf" required></p>
|
|
24
26
|
<p>
|
|
@@ -33,9 +35,13 @@ HTML_FORM = """
|
|
|
33
35
|
<input type="number" name="max_links" value="0" min="0">
|
|
34
36
|
</p>
|
|
35
37
|
<p><button type="submit">Analyze PDF</button></p>
|
|
38
|
+
<!--p>
|
|
39
|
+
<button type="submit" name="action" value="analyze">Analyze PDF</button>
|
|
40
|
+
<button type="submit" name="action" value="validate">Validate PDF</button>
|
|
41
|
+
</p-->
|
|
36
42
|
</form>
|
|
37
43
|
<hr>
|
|
38
|
-
<p>Returns JSON
|
|
44
|
+
<p>Returns JSON.</p>
|
|
39
45
|
</body>
|
|
40
46
|
</html>
|
|
41
47
|
"""
|
|
@@ -130,18 +136,20 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
130
136
|
tmp_file.write(file_item)
|
|
131
137
|
tmp_path = tmp_file.name
|
|
132
138
|
|
|
133
|
-
result =
|
|
139
|
+
result = run_report_and_call_exports(
|
|
134
140
|
pdf_path=tmp_path,
|
|
135
141
|
max_links=max_links if max_links > 0 else 0,
|
|
136
142
|
export_format="",
|
|
137
143
|
pdf_library=pdf_library,
|
|
138
144
|
print_bool=False
|
|
139
145
|
)
|
|
146
|
+
metadata = result.get("metadata", {"total_links": 0, "pdf_name": file_filename})
|
|
147
|
+
total_links = metadata.get("total_links", 0)
|
|
140
148
|
|
|
141
149
|
response = {
|
|
142
150
|
"filename": file_filename,
|
|
143
151
|
"pdf_library_used": pdf_library,
|
|
144
|
-
"total_links":
|
|
152
|
+
"total_links": total_links,
|
|
145
153
|
"data": result["data"],
|
|
146
154
|
"text_report": result["text"]
|
|
147
155
|
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from pdflinkcheck.version_info import get_version_from_pyproject
|
|
3
|
+
|
|
4
|
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
|
5
|
+
|
|
6
|
+
UNVERSIONED_MANIFEST = PROJECT_ROOT / "msix" / "AppxManifest_unversioned.xml"
|
|
7
|
+
OUTPUT_MANIFEST = PROJECT_ROOT / "msix" / "AppxManifest.xml"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
PLACEHOLDER = "@@VERSION_PLACEHOLDER@@"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def generate_versioned_manifest(version):
|
|
14
|
+
|
|
15
|
+
# Pad to four parts: 1.1 -> 1.1.0.0, 1.1.92 -> 1.1.92.0
|
|
16
|
+
parts = version.split(".")
|
|
17
|
+
if len(parts) == 2:
|
|
18
|
+
parts += ["0", "0"]
|
|
19
|
+
elif len(parts) == 3:
|
|
20
|
+
parts.append("0")
|
|
21
|
+
elif len(parts) > 4:
|
|
22
|
+
raise ValueError(f"Version has too many parts: {version}")
|
|
23
|
+
|
|
24
|
+
msix_version = ".".join(parts[:4])
|
|
25
|
+
|
|
26
|
+
if not UNVERSIONED_MANIFEST.exists():
|
|
27
|
+
raise FileNotFoundError(f"Unversioned manifest not found: {UNVERSIONED_MANIFEST}")
|
|
28
|
+
|
|
29
|
+
text = UNVERSIONED_MANIFEST.read_text(encoding="utf-8")
|
|
30
|
+
|
|
31
|
+
placeholder_full = f'Version="{PLACEHOLDER}"'
|
|
32
|
+
|
|
33
|
+
if placeholder_full not in text:
|
|
34
|
+
raise ValueError(f"Placeholder {placeholder_full} not found in the unversioned manifest!")
|
|
35
|
+
|
|
36
|
+
updated_text = text.replace(placeholder_full, f'Version="{msix_version}"')
|
|
37
|
+
|
|
38
|
+
# Ensure the directory exists and write the new manifest
|
|
39
|
+
OUTPUT_MANIFEST.parent.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
OUTPUT_MANIFEST.write_text(updated_text, encoding="utf-8")
|
|
41
|
+
|
|
42
|
+
print(f"Successfully generated AppxManifest.xml with version {msix_version}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
if __name__ == "__main__":
|
|
46
|
+
version = get_version_from_pyproject()
|
|
47
|
+
generate_versioned_manifest(version)
|