pdflinkcheck 1.1.73__py3-none-any.whl → 1.1.94__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +2 -5
- pdflinkcheck/analyze_pymupdf.py +12 -6
- pdflinkcheck/analyze_pypdf.py +25 -7
- pdflinkcheck/analyze_pypdf_v2.py +5 -6
- pdflinkcheck/cli.py +82 -91
- pdflinkcheck/data/I Have Questions.md +51 -0
- pdflinkcheck/data/LICENSE +17 -654
- pdflinkcheck/data/README.md +49 -49
- pdflinkcheck/data/icons/BoxArt-1080x1080.png +0 -0
- pdflinkcheck/data/icons/Logo-150x150.png +0 -0
- pdflinkcheck/data/icons/Logo-300x300.png +0 -0
- pdflinkcheck/data/icons/Logo-71x71.png +0 -0
- pdflinkcheck/data/icons/PosterArt-720x1080.png +0 -0
- pdflinkcheck/data/icons/SmallLogo-44x44.png +0 -0
- pdflinkcheck/data/icons/SplashScreen-620x300.png +0 -0
- pdflinkcheck/data/icons/StoreLogo-50x50.png +0 -0
- pdflinkcheck/data/icons/WideLogo-310x150.png +0 -0
- pdflinkcheck/data/icons/red_pdf_512px.ico +0 -0
- pdflinkcheck/data/pyproject.toml +20 -23
- pdflinkcheck/data/themes/forest/forest-dark/border-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-invalid.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/card.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/down.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/empty.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/notebook.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/right.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/scale-hor.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/scale-vert.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/separator.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/sizegrip.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tree-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tree-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark.tcl +536 -0
- pdflinkcheck/data/themes/forest/forest-light/border-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-invalid.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/card.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/down.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/empty.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/notebook.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/right-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/right.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/scale-hor.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/scale-vert.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/separator.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/sizegrip.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-down-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tree-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tree-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light.tcl +544 -0
- pdflinkcheck/datacopy.py +2 -0
- pdflinkcheck/dev.py +10 -23
- pdflinkcheck/environment.py +64 -0
- pdflinkcheck/gui.py +229 -103
- pdflinkcheck/io.py +4 -18
- pdflinkcheck/report.py +148 -78
- pdflinkcheck/stdlib_server.py +14 -6
- pdflinkcheck/update_msix_version.py +47 -0
- pdflinkcheck/validate.py +50 -73
- pdflinkcheck/version_info.py +5 -2
- {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.1.94.dist-info}/METADATA +54 -52
- pdflinkcheck-1.1.94.dist-info/RECORD +176 -0
- pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +24 -0
- pdflinkcheck-1.1.94.dist-info/licenses/LICENSE-MIT +9 -0
- pdflinkcheck-1.1.73.dist-info/RECORD +0 -21
- {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.1.94.dist-info}/WHEEL +0 -0
- {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.1.94.dist-info}/entry_points.txt +0 -0
- /pdflinkcheck-1.1.73.dist-info/licenses/LICENSE → /pdflinkcheck-1.1.94.dist-info/licenses/LICENSE-AGPL3 +0 -0
pdflinkcheck/report.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
1
3
|
# pdflinkcheck/report.py
|
|
2
4
|
|
|
3
5
|
import sys
|
|
@@ -6,11 +8,30 @@ from typing import Optional, Dict, Any
|
|
|
6
8
|
import pyhabitat
|
|
7
9
|
|
|
8
10
|
from pdflinkcheck.io import error_logger, export_report_json, export_report_txt, get_first_pdf_in_cwd, get_friendly_path, LOG_FILE_PATH
|
|
9
|
-
|
|
11
|
+
from pdflinkcheck.environment import pymupdf_is_available
|
|
12
|
+
from pdflinkcheck.validate import run_validation
|
|
10
13
|
|
|
11
14
|
SEP_COUNT=28
|
|
15
|
+
|
|
16
|
+
def run_report_and_call_exports(pdf_path: str = None, max_links: int = 0, export_format: str = "JSON", pdf_library: str = "pypdf", print_bool:bool=True) -> Dict[str, Any]:
|
|
17
|
+
# The meat and potatoes
|
|
18
|
+
report_results = run_report_and_validtion(
|
|
19
|
+
pdf_path=str(pdf_path),
|
|
20
|
+
max_links=max_links,
|
|
21
|
+
pdf_library = pdf_library,
|
|
22
|
+
)
|
|
23
|
+
if export_format:
|
|
24
|
+
report_data_dict = report_results["data"]
|
|
25
|
+
report_buffer_str = report_results["text"]
|
|
26
|
+
if "JSON" in export_format.upper():
|
|
27
|
+
export_report_json(report_data_dict, pdf_path, pdf_library)
|
|
28
|
+
|
|
29
|
+
if "TXT" in export_format.upper():
|
|
30
|
+
export_report_txt(report_buffer_str, pdf_path, pdf_library)
|
|
31
|
+
return report_results
|
|
32
|
+
|
|
12
33
|
|
|
13
|
-
def
|
|
34
|
+
def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_library: str = "pypdf", print_bool:bool=True) -> Dict[str, Any]:
|
|
14
35
|
"""
|
|
15
36
|
Core high-level PDF link analysis logic.
|
|
16
37
|
|
|
@@ -36,8 +57,8 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
36
57
|
|
|
37
58
|
# Helper to handle conditional printing and mandatory buffering
|
|
38
59
|
def log(msg: str):
|
|
39
|
-
if print_bool:
|
|
40
|
-
print(msg)
|
|
60
|
+
if print_bool:
|
|
61
|
+
print(msg)
|
|
41
62
|
report_buffer.append(msg)
|
|
42
63
|
|
|
43
64
|
# Expected: "pypdf" or "PyMuPDF"
|
|
@@ -46,26 +67,38 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
46
67
|
if pdf_library in allowed_libraries and pdf_library == "pypdf":
|
|
47
68
|
from pdflinkcheck.analyze_pypdf import (extract_links_pypdf as extract_links, extract_toc_pypdf as extract_toc)
|
|
48
69
|
elif pdf_library in allowed_libraries and pdf_library == "pymupdf":
|
|
49
|
-
|
|
50
|
-
import fitz
|
|
51
|
-
except ImportError:
|
|
70
|
+
if not pymupdf_is_available():
|
|
52
71
|
print("PyMuPDF was explicitly requested as the PDF Engine")
|
|
53
|
-
print("
|
|
72
|
+
print("Switch the PDF library to 'pypdf' instead, or install PyMuPDF. ")
|
|
54
73
|
print("To install PyMuPDF locally, try: `uv sync --extra full` OR `pip install .[full]`")
|
|
55
74
|
if pyhabitat.on_termux():
|
|
56
75
|
print(f"pyhabitat.on_termux() = {pyhabitat.on_termux()}")
|
|
57
76
|
print("PyMuPDF is not expected to work on Termux. Use pypdf.")
|
|
58
77
|
print("\n")
|
|
59
|
-
return
|
|
78
|
+
#return
|
|
79
|
+
raise ImportError(f"The 'fitz' module is required for this functionality. Original error: {e}") from e
|
|
60
80
|
from pdflinkcheck.analyze_pymupdf import (extract_links_pymupdf as extract_links, extract_toc_pymupdf as extract_toc)
|
|
61
81
|
|
|
62
82
|
log("\n--- Starting Analysis ... ---\n")
|
|
63
|
-
if pdf_path is None:
|
|
64
|
-
pdf_path = get_first_pdf_in_cwd()
|
|
65
83
|
if pdf_path is None:
|
|
66
84
|
log("pdf_path is None")
|
|
67
85
|
log("Tip: Drop a PDF in the current folder or pass in a path arg.")
|
|
68
|
-
|
|
86
|
+
empty_report = {
|
|
87
|
+
"data": {
|
|
88
|
+
"external_links": [],
|
|
89
|
+
"internal_links": [],
|
|
90
|
+
"toc": []
|
|
91
|
+
},
|
|
92
|
+
"text": "\n".join(report_buffer),
|
|
93
|
+
"metadata": {
|
|
94
|
+
"pdf_name": Path(pdf_path).name,
|
|
95
|
+
"library_used": pdf_library,
|
|
96
|
+
"total_links": 0
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return empty_report
|
|
101
|
+
|
|
69
102
|
try:
|
|
70
103
|
log(f"Target file: {get_friendly_path(pdf_path)}")
|
|
71
104
|
log(f"PDF Engine: {pdf_library}")
|
|
@@ -80,7 +113,21 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
80
113
|
if not extracted_links and not structural_toc:
|
|
81
114
|
log(f"\nNo hyperlinks or structural TOC found in {Path(pdf_path).name}.")
|
|
82
115
|
log("(This is common for scanned/image-only PDFs.)")
|
|
83
|
-
|
|
116
|
+
|
|
117
|
+
empty_result = {
|
|
118
|
+
"data": {
|
|
119
|
+
"external_links": [],
|
|
120
|
+
"internal_links": [],
|
|
121
|
+
"toc": []
|
|
122
|
+
},
|
|
123
|
+
"text": "\n".join(report_buffer),
|
|
124
|
+
"metadata": {
|
|
125
|
+
"pdf_name": Path(pdf_path).name,
|
|
126
|
+
"library_used": pdf_library,
|
|
127
|
+
"total_links": 0
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
return empty_result
|
|
84
131
|
|
|
85
132
|
# 3. Separate the lists based on the 'type' key
|
|
86
133
|
uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
|
|
@@ -91,6 +138,8 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
91
138
|
total_internal_links = len(goto_links) + len(resolved_action_links)
|
|
92
139
|
limit = max_links if max_links > 0 else None
|
|
93
140
|
uri_and_other = uri_links + other_links
|
|
141
|
+
|
|
142
|
+
str_structural_toc = get_structural_toc(structural_toc)
|
|
94
143
|
|
|
95
144
|
# --- ANALYSIS SUMMARY (Using your print logic) ---
|
|
96
145
|
log("\n" + "=" * SEP_COUNT)
|
|
@@ -100,7 +149,6 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
100
149
|
log("=" * SEP_COUNT)
|
|
101
150
|
|
|
102
151
|
# --- Section 1: TOC ---
|
|
103
|
-
str_structural_toc = print_structural_toc(structural_toc)
|
|
104
152
|
log(str_structural_toc)
|
|
105
153
|
|
|
106
154
|
# --- Section 2: ACTIVE INTERNAL JUMPS ---
|
|
@@ -140,43 +188,51 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
140
188
|
log(" No external or 'Other' links found.")
|
|
141
189
|
log("-" * SEP_COUNT)
|
|
142
190
|
|
|
143
|
-
log("\n--- Analysis Complete ---\n")
|
|
144
|
-
|
|
145
|
-
# Final aggregation of the buffer into one string
|
|
146
|
-
report_buffer_str = "\n".join(report_buffer)
|
|
147
191
|
|
|
148
192
|
# Return the collected data for potential future JSON/other output
|
|
149
|
-
|
|
193
|
+
report_data_dict = {
|
|
150
194
|
"external_links": uri_links,
|
|
151
195
|
"internal_links": all_internal,
|
|
152
|
-
"toc": structural_toc
|
|
196
|
+
"toc": structural_toc,
|
|
197
|
+
"validation": {}
|
|
153
198
|
}
|
|
154
199
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
# export_report_data(final_report_data_dict, Path(pdf_path).name, export_format, pdf_library)
|
|
159
|
-
|
|
160
|
-
if export_format:
|
|
161
|
-
fmt_upper = export_format.upper()
|
|
162
|
-
|
|
163
|
-
if "JSON" in fmt_upper:
|
|
164
|
-
export_report_json(final_report_data_dict, pdf_path, pdf_library)
|
|
165
|
-
|
|
166
|
-
if "TXT" in fmt_upper:
|
|
167
|
-
export_report_txt(report_buffer_str, pdf_path, pdf_library)
|
|
168
|
-
|
|
169
|
-
report_results = {
|
|
170
|
-
"data": final_report_data_dict, # The structured JSON-ready dict
|
|
171
|
-
"text": report_buffer_str, # The human-readable string
|
|
200
|
+
intermediate_report_results = {
|
|
201
|
+
"data": report_data_dict, # The structured JSON-ready dict
|
|
202
|
+
"text": "",
|
|
172
203
|
"metadata": { # Helpful for the GUI/Logs
|
|
173
204
|
"pdf_name": Path(pdf_path).name,
|
|
174
205
|
"library_used": pdf_library,
|
|
175
206
|
"total_links": len(extracted_links)
|
|
176
207
|
}
|
|
177
208
|
}
|
|
209
|
+
|
|
210
|
+
log("\n--- Analysis Complete ---")
|
|
211
|
+
|
|
212
|
+
validation_results = run_validation(report_results=intermediate_report_results,
|
|
213
|
+
pdf_path=pdf_path,
|
|
214
|
+
pdf_library=pdf_library)
|
|
215
|
+
log(validation_results.get("summary-txt",""))
|
|
216
|
+
report_results = intermediate_report_results
|
|
217
|
+
|
|
218
|
+
# Final aggregation of the buffer into one string, after the last call to log()
|
|
219
|
+
report_buffer_str = "\n".join(report_buffer)
|
|
220
|
+
|
|
221
|
+
report_results["data"]["validation"].update(validation_results)
|
|
222
|
+
#report_results["text"].update(report_buffer_str) # The human-readable string
|
|
223
|
+
report_results["text"] = report_buffer_str
|
|
224
|
+
|
|
225
|
+
# 5. Export Report
|
|
226
|
+
#if export_format:
|
|
227
|
+
# # Assuming export_to will hold the output format string (e.g., "JSON")
|
|
228
|
+
# export_report_data(report_data_dict, Path(pdf_path).name, export_format, pdf_library)
|
|
229
|
+
|
|
230
|
+
if print_bool:
|
|
231
|
+
print(report_buffer_str)
|
|
232
|
+
|
|
178
233
|
# Return a clean results object
|
|
179
234
|
return report_results
|
|
235
|
+
|
|
180
236
|
except Exception as e:
|
|
181
237
|
# Specific handling for common read failures
|
|
182
238
|
if "invalid pdf header" in str(e).lower() or "EOF marker not found" in str(e) or "stream has ended unexpectedly" in str(e):
|
|
@@ -196,44 +252,36 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
196
252
|
}
|
|
197
253
|
}
|
|
198
254
|
|
|
255
|
+
#except Exception as e:
|
|
256
|
+
# # Log the critical failure
|
|
257
|
+
# error_logger.error(f"Critical failure during run_report for {pdf_path}: {e}", exc_info=True)
|
|
258
|
+
# log(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
|
|
259
|
+
# raise # Allow the exception to propagate or handle gracefully
|
|
199
260
|
except Exception as e:
|
|
200
|
-
# Log the critical failure
|
|
201
261
|
error_logger.error(f"Critical failure during run_report for {pdf_path}: {e}", exc_info=True)
|
|
202
|
-
log(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
# Iterate and format
|
|
226
|
-
for item in structural_toc:
|
|
227
|
-
# Use level for indentation (e.g., Level 1 = 0 spaces, Level 2 = 4 spaces, Level 3 = 8 spaces)
|
|
228
|
-
indent = " " * 4 * (item['level'] - 1)
|
|
229
|
-
# Format the title and target page number
|
|
230
|
-
page_str = str(item['target_page']).rjust(page_width)
|
|
231
|
-
print(f"{indent}{item['title']} . . . page {page_str}")
|
|
232
|
-
|
|
233
|
-
print("-" * SEP_COUNT)
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
def print_structural_toc(structural_toc: list, print_bool: bool = False) -> str:
|
|
262
|
+
log(f"FATAL: Analysis failed: {str(e)}. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
|
|
263
|
+
|
|
264
|
+
# Always return a safe empty result on error
|
|
265
|
+
return {
|
|
266
|
+
"data": {
|
|
267
|
+
"external_links": [],
|
|
268
|
+
"internal_links": [],
|
|
269
|
+
"toc": [],
|
|
270
|
+
"validation": {}
|
|
271
|
+
},
|
|
272
|
+
"text": "\n".join(report_buffer + [
|
|
273
|
+
"\n--- Analysis failed ---",
|
|
274
|
+
f"Error: {str(e)}",
|
|
275
|
+
"No links or TOC extracted."
|
|
276
|
+
]),
|
|
277
|
+
"metadata": {
|
|
278
|
+
"pdf_name": Path(pdf_path).name,
|
|
279
|
+
"library_used": pdf_library,
|
|
280
|
+
"total_links": 0
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
def get_structural_toc(structural_toc: list) -> str:
|
|
237
285
|
"""
|
|
238
286
|
Formats the structural TOC data into a hierarchical string and optionally prints it.
|
|
239
287
|
|
|
@@ -253,8 +301,6 @@ def print_structural_toc(structural_toc: list, print_bool: bool = False) -> str:
|
|
|
253
301
|
msg = "No structural TOC (bookmarks/outline) found."
|
|
254
302
|
lines.append(msg)
|
|
255
303
|
output = "\n".join(lines)
|
|
256
|
-
if print_bool:
|
|
257
|
-
print(output)
|
|
258
304
|
return output
|
|
259
305
|
|
|
260
306
|
# Determine max page width for consistent alignment
|
|
@@ -275,8 +321,32 @@ def print_structural_toc(structural_toc: list, print_bool: bool = False) -> str:
|
|
|
275
321
|
|
|
276
322
|
# Final aggregation
|
|
277
323
|
str_structural_toc = "\n".join(lines)
|
|
278
|
-
|
|
279
|
-
if print_bool:
|
|
280
|
-
print(str_structural_toc)
|
|
281
324
|
|
|
282
325
|
return str_structural_toc
|
|
326
|
+
|
|
327
|
+
if __name__ == "__main__":
|
|
328
|
+
|
|
329
|
+
from pdflinkcheck.io import get_first_pdf_in_cwd
|
|
330
|
+
pdf_path = get_first_pdf_in_cwd()
|
|
331
|
+
# Run analysis first
|
|
332
|
+
|
|
333
|
+
if pymupdf_is_available():
|
|
334
|
+
pdf_library = "pymupdf"
|
|
335
|
+
else:
|
|
336
|
+
pdf_library = "pypdf"
|
|
337
|
+
report = run_report(
|
|
338
|
+
pdf_path=pdf_path,
|
|
339
|
+
max_links=0,
|
|
340
|
+
export_format="",
|
|
341
|
+
pdf_library=pdf_library,
|
|
342
|
+
print_bool=True # We handle printing in validation
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
if not report or not report.get("data"):
|
|
346
|
+
print("No data extracted — nothing to validate.")
|
|
347
|
+
sys.exit(1)
|
|
348
|
+
|
|
349
|
+
else:
|
|
350
|
+
print("Success!")
|
|
351
|
+
print(f"list(report['data']) = {list(report['data'])}")
|
|
352
|
+
|
pdflinkcheck/stdlib_server.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
1
3
|
# src/pdflinkcheck/stdlib_server.py
|
|
2
4
|
import http.server
|
|
3
5
|
import socketserver
|
|
@@ -8,7 +10,7 @@ import os
|
|
|
8
10
|
from pathlib import Path
|
|
9
11
|
import email # This replaces cgi for multipart parsing
|
|
10
12
|
|
|
11
|
-
from pdflinkcheck.report import
|
|
13
|
+
from pdflinkcheck.report import run_report_and_call_exports
|
|
12
14
|
|
|
13
15
|
PORT = 8000
|
|
14
16
|
|
|
@@ -17,8 +19,8 @@ HTML_FORM = """
|
|
|
17
19
|
<html>
|
|
18
20
|
<head><title>pdflinkcheck Stdlib Server</title></head>
|
|
19
21
|
<body style="font-family: sans-serif; max-width: 800px; margin: 40px auto;">
|
|
20
|
-
<h1>pdflinkcheck API (
|
|
21
|
-
<p>Upload a PDF for link/TOC analysis
|
|
22
|
+
<h1>pdflinkcheck API (pure stdlib)</h1>
|
|
23
|
+
<p>Upload a PDF for link/TOC analysis.</p>
|
|
22
24
|
<form action="/" method="post" enctype="multipart/form-data">
|
|
23
25
|
<p><input type="file" name="file" accept=".pdf" required></p>
|
|
24
26
|
<p>
|
|
@@ -33,9 +35,13 @@ HTML_FORM = """
|
|
|
33
35
|
<input type="number" name="max_links" value="0" min="0">
|
|
34
36
|
</p>
|
|
35
37
|
<p><button type="submit">Analyze PDF</button></p>
|
|
38
|
+
<!--p>
|
|
39
|
+
<button type="submit" name="action" value="analyze">Analyze PDF</button>
|
|
40
|
+
<button type="submit" name="action" value="validate">Validate PDF</button>
|
|
41
|
+
</p-->
|
|
36
42
|
</form>
|
|
37
43
|
<hr>
|
|
38
|
-
<p>Returns JSON
|
|
44
|
+
<p>Returns JSON.</p>
|
|
39
45
|
</body>
|
|
40
46
|
</html>
|
|
41
47
|
"""
|
|
@@ -130,18 +136,20 @@ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
130
136
|
tmp_file.write(file_item)
|
|
131
137
|
tmp_path = tmp_file.name
|
|
132
138
|
|
|
133
|
-
result =
|
|
139
|
+
result = run_report_and_call_exports(
|
|
134
140
|
pdf_path=tmp_path,
|
|
135
141
|
max_links=max_links if max_links > 0 else 0,
|
|
136
142
|
export_format="",
|
|
137
143
|
pdf_library=pdf_library,
|
|
138
144
|
print_bool=False
|
|
139
145
|
)
|
|
146
|
+
metadata = result.get("metadata", {"total_links": 0, "pdf_name": file_filename})
|
|
147
|
+
total_links = metadata.get("total_links", 0)
|
|
140
148
|
|
|
141
149
|
response = {
|
|
142
150
|
"filename": file_filename,
|
|
143
151
|
"pdf_library_used": pdf_library,
|
|
144
|
-
"total_links":
|
|
152
|
+
"total_links": total_links,
|
|
145
153
|
"data": result["data"],
|
|
146
154
|
"text_report": result["text"]
|
|
147
155
|
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from pdflinkcheck.version_info import get_version_from_pyproject
|
|
3
|
+
|
|
4
|
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
|
5
|
+
|
|
6
|
+
UNVERSIONED_MANIFEST = PROJECT_ROOT / "msix" / "AppxManifest_unversioned.xml"
|
|
7
|
+
OUTPUT_MANIFEST = PROJECT_ROOT / "msix" / "AppxManifest.xml"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
PLACEHOLDER = "@@VERSION_PLACEHOLDER@@"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def generate_versioned_manifest(version):
|
|
14
|
+
|
|
15
|
+
# Pad to four parts: 1.1 -> 1.1.0.0, 1.1.92 -> 1.1.92.0
|
|
16
|
+
parts = version.split(".")
|
|
17
|
+
if len(parts) == 2:
|
|
18
|
+
parts += ["0", "0"]
|
|
19
|
+
elif len(parts) == 3:
|
|
20
|
+
parts.append("0")
|
|
21
|
+
elif len(parts) > 4:
|
|
22
|
+
raise ValueError(f"Version has too many parts: {version}")
|
|
23
|
+
|
|
24
|
+
msix_version = ".".join(parts[:4])
|
|
25
|
+
|
|
26
|
+
if not UNVERSIONED_MANIFEST.exists():
|
|
27
|
+
raise FileNotFoundError(f"Unversioned manifest not found: {UNVERSIONED_MANIFEST}")
|
|
28
|
+
|
|
29
|
+
text = UNVERSIONED_MANIFEST.read_text(encoding="utf-8")
|
|
30
|
+
|
|
31
|
+
placeholder_full = f'Version="{PLACEHOLDER}"'
|
|
32
|
+
|
|
33
|
+
if placeholder_full not in text:
|
|
34
|
+
raise ValueError(f"Placeholder {placeholder_full} not found in the unversioned manifest!")
|
|
35
|
+
|
|
36
|
+
updated_text = text.replace(placeholder_full, f'Version="{msix_version}"')
|
|
37
|
+
|
|
38
|
+
# Ensure the directory exists and write the new manifest
|
|
39
|
+
OUTPUT_MANIFEST.parent.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
OUTPUT_MANIFEST.write_text(updated_text, encoding="utf-8")
|
|
41
|
+
|
|
42
|
+
print(f"Successfully generated AppxManifest.xml with version {msix_version}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
if __name__ == "__main__":
|
|
46
|
+
version = get_version_from_pyproject()
|
|
47
|
+
generate_versioned_manifest(version)
|