pdflinkcheck 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +88 -18
- pdflinkcheck/__main__.py +6 -0
- pdflinkcheck/analysis_pdfium.py +131 -0
- pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +99 -141
- pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +51 -39
- pdflinkcheck/cli.py +52 -48
- pdflinkcheck/data/LICENSE +18 -15
- pdflinkcheck/data/README.md +23 -25
- pdflinkcheck/data/pyproject.toml +17 -26
- pdflinkcheck/datacopy.py +16 -1
- pdflinkcheck/dev.py +2 -2
- pdflinkcheck/environment.py +14 -2
- pdflinkcheck/gui.py +346 -563
- pdflinkcheck/helpers.py +88 -0
- pdflinkcheck/io.py +24 -6
- pdflinkcheck/report.py +598 -97
- pdflinkcheck/security.py +189 -0
- pdflinkcheck/splash.py +38 -0
- pdflinkcheck/stdlib_server.py +7 -21
- pdflinkcheck/stdlib_server_alt.py +571 -0
- pdflinkcheck/tk_utils.py +188 -0
- pdflinkcheck/update_msix_version.py +2 -0
- pdflinkcheck/validate.py +104 -170
- pdflinkcheck/version_info.py +2 -2
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +41 -40
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/RECORD +34 -27
- pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
- pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
- pdflinkcheck/analyze_pypdf_v2.py +0 -217
- pdflinkcheck-1.1.94.dist-info/WHEEL +0 -4
- pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +0 -24
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-AGPL3 +0 -0
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-MIT +0 -0
pdflinkcheck/report.py
CHANGED
|
@@ -1,37 +1,195 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# SPDX-License-Identifier: MIT
|
|
3
3
|
# pdflinkcheck/report.py
|
|
4
|
-
|
|
4
|
+
from __future__ import annotations
|
|
5
5
|
import sys
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import Optional, Dict, Any
|
|
8
8
|
import pyhabitat
|
|
9
|
+
import copy
|
|
9
10
|
|
|
10
11
|
from pdflinkcheck.io import error_logger, export_report_json, export_report_txt, get_first_pdf_in_cwd, get_friendly_path, LOG_FILE_PATH
|
|
11
|
-
from pdflinkcheck.environment import pymupdf_is_available
|
|
12
|
+
from pdflinkcheck.environment import pymupdf_is_available, pdfium_is_available
|
|
12
13
|
from pdflinkcheck.validate import run_validation
|
|
14
|
+
from pdflinkcheck.security import compute_risk
|
|
15
|
+
from pdflinkcheck.helpers import debug_head, PageRef
|
|
16
|
+
|
|
13
17
|
|
|
14
18
|
SEP_COUNT=28
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
+
# Define a safe "empty" validation state
|
|
20
|
+
EMPTY_VALIDATION = {
|
|
21
|
+
"summary-stats": {
|
|
22
|
+
"total_checked": 0,
|
|
23
|
+
"valid": 0,
|
|
24
|
+
"file-found": 0,
|
|
25
|
+
"broken-page": 0,
|
|
26
|
+
"broken-file": 0,
|
|
27
|
+
"no_destination_page_count": 0,
|
|
28
|
+
"unknown-web": 0,
|
|
29
|
+
"unknown-reasonableness": 0,
|
|
30
|
+
"unknown-link": 0
|
|
31
|
+
},
|
|
32
|
+
"issues": [],
|
|
33
|
+
"summary-txt": "Analysis failed: No validation performed.",
|
|
34
|
+
"total_pages": 0
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run_report_and_call_exports(
|
|
39
|
+
pdf_path: str = None,
|
|
40
|
+
export_format: str = "JSON",
|
|
41
|
+
pdf_library: str = "auto",
|
|
42
|
+
print_bool:bool=True,
|
|
43
|
+
) -> Dict[str, Any]:
|
|
44
|
+
"""
|
|
45
|
+
Public entry point. Orchestrates extraction, validation, and file exports.
|
|
46
|
+
"""
|
|
47
|
+
# The meat and potatoes
|
|
48
|
+
report_results = run_report_extraction_and_assessment_and_recording(
|
|
19
49
|
pdf_path=str(pdf_path),
|
|
20
|
-
max_links=max_links,
|
|
21
50
|
pdf_library = pdf_library,
|
|
51
|
+
print_bool=print_bool,
|
|
22
52
|
)
|
|
53
|
+
# 2. Initialize file path tracking
|
|
54
|
+
output_path_json = None
|
|
55
|
+
output_path_txt = None
|
|
56
|
+
|
|
23
57
|
if export_format:
|
|
24
58
|
report_data_dict = report_results["data"]
|
|
25
59
|
report_buffer_str = report_results["text"]
|
|
26
60
|
if "JSON" in export_format.upper():
|
|
27
|
-
export_report_json(report_data_dict, pdf_path, pdf_library)
|
|
28
|
-
|
|
61
|
+
output_path_json = export_report_json(report_data_dict, pdf_path, pdf_library)
|
|
29
62
|
if "TXT" in export_format.upper():
|
|
30
|
-
export_report_txt(report_buffer_str, pdf_path, pdf_library)
|
|
63
|
+
output_path_txt = export_report_txt(report_buffer_str, pdf_path, pdf_library)
|
|
64
|
+
|
|
65
|
+
# 4. Inject the file info into the results dictionary
|
|
66
|
+
report_results["files"] = {
|
|
67
|
+
"export_path_json": output_path_json,
|
|
68
|
+
"export_path_txt": output_path_txt
|
|
69
|
+
}
|
|
31
70
|
return report_results
|
|
71
|
+
|
|
72
|
+
def _get_engine_data(pdf_path: str, pdf_library: str) -> tuple[Dict, str]:
|
|
73
|
+
"""Handles the dirty work of switching engines and importing them."""
|
|
74
|
+
# Resolve 'auto' mode
|
|
75
|
+
if pdf_library == "auto":
|
|
76
|
+
if pdfium_is_available(): pdf_library = "pdfium"
|
|
77
|
+
elif pymupdf_is_available(): pdf_library = "pymupdf"
|
|
78
|
+
else: pdf_library = "pypdf"
|
|
79
|
+
|
|
80
|
+
# Map engine names to their respective modules
|
|
81
|
+
engines = {
|
|
82
|
+
"pdfium": "pdflinkcheck.analysis_pdfium",
|
|
83
|
+
"pypdf": "pdflinkcheck.analysis_pypdf", # Assuming this exists
|
|
84
|
+
"pymupdf": "pdflinkcheck.analysis_pymupdf"
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if pdf_library not in engines:
|
|
88
|
+
raise ValueError(f"Unsupported library: {pdf_library}")
|
|
89
|
+
|
|
90
|
+
# Dynamic import to keep __init__ lean
|
|
91
|
+
import importlib
|
|
92
|
+
module = importlib.import_module(engines[pdf_library])
|
|
93
|
+
data = module.analyze_pdf(pdf_path) or {"links": [], "toc": [], "file_ov": {}}
|
|
32
94
|
|
|
95
|
+
return data, pdf_library
|
|
96
|
+
|
|
97
|
+
# ----- Refactored version, failing ----
|
|
98
|
+
def run_report_extraction_and_assessment_and_recording_(
|
|
99
|
+
pdf_path: str = None,
|
|
100
|
+
pdf_library: str = "auto",
|
|
101
|
+
print_bool: bool = True
|
|
102
|
+
) -> Dict[str, Any]:
|
|
103
|
+
"""
|
|
104
|
+
Orchestrates extraction, categorization, and validation.
|
|
105
|
+
FULLY RECONCILED with legacy logic to ensure no features are lost.
|
|
106
|
+
"""
|
|
107
|
+
if pdf_path is None:
|
|
108
|
+
return _return_empty_report(["pdf_path is None"], pdf_library)
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
# 1. Extraction
|
|
112
|
+
raw_data, resolved_library = _get_engine_data(pdf_path, pdf_library)
|
|
113
|
+
|
|
114
|
+
extracted_links = raw_data.get("links", [])
|
|
115
|
+
structural_toc = raw_data.get("toc", [])
|
|
116
|
+
file_ov = raw_data.get("file_ov", {})
|
|
117
|
+
total_pages = file_ov.get("total_pages", 0)
|
|
118
|
+
pdf_name = Path(pdf_path).name
|
|
119
|
+
|
|
120
|
+
# 2. Categorization (Restored exactly from original logic)
|
|
121
|
+
external_uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
|
|
122
|
+
goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
|
|
123
|
+
resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
|
|
124
|
+
other_links = [link for link in extracted_links if link['type'] not in
|
|
125
|
+
['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
|
|
126
|
+
|
|
127
|
+
all_internal = goto_links + resolved_action_links
|
|
33
128
|
|
|
34
|
-
|
|
129
|
+
# 3. Generate the Text Report (Using get_friendly_path as required)
|
|
130
|
+
# We pass the separate lists to maintain Section 2, 3, and 4 formatting
|
|
131
|
+
report_text_base = _generate_text_report(
|
|
132
|
+
pdf_path=pdf_path,
|
|
133
|
+
library=resolved_library,
|
|
134
|
+
ext_links=external_uri_links,
|
|
135
|
+
goto_links=goto_links,
|
|
136
|
+
resolve_links=resolved_action_links,
|
|
137
|
+
other_links=other_links,
|
|
138
|
+
toc=structural_toc
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# 4. Initial Result Assembly
|
|
142
|
+
report_results = {
|
|
143
|
+
"data": {
|
|
144
|
+
"external_links": external_uri_links,
|
|
145
|
+
"internal_links": goto_links + resolved_action_links,
|
|
146
|
+
"toc": structural_toc,
|
|
147
|
+
"validation": EMPTY_VALIDATION.copy()
|
|
148
|
+
},
|
|
149
|
+
"text": report_text_base,
|
|
150
|
+
"metadata": _build_metadata(
|
|
151
|
+
pdf_name=pdf_name,
|
|
152
|
+
total_pages=total_pages,
|
|
153
|
+
library_used=resolved_library,
|
|
154
|
+
toc_entry_count=len(structural_toc),
|
|
155
|
+
internal_goto_links_count=len(goto_links),
|
|
156
|
+
interal_resolve_action_links_count=len(resolved_action_links),
|
|
157
|
+
external_uri_links_count=len(external_uri_links),
|
|
158
|
+
other_links_count=len(other_links)
|
|
159
|
+
)
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# 5. Validation & Risk Analysis
|
|
163
|
+
validation_results = run_validation(report_results=report_results, pdf_path=pdf_path)
|
|
164
|
+
report_results["data"]["validation"].update(validation_results)
|
|
165
|
+
report_results["data"]["risk"] = compute_risk(report_results)
|
|
166
|
+
|
|
167
|
+
# --- Inside run_report_extraction_and_assessment_and_recording ---
|
|
168
|
+
# 6. Finalizing Text Buffer
|
|
169
|
+
val_summary = validation_results.get("summary-txt", "")
|
|
170
|
+
raw_text = report_text_base + f"\n{val_summary}\n--- Analysis Complete ---"
|
|
171
|
+
cleaned_text = sanitize_glyphs_for_compatibility(raw_text)
|
|
172
|
+
# Apply sanitization before returning
|
|
173
|
+
report_results["text"] = cleaned_text
|
|
174
|
+
#report_results["text"] = raw_text
|
|
175
|
+
|
|
176
|
+
if print_bool:
|
|
177
|
+
# Matches your original logic: print the overview/validation summary to console
|
|
178
|
+
print(val_summary)
|
|
179
|
+
|
|
180
|
+
return report_results
|
|
181
|
+
|
|
182
|
+
except Exception as e:
|
|
183
|
+
error_logger.error(f"Critical failure: {e}", exc_info=True)
|
|
184
|
+
return _return_empty_report([f"FATAL: {str(e)}"], pdf_library)
|
|
185
|
+
|
|
186
|
+
# ----- Revert to stable version ----
|
|
187
|
+
def run_report_extraction_and_assessment_and_recording(
|
|
188
|
+
pdf_path: str = None,
|
|
189
|
+
pdf_library: str = "auto",
|
|
190
|
+
print_bool:bool=True,
|
|
191
|
+
concise_print: bool=False,
|
|
192
|
+
) -> Dict[str, Any]:
|
|
35
193
|
"""
|
|
36
194
|
Core high-level PDF link analysis logic.
|
|
37
195
|
|
|
@@ -39,10 +197,8 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
|
|
|
39
197
|
using pdflinkcheck analysis, and
|
|
40
198
|
prints a comprehensive, user-friendly report to the console.
|
|
41
199
|
|
|
42
|
-
Args:
|
|
200
|
+
Args:
|
|
43
201
|
pdf_path: The file system path (str) to the target PDF document.
|
|
44
|
-
max_links: Maximum number of links to display in each console
|
|
45
|
-
section. If <= 0, all links will be displayed.
|
|
46
202
|
|
|
47
203
|
Returns:
|
|
48
204
|
A dictionary containing the structured results of the analysis:
|
|
@@ -54,18 +210,58 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
|
|
|
54
210
|
"""
|
|
55
211
|
|
|
56
212
|
report_buffer = []
|
|
213
|
+
report_buffer_overview = []
|
|
57
214
|
|
|
58
215
|
# Helper to handle conditional printing and mandatory buffering
|
|
59
|
-
def log(msg: str):
|
|
60
|
-
if print_bool:
|
|
61
|
-
print(msg)
|
|
216
|
+
def log(msg: str, overview: bool = False):
|
|
62
217
|
report_buffer.append(msg)
|
|
218
|
+
if overview:
|
|
219
|
+
report_buffer_overview.append(msg)
|
|
220
|
+
|
|
63
221
|
|
|
64
|
-
|
|
65
|
-
|
|
222
|
+
|
|
223
|
+
# Expected: "pypdf" or "PyMuPDF" pr "rust"
|
|
224
|
+
allowed_libraries = ("pypdf", "pymupdf", "pdfium", "auto")
|
|
66
225
|
pdf_library = pdf_library.lower()
|
|
67
|
-
|
|
68
|
-
|
|
226
|
+
|
|
227
|
+
log("\n--- Starting Analysis ... ---\n")
|
|
228
|
+
if pdf_path is None:
|
|
229
|
+
log("pdf_path is None", overview=True)
|
|
230
|
+
log("Tip: Drop a PDF in the current folder or pass in a path arg.")
|
|
231
|
+
_return_empty_report(report_buffer)
|
|
232
|
+
else:
|
|
233
|
+
pdf_name = Path(pdf_path).name
|
|
234
|
+
|
|
235
|
+
# AUTO MODE
|
|
236
|
+
if pdf_library == "auto":
|
|
237
|
+
if pdfium_is_available():
|
|
238
|
+
pdf_library = "pdfium"
|
|
239
|
+
elif pymupdf_is_available():
|
|
240
|
+
pdf_library = "pymupdf"
|
|
241
|
+
else:
|
|
242
|
+
pdf_library = "pypdf"
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
# PDFium ENGINE
|
|
247
|
+
if pdf_library in allowed_libraries and pdf_library == "pdfium":
|
|
248
|
+
from pdflinkcheck.analysis_pdfium import analyze_pdf as analyze_pdf_pdfium
|
|
249
|
+
data = analyze_pdf_pdfium(pdf_path) or {"links": [], "toc": [], "file_ov": []}
|
|
250
|
+
extracted_links = data.get("links", [])
|
|
251
|
+
structural_toc = data.get("toc", [])
|
|
252
|
+
file_ov = data.get("file_ov", [])
|
|
253
|
+
|
|
254
|
+
# pypdf ENGINE
|
|
255
|
+
elif pdf_library in allowed_libraries and pdf_library == "pypdf":
|
|
256
|
+
from pdflinkcheck.analysis_pdfium import analyze_pdf as analyze_pdf_pypdf
|
|
257
|
+
#extracted_links = extract_links(pdf_path)
|
|
258
|
+
#structural_toc = extract_toc(pdf_path)
|
|
259
|
+
data = analyze_pdf_pypdf(pdf_path) or {"links": [], "toc": [], "file_ov": []}
|
|
260
|
+
extracted_links = data.get("links", [])
|
|
261
|
+
structural_toc = data.get("toc", [])
|
|
262
|
+
file_ov = data.get("file_ov", [])
|
|
263
|
+
|
|
264
|
+
# PyMuPDF Engine
|
|
69
265
|
elif pdf_library in allowed_libraries and pdf_library == "pymupdf":
|
|
70
266
|
if not pymupdf_is_available():
|
|
71
267
|
print("PyMuPDF was explicitly requested as the PDF Engine")
|
|
@@ -76,43 +272,36 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
|
|
|
76
272
|
print("PyMuPDF is not expected to work on Termux. Use pypdf.")
|
|
77
273
|
print("\n")
|
|
78
274
|
#return
|
|
79
|
-
raise ImportError(
|
|
80
|
-
|
|
275
|
+
raise ImportError("The 'fitz' module (PyMuPDF) is required but not installed.")
|
|
276
|
+
|
|
277
|
+
from pdflinkcheck.analysis_pdfium import analyze_pdf as analyze_pdf_pymupdf
|
|
278
|
+
data = analyze_pdf_pymupdf(pdf_path) or {"links": [], "toc": [], "file_ov": []}
|
|
279
|
+
extracted_links = data.get("links", [])
|
|
280
|
+
structural_toc = data.get("toc", [])
|
|
281
|
+
file_ov = data.get("file_ov", [])
|
|
282
|
+
|
|
283
|
+
total_pages = file_ov.get("total_pages",0)
|
|
81
284
|
|
|
82
|
-
log("\n--- Starting Analysis ... ---\n")
|
|
83
|
-
if pdf_path is None:
|
|
84
|
-
log("pdf_path is None")
|
|
85
|
-
log("Tip: Drop a PDF in the current folder or pass in a path arg.")
|
|
86
|
-
empty_report = {
|
|
87
|
-
"data": {
|
|
88
|
-
"external_links": [],
|
|
89
|
-
"internal_links": [],
|
|
90
|
-
"toc": []
|
|
91
|
-
},
|
|
92
|
-
"text": "\n".join(report_buffer),
|
|
93
|
-
"metadata": {
|
|
94
|
-
"pdf_name": Path(pdf_path).name,
|
|
95
|
-
"library_used": pdf_library,
|
|
96
|
-
"total_links": 0
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
285
|
|
|
100
|
-
return empty_report
|
|
101
286
|
|
|
102
287
|
try:
|
|
103
|
-
log(f"Target file: {get_friendly_path(pdf_path)}")
|
|
104
|
-
log(f"PDF Engine: {pdf_library}")
|
|
288
|
+
log(f"Target file: {get_friendly_path(pdf_path)}", overview=True)
|
|
289
|
+
log(f"PDF Engine: {pdf_library}", overview=True)
|
|
105
290
|
|
|
106
|
-
# 1. Extract all active links and TOC
|
|
107
|
-
extracted_links = extract_links(pdf_path)
|
|
108
|
-
structural_toc = extract_toc(pdf_path)
|
|
109
|
-
#structural_toc = extract_toc_pypdf(pdf_path)
|
|
110
291
|
toc_entry_count = len(structural_toc)
|
|
292
|
+
str_structural_toc = get_structural_toc(structural_toc)
|
|
293
|
+
|
|
294
|
+
# check the structure, that it matches
|
|
295
|
+
if False:
|
|
296
|
+
print(f"pdf_library={pdf_library}")
|
|
297
|
+
debug_head("TOC", structural_toc, n=3)
|
|
298
|
+
debug_head("Links", list(extracted_links), n=3)
|
|
111
299
|
|
|
300
|
+
# THIS HITS
|
|
112
301
|
|
|
113
302
|
if not extracted_links and not structural_toc:
|
|
114
|
-
log(f"\nNo hyperlinks or structural TOC found in {
|
|
115
|
-
log("(This is common for scanned/image-only PDFs.)")
|
|
303
|
+
log(f"\nNo hyperlinks or structural TOC found in {pdf_name}.", overview=True)
|
|
304
|
+
log("(This is common for scanned/image-only PDFs.)", overview=True)
|
|
116
305
|
|
|
117
306
|
empty_result = {
|
|
118
307
|
"data": {
|
|
@@ -122,101 +311,156 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
|
|
|
122
311
|
},
|
|
123
312
|
"text": "\n".join(report_buffer),
|
|
124
313
|
"metadata": {
|
|
125
|
-
"
|
|
314
|
+
"file_overview": {
|
|
315
|
+
"pdf_name": pdf_name,
|
|
316
|
+
"total_pages": total_pages,
|
|
317
|
+
},
|
|
126
318
|
"library_used": pdf_library,
|
|
127
|
-
"
|
|
319
|
+
"link_counts": {
|
|
320
|
+
"toc_entry_count": 0,
|
|
321
|
+
"internal_goto_links_count": 0,
|
|
322
|
+
"interal_resolve_action_links_count": 0,
|
|
323
|
+
"total_internal_links_count": 0,
|
|
324
|
+
"external_uri_links_count": 0,
|
|
325
|
+
"other_links_count": 0,
|
|
326
|
+
"total_links_count": 0
|
|
327
|
+
}
|
|
128
328
|
}
|
|
129
329
|
}
|
|
130
330
|
return empty_result
|
|
131
331
|
|
|
132
332
|
# 3. Separate the lists based on the 'type' key
|
|
133
|
-
|
|
333
|
+
external_uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
|
|
134
334
|
goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
|
|
135
335
|
resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
|
|
136
336
|
other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
|
|
137
337
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
338
|
+
interal_resolve_action_links_count = len(resolved_action_links)
|
|
339
|
+
internal_goto_links_count = len(goto_links)
|
|
340
|
+
total_internal_links_count = internal_goto_links_count + interal_resolve_action_links_count
|
|
341
|
+
|
|
342
|
+
external_uri_links_count = len(external_uri_links)
|
|
343
|
+
other_links_count = len(other_links)
|
|
344
|
+
|
|
345
|
+
total_links_count = len(extracted_links)
|
|
141
346
|
|
|
142
|
-
str_structural_toc = get_structural_toc(structural_toc)
|
|
143
|
-
|
|
144
347
|
# --- ANALYSIS SUMMARY (Using your print logic) ---
|
|
145
|
-
log("\n" + "=" * SEP_COUNT)
|
|
146
|
-
log(f"--- Link Analysis Results for {
|
|
147
|
-
log(f"Total active links: {
|
|
148
|
-
log(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
|
|
149
|
-
log("=" * SEP_COUNT)
|
|
348
|
+
log("\n" + "=" * SEP_COUNT, overview = True)
|
|
349
|
+
log(f"--- Link Analysis Results for {pdf_name} ---", overview = True)
|
|
350
|
+
log(f"Total active links: {total_links_count} (External: {external_uri_links_count}, Internal Jumps: {total_internal_links_count}, Other: {other_links_count})",overview = True)
|
|
351
|
+
log(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}",overview = True)
|
|
352
|
+
log("=" * SEP_COUNT,overview = True)
|
|
150
353
|
|
|
151
354
|
# --- Section 1: TOC ---
|
|
152
355
|
log(str_structural_toc)
|
|
153
356
|
|
|
154
357
|
# --- Section 2: ACTIVE INTERNAL JUMPS ---
|
|
155
358
|
log("\n" + "=" * SEP_COUNT)
|
|
156
|
-
log(f"## Active Internal Jumps (GoTo & Resolved Actions) - {
|
|
359
|
+
log(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links_count} found")
|
|
157
360
|
log("=" * SEP_COUNT)
|
|
158
361
|
log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
|
|
159
362
|
log("-" * SEP_COUNT)
|
|
160
363
|
|
|
161
364
|
all_internal = goto_links + resolved_action_links
|
|
162
|
-
|
|
163
|
-
|
|
365
|
+
#If links were found: all_internal is a list with dictionaries. It evaluates to True.
|
|
366
|
+
# If NO links were found: all_internal is an empty list []. It evaluates to False.
|
|
367
|
+
if all_internal:
|
|
368
|
+
for i, link in enumerate(all_internal, 1):
|
|
164
369
|
link_text = link.get('link_text', 'N/A')
|
|
165
|
-
log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], link['destination_page']))
|
|
166
370
|
|
|
167
|
-
|
|
168
|
-
|
|
371
|
+
# Convert source and destination indices to human strings
|
|
372
|
+
src_page = PageRef.from_index(link['page']).human
|
|
373
|
+
dest_page = PageRef.from_index(link['destination_page']).human
|
|
374
|
+
|
|
375
|
+
log("{:<5} | {:<5} | {:<40} | {}".format(
|
|
376
|
+
i,
|
|
377
|
+
src_page,
|
|
378
|
+
link_text[:40],
|
|
379
|
+
dest_page
|
|
380
|
+
))
|
|
381
|
+
|
|
382
|
+
|
|
169
383
|
else:
|
|
170
384
|
log(" No internal GoTo or Resolved Action links found.")
|
|
171
385
|
log("-" * SEP_COUNT)
|
|
172
386
|
|
|
173
387
|
# --- Section 3: ACTIVE URI LINKS ---
|
|
174
388
|
log("\n" + "=" * SEP_COUNT)
|
|
175
|
-
log(f"## Active URI Links (External
|
|
389
|
+
log(f"## Active URI Links (External) - {len(external_uri_links)} found")
|
|
176
390
|
log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
|
|
177
391
|
log("=" * SEP_COUNT)
|
|
178
392
|
|
|
179
|
-
if
|
|
180
|
-
for i, link in enumerate(
|
|
393
|
+
if external_uri_links:
|
|
394
|
+
for i, link in enumerate(external_uri_links, 1):
|
|
181
395
|
target = link.get('url') or link.get('remote_file') or link.get('target')
|
|
182
396
|
link_text = link.get('link_text', 'N/A')
|
|
183
397
|
log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
|
|
184
|
-
if limit is not None and len(uri_and_other) > limit:
|
|
185
|
-
log(f"... and {len(uri_and_other) - limit} more links (use --max-links 0 to show all).")
|
|
186
398
|
|
|
187
399
|
else:
|
|
188
|
-
log(" No external
|
|
400
|
+
log(" No external links found.")
|
|
189
401
|
log("-" * SEP_COUNT)
|
|
190
402
|
|
|
403
|
+
# --- Section 4: OTHER LINKS ---
|
|
404
|
+
log("\n" + "=" * SEP_COUNT)
|
|
405
|
+
log(f"## Other Links - {len(other_links)} found")
|
|
406
|
+
log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target Action"))
|
|
407
|
+
log("=" * SEP_COUNT)
|
|
408
|
+
|
|
409
|
+
if other_links:
|
|
410
|
+
for i, link in enumerate(other_links, 1):
|
|
411
|
+
target = link.get('url') or link.get('remote_file') or link.get('target')
|
|
412
|
+
link_text = link.get('link_text', 'N/A')
|
|
413
|
+
log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
|
|
414
|
+
|
|
415
|
+
else:
|
|
416
|
+
log(" No 'Other' links found.")
|
|
417
|
+
log("-" * SEP_COUNT)
|
|
191
418
|
|
|
192
419
|
# Return the collected data for potential future JSON/other output
|
|
193
420
|
report_data_dict = {
|
|
194
|
-
"external_links":
|
|
421
|
+
"external_links": external_uri_links,
|
|
195
422
|
"internal_links": all_internal,
|
|
196
423
|
"toc": structural_toc,
|
|
197
|
-
"validation":
|
|
424
|
+
"validation": EMPTY_VALIDATION.copy()
|
|
198
425
|
}
|
|
199
426
|
|
|
200
427
|
intermediate_report_results = {
|
|
201
428
|
"data": report_data_dict, # The structured JSON-ready dict
|
|
202
429
|
"text": "",
|
|
203
430
|
"metadata": { # Helpful for the GUI/Logs
|
|
204
|
-
"
|
|
431
|
+
"file_overview": {
|
|
432
|
+
"pdf_name": pdf_name,
|
|
433
|
+
"total_pages": total_pages,
|
|
434
|
+
},
|
|
205
435
|
"library_used": pdf_library,
|
|
206
|
-
"
|
|
436
|
+
"link_counts": {
|
|
437
|
+
"toc_entry_count": toc_entry_count,
|
|
438
|
+
"internal_goto_links_count": internal_goto_links_count,
|
|
439
|
+
"interal_resolve_action_links_count": interal_resolve_action_links_count,
|
|
440
|
+
"total_internal_links_count": total_internal_links_count,
|
|
441
|
+
"external_uri_links_count": external_uri_links_count,
|
|
442
|
+
"other_links_count": other_links_count,
|
|
443
|
+
"total_links_count": total_links_count
|
|
444
|
+
}
|
|
207
445
|
}
|
|
208
446
|
}
|
|
209
447
|
|
|
210
448
|
log("\n--- Analysis Complete ---")
|
|
211
449
|
|
|
212
450
|
validation_results = run_validation(report_results=intermediate_report_results,
|
|
213
|
-
pdf_path=pdf_path
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
report_results
|
|
451
|
+
pdf_path=pdf_path)
|
|
452
|
+
log(validation_results.get("summary-txt",""), overview = True)
|
|
453
|
+
|
|
454
|
+
# CRITICAL: Re-assign to report_results so it's available for the final return
|
|
455
|
+
report_results = copy.deepcopy(intermediate_report_results)
|
|
217
456
|
|
|
457
|
+
# --- Offline Risk Analysis (Security Layer) ---
|
|
458
|
+
risk_results = compute_risk(report_results)
|
|
459
|
+
report_results["data"]["risk"] = risk_results
|
|
460
|
+
|
|
218
461
|
# Final aggregation of the buffer into one string, after the last call to log()
|
|
219
462
|
report_buffer_str = "\n".join(report_buffer)
|
|
463
|
+
report_buffer_overview_str = "\n".join(report_buffer_overview)
|
|
220
464
|
|
|
221
465
|
report_results["data"]["validation"].update(validation_results)
|
|
222
466
|
#report_results["text"].update(report_buffer_str) # The human-readable string
|
|
@@ -225,30 +469,43 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
|
|
|
225
469
|
# 5. Export Report
|
|
226
470
|
#if export_format:
|
|
227
471
|
# # Assuming export_to will hold the output format string (e.g., "JSON")
|
|
228
|
-
# export_report_data(report_data_dict,
|
|
472
|
+
# export_report_data(report_data_dict, pdf_name, export_format, pdf_library)
|
|
229
473
|
|
|
230
474
|
if print_bool:
|
|
231
|
-
|
|
475
|
+
if concise_print:
|
|
476
|
+
print(report_buffer_overview_str)
|
|
477
|
+
else:
|
|
478
|
+
print(report_buffer_str)
|
|
232
479
|
|
|
233
|
-
# Return a clean results object
|
|
234
480
|
return report_results
|
|
235
|
-
|
|
481
|
+
|
|
236
482
|
except Exception as e:
|
|
237
483
|
# Specific handling for common read failures
|
|
238
|
-
if "invalid pdf header" in str(e).lower() or "EOF marker not found" in str(e) or "stream has ended unexpectedly" in str(e):
|
|
484
|
+
if True:#"invalid pdf header" in str(e).lower() or "EOF marker not found" in str(e) or "stream has ended unexpectedly" in str(e):
|
|
239
485
|
log(f"\nWarning: Could not parse PDF structure — likely an image-only or malformed PDF.")
|
|
240
486
|
log("No hyperlinks or TOC can exist in this file.")
|
|
241
487
|
log("Result: No links found.")
|
|
242
488
|
return {
|
|
243
|
-
"data": {"external_links": [], "internal_links": [], "toc": []},
|
|
489
|
+
"data": {"external_links": [], "internal_links": [], "toc": [], "validation": EMPTY_VALIDATION.copy()},
|
|
244
490
|
"text": "\n".join(report_buffer + [
|
|
245
491
|
"\nWarning: PDF appears to be image-only or malformed.",
|
|
246
492
|
"No hyperlinks or structural TOC found."
|
|
247
493
|
]),
|
|
248
494
|
"metadata": {
|
|
249
|
-
"
|
|
495
|
+
"file_overview": {
|
|
496
|
+
"pdf_name": pdf_name,
|
|
497
|
+
"total_pages": total_pages,
|
|
498
|
+
},
|
|
250
499
|
"library_used": pdf_library,
|
|
251
|
-
"
|
|
500
|
+
"link_counts": {
|
|
501
|
+
"toc_entry_count": 0,
|
|
502
|
+
"internal_goto_links_count": 0,
|
|
503
|
+
"interal_resolve_action_links_count": 0,
|
|
504
|
+
"total_internal_links_count": 0,
|
|
505
|
+
"external_uri_links_count": 0,
|
|
506
|
+
"other_links_count": 0,
|
|
507
|
+
"total_links_count": 0
|
|
508
|
+
}
|
|
252
509
|
}
|
|
253
510
|
}
|
|
254
511
|
|
|
@@ -267,7 +524,7 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
|
|
|
267
524
|
"external_links": [],
|
|
268
525
|
"internal_links": [],
|
|
269
526
|
"toc": [],
|
|
270
|
-
"validation":
|
|
527
|
+
"validation": EMPTY_VALIDATION.copy()
|
|
271
528
|
},
|
|
272
529
|
"text": "\n".join(report_buffer + [
|
|
273
530
|
"\n--- Analysis failed ---",
|
|
@@ -275,11 +532,229 @@ def run_report_and_validtion(pdf_path: str = None, max_links: int = 0, pdf_libr
|
|
|
275
532
|
"No links or TOC extracted."
|
|
276
533
|
]),
|
|
277
534
|
"metadata": {
|
|
278
|
-
"
|
|
535
|
+
"file_overview": {
|
|
536
|
+
"pdf_name": pdf_name,
|
|
537
|
+
"total_pages": total_pages,
|
|
538
|
+
},
|
|
279
539
|
"library_used": pdf_library,
|
|
280
|
-
"
|
|
540
|
+
"link_counts": {
|
|
541
|
+
"toc_entry_count": 0,
|
|
542
|
+
"internal_goto_links_count": 0,
|
|
543
|
+
"interal_resolve_action_links_count": 0,
|
|
544
|
+
"total_internal_links_count": 0,
|
|
545
|
+
"external_uri_links_count": 0,
|
|
546
|
+
"other_links_count": 0,
|
|
547
|
+
"total_links_count": 0
|
|
548
|
+
}
|
|
281
549
|
}
|
|
282
550
|
}
|
|
551
|
+
|
|
552
|
+
def _return_empty_report(report_buffer: str, pdf_library: str)-> dict:
|
|
553
|
+
|
|
554
|
+
empty_report = {
|
|
555
|
+
"data": {
|
|
556
|
+
"external_links": [],
|
|
557
|
+
"internal_links": [],
|
|
558
|
+
"toc": [],
|
|
559
|
+
"validation": EMPTY_VALIDATION.copy()
|
|
560
|
+
},
|
|
561
|
+
"text": "\n".join(report_buffer),
|
|
562
|
+
"metadata": {
|
|
563
|
+
"file_overview": {
|
|
564
|
+
"pdf_name": "null",
|
|
565
|
+
"total_pages": 0,
|
|
566
|
+
},
|
|
567
|
+
"library_used": pdf_library,
|
|
568
|
+
"link_counts": {
|
|
569
|
+
"toc_entry_count": 0,
|
|
570
|
+
"internal_goto_links_count": 0,
|
|
571
|
+
"interal_resolve_action_links_count": 0,
|
|
572
|
+
"total_internal_links_count": 0,
|
|
573
|
+
"external_uri_links_count": 0,
|
|
574
|
+
"other_links_count": 0,
|
|
575
|
+
"total_links_count": 0
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
return empty_report
|
|
581
|
+
|
|
582
|
+
def _generate_text_report(
|
|
583
|
+
pdf_path: str,
|
|
584
|
+
library: str,
|
|
585
|
+
ext_links: list,
|
|
586
|
+
goto_links: list,
|
|
587
|
+
resolve_links: list,
|
|
588
|
+
other_links: list,
|
|
589
|
+
toc: list
|
|
590
|
+
) -> str:
|
|
591
|
+
"""Pure helper to build the human-readable string for console/TXT export."""
|
|
592
|
+
lines = []
|
|
593
|
+
lines.append("\n--- Starting Analysis ... ---\n")
|
|
594
|
+
lines.append(f"Target file: {get_friendly_path(pdf_path)}")
|
|
595
|
+
lines.append(f"PDF Engine: {library}")
|
|
596
|
+
|
|
597
|
+
total_int = len(goto_links) + len(resolve_links)
|
|
598
|
+
total_links = len(ext_links) + total_int + len(other_links)
|
|
599
|
+
|
|
600
|
+
# 1. Summary Header
|
|
601
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
602
|
+
lines.append(f"--- Link Analysis Results for {get_friendly_path(pdf_path)} ---")
|
|
603
|
+
lines.append(f"Total active links: {total_links} (External: {len(ext_links)}, Internal Jumps: {total_int}, Other: {len(other_links)})")
|
|
604
|
+
lines.append(f"Total **structural TOC entries (bookmarks)** found: {len(toc)}")
|
|
605
|
+
lines.append("=" * SEP_COUNT)
|
|
606
|
+
|
|
607
|
+
# 2. Table of Contents
|
|
608
|
+
lines.append(get_structural_toc(toc))
|
|
609
|
+
|
|
610
|
+
# 3. Internal Jumps
|
|
611
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
612
|
+
lines.append(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_int} found")
|
|
613
|
+
lines.append("=" * SEP_COUNT)
|
|
614
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
|
|
615
|
+
lines.append("-" * SEP_COUNT)
|
|
616
|
+
|
|
617
|
+
all_internal = goto_links + resolve_links
|
|
618
|
+
if all_internal:
|
|
619
|
+
for i, link in enumerate(all_internal, 1):
|
|
620
|
+
src = PageRef.from_index(link.get('page', 0)).human
|
|
621
|
+
dest = PageRef.from_index(link.get('destination_page', 0)).human
|
|
622
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format(
|
|
623
|
+
i, src, link.get('link_text', 'N/A')[:40], dest
|
|
624
|
+
))
|
|
625
|
+
else:
|
|
626
|
+
lines.append(" No internal GoTo or Resolved Action links found.")
|
|
627
|
+
lines.append("-" * SEP_COUNT)
|
|
628
|
+
|
|
629
|
+
# 4. External URI Links
|
|
630
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
631
|
+
lines.append(f"## Active URI Links (External) - {len(ext_links)} found")
|
|
632
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
|
|
633
|
+
lines.append("=" * SEP_COUNT)
|
|
634
|
+
|
|
635
|
+
if ext_links:
|
|
636
|
+
for i, link in enumerate(ext_links, 1):
|
|
637
|
+
target = link.get('url') or link.get('remote_file') or link.get('target', 'N/A')
|
|
638
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format(
|
|
639
|
+
i, link.get('page', 0), link.get('link_text', 'N/A')[:40], target
|
|
640
|
+
))
|
|
641
|
+
else:
|
|
642
|
+
lines.append(" No external links found.")
|
|
643
|
+
lines.append("-" * SEP_COUNT)
|
|
644
|
+
|
|
645
|
+
# 5. Other Links
|
|
646
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
647
|
+
lines.append(f"## Other Links - {len(other_links)} found")
|
|
648
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target Action"))
|
|
649
|
+
lines.append("=" * SEP_COUNT)
|
|
650
|
+
|
|
651
|
+
if other_links:
|
|
652
|
+
for i, link in enumerate(other_links, 1):
|
|
653
|
+
target = link.get('url') or link.get('remote_file') or link.get('target', 'N/A')
|
|
654
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format(
|
|
655
|
+
i, link.get('page', 0), link.get('link_text', 'N/A')[:40], target
|
|
656
|
+
))
|
|
657
|
+
else:
|
|
658
|
+
lines.append(" No 'Other' links found.")
|
|
659
|
+
lines.append("-" * SEP_COUNT)
|
|
660
|
+
|
|
661
|
+
return "\n".join(lines)
|
|
662
|
+
|
|
663
|
+
def _generate_text_report__(pdf_path, library, ext_links, int_links, other_links, toc) -> str:
|
|
664
|
+
lines = []
|
|
665
|
+
lines.append("\n--- Starting Analysis ... ---\n")
|
|
666
|
+
lines.append(f"Target file: {get_friendly_path(pdf_path)}")
|
|
667
|
+
lines.append(f"PDF Engine: {library}")
|
|
668
|
+
|
|
669
|
+
# 1. Summary Header
|
|
670
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
671
|
+
lines.append(f"--- Link Analysis Results for {get_friendly_path(pdf_path)} ---")
|
|
672
|
+
lines.append(f"Total active links: {len(ext_links) + len(int_links) + len(other_links)}")
|
|
673
|
+
lines.append(f"Total bookmarks: {len(toc)}")
|
|
674
|
+
lines.append("=" * SEP_COUNT)
|
|
675
|
+
|
|
676
|
+
# 2. Table of Contents
|
|
677
|
+
lines.append(get_structural_toc(toc))
|
|
678
|
+
|
|
679
|
+
# 3. Internal Jumps (GoTo & Resolved)
|
|
680
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
681
|
+
lines.append(f"## Active Internal Jumps - {len(int_links)} found")
|
|
682
|
+
lines.append("=" * SEP_COUNT)
|
|
683
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To"))
|
|
684
|
+
|
|
685
|
+
for i, link in enumerate(int_links, 1):
|
|
686
|
+
src = PageRef.from_index(link.get('page', 0)).human
|
|
687
|
+
dest = PageRef.from_index(link.get('destination_page', 0)).human
|
|
688
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format(i, src, link.get('link_text', 'N/A')[:40], dest))
|
|
689
|
+
|
|
690
|
+
# 4. External URI Links
|
|
691
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
692
|
+
lines.append(f"## External URI Links - {len(ext_links)} found")
|
|
693
|
+
lines.append("=" * SEP_COUNT)
|
|
694
|
+
for i, link in enumerate(ext_links, 1):
|
|
695
|
+
target = link.get('url') or link.get('target', 'N/A')
|
|
696
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format(i, link.get('page', 0), link.get('link_text', 'N/A')[:40], target))
|
|
697
|
+
|
|
698
|
+
return "\n".join(lines)
|
|
699
|
+
|
|
700
|
+
def _build_metadata(
|
|
701
|
+
pdf_name: str,
|
|
702
|
+
total_pages: int,
|
|
703
|
+
library_used: str,
|
|
704
|
+
toc_entry_count: int,
|
|
705
|
+
internal_goto_links_count: int,
|
|
706
|
+
interal_resolve_action_links_count: int,
|
|
707
|
+
external_uri_links_count: int,
|
|
708
|
+
other_links_count: int
|
|
709
|
+
) -> Dict[str, Any]:
|
|
710
|
+
"""
|
|
711
|
+
Standardizes the metadata dictionary using the EXACT legacy variable names.
|
|
712
|
+
"""
|
|
713
|
+
total_internal_links_count = internal_goto_links_count + interal_resolve_action_links_count
|
|
714
|
+
total_links_count = total_internal_links_count + external_uri_links_count + other_links_count
|
|
715
|
+
|
|
716
|
+
return {
|
|
717
|
+
"file_overview": {
|
|
718
|
+
"pdf_name": pdf_name,
|
|
719
|
+
"total_pages": total_pages,
|
|
720
|
+
},
|
|
721
|
+
"library_used": library_used,
|
|
722
|
+
"link_counts": {
|
|
723
|
+
"toc_entry_count": toc_entry_count,
|
|
724
|
+
"internal_goto_links_count": internal_goto_links_count,
|
|
725
|
+
"interal_resolve_action_links_count": interal_resolve_action_links_count,
|
|
726
|
+
"total_internal_links_count": total_internal_links_count,
|
|
727
|
+
"external_uri_links_count": external_uri_links_count,
|
|
728
|
+
"other_links_count": other_links_count,
|
|
729
|
+
"total_links_count": total_links_count
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
def _build_metadata_(
|
|
734
|
+
pdf_name: str,
|
|
735
|
+
total_pages: int,
|
|
736
|
+
library_used: str,
|
|
737
|
+
toc_count: int,
|
|
738
|
+
goto_count: int,
|
|
739
|
+
resolve_count: int,
|
|
740
|
+
ext_count: int,
|
|
741
|
+
other_count: int
|
|
742
|
+
) -> Dict[str, Any]:
|
|
743
|
+
"""Standardizes the metadata dictionary for all report types."""
|
|
744
|
+
return {
|
|
745
|
+
"file_overview": {
|
|
746
|
+
"pdf_name": pdf_name,
|
|
747
|
+
"total_pages": total_pages,
|
|
748
|
+
},
|
|
749
|
+
"library_used": library_used,
|
|
750
|
+
"link_counts": {
|
|
751
|
+
"toc_entry_count": toc_count,
|
|
752
|
+
"internal_links_count": goto_count,
|
|
753
|
+
"external_uri_links_count": ext_count,
|
|
754
|
+
"other_links_count": other_count,
|
|
755
|
+
"total_links_count": goto_count + ext_count + other_count
|
|
756
|
+
}
|
|
757
|
+
}
|
|
283
758
|
|
|
284
759
|
def get_structural_toc(structural_toc: list) -> str:
|
|
285
760
|
"""
|
|
@@ -287,7 +762,6 @@ def get_structural_toc(structural_toc: list) -> str:
|
|
|
287
762
|
|
|
288
763
|
Args:
|
|
289
764
|
structural_toc: A list of TOC dictionaries.
|
|
290
|
-
print_bool: Whether to print the output to the console.
|
|
291
765
|
|
|
292
766
|
Returns:
|
|
293
767
|
A formatted string of the structural TOC.
|
|
@@ -313,8 +787,16 @@ def get_structural_toc(structural_toc: list) -> str:
|
|
|
313
787
|
indent = " " * 4 * (item['level'] - 1)
|
|
314
788
|
# Handle cases where page might be N/A or None
|
|
315
789
|
target_page = item.get('target_page', "N/A")
|
|
316
|
-
page_str = str(target_page).rjust(page_width)
|
|
317
790
|
|
|
791
|
+
# Determine the human-facing string
|
|
792
|
+
if isinstance(target_page, int):
|
|
793
|
+
# Convert 0-index back to human (1-index) for the report
|
|
794
|
+
display_val = PageRef.from_index(target_page).human
|
|
795
|
+
else:
|
|
796
|
+
display_val = str(target_page)
|
|
797
|
+
|
|
798
|
+
page_str = str(display_val).rjust(page_width)
|
|
799
|
+
|
|
318
800
|
lines.append(f"{indent}{item['title']} . . . page {page_str}")
|
|
319
801
|
|
|
320
802
|
lines.append("-" * SEP_COUNT)
|
|
@@ -324,6 +806,26 @@ def get_structural_toc(structural_toc: list) -> str:
|
|
|
324
806
|
|
|
325
807
|
return str_structural_toc
|
|
326
808
|
|
|
809
|
+
import unicodedata
|
|
810
|
+
|
|
811
|
+
def sanitize_glyphs_for_compatibility(text: str) -> str:
|
|
812
|
+
"""Replaces emojis with ASCII tags to prevent rendering bugs in gedit/WSL2."""
|
|
813
|
+
glyph_mapping = {
|
|
814
|
+
'✅': '[PASS]',
|
|
815
|
+
'🌐': '[WEB]',
|
|
816
|
+
'⚠️': '[WARN]',
|
|
817
|
+
'❌': '[FAIL]',
|
|
818
|
+
'ℹ️': '[INFO]'
|
|
819
|
+
}
|
|
820
|
+
for glyph, replacement in glyph_mapping.items():
|
|
821
|
+
text = text.replace(glyph, replacement)
|
|
822
|
+
|
|
823
|
+
# Standard library only - no unidecode dependency
|
|
824
|
+
normalized = unicodedata.normalize('NFKD', text)
|
|
825
|
+
return normalized.encode('ascii', 'ignore').decode('utf-8').replace(' ', ' ')
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
|
|
327
829
|
if __name__ == "__main__":
|
|
328
830
|
|
|
329
831
|
from pdflinkcheck.io import get_first_pdf_in_cwd
|
|
@@ -334,9 +836,8 @@ if __name__ == "__main__":
|
|
|
334
836
|
pdf_library = "pymupdf"
|
|
335
837
|
else:
|
|
336
838
|
pdf_library = "pypdf"
|
|
337
|
-
report =
|
|
839
|
+
report = run_report_and_call_exports(
|
|
338
840
|
pdf_path=pdf_path,
|
|
339
|
-
max_links=0,
|
|
340
841
|
export_format="",
|
|
341
842
|
pdf_library=pdf_library,
|
|
342
843
|
print_bool=True # We handle printing in validation
|