pdflinkcheck 1.1.73__py3-none-any.whl → 1.2.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +88 -21
- pdflinkcheck/__main__.py +6 -0
- pdflinkcheck/analysis_pdfium.py +131 -0
- pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +109 -145
- pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +67 -37
- pdflinkcheck/cli.py +111 -116
- pdflinkcheck/data/I Have Questions.md +51 -0
- pdflinkcheck/data/LICENSE +20 -654
- pdflinkcheck/data/README.md +65 -67
- pdflinkcheck/data/icons/BoxArt-1080x1080.png +0 -0
- pdflinkcheck/data/icons/Logo-150x150.png +0 -0
- pdflinkcheck/data/icons/Logo-300x300.png +0 -0
- pdflinkcheck/data/icons/Logo-71x71.png +0 -0
- pdflinkcheck/data/icons/PosterArt-720x1080.png +0 -0
- pdflinkcheck/data/icons/SmallLogo-44x44.png +0 -0
- pdflinkcheck/data/icons/SplashScreen-620x300.png +0 -0
- pdflinkcheck/data/icons/StoreLogo-50x50.png +0 -0
- pdflinkcheck/data/icons/WideLogo-310x150.png +0 -0
- pdflinkcheck/data/icons/red_pdf_512px.ico +0 -0
- pdflinkcheck/data/pyproject.toml +25 -37
- pdflinkcheck/data/themes/forest/forest-dark/border-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-invalid.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/card.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/down.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/empty.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/notebook.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/right.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/scale-hor.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/scale-vert.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/separator.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/sizegrip.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tree-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tree-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark.tcl +536 -0
- pdflinkcheck/data/themes/forest/forest-light/border-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-invalid.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/card.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/down.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/empty.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/notebook.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/right-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/right.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/scale-hor.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/scale-vert.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/separator.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/sizegrip.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-down-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tree-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tree-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light.tcl +544 -0
- pdflinkcheck/datacopy.py +18 -1
- pdflinkcheck/dev.py +12 -25
- pdflinkcheck/environment.py +76 -0
- pdflinkcheck/gui.py +366 -457
- pdflinkcheck/helpers.py +88 -0
- pdflinkcheck/io.py +27 -23
- pdflinkcheck/report.py +692 -121
- pdflinkcheck/security.py +189 -0
- pdflinkcheck/splash.py +38 -0
- pdflinkcheck/stdlib_server.py +14 -20
- pdflinkcheck/stdlib_server_alt.py +571 -0
- pdflinkcheck/tk_utils.py +188 -0
- pdflinkcheck/update_msix_version.py +49 -0
- pdflinkcheck/validate.py +129 -218
- pdflinkcheck/version_info.py +6 -3
- {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +84 -81
- pdflinkcheck-1.2.29.dist-info/RECORD +183 -0
- pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
- {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-MIT +9 -0
- pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
- pdflinkcheck/analyze_pypdf_v2.py +0 -218
- pdflinkcheck-1.1.73.dist-info/RECORD +0 -21
- pdflinkcheck-1.1.73.dist-info/WHEEL +0 -4
- /pdflinkcheck-1.1.73.dist-info/licenses/LICENSE → /pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-AGPL3 +0 -0
pdflinkcheck/report.py
CHANGED
|
@@ -1,16 +1,195 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
1
3
|
# pdflinkcheck/report.py
|
|
2
|
-
|
|
4
|
+
from __future__ import annotations
|
|
3
5
|
import sys
|
|
4
6
|
from pathlib import Path
|
|
5
7
|
from typing import Optional, Dict, Any
|
|
6
8
|
import pyhabitat
|
|
9
|
+
import copy
|
|
7
10
|
|
|
8
11
|
from pdflinkcheck.io import error_logger, export_report_json, export_report_txt, get_first_pdf_in_cwd, get_friendly_path, LOG_FILE_PATH
|
|
12
|
+
from pdflinkcheck.environment import pymupdf_is_available, pdfium_is_available
|
|
13
|
+
from pdflinkcheck.validate import run_validation
|
|
14
|
+
from pdflinkcheck.security import compute_risk
|
|
15
|
+
from pdflinkcheck.helpers import debug_head, PageRef
|
|
9
16
|
|
|
10
17
|
|
|
11
18
|
SEP_COUNT=28
|
|
19
|
+
# Define a safe "empty" validation state
|
|
20
|
+
EMPTY_VALIDATION = {
|
|
21
|
+
"summary-stats": {
|
|
22
|
+
"total_checked": 0,
|
|
23
|
+
"valid": 0,
|
|
24
|
+
"file-found": 0,
|
|
25
|
+
"broken-page": 0,
|
|
26
|
+
"broken-file": 0,
|
|
27
|
+
"no_destination_page_count": 0,
|
|
28
|
+
"unknown-web": 0,
|
|
29
|
+
"unknown-reasonableness": 0,
|
|
30
|
+
"unknown-link": 0
|
|
31
|
+
},
|
|
32
|
+
"issues": [],
|
|
33
|
+
"summary-txt": "Analysis failed: No validation performed.",
|
|
34
|
+
"total_pages": 0
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run_report_and_call_exports(
|
|
39
|
+
pdf_path: str = None,
|
|
40
|
+
export_format: str = "JSON",
|
|
41
|
+
pdf_library: str = "auto",
|
|
42
|
+
print_bool:bool=True,
|
|
43
|
+
) -> Dict[str, Any]:
|
|
44
|
+
"""
|
|
45
|
+
Public entry point. Orchestrates extraction, validation, and file exports.
|
|
46
|
+
"""
|
|
47
|
+
# The meat and potatoes
|
|
48
|
+
report_results = run_report_extraction_and_assessment_and_recording(
|
|
49
|
+
pdf_path=str(pdf_path),
|
|
50
|
+
pdf_library = pdf_library,
|
|
51
|
+
print_bool=print_bool,
|
|
52
|
+
)
|
|
53
|
+
# 2. Initialize file path tracking
|
|
54
|
+
output_path_json = None
|
|
55
|
+
output_path_txt = None
|
|
56
|
+
|
|
57
|
+
if export_format:
|
|
58
|
+
report_data_dict = report_results["data"]
|
|
59
|
+
report_buffer_str = report_results["text"]
|
|
60
|
+
if "JSON" in export_format.upper():
|
|
61
|
+
output_path_json = export_report_json(report_data_dict, pdf_path, pdf_library)
|
|
62
|
+
if "TXT" in export_format.upper():
|
|
63
|
+
output_path_txt = export_report_txt(report_buffer_str, pdf_path, pdf_library)
|
|
64
|
+
|
|
65
|
+
# 4. Inject the file info into the results dictionary
|
|
66
|
+
report_results["files"] = {
|
|
67
|
+
"export_path_json": output_path_json,
|
|
68
|
+
"export_path_txt": output_path_txt
|
|
69
|
+
}
|
|
70
|
+
return report_results
|
|
71
|
+
|
|
72
|
+
def _get_engine_data(pdf_path: str, pdf_library: str) -> tuple[Dict, str]:
|
|
73
|
+
"""Handles the dirty work of switching engines and importing them."""
|
|
74
|
+
# Resolve 'auto' mode
|
|
75
|
+
if pdf_library == "auto":
|
|
76
|
+
if pdfium_is_available(): pdf_library = "pdfium"
|
|
77
|
+
elif pymupdf_is_available(): pdf_library = "pymupdf"
|
|
78
|
+
else: pdf_library = "pypdf"
|
|
79
|
+
|
|
80
|
+
# Map engine names to their respective modules
|
|
81
|
+
engines = {
|
|
82
|
+
"pdfium": "pdflinkcheck.analysis_pdfium",
|
|
83
|
+
"pypdf": "pdflinkcheck.analysis_pypdf", # Assuming this exists
|
|
84
|
+
"pymupdf": "pdflinkcheck.analysis_pymupdf"
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if pdf_library not in engines:
|
|
88
|
+
raise ValueError(f"Unsupported library: {pdf_library}")
|
|
89
|
+
|
|
90
|
+
# Dynamic import to keep __init__ lean
|
|
91
|
+
import importlib
|
|
92
|
+
module = importlib.import_module(engines[pdf_library])
|
|
93
|
+
data = module.analyze_pdf(pdf_path) or {"links": [], "toc": [], "file_ov": {}}
|
|
94
|
+
|
|
95
|
+
return data, pdf_library
|
|
96
|
+
|
|
97
|
+
# ----- Refactored version, failing ----
|
|
98
|
+
def run_report_extraction_and_assessment_and_recording_(
|
|
99
|
+
pdf_path: str = None,
|
|
100
|
+
pdf_library: str = "auto",
|
|
101
|
+
print_bool: bool = True
|
|
102
|
+
) -> Dict[str, Any]:
|
|
103
|
+
"""
|
|
104
|
+
Orchestrates extraction, categorization, and validation.
|
|
105
|
+
FULLY RECONCILED with legacy logic to ensure no features are lost.
|
|
106
|
+
"""
|
|
107
|
+
if pdf_path is None:
|
|
108
|
+
return _return_empty_report(["pdf_path is None"], pdf_library)
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
# 1. Extraction
|
|
112
|
+
raw_data, resolved_library = _get_engine_data(pdf_path, pdf_library)
|
|
113
|
+
|
|
114
|
+
extracted_links = raw_data.get("links", [])
|
|
115
|
+
structural_toc = raw_data.get("toc", [])
|
|
116
|
+
file_ov = raw_data.get("file_ov", {})
|
|
117
|
+
total_pages = file_ov.get("total_pages", 0)
|
|
118
|
+
pdf_name = Path(pdf_path).name
|
|
119
|
+
|
|
120
|
+
# 2. Categorization (Restored exactly from original logic)
|
|
121
|
+
external_uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
|
|
122
|
+
goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
|
|
123
|
+
resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
|
|
124
|
+
other_links = [link for link in extracted_links if link['type'] not in
|
|
125
|
+
['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
|
|
126
|
+
|
|
127
|
+
all_internal = goto_links + resolved_action_links
|
|
128
|
+
|
|
129
|
+
# 3. Generate the Text Report (Using get_friendly_path as required)
|
|
130
|
+
# We pass the separate lists to maintain Section 2, 3, and 4 formatting
|
|
131
|
+
report_text_base = _generate_text_report(
|
|
132
|
+
pdf_path=pdf_path,
|
|
133
|
+
library=resolved_library,
|
|
134
|
+
ext_links=external_uri_links,
|
|
135
|
+
goto_links=goto_links,
|
|
136
|
+
resolve_links=resolved_action_links,
|
|
137
|
+
other_links=other_links,
|
|
138
|
+
toc=structural_toc
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# 4. Initial Result Assembly
|
|
142
|
+
report_results = {
|
|
143
|
+
"data": {
|
|
144
|
+
"external_links": external_uri_links,
|
|
145
|
+
"internal_links": goto_links + resolved_action_links,
|
|
146
|
+
"toc": structural_toc,
|
|
147
|
+
"validation": EMPTY_VALIDATION.copy()
|
|
148
|
+
},
|
|
149
|
+
"text": report_text_base,
|
|
150
|
+
"metadata": _build_metadata(
|
|
151
|
+
pdf_name=pdf_name,
|
|
152
|
+
total_pages=total_pages,
|
|
153
|
+
library_used=resolved_library,
|
|
154
|
+
toc_entry_count=len(structural_toc),
|
|
155
|
+
internal_goto_links_count=len(goto_links),
|
|
156
|
+
interal_resolve_action_links_count=len(resolved_action_links),
|
|
157
|
+
external_uri_links_count=len(external_uri_links),
|
|
158
|
+
other_links_count=len(other_links)
|
|
159
|
+
)
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# 5. Validation & Risk Analysis
|
|
163
|
+
validation_results = run_validation(report_results=report_results, pdf_path=pdf_path)
|
|
164
|
+
report_results["data"]["validation"].update(validation_results)
|
|
165
|
+
report_results["data"]["risk"] = compute_risk(report_results)
|
|
166
|
+
|
|
167
|
+
# --- Inside run_report_extraction_and_assessment_and_recording ---
|
|
168
|
+
# 6. Finalizing Text Buffer
|
|
169
|
+
val_summary = validation_results.get("summary-txt", "")
|
|
170
|
+
raw_text = report_text_base + f"\n{val_summary}\n--- Analysis Complete ---"
|
|
171
|
+
cleaned_text = sanitize_glyphs_for_compatibility(raw_text)
|
|
172
|
+
# Apply sanitization before returning
|
|
173
|
+
report_results["text"] = cleaned_text
|
|
174
|
+
#report_results["text"] = raw_text
|
|
12
175
|
|
|
13
|
-
|
|
176
|
+
if print_bool:
|
|
177
|
+
# Matches your original logic: print the overview/validation summary to console
|
|
178
|
+
print(val_summary)
|
|
179
|
+
|
|
180
|
+
return report_results
|
|
181
|
+
|
|
182
|
+
except Exception as e:
|
|
183
|
+
error_logger.error(f"Critical failure: {e}", exc_info=True)
|
|
184
|
+
return _return_empty_report([f"FATAL: {str(e)}"], pdf_library)
|
|
185
|
+
|
|
186
|
+
# ----- Revert to stable version ----
|
|
187
|
+
def run_report_extraction_and_assessment_and_recording(
|
|
188
|
+
pdf_path: str = None,
|
|
189
|
+
pdf_library: str = "auto",
|
|
190
|
+
print_bool:bool=True,
|
|
191
|
+
concise_print: bool=False,
|
|
192
|
+
) -> Dict[str, Any]:
|
|
14
193
|
"""
|
|
15
194
|
Core high-level PDF link analysis logic.
|
|
16
195
|
|
|
@@ -18,10 +197,8 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
18
197
|
using pdflinkcheck analysis, and
|
|
19
198
|
prints a comprehensive, user-friendly report to the console.
|
|
20
199
|
|
|
21
|
-
Args:
|
|
200
|
+
Args:
|
|
22
201
|
pdf_path: The file system path (str) to the target PDF document.
|
|
23
|
-
max_links: Maximum number of links to display in each console
|
|
24
|
-
section. If <= 0, all links will be displayed.
|
|
25
202
|
|
|
26
203
|
Returns:
|
|
27
204
|
A dictionary containing the structured results of the analysis:
|
|
@@ -33,213 +210,558 @@ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "
|
|
|
33
210
|
"""
|
|
34
211
|
|
|
35
212
|
report_buffer = []
|
|
213
|
+
report_buffer_overview = []
|
|
36
214
|
|
|
37
215
|
# Helper to handle conditional printing and mandatory buffering
|
|
38
|
-
def log(msg: str):
|
|
39
|
-
if print_bool: # this should not be here
|
|
40
|
-
print(msg) # this should not be here. esure elsewhere then remove
|
|
216
|
+
def log(msg: str, overview: bool = False):
|
|
41
217
|
report_buffer.append(msg)
|
|
218
|
+
if overview:
|
|
219
|
+
report_buffer_overview.append(msg)
|
|
220
|
+
|
|
221
|
+
|
|
42
222
|
|
|
43
|
-
# Expected: "pypdf" or "PyMuPDF"
|
|
44
|
-
allowed_libraries = ("pypdf","pymupdf")
|
|
223
|
+
# Expected: "pypdf" or "PyMuPDF" pr "rust"
|
|
224
|
+
allowed_libraries = ("pypdf", "pymupdf", "pdfium", "auto")
|
|
45
225
|
pdf_library = pdf_library.lower()
|
|
46
|
-
|
|
47
|
-
|
|
226
|
+
|
|
227
|
+
log("\n--- Starting Analysis ... ---\n")
|
|
228
|
+
if pdf_path is None:
|
|
229
|
+
log("pdf_path is None", overview=True)
|
|
230
|
+
log("Tip: Drop a PDF in the current folder or pass in a path arg.")
|
|
231
|
+
_return_empty_report(report_buffer)
|
|
232
|
+
else:
|
|
233
|
+
pdf_name = Path(pdf_path).name
|
|
234
|
+
|
|
235
|
+
# AUTO MODE
|
|
236
|
+
if pdf_library == "auto":
|
|
237
|
+
if pdfium_is_available():
|
|
238
|
+
pdf_library = "pdfium"
|
|
239
|
+
elif pymupdf_is_available():
|
|
240
|
+
pdf_library = "pymupdf"
|
|
241
|
+
else:
|
|
242
|
+
pdf_library = "pypdf"
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
# PDFium ENGINE
|
|
247
|
+
if pdf_library in allowed_libraries and pdf_library == "pdfium":
|
|
248
|
+
from pdflinkcheck.analysis_pdfium import analyze_pdf as analyze_pdf_pdfium
|
|
249
|
+
data = analyze_pdf_pdfium(pdf_path) or {"links": [], "toc": [], "file_ov": []}
|
|
250
|
+
extracted_links = data.get("links", [])
|
|
251
|
+
structural_toc = data.get("toc", [])
|
|
252
|
+
file_ov = data.get("file_ov", [])
|
|
253
|
+
|
|
254
|
+
# pypdf ENGINE
|
|
255
|
+
elif pdf_library in allowed_libraries and pdf_library == "pypdf":
|
|
256
|
+
from pdflinkcheck.analysis_pdfium import analyze_pdf as analyze_pdf_pypdf
|
|
257
|
+
#extracted_links = extract_links(pdf_path)
|
|
258
|
+
#structural_toc = extract_toc(pdf_path)
|
|
259
|
+
data = analyze_pdf_pypdf(pdf_path) or {"links": [], "toc": [], "file_ov": []}
|
|
260
|
+
extracted_links = data.get("links", [])
|
|
261
|
+
structural_toc = data.get("toc", [])
|
|
262
|
+
file_ov = data.get("file_ov", [])
|
|
263
|
+
|
|
264
|
+
# PyMuPDF Engine
|
|
48
265
|
elif pdf_library in allowed_libraries and pdf_library == "pymupdf":
|
|
49
|
-
|
|
50
|
-
import fitz
|
|
51
|
-
except ImportError:
|
|
266
|
+
if not pymupdf_is_available():
|
|
52
267
|
print("PyMuPDF was explicitly requested as the PDF Engine")
|
|
53
|
-
print("
|
|
268
|
+
print("Switch the PDF library to 'pypdf' instead, or install PyMuPDF. ")
|
|
54
269
|
print("To install PyMuPDF locally, try: `uv sync --extra full` OR `pip install .[full]`")
|
|
55
270
|
if pyhabitat.on_termux():
|
|
56
271
|
print(f"pyhabitat.on_termux() = {pyhabitat.on_termux()}")
|
|
57
272
|
print("PyMuPDF is not expected to work on Termux. Use pypdf.")
|
|
58
273
|
print("\n")
|
|
59
|
-
return
|
|
60
|
-
|
|
274
|
+
#return
|
|
275
|
+
raise ImportError("The 'fitz' module (PyMuPDF) is required but not installed.")
|
|
276
|
+
|
|
277
|
+
from pdflinkcheck.analysis_pdfium import analyze_pdf as analyze_pdf_pymupdf
|
|
278
|
+
data = analyze_pdf_pymupdf(pdf_path) or {"links": [], "toc": [], "file_ov": []}
|
|
279
|
+
extracted_links = data.get("links", [])
|
|
280
|
+
structural_toc = data.get("toc", [])
|
|
281
|
+
file_ov = data.get("file_ov", [])
|
|
61
282
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
log("pdf_path is None")
|
|
67
|
-
log("Tip: Drop a PDF in the current folder or pass in a path arg.")
|
|
68
|
-
return
|
|
283
|
+
total_pages = file_ov.get("total_pages",0)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
|
|
69
287
|
try:
|
|
70
|
-
log(f"Target file: {get_friendly_path(pdf_path)}")
|
|
71
|
-
log(f"PDF Engine: {pdf_library}")
|
|
288
|
+
log(f"Target file: {get_friendly_path(pdf_path)}", overview=True)
|
|
289
|
+
log(f"PDF Engine: {pdf_library}", overview=True)
|
|
72
290
|
|
|
73
|
-
# 1. Extract all active links and TOC
|
|
74
|
-
extracted_links = extract_links(pdf_path)
|
|
75
|
-
structural_toc = extract_toc(pdf_path)
|
|
76
|
-
#structural_toc = extract_toc_pypdf(pdf_path)
|
|
77
291
|
toc_entry_count = len(structural_toc)
|
|
292
|
+
str_structural_toc = get_structural_toc(structural_toc)
|
|
78
293
|
|
|
294
|
+
# check the structure, that it matches
|
|
295
|
+
if False:
|
|
296
|
+
print(f"pdf_library={pdf_library}")
|
|
297
|
+
debug_head("TOC", structural_toc, n=3)
|
|
298
|
+
debug_head("Links", list(extracted_links), n=3)
|
|
299
|
+
|
|
300
|
+
# THIS HITS
|
|
79
301
|
|
|
80
302
|
if not extracted_links and not structural_toc:
|
|
81
|
-
log(f"\nNo hyperlinks or structural TOC found in {
|
|
82
|
-
log("(This is common for scanned/image-only PDFs.)")
|
|
83
|
-
|
|
303
|
+
log(f"\nNo hyperlinks or structural TOC found in {pdf_name}.", overview=True)
|
|
304
|
+
log("(This is common for scanned/image-only PDFs.)", overview=True)
|
|
305
|
+
|
|
306
|
+
empty_result = {
|
|
307
|
+
"data": {
|
|
308
|
+
"external_links": [],
|
|
309
|
+
"internal_links": [],
|
|
310
|
+
"toc": []
|
|
311
|
+
},
|
|
312
|
+
"text": "\n".join(report_buffer),
|
|
313
|
+
"metadata": {
|
|
314
|
+
"file_overview": {
|
|
315
|
+
"pdf_name": pdf_name,
|
|
316
|
+
"total_pages": total_pages,
|
|
317
|
+
},
|
|
318
|
+
"library_used": pdf_library,
|
|
319
|
+
"link_counts": {
|
|
320
|
+
"toc_entry_count": 0,
|
|
321
|
+
"internal_goto_links_count": 0,
|
|
322
|
+
"interal_resolve_action_links_count": 0,
|
|
323
|
+
"total_internal_links_count": 0,
|
|
324
|
+
"external_uri_links_count": 0,
|
|
325
|
+
"other_links_count": 0,
|
|
326
|
+
"total_links_count": 0
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
return empty_result
|
|
84
331
|
|
|
85
332
|
# 3. Separate the lists based on the 'type' key
|
|
86
|
-
|
|
333
|
+
external_uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
|
|
87
334
|
goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
|
|
88
335
|
resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
|
|
89
336
|
other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
|
|
90
337
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
338
|
+
interal_resolve_action_links_count = len(resolved_action_links)
|
|
339
|
+
internal_goto_links_count = len(goto_links)
|
|
340
|
+
total_internal_links_count = internal_goto_links_count + interal_resolve_action_links_count
|
|
341
|
+
|
|
342
|
+
external_uri_links_count = len(external_uri_links)
|
|
343
|
+
other_links_count = len(other_links)
|
|
344
|
+
|
|
345
|
+
total_links_count = len(extracted_links)
|
|
346
|
+
|
|
95
347
|
# --- ANALYSIS SUMMARY (Using your print logic) ---
|
|
96
|
-
log("\n" + "=" * SEP_COUNT)
|
|
97
|
-
log(f"--- Link Analysis Results for {
|
|
98
|
-
log(f"Total active links: {
|
|
99
|
-
log(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
|
|
100
|
-
log("=" * SEP_COUNT)
|
|
348
|
+
log("\n" + "=" * SEP_COUNT, overview = True)
|
|
349
|
+
log(f"--- Link Analysis Results for {pdf_name} ---", overview = True)
|
|
350
|
+
log(f"Total active links: {total_links_count} (External: {external_uri_links_count}, Internal Jumps: {total_internal_links_count}, Other: {other_links_count})",overview = True)
|
|
351
|
+
log(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}",overview = True)
|
|
352
|
+
log("=" * SEP_COUNT,overview = True)
|
|
101
353
|
|
|
102
354
|
# --- Section 1: TOC ---
|
|
103
|
-
str_structural_toc = print_structural_toc(structural_toc)
|
|
104
355
|
log(str_structural_toc)
|
|
105
356
|
|
|
106
357
|
# --- Section 2: ACTIVE INTERNAL JUMPS ---
|
|
107
358
|
log("\n" + "=" * SEP_COUNT)
|
|
108
|
-
log(f"## Active Internal Jumps (GoTo & Resolved Actions) - {
|
|
359
|
+
log(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links_count} found")
|
|
109
360
|
log("=" * SEP_COUNT)
|
|
110
361
|
log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
|
|
111
362
|
log("-" * SEP_COUNT)
|
|
112
363
|
|
|
113
364
|
all_internal = goto_links + resolved_action_links
|
|
114
|
-
|
|
115
|
-
|
|
365
|
+
#If links were found: all_internal is a list with dictionaries. It evaluates to True.
|
|
366
|
+
# If NO links were found: all_internal is an empty list []. It evaluates to False.
|
|
367
|
+
if all_internal:
|
|
368
|
+
for i, link in enumerate(all_internal, 1):
|
|
116
369
|
link_text = link.get('link_text', 'N/A')
|
|
117
|
-
log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], link['destination_page']))
|
|
118
370
|
|
|
119
|
-
|
|
120
|
-
|
|
371
|
+
# Convert source and destination indices to human strings
|
|
372
|
+
src_page = PageRef.from_index(link['page']).human
|
|
373
|
+
dest_page = PageRef.from_index(link['destination_page']).human
|
|
374
|
+
|
|
375
|
+
log("{:<5} | {:<5} | {:<40} | {}".format(
|
|
376
|
+
i,
|
|
377
|
+
src_page,
|
|
378
|
+
link_text[:40],
|
|
379
|
+
dest_page
|
|
380
|
+
))
|
|
381
|
+
|
|
382
|
+
|
|
121
383
|
else:
|
|
122
384
|
log(" No internal GoTo or Resolved Action links found.")
|
|
123
385
|
log("-" * SEP_COUNT)
|
|
124
386
|
|
|
125
387
|
# --- Section 3: ACTIVE URI LINKS ---
|
|
126
388
|
log("\n" + "=" * SEP_COUNT)
|
|
127
|
-
log(f"## Active URI Links (External
|
|
389
|
+
log(f"## Active URI Links (External) - {len(external_uri_links)} found")
|
|
128
390
|
log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
|
|
129
391
|
log("=" * SEP_COUNT)
|
|
130
392
|
|
|
131
|
-
if
|
|
132
|
-
for i, link in enumerate(
|
|
393
|
+
if external_uri_links:
|
|
394
|
+
for i, link in enumerate(external_uri_links, 1):
|
|
133
395
|
target = link.get('url') or link.get('remote_file') or link.get('target')
|
|
134
396
|
link_text = link.get('link_text', 'N/A')
|
|
135
397
|
log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
|
|
136
|
-
if limit is not None and len(uri_and_other) > limit:
|
|
137
|
-
log(f"... and {len(uri_and_other) - limit} more links (use --max-links 0 to show all).")
|
|
138
398
|
|
|
139
399
|
else:
|
|
140
|
-
log(" No external
|
|
400
|
+
log(" No external links found.")
|
|
141
401
|
log("-" * SEP_COUNT)
|
|
142
402
|
|
|
143
|
-
|
|
403
|
+
# --- Section 4: OTHER LINKS ---
|
|
404
|
+
log("\n" + "=" * SEP_COUNT)
|
|
405
|
+
log(f"## Other Links - {len(other_links)} found")
|
|
406
|
+
log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target Action"))
|
|
407
|
+
log("=" * SEP_COUNT)
|
|
408
|
+
|
|
409
|
+
if other_links:
|
|
410
|
+
for i, link in enumerate(other_links, 1):
|
|
411
|
+
target = link.get('url') or link.get('remote_file') or link.get('target')
|
|
412
|
+
link_text = link.get('link_text', 'N/A')
|
|
413
|
+
log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
|
|
144
414
|
|
|
145
|
-
|
|
146
|
-
|
|
415
|
+
else:
|
|
416
|
+
log(" No 'Other' links found.")
|
|
417
|
+
log("-" * SEP_COUNT)
|
|
147
418
|
|
|
148
419
|
# Return the collected data for potential future JSON/other output
|
|
149
|
-
|
|
150
|
-
"external_links":
|
|
420
|
+
report_data_dict = {
|
|
421
|
+
"external_links": external_uri_links,
|
|
151
422
|
"internal_links": all_internal,
|
|
152
|
-
"toc": structural_toc
|
|
423
|
+
"toc": structural_toc,
|
|
424
|
+
"validation": EMPTY_VALIDATION.copy()
|
|
153
425
|
}
|
|
154
426
|
|
|
427
|
+
intermediate_report_results = {
|
|
428
|
+
"data": report_data_dict, # The structured JSON-ready dict
|
|
429
|
+
"text": "",
|
|
430
|
+
"metadata": { # Helpful for the GUI/Logs
|
|
431
|
+
"file_overview": {
|
|
432
|
+
"pdf_name": pdf_name,
|
|
433
|
+
"total_pages": total_pages,
|
|
434
|
+
},
|
|
435
|
+
"library_used": pdf_library,
|
|
436
|
+
"link_counts": {
|
|
437
|
+
"toc_entry_count": toc_entry_count,
|
|
438
|
+
"internal_goto_links_count": internal_goto_links_count,
|
|
439
|
+
"interal_resolve_action_links_count": interal_resolve_action_links_count,
|
|
440
|
+
"total_internal_links_count": total_internal_links_count,
|
|
441
|
+
"external_uri_links_count": external_uri_links_count,
|
|
442
|
+
"other_links_count": other_links_count,
|
|
443
|
+
"total_links_count": total_links_count
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
log("\n--- Analysis Complete ---")
|
|
449
|
+
|
|
450
|
+
validation_results = run_validation(report_results=intermediate_report_results,
|
|
451
|
+
pdf_path=pdf_path)
|
|
452
|
+
log(validation_results.get("summary-txt",""), overview = True)
|
|
453
|
+
|
|
454
|
+
# CRITICAL: Re-assign to report_results so it's available for the final return
|
|
455
|
+
report_results = copy.deepcopy(intermediate_report_results)
|
|
456
|
+
|
|
457
|
+
# --- Offline Risk Analysis (Security Layer) ---
|
|
458
|
+
risk_results = compute_risk(report_results)
|
|
459
|
+
report_results["data"]["risk"] = risk_results
|
|
460
|
+
|
|
461
|
+
# Final aggregation of the buffer into one string, after the last call to log()
|
|
462
|
+
report_buffer_str = "\n".join(report_buffer)
|
|
463
|
+
report_buffer_overview_str = "\n".join(report_buffer_overview)
|
|
464
|
+
|
|
465
|
+
report_results["data"]["validation"].update(validation_results)
|
|
466
|
+
#report_results["text"].update(report_buffer_str) # The human-readable string
|
|
467
|
+
report_results["text"] = report_buffer_str
|
|
468
|
+
|
|
155
469
|
# 5. Export Report
|
|
156
470
|
#if export_format:
|
|
157
471
|
# # Assuming export_to will hold the output format string (e.g., "JSON")
|
|
158
|
-
# export_report_data(
|
|
472
|
+
# export_report_data(report_data_dict, pdf_name, export_format, pdf_library)
|
|
159
473
|
|
|
160
|
-
if
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
474
|
+
if print_bool:
|
|
475
|
+
if concise_print:
|
|
476
|
+
print(report_buffer_overview_str)
|
|
477
|
+
else:
|
|
478
|
+
print(report_buffer_str)
|
|
165
479
|
|
|
166
|
-
if "TXT" in fmt_upper:
|
|
167
|
-
export_report_txt(report_buffer_str, pdf_path, pdf_library)
|
|
168
|
-
|
|
169
|
-
report_results = {
|
|
170
|
-
"data": final_report_data_dict, # The structured JSON-ready dict
|
|
171
|
-
"text": report_buffer_str, # The human-readable string
|
|
172
|
-
"metadata": { # Helpful for the GUI/Logs
|
|
173
|
-
"pdf_name": Path(pdf_path).name,
|
|
174
|
-
"library_used": pdf_library,
|
|
175
|
-
"total_links": len(extracted_links)
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
# Return a clean results object
|
|
179
480
|
return report_results
|
|
481
|
+
|
|
180
482
|
except Exception as e:
|
|
181
483
|
# Specific handling for common read failures
|
|
182
|
-
if "invalid pdf header" in str(e).lower() or "EOF marker not found" in str(e) or "stream has ended unexpectedly" in str(e):
|
|
484
|
+
if True:#"invalid pdf header" in str(e).lower() or "EOF marker not found" in str(e) or "stream has ended unexpectedly" in str(e):
|
|
183
485
|
log(f"\nWarning: Could not parse PDF structure — likely an image-only or malformed PDF.")
|
|
184
486
|
log("No hyperlinks or TOC can exist in this file.")
|
|
185
487
|
log("Result: No links found.")
|
|
186
488
|
return {
|
|
187
|
-
"data": {"external_links": [], "internal_links": [], "toc": []},
|
|
489
|
+
"data": {"external_links": [], "internal_links": [], "toc": [], "validation": EMPTY_VALIDATION.copy()},
|
|
188
490
|
"text": "\n".join(report_buffer + [
|
|
189
491
|
"\nWarning: PDF appears to be image-only or malformed.",
|
|
190
492
|
"No hyperlinks or structural TOC found."
|
|
191
493
|
]),
|
|
192
494
|
"metadata": {
|
|
193
|
-
"
|
|
495
|
+
"file_overview": {
|
|
496
|
+
"pdf_name": pdf_name,
|
|
497
|
+
"total_pages": total_pages,
|
|
498
|
+
},
|
|
194
499
|
"library_used": pdf_library,
|
|
195
|
-
"
|
|
500
|
+
"link_counts": {
|
|
501
|
+
"toc_entry_count": 0,
|
|
502
|
+
"internal_goto_links_count": 0,
|
|
503
|
+
"interal_resolve_action_links_count": 0,
|
|
504
|
+
"total_internal_links_count": 0,
|
|
505
|
+
"external_uri_links_count": 0,
|
|
506
|
+
"other_links_count": 0,
|
|
507
|
+
"total_links_count": 0
|
|
508
|
+
}
|
|
196
509
|
}
|
|
197
510
|
}
|
|
198
511
|
|
|
512
|
+
#except Exception as e:
|
|
513
|
+
# # Log the critical failure
|
|
514
|
+
# error_logger.error(f"Critical failure during run_report for {pdf_path}: {e}", exc_info=True)
|
|
515
|
+
# log(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
|
|
516
|
+
# raise # Allow the exception to propagate or handle gracefully
|
|
199
517
|
except Exception as e:
|
|
200
|
-
# Log the critical failure
|
|
201
518
|
error_logger.error(f"Critical failure during run_report for {pdf_path}: {e}", exc_info=True)
|
|
202
|
-
log(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
|
|
203
|
-
|
|
519
|
+
log(f"FATAL: Analysis failed: {str(e)}. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
|
|
520
|
+
|
|
521
|
+
# Always return a safe empty result on error
|
|
522
|
+
return {
|
|
523
|
+
"data": {
|
|
524
|
+
"external_links": [],
|
|
525
|
+
"internal_links": [],
|
|
526
|
+
"toc": [],
|
|
527
|
+
"validation": EMPTY_VALIDATION.copy()
|
|
528
|
+
},
|
|
529
|
+
"text": "\n".join(report_buffer + [
|
|
530
|
+
"\n--- Analysis failed ---",
|
|
531
|
+
f"Error: {str(e)}",
|
|
532
|
+
"No links or TOC extracted."
|
|
533
|
+
]),
|
|
534
|
+
"metadata": {
|
|
535
|
+
"file_overview": {
|
|
536
|
+
"pdf_name": pdf_name,
|
|
537
|
+
"total_pages": total_pages,
|
|
538
|
+
},
|
|
539
|
+
"library_used": pdf_library,
|
|
540
|
+
"link_counts": {
|
|
541
|
+
"toc_entry_count": 0,
|
|
542
|
+
"internal_goto_links_count": 0,
|
|
543
|
+
"interal_resolve_action_links_count": 0,
|
|
544
|
+
"total_internal_links_count": 0,
|
|
545
|
+
"external_uri_links_count": 0,
|
|
546
|
+
"other_links_count": 0,
|
|
547
|
+
"total_links_count": 0
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
def _return_empty_report(report_buffer: str, pdf_library: str)-> dict:
|
|
553
|
+
|
|
554
|
+
empty_report = {
|
|
555
|
+
"data": {
|
|
556
|
+
"external_links": [],
|
|
557
|
+
"internal_links": [],
|
|
558
|
+
"toc": [],
|
|
559
|
+
"validation": EMPTY_VALIDATION.copy()
|
|
560
|
+
},
|
|
561
|
+
"text": "\n".join(report_buffer),
|
|
562
|
+
"metadata": {
|
|
563
|
+
"file_overview": {
|
|
564
|
+
"pdf_name": "null",
|
|
565
|
+
"total_pages": 0,
|
|
566
|
+
},
|
|
567
|
+
"library_used": pdf_library,
|
|
568
|
+
"link_counts": {
|
|
569
|
+
"toc_entry_count": 0,
|
|
570
|
+
"internal_goto_links_count": 0,
|
|
571
|
+
"interal_resolve_action_links_count": 0,
|
|
572
|
+
"total_internal_links_count": 0,
|
|
573
|
+
"external_uri_links_count": 0,
|
|
574
|
+
"other_links_count": 0,
|
|
575
|
+
"total_links_count": 0
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
}
|
|
204
579
|
|
|
580
|
+
return empty_report
|
|
581
|
+
|
|
582
|
+
def _generate_text_report(
|
|
583
|
+
pdf_path: str,
|
|
584
|
+
library: str,
|
|
585
|
+
ext_links: list,
|
|
586
|
+
goto_links: list,
|
|
587
|
+
resolve_links: list,
|
|
588
|
+
other_links: list,
|
|
589
|
+
toc: list
|
|
590
|
+
) -> str:
|
|
591
|
+
"""Pure helper to build the human-readable string for console/TXT export."""
|
|
592
|
+
lines = []
|
|
593
|
+
lines.append("\n--- Starting Analysis ... ---\n")
|
|
594
|
+
lines.append(f"Target file: {get_friendly_path(pdf_path)}")
|
|
595
|
+
lines.append(f"PDF Engine: {library}")
|
|
596
|
+
|
|
597
|
+
total_int = len(goto_links) + len(resolve_links)
|
|
598
|
+
total_links = len(ext_links) + total_int + len(other_links)
|
|
205
599
|
|
|
206
|
-
|
|
207
|
-
"""
|
|
208
|
-
|
|
209
|
-
|
|
600
|
+
# 1. Summary Header
|
|
601
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
602
|
+
lines.append(f"--- Link Analysis Results for {get_friendly_path(pdf_path)} ---")
|
|
603
|
+
lines.append(f"Total active links: {total_links} (External: {len(ext_links)}, Internal Jumps: {total_int}, Other: {len(other_links)})")
|
|
604
|
+
lines.append(f"Total **structural TOC entries (bookmarks)** found: {len(toc)}")
|
|
605
|
+
lines.append("=" * SEP_COUNT)
|
|
210
606
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
"""
|
|
214
|
-
print("\n" + "=" * SEP_COUNT)
|
|
215
|
-
print("## Structural Table of Contents (PDF Bookmarks/Outline)")
|
|
216
|
-
print("=" * SEP_COUNT)
|
|
217
|
-
if not structural_toc:
|
|
218
|
-
print("No structural TOC (bookmarks/outline) found.")
|
|
219
|
-
return
|
|
607
|
+
# 2. Table of Contents
|
|
608
|
+
lines.append(get_structural_toc(toc))
|
|
220
609
|
|
|
221
|
-
#
|
|
222
|
-
|
|
223
|
-
|
|
610
|
+
# 3. Internal Jumps
|
|
611
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
612
|
+
lines.append(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_int} found")
|
|
613
|
+
lines.append("=" * SEP_COUNT)
|
|
614
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
|
|
615
|
+
lines.append("-" * SEP_COUNT)
|
|
224
616
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
617
|
+
all_internal = goto_links + resolve_links
|
|
618
|
+
if all_internal:
|
|
619
|
+
for i, link in enumerate(all_internal, 1):
|
|
620
|
+
src = PageRef.from_index(link.get('page', 0)).human
|
|
621
|
+
dest = PageRef.from_index(link.get('destination_page', 0)).human
|
|
622
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format(
|
|
623
|
+
i, src, link.get('link_text', 'N/A')[:40], dest
|
|
624
|
+
))
|
|
625
|
+
else:
|
|
626
|
+
lines.append(" No internal GoTo or Resolved Action links found.")
|
|
627
|
+
lines.append("-" * SEP_COUNT)
|
|
628
|
+
|
|
629
|
+
# 4. External URI Links
|
|
630
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
631
|
+
lines.append(f"## Active URI Links (External) - {len(ext_links)} found")
|
|
632
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
|
|
633
|
+
lines.append("=" * SEP_COUNT)
|
|
634
|
+
|
|
635
|
+
if ext_links:
|
|
636
|
+
for i, link in enumerate(ext_links, 1):
|
|
637
|
+
target = link.get('url') or link.get('remote_file') or link.get('target', 'N/A')
|
|
638
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format(
|
|
639
|
+
i, link.get('page', 0), link.get('link_text', 'N/A')[:40], target
|
|
640
|
+
))
|
|
641
|
+
else:
|
|
642
|
+
lines.append(" No external links found.")
|
|
643
|
+
lines.append("-" * SEP_COUNT)
|
|
644
|
+
|
|
645
|
+
# 5. Other Links
|
|
646
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
647
|
+
lines.append(f"## Other Links - {len(other_links)} found")
|
|
648
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target Action"))
|
|
649
|
+
lines.append("=" * SEP_COUNT)
|
|
650
|
+
|
|
651
|
+
if other_links:
|
|
652
|
+
for i, link in enumerate(other_links, 1):
|
|
653
|
+
target = link.get('url') or link.get('remote_file') or link.get('target', 'N/A')
|
|
654
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format(
|
|
655
|
+
i, link.get('page', 0), link.get('link_text', 'N/A')[:40], target
|
|
656
|
+
))
|
|
657
|
+
else:
|
|
658
|
+
lines.append(" No 'Other' links found.")
|
|
659
|
+
lines.append("-" * SEP_COUNT)
|
|
660
|
+
|
|
661
|
+
return "\n".join(lines)
|
|
662
|
+
|
|
663
|
+
def _generate_text_report__(pdf_path, library, ext_links, int_links, other_links, toc) -> str:
|
|
664
|
+
lines = []
|
|
665
|
+
lines.append("\n--- Starting Analysis ... ---\n")
|
|
666
|
+
lines.append(f"Target file: {get_friendly_path(pdf_path)}")
|
|
667
|
+
lines.append(f"PDF Engine: {library}")
|
|
668
|
+
|
|
669
|
+
# 1. Summary Header
|
|
670
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
671
|
+
lines.append(f"--- Link Analysis Results for {get_friendly_path(pdf_path)} ---")
|
|
672
|
+
lines.append(f"Total active links: {len(ext_links) + len(int_links) + len(other_links)}")
|
|
673
|
+
lines.append(f"Total bookmarks: {len(toc)}")
|
|
674
|
+
lines.append("=" * SEP_COUNT)
|
|
232
675
|
|
|
233
|
-
|
|
676
|
+
# 2. Table of Contents
|
|
677
|
+
lines.append(get_structural_toc(toc))
|
|
234
678
|
|
|
679
|
+
# 3. Internal Jumps (GoTo & Resolved)
|
|
680
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
681
|
+
lines.append(f"## Active Internal Jumps - {len(int_links)} found")
|
|
682
|
+
lines.append("=" * SEP_COUNT)
|
|
683
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To"))
|
|
684
|
+
|
|
685
|
+
for i, link in enumerate(int_links, 1):
|
|
686
|
+
src = PageRef.from_index(link.get('page', 0)).human
|
|
687
|
+
dest = PageRef.from_index(link.get('destination_page', 0)).human
|
|
688
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format(i, src, link.get('link_text', 'N/A')[:40], dest))
|
|
235
689
|
|
|
236
|
-
|
|
690
|
+
# 4. External URI Links
|
|
691
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
692
|
+
lines.append(f"## External URI Links - {len(ext_links)} found")
|
|
693
|
+
lines.append("=" * SEP_COUNT)
|
|
694
|
+
for i, link in enumerate(ext_links, 1):
|
|
695
|
+
target = link.get('url') or link.get('target', 'N/A')
|
|
696
|
+
lines.append("{:<5} | {:<5} | {:<40} | {}".format(i, link.get('page', 0), link.get('link_text', 'N/A')[:40], target))
|
|
697
|
+
|
|
698
|
+
return "\n".join(lines)
|
|
699
|
+
|
|
700
|
+
def _build_metadata(
|
|
701
|
+
pdf_name: str,
|
|
702
|
+
total_pages: int,
|
|
703
|
+
library_used: str,
|
|
704
|
+
toc_entry_count: int,
|
|
705
|
+
internal_goto_links_count: int,
|
|
706
|
+
interal_resolve_action_links_count: int,
|
|
707
|
+
external_uri_links_count: int,
|
|
708
|
+
other_links_count: int
|
|
709
|
+
) -> Dict[str, Any]:
|
|
710
|
+
"""
|
|
711
|
+
Standardizes the metadata dictionary using the EXACT legacy variable names.
|
|
712
|
+
"""
|
|
713
|
+
total_internal_links_count = internal_goto_links_count + interal_resolve_action_links_count
|
|
714
|
+
total_links_count = total_internal_links_count + external_uri_links_count + other_links_count
|
|
715
|
+
|
|
716
|
+
return {
|
|
717
|
+
"file_overview": {
|
|
718
|
+
"pdf_name": pdf_name,
|
|
719
|
+
"total_pages": total_pages,
|
|
720
|
+
},
|
|
721
|
+
"library_used": library_used,
|
|
722
|
+
"link_counts": {
|
|
723
|
+
"toc_entry_count": toc_entry_count,
|
|
724
|
+
"internal_goto_links_count": internal_goto_links_count,
|
|
725
|
+
"interal_resolve_action_links_count": interal_resolve_action_links_count,
|
|
726
|
+
"total_internal_links_count": total_internal_links_count,
|
|
727
|
+
"external_uri_links_count": external_uri_links_count,
|
|
728
|
+
"other_links_count": other_links_count,
|
|
729
|
+
"total_links_count": total_links_count
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
def _build_metadata_(
|
|
734
|
+
pdf_name: str,
|
|
735
|
+
total_pages: int,
|
|
736
|
+
library_used: str,
|
|
737
|
+
toc_count: int,
|
|
738
|
+
goto_count: int,
|
|
739
|
+
resolve_count: int,
|
|
740
|
+
ext_count: int,
|
|
741
|
+
other_count: int
|
|
742
|
+
) -> Dict[str, Any]:
|
|
743
|
+
"""Standardizes the metadata dictionary for all report types."""
|
|
744
|
+
return {
|
|
745
|
+
"file_overview": {
|
|
746
|
+
"pdf_name": pdf_name,
|
|
747
|
+
"total_pages": total_pages,
|
|
748
|
+
},
|
|
749
|
+
"library_used": library_used,
|
|
750
|
+
"link_counts": {
|
|
751
|
+
"toc_entry_count": toc_count,
|
|
752
|
+
"internal_links_count": goto_count,
|
|
753
|
+
"external_uri_links_count": ext_count,
|
|
754
|
+
"other_links_count": other_count,
|
|
755
|
+
"total_links_count": goto_count + ext_count + other_count
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
def get_structural_toc(structural_toc: list) -> str:
|
|
237
760
|
"""
|
|
238
761
|
Formats the structural TOC data into a hierarchical string and optionally prints it.
|
|
239
762
|
|
|
240
763
|
Args:
|
|
241
764
|
structural_toc: A list of TOC dictionaries.
|
|
242
|
-
print_bool: Whether to print the output to the console.
|
|
243
765
|
|
|
244
766
|
Returns:
|
|
245
767
|
A formatted string of the structural TOC.
|
|
@@ -253,8 +775,6 @@ def print_structural_toc(structural_toc: list, print_bool: bool = False) -> str:
|
|
|
253
775
|
msg = "No structural TOC (bookmarks/outline) found."
|
|
254
776
|
lines.append(msg)
|
|
255
777
|
output = "\n".join(lines)
|
|
256
|
-
if print_bool:
|
|
257
|
-
print(output)
|
|
258
778
|
return output
|
|
259
779
|
|
|
260
780
|
# Determine max page width for consistent alignment
|
|
@@ -267,16 +787,67 @@ def print_structural_toc(structural_toc: list, print_bool: bool = False) -> str:
|
|
|
267
787
|
indent = " " * 4 * (item['level'] - 1)
|
|
268
788
|
# Handle cases where page might be N/A or None
|
|
269
789
|
target_page = item.get('target_page', "N/A")
|
|
270
|
-
page_str = str(target_page).rjust(page_width)
|
|
271
790
|
|
|
791
|
+
# Determine the human-facing string
|
|
792
|
+
if isinstance(target_page, int):
|
|
793
|
+
# Convert 0-index back to human (1-index) for the report
|
|
794
|
+
display_val = PageRef.from_index(target_page).human
|
|
795
|
+
else:
|
|
796
|
+
display_val = str(target_page)
|
|
797
|
+
|
|
798
|
+
page_str = str(display_val).rjust(page_width)
|
|
799
|
+
|
|
272
800
|
lines.append(f"{indent}{item['title']} . . . page {page_str}")
|
|
273
801
|
|
|
274
802
|
lines.append("-" * SEP_COUNT)
|
|
275
803
|
|
|
276
804
|
# Final aggregation
|
|
277
805
|
str_structural_toc = "\n".join(lines)
|
|
278
|
-
|
|
279
|
-
if print_bool:
|
|
280
|
-
print(str_structural_toc)
|
|
281
806
|
|
|
282
807
|
return str_structural_toc
|
|
808
|
+
|
|
809
|
+
import unicodedata
|
|
810
|
+
|
|
811
|
+
def sanitize_glyphs_for_compatibility(text: str) -> str:
|
|
812
|
+
"""Replaces emojis with ASCII tags to prevent rendering bugs in gedit/WSL2."""
|
|
813
|
+
glyph_mapping = {
|
|
814
|
+
'✅': '[PASS]',
|
|
815
|
+
'🌐': '[WEB]',
|
|
816
|
+
'⚠️': '[WARN]',
|
|
817
|
+
'❌': '[FAIL]',
|
|
818
|
+
'ℹ️': '[INFO]'
|
|
819
|
+
}
|
|
820
|
+
for glyph, replacement in glyph_mapping.items():
|
|
821
|
+
text = text.replace(glyph, replacement)
|
|
822
|
+
|
|
823
|
+
# Standard library only - no unidecode dependency
|
|
824
|
+
normalized = unicodedata.normalize('NFKD', text)
|
|
825
|
+
return normalized.encode('ascii', 'ignore').decode('utf-8').replace(' ', ' ')
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
if __name__ == "__main__":
|
|
830
|
+
|
|
831
|
+
from pdflinkcheck.io import get_first_pdf_in_cwd
|
|
832
|
+
pdf_path = get_first_pdf_in_cwd()
|
|
833
|
+
# Run analysis first
|
|
834
|
+
|
|
835
|
+
if pymupdf_is_available():
|
|
836
|
+
pdf_library = "pymupdf"
|
|
837
|
+
else:
|
|
838
|
+
pdf_library = "pypdf"
|
|
839
|
+
report = run_report_and_call_exports(
|
|
840
|
+
pdf_path=pdf_path,
|
|
841
|
+
export_format="",
|
|
842
|
+
pdf_library=pdf_library,
|
|
843
|
+
print_bool=True # We handle printing in validation
|
|
844
|
+
)
|
|
845
|
+
|
|
846
|
+
if not report or not report.get("data"):
|
|
847
|
+
print("No data extracted — nothing to validate.")
|
|
848
|
+
sys.exit(1)
|
|
849
|
+
|
|
850
|
+
else:
|
|
851
|
+
print("Success!")
|
|
852
|
+
print(f"list(report['data']) = {list(report['data'])}")
|
|
853
|
+
|