pdflinkcheck 1.1.47__py3-none-any.whl → 1.1.73__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +51 -13
- pdflinkcheck/{analyze.py → analyze_pymupdf.py} +54 -224
- pdflinkcheck/analyze_pypdf.py +184 -0
- pdflinkcheck/analyze_pypdf_v2.py +218 -0
- pdflinkcheck/cli.py +238 -39
- pdflinkcheck/data/LICENSE +5 -24
- pdflinkcheck/data/README.md +278 -0
- pdflinkcheck/data/pyproject.toml +98 -0
- pdflinkcheck/datacopy.py +60 -0
- pdflinkcheck/dev.py +109 -0
- pdflinkcheck/gui.py +371 -74
- pdflinkcheck/io.py +118 -11
- pdflinkcheck/report.py +282 -0
- pdflinkcheck/stdlib_server.py +176 -0
- pdflinkcheck/validate.py +382 -0
- pdflinkcheck/version_info.py +83 -0
- {pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.73.dist-info}/METADATA +127 -71
- pdflinkcheck-1.1.73.dist-info/RECORD +21 -0
- pdflinkcheck-1.1.73.dist-info/WHEEL +4 -0
- {pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.73.dist-info}/entry_points.txt +1 -0
- {pdflinkcheck-1.1.47.dist-info → pdflinkcheck-1.1.73.dist-info}/licenses/LICENSE +5 -24
- pdflinkcheck/remnants.py +0 -142
- pdflinkcheck-1.1.47.dist-info/RECORD +0 -13
- pdflinkcheck-1.1.47.dist-info/WHEEL +0 -5
- pdflinkcheck-1.1.47.dist-info/top_level.txt +0 -1
pdflinkcheck/io.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
# src/pdflinkcheck/io.py
|
|
2
2
|
import logging
|
|
3
3
|
import json
|
|
4
|
+
import sys
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import Dict, Any, Union, List
|
|
6
|
+
from typing import Dict, Any, Union, List, Optional
|
|
6
7
|
|
|
7
8
|
# --- Configuration ---
|
|
8
9
|
|
|
@@ -27,6 +28,14 @@ def setup_error_logger():
|
|
|
27
28
|
"""
|
|
28
29
|
Configures a basic logger that writes errors and warnings to a file
|
|
29
30
|
in the PDFLINKCHECK_HOME directory.
|
|
31
|
+
|
|
32
|
+
# Example of how an external module can log an error:
|
|
33
|
+
# from pdflinkcheck.io import error_logger
|
|
34
|
+
# try:
|
|
35
|
+
# ...
|
|
36
|
+
# except Exception as e:
|
|
37
|
+
# error_logger.exception("An exception occurred during link extraction.")
|
|
38
|
+
|
|
30
39
|
"""
|
|
31
40
|
# Create the logger instance
|
|
32
41
|
logger = logging.getLogger('pdflinkcheck_logger')
|
|
@@ -57,14 +66,15 @@ error_logger = setup_error_logger()
|
|
|
57
66
|
def export_report_data(
|
|
58
67
|
report_data: Dict[str, Any],
|
|
59
68
|
pdf_filename: str,
|
|
60
|
-
export_format: str = "JSON"
|
|
69
|
+
export_format: str = "JSON",
|
|
70
|
+
pdf_library: str = "", # expected to be specificed every time.
|
|
61
71
|
) -> Path:
|
|
62
72
|
"""
|
|
63
73
|
Exports the structured analysis report data to a file in the
|
|
64
74
|
PDFLINKCHECK_HOME directory.
|
|
65
75
|
|
|
66
76
|
Args:
|
|
67
|
-
report_data: The dictionary containing the results from
|
|
77
|
+
report_data: The dictionary containing the results from run_report.
|
|
68
78
|
pdf_filename: The base filename of the PDF being analyzed (used for the output file name).
|
|
69
79
|
export_format: The desired output format ('json' currently supported).
|
|
70
80
|
|
|
@@ -80,7 +90,7 @@ def export_report_data(
|
|
|
80
90
|
|
|
81
91
|
# Create an output file name based on the PDF name and a timestamp
|
|
82
92
|
base_name = Path(pdf_filename).stem
|
|
83
|
-
output_filename = f"{base_name}_report.json"
|
|
93
|
+
output_filename = f"{base_name}_{pdf_library}_report.json"
|
|
84
94
|
output_path = PDFLINKCHECK_HOME / output_filename
|
|
85
95
|
|
|
86
96
|
try:
|
|
@@ -88,7 +98,7 @@ def export_report_data(
|
|
|
88
98
|
# Use indent for readability
|
|
89
99
|
json.dump(report_data, f, indent=4)
|
|
90
100
|
|
|
91
|
-
print(f"\nReport successfully exported to: {output_path}")
|
|
101
|
+
print(f"\nReport successfully exported to: {get_friendly_path(output_path)}")
|
|
92
102
|
return output_path
|
|
93
103
|
|
|
94
104
|
except Exception as e:
|
|
@@ -96,11 +106,108 @@ def export_report_data(
|
|
|
96
106
|
# Re-raise the exception after logging for caller to handle
|
|
97
107
|
raise RuntimeError(f"Report export failed due to an I/O error: {e}")
|
|
98
108
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
109
|
+
def export_report_json(
|
|
110
|
+
report_data: Dict[str, Any],
|
|
111
|
+
pdf_filename: str,
|
|
112
|
+
pdf_library: str
|
|
113
|
+
) -> Path:
|
|
114
|
+
"""Exports structured dictionary results to a .json file."""
|
|
115
|
+
base_name = Path(pdf_filename).stem
|
|
116
|
+
output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_report.json"
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
120
|
+
json.dump(report_data, f, indent=4)
|
|
121
|
+
print(f"\nJSON report exported: {get_friendly_path(output_path)}")
|
|
122
|
+
return output_path
|
|
123
|
+
except Exception as e:
|
|
124
|
+
error_logger.error(f"JSON export failed: {e}", exc_info=True)
|
|
125
|
+
raise RuntimeError(f"JSON export failed: {e}")
|
|
126
|
+
|
|
127
|
+
def export_report_txt(
|
|
128
|
+
report_text: str,
|
|
129
|
+
pdf_filename: str,
|
|
130
|
+
pdf_library: str
|
|
131
|
+
) -> Path:
|
|
132
|
+
"""Exports the formatted string buffer to a .txt file."""
|
|
133
|
+
base_name = Path(pdf_filename).stem
|
|
134
|
+
output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_report.txt"
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
output_path.write_text(report_text, encoding='utf-8')
|
|
138
|
+
print(f"\nTXT report exported: {get_friendly_path(output_path)}")
|
|
139
|
+
return output_path
|
|
140
|
+
except Exception as e:
|
|
141
|
+
error_logger.error(f"TXT export failed: {e}", exc_info=True)
|
|
142
|
+
raise RuntimeError(f"TXT export failed: {e}")
|
|
143
|
+
|
|
144
|
+
def export_validation_json(
|
|
145
|
+
report_data: Dict[str, Any],
|
|
146
|
+
pdf_filename: str,
|
|
147
|
+
pdf_library: str
|
|
148
|
+
) -> Path:
|
|
149
|
+
"""Exports structured dictionary validation results to a .json file."""
|
|
150
|
+
base_name = Path(pdf_filename).stem
|
|
151
|
+
output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_validation.json"
|
|
105
152
|
|
|
153
|
+
try:
|
|
154
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
155
|
+
json.dump(report_data, f, indent=4)
|
|
156
|
+
print(f"\nJSON validation exported: {get_friendly_path(output_path)}")
|
|
157
|
+
return output_path
|
|
158
|
+
except Exception as e:
|
|
159
|
+
error_logger.error(f"JSON validation export failed: {e}", exc_info=True)
|
|
160
|
+
raise RuntimeError(f"JSON validation export failed: {e}")
|
|
106
161
|
|
|
162
|
+
# --- helpers ---
|
|
163
|
+
def get_friendly_path(full_path: str) -> str:
|
|
164
|
+
p = Path(full_path).resolve()
|
|
165
|
+
try:
|
|
166
|
+
# Replaces /home/oolong with ~
|
|
167
|
+
return str(p).replace(str(Path.home()), "~")
|
|
168
|
+
except ValueError:
|
|
169
|
+
return str(p)
|
|
170
|
+
|
|
171
|
+
def get_first_pdf_in_cwd() -> Optional[str]:
|
|
172
|
+
"""
|
|
173
|
+
Scans the current working directory (CWD) for the first file ending
|
|
174
|
+
with a '.pdf' extension (case-insensitive).
|
|
175
|
+
|
|
176
|
+
This is intended as a convenience function for running the tool
|
|
177
|
+
without explicitly specifying a path.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
The absolute path (as a string) to the first PDF file found,
|
|
181
|
+
or None if no PDF files are present in the CWD.
|
|
182
|
+
"""
|
|
183
|
+
# 1. Get the current working directory (CWD)
|
|
184
|
+
cwd = Path.cwd()
|
|
185
|
+
|
|
186
|
+
# 2. Use Path.glob to find files matching the pattern.
|
|
187
|
+
# We use '**/*.pdf' to also search nested directories if desired,
|
|
188
|
+
# but typically for a single PDF in CWD, '*.pdf' is enough.
|
|
189
|
+
# Let's stick to files directly in the CWD for simplicity.
|
|
190
|
+
|
|
191
|
+
# We use list comprehension with next() for efficiency, or a simple loop.
|
|
192
|
+
# Using Path.glob('*.pdf') to search the CWD for files ending in .pdf
|
|
193
|
+
# We make it case-insensitive by checking both '*.pdf' and '*.PDF'
|
|
194
|
+
|
|
195
|
+
# Note: On Unix systems, glob is case-sensitive by default.
|
|
196
|
+
# The most cross-platform safe way is to iterate and check the suffix.
|
|
197
|
+
print("No PDF argument was provide. Falling back to using the first PDF available at the current path.")
|
|
198
|
+
try:
|
|
199
|
+
# Check for files in the current directory only
|
|
200
|
+
# Iterating over the generator stops as soon as the first match is found.
|
|
201
|
+
first_pdf_path = next(
|
|
202
|
+
p.resolve() for p in cwd.iterdir()
|
|
203
|
+
if p.is_file() and p.suffix.lower() == '.pdf'
|
|
204
|
+
)
|
|
205
|
+
print(f"Fallback PDF found: {first_pdf_path.name}")
|
|
206
|
+
return str(first_pdf_path)
|
|
207
|
+
except StopIteration:
|
|
208
|
+
# If the generator runs out of items, no PDF was found
|
|
209
|
+
return None
|
|
210
|
+
except Exception as e:
|
|
211
|
+
# Handle potential permissions errors or other issues
|
|
212
|
+
print(f"Error while searching for PDF in CWD: {e}", file=sys.stderr)
|
|
213
|
+
return None
|
pdflinkcheck/report.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
# pdflinkcheck/report.py
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Dict, Any
|
|
6
|
+
import pyhabitat
|
|
7
|
+
|
|
8
|
+
from pdflinkcheck.io import error_logger, export_report_json, export_report_txt, get_first_pdf_in_cwd, get_friendly_path, LOG_FILE_PATH
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
SEP_COUNT=28
|
|
12
|
+
|
|
13
|
+
def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "JSON", pdf_library: str = "pypdf", print_bool:bool=True) -> Dict[str, Any]:
|
|
14
|
+
"""
|
|
15
|
+
Core high-level PDF link analysis logic.
|
|
16
|
+
|
|
17
|
+
This function orchestrates the extraction of active links and TOC
|
|
18
|
+
using pdflinkcheck analysis, and
|
|
19
|
+
prints a comprehensive, user-friendly report to the console.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
pdf_path: The file system path (str) to the target PDF document.
|
|
23
|
+
max_links: Maximum number of links to display in each console
|
|
24
|
+
section. If <= 0, all links will be displayed.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
A dictionary containing the structured results of the analysis:
|
|
28
|
+
'external_links', 'internal_links', and 'toc'.
|
|
29
|
+
|
|
30
|
+
To Do:
|
|
31
|
+
Aggregate print strings into a str for TXT export.
|
|
32
|
+
Modularize.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
report_buffer = []
|
|
36
|
+
|
|
37
|
+
# Helper to handle conditional printing and mandatory buffering
|
|
38
|
+
def log(msg: str):
|
|
39
|
+
if print_bool: # this should not be here
|
|
40
|
+
print(msg) # this should not be here. esure elsewhere then remove
|
|
41
|
+
report_buffer.append(msg)
|
|
42
|
+
|
|
43
|
+
# Expected: "pypdf" or "PyMuPDF"
|
|
44
|
+
allowed_libraries = ("pypdf","pymupdf")
|
|
45
|
+
pdf_library = pdf_library.lower()
|
|
46
|
+
if pdf_library in allowed_libraries and pdf_library == "pypdf":
|
|
47
|
+
from pdflinkcheck.analyze_pypdf import (extract_links_pypdf as extract_links, extract_toc_pypdf as extract_toc)
|
|
48
|
+
elif pdf_library in allowed_libraries and pdf_library == "pymupdf":
|
|
49
|
+
try:
|
|
50
|
+
import fitz
|
|
51
|
+
except ImportError:
|
|
52
|
+
print("PyMuPDF was explicitly requested as the PDF Engine")
|
|
53
|
+
print("Use pypdf instead, or install PyMuPDF. ")
|
|
54
|
+
print("To install PyMuPDF locally, try: `uv sync --extra full` OR `pip install .[full]`")
|
|
55
|
+
if pyhabitat.on_termux():
|
|
56
|
+
print(f"pyhabitat.on_termux() = {pyhabitat.on_termux()}")
|
|
57
|
+
print("PyMuPDF is not expected to work on Termux. Use pypdf.")
|
|
58
|
+
print("\n")
|
|
59
|
+
return
|
|
60
|
+
from pdflinkcheck.analyze_pymupdf import (extract_links_pymupdf as extract_links, extract_toc_pymupdf as extract_toc)
|
|
61
|
+
|
|
62
|
+
log("\n--- Starting Analysis ... ---\n")
|
|
63
|
+
if pdf_path is None:
|
|
64
|
+
pdf_path = get_first_pdf_in_cwd()
|
|
65
|
+
if pdf_path is None:
|
|
66
|
+
log("pdf_path is None")
|
|
67
|
+
log("Tip: Drop a PDF in the current folder or pass in a path arg.")
|
|
68
|
+
return
|
|
69
|
+
try:
|
|
70
|
+
log(f"Target file: {get_friendly_path(pdf_path)}")
|
|
71
|
+
log(f"PDF Engine: {pdf_library}")
|
|
72
|
+
|
|
73
|
+
# 1. Extract all active links and TOC
|
|
74
|
+
extracted_links = extract_links(pdf_path)
|
|
75
|
+
structural_toc = extract_toc(pdf_path)
|
|
76
|
+
#structural_toc = extract_toc_pypdf(pdf_path)
|
|
77
|
+
toc_entry_count = len(structural_toc)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
if not extracted_links and not structural_toc:
|
|
81
|
+
log(f"\nNo hyperlinks or structural TOC found in {Path(pdf_path).name}.")
|
|
82
|
+
log("(This is common for scanned/image-only PDFs.)")
|
|
83
|
+
return {}
|
|
84
|
+
|
|
85
|
+
# 3. Separate the lists based on the 'type' key
|
|
86
|
+
uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
|
|
87
|
+
goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
|
|
88
|
+
resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
|
|
89
|
+
other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
|
|
90
|
+
|
|
91
|
+
total_internal_links = len(goto_links) + len(resolved_action_links)
|
|
92
|
+
limit = max_links if max_links > 0 else None
|
|
93
|
+
uri_and_other = uri_links + other_links
|
|
94
|
+
|
|
95
|
+
# --- ANALYSIS SUMMARY (Using your print logic) ---
|
|
96
|
+
log("\n" + "=" * SEP_COUNT)
|
|
97
|
+
log(f"--- Link Analysis Results for {Path(pdf_path).name} ---")
|
|
98
|
+
log(f"Total active links: {len(extracted_links)} (External: {len(uri_links)}, Internal Jumps: {total_internal_links}, Other: {len(other_links)})")
|
|
99
|
+
log(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
|
|
100
|
+
log("=" * SEP_COUNT)
|
|
101
|
+
|
|
102
|
+
# --- Section 1: TOC ---
|
|
103
|
+
str_structural_toc = print_structural_toc(structural_toc)
|
|
104
|
+
log(str_structural_toc)
|
|
105
|
+
|
|
106
|
+
# --- Section 2: ACTIVE INTERNAL JUMPS ---
|
|
107
|
+
log("\n" + "=" * SEP_COUNT)
|
|
108
|
+
log(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links} found")
|
|
109
|
+
log("=" * SEP_COUNT)
|
|
110
|
+
log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
|
|
111
|
+
log("-" * SEP_COUNT)
|
|
112
|
+
|
|
113
|
+
all_internal = goto_links + resolved_action_links
|
|
114
|
+
if total_internal_links > 0:
|
|
115
|
+
for i, link in enumerate(all_internal[:limit], 1):
|
|
116
|
+
link_text = link.get('link_text', 'N/A')
|
|
117
|
+
log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], link['destination_page']))
|
|
118
|
+
|
|
119
|
+
if limit is not None and len(all_internal) > limit:
|
|
120
|
+
log(f"... and {len(all_internal) - limit} more links (use --max-links 0 to show all).")
|
|
121
|
+
else:
|
|
122
|
+
log(" No internal GoTo or Resolved Action links found.")
|
|
123
|
+
log("-" * SEP_COUNT)
|
|
124
|
+
|
|
125
|
+
# --- Section 3: ACTIVE URI LINKS ---
|
|
126
|
+
log("\n" + "=" * SEP_COUNT)
|
|
127
|
+
log(f"## Active URI Links (External & Other) - {len(uri_and_other)} found")
|
|
128
|
+
log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
|
|
129
|
+
log("=" * SEP_COUNT)
|
|
130
|
+
|
|
131
|
+
if uri_and_other:
|
|
132
|
+
for i, link in enumerate(uri_and_other[:limit], 1):
|
|
133
|
+
target = link.get('url') or link.get('remote_file') or link.get('target')
|
|
134
|
+
link_text = link.get('link_text', 'N/A')
|
|
135
|
+
log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
|
|
136
|
+
if limit is not None and len(uri_and_other) > limit:
|
|
137
|
+
log(f"... and {len(uri_and_other) - limit} more links (use --max-links 0 to show all).")
|
|
138
|
+
|
|
139
|
+
else:
|
|
140
|
+
log(" No external or 'Other' links found.")
|
|
141
|
+
log("-" * SEP_COUNT)
|
|
142
|
+
|
|
143
|
+
log("\n--- Analysis Complete ---\n")
|
|
144
|
+
|
|
145
|
+
# Final aggregation of the buffer into one string
|
|
146
|
+
report_buffer_str = "\n".join(report_buffer)
|
|
147
|
+
|
|
148
|
+
# Return the collected data for potential future JSON/other output
|
|
149
|
+
final_report_data_dict = {
|
|
150
|
+
"external_links": uri_links,
|
|
151
|
+
"internal_links": all_internal,
|
|
152
|
+
"toc": structural_toc
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
# 5. Export Report
|
|
156
|
+
#if export_format:
|
|
157
|
+
# # Assuming export_to will hold the output format string (e.g., "JSON")
|
|
158
|
+
# export_report_data(final_report_data_dict, Path(pdf_path).name, export_format, pdf_library)
|
|
159
|
+
|
|
160
|
+
if export_format:
|
|
161
|
+
fmt_upper = export_format.upper()
|
|
162
|
+
|
|
163
|
+
if "JSON" in fmt_upper:
|
|
164
|
+
export_report_json(final_report_data_dict, pdf_path, pdf_library)
|
|
165
|
+
|
|
166
|
+
if "TXT" in fmt_upper:
|
|
167
|
+
export_report_txt(report_buffer_str, pdf_path, pdf_library)
|
|
168
|
+
|
|
169
|
+
report_results = {
|
|
170
|
+
"data": final_report_data_dict, # The structured JSON-ready dict
|
|
171
|
+
"text": report_buffer_str, # The human-readable string
|
|
172
|
+
"metadata": { # Helpful for the GUI/Logs
|
|
173
|
+
"pdf_name": Path(pdf_path).name,
|
|
174
|
+
"library_used": pdf_library,
|
|
175
|
+
"total_links": len(extracted_links)
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
# Return a clean results object
|
|
179
|
+
return report_results
|
|
180
|
+
except Exception as e:
|
|
181
|
+
# Specific handling for common read failures
|
|
182
|
+
if "invalid pdf header" in str(e).lower() or "EOF marker not found" in str(e) or "stream has ended unexpectedly" in str(e):
|
|
183
|
+
log(f"\nWarning: Could not parse PDF structure — likely an image-only or malformed PDF.")
|
|
184
|
+
log("No hyperlinks or TOC can exist in this file.")
|
|
185
|
+
log("Result: No links found.")
|
|
186
|
+
return {
|
|
187
|
+
"data": {"external_links": [], "internal_links": [], "toc": []},
|
|
188
|
+
"text": "\n".join(report_buffer + [
|
|
189
|
+
"\nWarning: PDF appears to be image-only or malformed.",
|
|
190
|
+
"No hyperlinks or structural TOC found."
|
|
191
|
+
]),
|
|
192
|
+
"metadata": {
|
|
193
|
+
"pdf_name": Path(pdf_path).name,
|
|
194
|
+
"library_used": pdf_library,
|
|
195
|
+
"total_links": 0
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
# Log the critical failure
|
|
201
|
+
error_logger.error(f"Critical failure during run_report for {pdf_path}: {e}", exc_info=True)
|
|
202
|
+
log(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
|
|
203
|
+
raise # Allow the exception to propagate or handle gracefully
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def print_structural_toc_print(structural_toc:dict)->str|None:
|
|
207
|
+
"""
|
|
208
|
+
Prints the structural TOC data (bookmarks/outline) in a clean,
|
|
209
|
+
hierarchical, and readable console format.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
structural_toc: A list of TOC dictionaries.
|
|
213
|
+
"""
|
|
214
|
+
print("\n" + "=" * SEP_COUNT)
|
|
215
|
+
print("## Structural Table of Contents (PDF Bookmarks/Outline)")
|
|
216
|
+
print("=" * SEP_COUNT)
|
|
217
|
+
if not structural_toc:
|
|
218
|
+
print("No structural TOC (bookmarks/outline) found.")
|
|
219
|
+
return
|
|
220
|
+
|
|
221
|
+
# Determine max page width for consistent alignment (optional but nice)
|
|
222
|
+
max_page = max(item['target_page'] for item in structural_toc) if structural_toc else 1
|
|
223
|
+
page_width = len(str(max_page))
|
|
224
|
+
|
|
225
|
+
# Iterate and format
|
|
226
|
+
for item in structural_toc:
|
|
227
|
+
# Use level for indentation (e.g., Level 1 = 0 spaces, Level 2 = 4 spaces, Level 3 = 8 spaces)
|
|
228
|
+
indent = " " * 4 * (item['level'] - 1)
|
|
229
|
+
# Format the title and target page number
|
|
230
|
+
page_str = str(item['target_page']).rjust(page_width)
|
|
231
|
+
print(f"{indent}{item['title']} . . . page {page_str}")
|
|
232
|
+
|
|
233
|
+
print("-" * SEP_COUNT)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def print_structural_toc(structural_toc: list, print_bool: bool = False) -> str:
|
|
237
|
+
"""
|
|
238
|
+
Formats the structural TOC data into a hierarchical string and optionally prints it.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
structural_toc: A list of TOC dictionaries.
|
|
242
|
+
print_bool: Whether to print the output to the console.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
A formatted string of the structural TOC.
|
|
246
|
+
"""
|
|
247
|
+
lines = []
|
|
248
|
+
lines.append("\n" + "=" * SEP_COUNT)
|
|
249
|
+
lines.append("## Structural Table of Contents (PDF Bookmarks/Outline)")
|
|
250
|
+
lines.append("=" * SEP_COUNT)
|
|
251
|
+
|
|
252
|
+
if not structural_toc:
|
|
253
|
+
msg = "No structural TOC (bookmarks/outline) found."
|
|
254
|
+
lines.append(msg)
|
|
255
|
+
output = "\n".join(lines)
|
|
256
|
+
if print_bool:
|
|
257
|
+
print(output)
|
|
258
|
+
return output
|
|
259
|
+
|
|
260
|
+
# Determine max page width for consistent alignment
|
|
261
|
+
valid_pages = [item['target_page'] for item in structural_toc if isinstance(item['target_page'], int)]
|
|
262
|
+
max_page = max(valid_pages) if valid_pages else 1
|
|
263
|
+
page_width = len(str(max_page))
|
|
264
|
+
|
|
265
|
+
# Iterate and format
|
|
266
|
+
for item in structural_toc:
|
|
267
|
+
indent = " " * 4 * (item['level'] - 1)
|
|
268
|
+
# Handle cases where page might be N/A or None
|
|
269
|
+
target_page = item.get('target_page', "N/A")
|
|
270
|
+
page_str = str(target_page).rjust(page_width)
|
|
271
|
+
|
|
272
|
+
lines.append(f"{indent}{item['title']} . . . page {page_str}")
|
|
273
|
+
|
|
274
|
+
lines.append("-" * SEP_COUNT)
|
|
275
|
+
|
|
276
|
+
# Final aggregation
|
|
277
|
+
str_structural_toc = "\n".join(lines)
|
|
278
|
+
|
|
279
|
+
if print_bool:
|
|
280
|
+
print(str_structural_toc)
|
|
281
|
+
|
|
282
|
+
return str_structural_toc
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# src/pdflinkcheck/stdlib_server.py
|
|
2
|
+
import http.server
|
|
3
|
+
import socketserver
|
|
4
|
+
import json
|
|
5
|
+
import tempfile
|
|
6
|
+
import shutil
|
|
7
|
+
import os
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import email # This replaces cgi for multipart parsing
|
|
10
|
+
|
|
11
|
+
from pdflinkcheck.report import run_report
|
|
12
|
+
|
|
13
|
+
PORT = 8000
|
|
14
|
+
|
|
15
|
+
HTML_FORM = """
|
|
16
|
+
<!doctype html>
|
|
17
|
+
<html>
|
|
18
|
+
<head><title>pdflinkcheck Stdlib Server</title></head>
|
|
19
|
+
<body style="font-family: sans-serif; max-width: 800px; margin: 40px auto;">
|
|
20
|
+
<h1>pdflinkcheck API (Pure Stdlib, without cgi)</h1>
|
|
21
|
+
<p>Upload a PDF for link/TOC analysis. Zero third-party deps, future-proof.</p>
|
|
22
|
+
<form action="/" method="post" enctype="multipart/form-data">
|
|
23
|
+
<p><input type="file" name="file" accept=".pdf" required></p>
|
|
24
|
+
<p>
|
|
25
|
+
<label>Engine:</label>
|
|
26
|
+
<select name="pdf_library">
|
|
27
|
+
<option value="pypdf" selected>pypdf (pure Python, Termux-friendly)</option>
|
|
28
|
+
<option value="pymupdf">pymupdf (faster, if installed)</option>
|
|
29
|
+
</select>
|
|
30
|
+
</p>
|
|
31
|
+
<p>
|
|
32
|
+
<label>Max links to show (0 = all):</label>
|
|
33
|
+
<input type="number" name="max_links" value="0" min="0">
|
|
34
|
+
</p>
|
|
35
|
+
<p><button type="submit">Analyze PDF</button></p>
|
|
36
|
+
</form>
|
|
37
|
+
<hr>
|
|
38
|
+
<p>Returns JSON. Works on Termux & Python 3.13+.</p>
|
|
39
|
+
</body>
|
|
40
|
+
</html>
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
class ThreadedTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
|
|
44
|
+
allow_reuse_address = True
|
|
45
|
+
|
|
46
|
+
class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
|
|
47
|
+
def do_GET(self):
|
|
48
|
+
if self.path == "/":
|
|
49
|
+
self.send_response(200)
|
|
50
|
+
self.send_header("Content-Type", "text/html; charset=utf-8")
|
|
51
|
+
self.end_headers()
|
|
52
|
+
self.wfile.write(HTML_FORM.encode("utf-8"))
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
if self.path == "/favicon.ico":
|
|
56
|
+
return
|
|
57
|
+
# Silent no-content response (most browsers cache this)
|
|
58
|
+
self.send_response(204)
|
|
59
|
+
self.end_headers()
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
self.send_error(404, "Not Found")
|
|
63
|
+
|
|
64
|
+
def do_POST(self):
|
|
65
|
+
if self.path != "/":
|
|
66
|
+
self.send_error(404, "Not Found")
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
# Get Content-Type and Content-Length
|
|
70
|
+
content_type = self.headers.get("Content-Type")
|
|
71
|
+
if not content_type or "multipart/form-data" not in content_type:
|
|
72
|
+
self._send_json_error("Expected multipart/form-data", 400)
|
|
73
|
+
return
|
|
74
|
+
|
|
75
|
+
content_length = int(self.headers.get("Content-Length", 0))
|
|
76
|
+
if content_length == 0:
|
|
77
|
+
self._send_json_error("No body sent", 400)
|
|
78
|
+
return
|
|
79
|
+
|
|
80
|
+
# Read the entire body
|
|
81
|
+
body = self.rfile.read(content_length)
|
|
82
|
+
|
|
83
|
+
# Parse using email.message (pure stdlib, no cgi)
|
|
84
|
+
msg = email.message_from_bytes(b"Content-Type: " + content_type.encode() + b"\r\n\r\n" + body)
|
|
85
|
+
|
|
86
|
+
if not msg.is_multipart():
|
|
87
|
+
self._send_json_error("Invalid multipart message", 400)
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
# Extract parts
|
|
91
|
+
file_item = None
|
|
92
|
+
pdf_library = "pypdf"
|
|
93
|
+
max_links = 0
|
|
94
|
+
|
|
95
|
+
for part in msg.get_payload():
|
|
96
|
+
disposition = part.get("Content-Disposition", "")
|
|
97
|
+
if not disposition.startswith("form-data"):
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
name = part.get_param("name", header="Content-Disposition")
|
|
101
|
+
filename = part.get_param("filename", header="Content-Disposition")
|
|
102
|
+
|
|
103
|
+
if name == "file" and filename:
|
|
104
|
+
if not filename.lower().endswith(".pdf"):
|
|
105
|
+
self._send_json_error("Only .pdf files allowed", 400)
|
|
106
|
+
return
|
|
107
|
+
file_item = part.get_payload(decode=True) # bytes
|
|
108
|
+
file_filename = filename
|
|
109
|
+
|
|
110
|
+
elif name == "pdf_library":
|
|
111
|
+
pdf_library = part.get_payload(decode=True).decode().lower()
|
|
112
|
+
if pdf_library not in {"pypdf", "pymupdf"}:
|
|
113
|
+
self._send_json_error("Invalid pdf_library", 400)
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
elif name == "max_links":
|
|
117
|
+
try:
|
|
118
|
+
max_links = int(part.get_payload(decode=True).decode())
|
|
119
|
+
except ValueError:
|
|
120
|
+
max_links = 0
|
|
121
|
+
|
|
122
|
+
if not file_item:
|
|
123
|
+
self._send_json_error("No PDF file uploaded", 400)
|
|
124
|
+
return
|
|
125
|
+
|
|
126
|
+
# Save uploaded file to temp
|
|
127
|
+
tmp_path = None
|
|
128
|
+
try:
|
|
129
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
|
130
|
+
tmp_file.write(file_item)
|
|
131
|
+
tmp_path = tmp_file.name
|
|
132
|
+
|
|
133
|
+
result = run_report(
|
|
134
|
+
pdf_path=tmp_path,
|
|
135
|
+
max_links=max_links if max_links > 0 else 0,
|
|
136
|
+
export_format="",
|
|
137
|
+
pdf_library=pdf_library,
|
|
138
|
+
print_bool=False
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
response = {
|
|
142
|
+
"filename": file_filename,
|
|
143
|
+
"pdf_library_used": pdf_library,
|
|
144
|
+
"total_links": result["metadata"]["total_links"],
|
|
145
|
+
"data": result["data"],
|
|
146
|
+
"text_report": result["text"]
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
self._send_json(response)
|
|
150
|
+
|
|
151
|
+
except Exception as e:
|
|
152
|
+
self._send_json_error(f"Analysis failed: {str(e)}", 500)
|
|
153
|
+
finally:
|
|
154
|
+
if tmp_path and os.path.exists(tmp_path):
|
|
155
|
+
os.unlink(tmp_path)
|
|
156
|
+
|
|
157
|
+
def _send_json(self, data, status=200):
|
|
158
|
+
self.send_response(status)
|
|
159
|
+
self.send_header("Content-Type", "application/json; charset=utf-8")
|
|
160
|
+
self.send_header("Access-Control-Allow-Origin", "*")
|
|
161
|
+
self.end_headers()
|
|
162
|
+
json_bytes = json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8")
|
|
163
|
+
self.wfile.write(json_bytes)
|
|
164
|
+
|
|
165
|
+
def _send_json_error(self, message, status=400):
|
|
166
|
+
self._send_json({"error": message}, status)
|
|
167
|
+
|
|
168
|
+
if __name__ == "__main__":
|
|
169
|
+
with ThreadedTCPServer(("", PORT), PDFLinkCheckHandler) as httpd:
|
|
170
|
+
print(f"pdflinkcheck pure-stdlib server (no cgi) running at http://localhost:{PORT}")
|
|
171
|
+
print("Future-proof for Python 3.13+ • Handles concurrent uploads")
|
|
172
|
+
try:
|
|
173
|
+
httpd.serve_forever()
|
|
174
|
+
except KeyboardInterrupt:
|
|
175
|
+
print("\nShutting down...")
|
|
176
|
+
httpd.server_close()
|