pdflinkcheck 1.1.47__py3-none-any.whl → 1.1.72__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdflinkcheck/io.py CHANGED
@@ -1,8 +1,9 @@
1
1
  # src/pdflinkcheck/io.py
2
2
  import logging
3
3
  import json
4
+ import sys
4
5
  from pathlib import Path
5
- from typing import Dict, Any, Union, List
6
+ from typing import Dict, Any, Union, List, Optional
6
7
 
7
8
  # --- Configuration ---
8
9
 
@@ -27,6 +28,14 @@ def setup_error_logger():
27
28
  """
28
29
  Configures a basic logger that writes errors and warnings to a file
29
30
  in the PDFLINKCHECK_HOME directory.
31
+
32
+ # Example of how an external module can log an error:
33
+ # from pdflinkcheck.io import error_logger
34
+ # try:
35
+ # ...
36
+ # except Exception as e:
37
+ # error_logger.exception("An exception occurred during link extraction.")
38
+
30
39
  """
31
40
  # Create the logger instance
32
41
  logger = logging.getLogger('pdflinkcheck_logger')
@@ -57,14 +66,15 @@ error_logger = setup_error_logger()
57
66
  def export_report_data(
58
67
  report_data: Dict[str, Any],
59
68
  pdf_filename: str,
60
- export_format: str = "JSON"
69
+ export_format: str = "JSON",
70
+ pdf_library: str = "", # expected to be specificed every time.
61
71
  ) -> Path:
62
72
  """
63
73
  Exports the structured analysis report data to a file in the
64
74
  PDFLINKCHECK_HOME directory.
65
75
 
66
76
  Args:
67
- report_data: The dictionary containing the results from run_analysis.
77
+ report_data: The dictionary containing the results from run_report.
68
78
  pdf_filename: The base filename of the PDF being analyzed (used for the output file name).
69
79
  export_format: The desired output format ('json' currently supported).
70
80
 
@@ -80,7 +90,7 @@ def export_report_data(
80
90
 
81
91
  # Create an output file name based on the PDF name and a timestamp
82
92
  base_name = Path(pdf_filename).stem
83
- output_filename = f"{base_name}_report.json"
93
+ output_filename = f"{base_name}_{pdf_library}_report.json"
84
94
  output_path = PDFLINKCHECK_HOME / output_filename
85
95
 
86
96
  try:
@@ -88,7 +98,7 @@ def export_report_data(
88
98
  # Use indent for readability
89
99
  json.dump(report_data, f, indent=4)
90
100
 
91
- print(f"\nReport successfully exported to: {output_path}")
101
+ print(f"\nReport successfully exported to: {get_friendly_path(output_path)}")
92
102
  return output_path
93
103
 
94
104
  except Exception as e:
@@ -96,11 +106,108 @@ def export_report_data(
96
106
  # Re-raise the exception after logging for caller to handle
97
107
  raise RuntimeError(f"Report export failed due to an I/O error: {e}")
98
108
 
99
- # Example of how an external module can log an error:
100
- # from pdflinkcheck.io import error_logger
101
- # try:
102
- # ...
103
- # except Exception as e:
104
- # error_logger.exception("An exception occurred during link extraction.")
109
+ def export_report_json(
110
+ report_data: Dict[str, Any],
111
+ pdf_filename: str,
112
+ pdf_library: str
113
+ ) -> Path:
114
+ """Exports structured dictionary results to a .json file."""
115
+ base_name = Path(pdf_filename).stem
116
+ output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_report.json"
117
+
118
+ try:
119
+ with open(output_path, 'w', encoding='utf-8') as f:
120
+ json.dump(report_data, f, indent=4)
121
+ print(f"\nJSON report exported: {get_friendly_path(output_path)}")
122
+ return output_path
123
+ except Exception as e:
124
+ error_logger.error(f"JSON export failed: {e}", exc_info=True)
125
+ raise RuntimeError(f"JSON export failed: {e}")
126
+
127
+ def export_report_txt(
128
+ report_text: str,
129
+ pdf_filename: str,
130
+ pdf_library: str
131
+ ) -> Path:
132
+ """Exports the formatted string buffer to a .txt file."""
133
+ base_name = Path(pdf_filename).stem
134
+ output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_report.txt"
135
+
136
+ try:
137
+ output_path.write_text(report_text, encoding='utf-8')
138
+ print(f"\nTXT report exported: {get_friendly_path(output_path)}")
139
+ return output_path
140
+ except Exception as e:
141
+ error_logger.error(f"TXT export failed: {e}", exc_info=True)
142
+ raise RuntimeError(f"TXT export failed: {e}")
143
+
144
+ def export_validation_json(
145
+ report_data: Dict[str, Any],
146
+ pdf_filename: str,
147
+ pdf_library: str
148
+ ) -> Path:
149
+ """Exports structured dictionary validation results to a .json file."""
150
+ base_name = Path(pdf_filename).stem
151
+ output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_validation.json"
105
152
 
153
+ try:
154
+ with open(output_path, 'w', encoding='utf-8') as f:
155
+ json.dump(report_data, f, indent=4)
156
+ print(f"\nJSON validation exported: {get_friendly_path(output_path)}")
157
+ return output_path
158
+ except Exception as e:
159
+ error_logger.error(f"JSON validation export failed: {e}", exc_info=True)
160
+ raise RuntimeError(f"JSON validation export failed: {e}")
106
161
 
162
+ # --- helpers ---
163
+ def get_friendly_path(full_path: str) -> str:
164
+ p = Path(full_path).resolve()
165
+ try:
166
+ # Replaces /home/oolong with ~
167
+ return str(p).replace(str(Path.home()), "~")
168
+ except ValueError:
169
+ return str(p)
170
+
171
+ def get_first_pdf_in_cwd() -> Optional[str]:
172
+ """
173
+ Scans the current working directory (CWD) for the first file ending
174
+ with a '.pdf' extension (case-insensitive).
175
+
176
+ This is intended as a convenience function for running the tool
177
+ without explicitly specifying a path.
178
+
179
+ Returns:
180
+ The absolute path (as a string) to the first PDF file found,
181
+ or None if no PDF files are present in the CWD.
182
+ """
183
+ # 1. Get the current working directory (CWD)
184
+ cwd = Path.cwd()
185
+
186
+ # 2. Use Path.glob to find files matching the pattern.
187
+ # We use '**/*.pdf' to also search nested directories if desired,
188
+ # but typically for a single PDF in CWD, '*.pdf' is enough.
189
+ # Let's stick to files directly in the CWD for simplicity.
190
+
191
+ # We use list comprehension with next() for efficiency, or a simple loop.
192
+ # Using Path.glob('*.pdf') to search the CWD for files ending in .pdf
193
+ # We make it case-insensitive by checking both '*.pdf' and '*.PDF'
194
+
195
+ # Note: On Unix systems, glob is case-sensitive by default.
196
+ # The most cross-platform safe way is to iterate and check the suffix.
197
+ print("No PDF argument was provide. Falling back to using the first PDF available at the current path.")
198
+ try:
199
+ # Check for files in the current directory only
200
+ # Iterating over the generator stops as soon as the first match is found.
201
+ first_pdf_path = next(
202
+ p.resolve() for p in cwd.iterdir()
203
+ if p.is_file() and p.suffix.lower() == '.pdf'
204
+ )
205
+ print(f"Fallback PDF found: {first_pdf_path.name}")
206
+ return str(first_pdf_path)
207
+ except StopIteration:
208
+ # If the generator runs out of items, no PDF was found
209
+ return None
210
+ except Exception as e:
211
+ # Handle potential permissions errors or other issues
212
+ print(f"Error while searching for PDF in CWD: {e}", file=sys.stderr)
213
+ return None
pdflinkcheck/report.py ADDED
@@ -0,0 +1,280 @@
1
+ # pdflinkcheck/report.py
2
+
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Optional, Dict, Any
6
+ import pyhabitat
7
+
8
+ from pdflinkcheck.io import error_logger, export_report_json, export_report_txt, get_first_pdf_in_cwd, get_friendly_path, LOG_FILE_PATH
9
+
10
+
11
+ def run_report(pdf_path: str = None, max_links: int = 0, export_format: str = "JSON", pdf_library: str = "pypdf", print_bool:bool=True) -> Dict[str, Any]:
12
+ """
13
+ Core high-level PDF link analysis logic.
14
+
15
+ This function orchestrates the extraction of active links and TOC
16
+ using pdflinkcheck analysis, and
17
+ prints a comprehensive, user-friendly report to the console.
18
+
19
+ Args:
20
+ pdf_path: The file system path (str) to the target PDF document.
21
+ max_links: Maximum number of links to display in each console
22
+ section. If <= 0, all links will be displayed.
23
+
24
+ Returns:
25
+ A dictionary containing the structured results of the analysis:
26
+ 'external_links', 'internal_links', and 'toc'.
27
+
28
+ To Do:
29
+ Aggregate print strings into a str for TXT export.
30
+ Modularize.
31
+ """
32
+
33
+ report_buffer = []
34
+
35
+ # Helper to handle conditional printing and mandatory buffering
36
+ def log(msg: str):
37
+ if print_bool: # this should not be here
38
+ print(msg) # this should not be here. esure elsewhere then remove
39
+ report_buffer.append(msg)
40
+
41
+ # Expected: "pypdf" or "PyMuPDF"
42
+ allowed_libraries = ("pypdf","pymupdf")
43
+ pdf_library = pdf_library.lower()
44
+ if pdf_library in allowed_libraries and pdf_library == "pypdf":
45
+ from pdflinkcheck.analyze_pypdf import (extract_links_pypdf as extract_links, extract_toc_pypdf as extract_toc)
46
+ elif pdf_library in allowed_libraries and pdf_library == "pymupdf":
47
+ try:
48
+ import fitz
49
+ except ImportError:
50
+ print("PyMuPDF was explicitly requested as the PDF Engine")
51
+ print("Use pypdf instead, or install PyMuPDF. ")
52
+ print("To install PyMuPDF locally, try: `uv sync --extra full` OR `pip install .[full]`")
53
+ if pyhabitat.on_termux():
54
+ print(f"pyhabitat.on_termux() = {pyhabitat.on_termux()}")
55
+ print("PyMuPDF is not expected to work on Termux. Use pypdf.")
56
+ print("\n")
57
+ return
58
+ from pdflinkcheck.analyze_pymupdf import (extract_links_pymupdf as extract_links, extract_toc_pymupdf as extract_toc)
59
+
60
+ log("\n--- Starting Analysis ... ---\n")
61
+ if pdf_path is None:
62
+ pdf_path = get_first_pdf_in_cwd()
63
+ if pdf_path is None:
64
+ log("pdf_path is None")
65
+ log("Tip: Drop a PDF in the current folder or pass in a path arg.")
66
+ return
67
+ try:
68
+ log(f"Target file: {get_friendly_path(pdf_path)}")
69
+ log(f"PDF Engine: {pdf_library}")
70
+
71
+ # 1. Extract all active links and TOC
72
+ extracted_links = extract_links(pdf_path)
73
+ structural_toc = extract_toc(pdf_path)
74
+ #structural_toc = extract_toc_pypdf(pdf_path)
75
+ toc_entry_count = len(structural_toc)
76
+
77
+
78
+ if not extracted_links and not structural_toc:
79
+ log(f"\nNo hyperlinks or structural TOC found in {Path(pdf_path).name}.")
80
+ log("(This is common for scanned/image-only PDFs.)")
81
+ return {}
82
+
83
+ # 3. Separate the lists based on the 'type' key
84
+ uri_links = [link for link in extracted_links if link['type'] == 'External (URI)']
85
+ goto_links = [link for link in extracted_links if link['type'] == 'Internal (GoTo/Dest)']
86
+ resolved_action_links = [link for link in extracted_links if link['type'] == 'Internal (Resolved Action)']
87
+ other_links = [link for link in extracted_links if link['type'] not in ['External (URI)', 'Internal (GoTo/Dest)', 'Internal (Resolved Action)']]
88
+
89
+ total_internal_links = len(goto_links) + len(resolved_action_links)
90
+ limit = max_links if max_links > 0 else None
91
+ uri_and_other = uri_links + other_links
92
+
93
+ # --- ANALYSIS SUMMARY (Using your print logic) ---
94
+ log("\n" + "=" * 70)
95
+ log(f"--- Link Analysis Results for {Path(pdf_path).name} ---")
96
+ log(f"Total active links: {len(extracted_links)} (External: {len(uri_links)}, Internal Jumps: {total_internal_links}, Other: {len(other_links)})")
97
+ log(f"Total **structural TOC entries (bookmarks)** found: {toc_entry_count}")
98
+ log("=" * 70)
99
+
100
+ # --- Section 1: TOC ---
101
+ str_structural_toc = print_structural_toc(structural_toc)
102
+ log(str_structural_toc)
103
+
104
+ # --- Section 2: ACTIVE INTERNAL JUMPS ---
105
+ log("\n" + "=" * 70)
106
+ log(f"## Active Internal Jumps (GoTo & Resolved Actions) - {total_internal_links} found")
107
+ log("=" * 70)
108
+ log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Jumps To Page"))
109
+ log("-" * 70)
110
+
111
+ all_internal = goto_links + resolved_action_links
112
+ if total_internal_links > 0:
113
+ for i, link in enumerate(all_internal[:limit], 1):
114
+ link_text = link.get('link_text', 'N/A')
115
+ log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], link['destination_page']))
116
+
117
+ if limit is not None and len(all_internal) > limit:
118
+ log(f"... and {len(all_internal) - limit} more links (use --max-links 0 to show all).")
119
+ else:
120
+ log(" No internal GoTo or Resolved Action links found.")
121
+ log("-" * 70)
122
+
123
+ # --- Section 3: ACTIVE URI LINKS ---
124
+ log("\n" + "=" * 70)
125
+ log(f"## Active URI Links (External & Other) - {len(uri_and_other)} found")
126
+ log("{:<5} | {:<5} | {:<40} | {}".format("Idx", "Page", "Anchor Text", "Target URI/Action"))
127
+ log("=" * 70)
128
+
129
+ if uri_and_other:
130
+ for i, link in enumerate(uri_and_other[:limit], 1):
131
+ target = link.get('url') or link.get('remote_file') or link.get('target')
132
+ link_text = link.get('link_text', 'N/A')
133
+ log("{:<5} | {:<5} | {:<40} | {}".format(i, link['page'], link_text[:40], target))
134
+ if limit is not None and len(uri_and_other) > limit:
135
+ log(f"... and {len(uri_and_other) - limit} more links (use --max-links 0 to show all).")
136
+
137
+ else:
138
+ log(" No external or 'Other' links found.")
139
+ log("-" * 70)
140
+
141
+ log("\n--- Analysis Complete ---\n")
142
+
143
+ # Final aggregation of the buffer into one string
144
+ report_buffer_str = "\n".join(report_buffer)
145
+
146
+ # Return the collected data for potential future JSON/other output
147
+ final_report_data_dict = {
148
+ "external_links": uri_links,
149
+ "internal_links": all_internal,
150
+ "toc": structural_toc
151
+ }
152
+
153
+ # 5. Export Report
154
+ #if export_format:
155
+ # # Assuming export_to will hold the output format string (e.g., "JSON")
156
+ # export_report_data(final_report_data_dict, Path(pdf_path).name, export_format, pdf_library)
157
+
158
+ if export_format:
159
+ fmt_upper = export_format.upper()
160
+
161
+ if "JSON" in fmt_upper:
162
+ export_report_json(final_report_data_dict, pdf_path, pdf_library)
163
+
164
+ if "TXT" in fmt_upper:
165
+ export_report_txt(report_buffer_str, pdf_path, pdf_library)
166
+
167
+ report_results = {
168
+ "data": final_report_data_dict, # The structured JSON-ready dict
169
+ "text": report_buffer_str, # The human-readable string
170
+ "metadata": { # Helpful for the GUI/Logs
171
+ "pdf_name": Path(pdf_path).name,
172
+ "library_used": pdf_library,
173
+ "total_links": len(extracted_links)
174
+ }
175
+ }
176
+ # Return a clean results object
177
+ return report_results
178
+ except Exception as e:
179
+ # Specific handling for common read failures
180
+ if "invalid pdf header" in str(e).lower() or "EOF marker not found" in str(e) or "stream has ended unexpectedly" in str(e):
181
+ log(f"\nWarning: Could not parse PDF structure — likely an image-only or malformed PDF.")
182
+ log("No hyperlinks or TOC can exist in this file.")
183
+ log("Result: No links found.")
184
+ return {
185
+ "data": {"external_links": [], "internal_links": [], "toc": []},
186
+ "text": "\n".join(report_buffer + [
187
+ "\nWarning: PDF appears to be image-only or malformed.",
188
+ "No hyperlinks or structural TOC found."
189
+ ]),
190
+ "metadata": {
191
+ "pdf_name": Path(pdf_path).name,
192
+ "library_used": pdf_library,
193
+ "total_links": 0
194
+ }
195
+ }
196
+
197
+ except Exception as e:
198
+ # Log the critical failure
199
+ error_logger.error(f"Critical failure during run_report for {pdf_path}: {e}", exc_info=True)
200
+ log(f"FATAL: Analysis failed. Check logs at {LOG_FILE_PATH}", file=sys.stderr)
201
+ raise # Allow the exception to propagate or handle gracefully
202
+
203
+
204
+ def print_structural_toc_print(structural_toc:dict)->str|None:
205
+ """
206
+ Prints the structural TOC data (bookmarks/outline) in a clean,
207
+ hierarchical, and readable console format.
208
+
209
+ Args:
210
+ structural_toc: A list of TOC dictionaries.
211
+ """
212
+ print("\n" + "=" * 70)
213
+ print("## Structural Table of Contents (PDF Bookmarks/Outline)")
214
+ print("=" * 70)
215
+ if not structural_toc:
216
+ print("No structural TOC (bookmarks/outline) found.")
217
+ return
218
+
219
+ # Determine max page width for consistent alignment (optional but nice)
220
+ max_page = max(item['target_page'] for item in structural_toc) if structural_toc else 1
221
+ page_width = len(str(max_page))
222
+
223
+ # Iterate and format
224
+ for item in structural_toc:
225
+ # Use level for indentation (e.g., Level 1 = 0 spaces, Level 2 = 4 spaces, Level 3 = 8 spaces)
226
+ indent = " " * 4 * (item['level'] - 1)
227
+ # Format the title and target page number
228
+ page_str = str(item['target_page']).rjust(page_width)
229
+ print(f"{indent}{item['title']} . . . page {page_str}")
230
+
231
+ print("-" * 70)
232
+
233
+
234
+ def print_structural_toc(structural_toc: list, print_bool: bool = False) -> str:
235
+ """
236
+ Formats the structural TOC data into a hierarchical string and optionally prints it.
237
+
238
+ Args:
239
+ structural_toc: A list of TOC dictionaries.
240
+ print_bool: Whether to print the output to the console.
241
+
242
+ Returns:
243
+ A formatted string of the structural TOC.
244
+ """
245
+ lines = []
246
+ lines.append("\n" + "=" * 70)
247
+ lines.append("## Structural Table of Contents (PDF Bookmarks/Outline)")
248
+ lines.append("=" * 70)
249
+
250
+ if not structural_toc:
251
+ msg = "No structural TOC (bookmarks/outline) found."
252
+ lines.append(msg)
253
+ output = "\n".join(lines)
254
+ if print_bool:
255
+ print(output)
256
+ return output
257
+
258
+ # Determine max page width for consistent alignment
259
+ valid_pages = [item['target_page'] for item in structural_toc if isinstance(item['target_page'], int)]
260
+ max_page = max(valid_pages) if valid_pages else 1
261
+ page_width = len(str(max_page))
262
+
263
+ # Iterate and format
264
+ for item in structural_toc:
265
+ indent = " " * 4 * (item['level'] - 1)
266
+ # Handle cases where page might be N/A or None
267
+ target_page = item.get('target_page', "N/A")
268
+ page_str = str(target_page).rjust(page_width)
269
+
270
+ lines.append(f"{indent}{item['title']} . . . page {page_str}")
271
+
272
+ lines.append("-" * 70)
273
+
274
+ # Final aggregation
275
+ str_structural_toc = "\n".join(lines)
276
+
277
+ if print_bool:
278
+ print(str_structural_toc)
279
+
280
+ return str_structural_toc
@@ -0,0 +1,176 @@
1
+ # src/pdflinkcheck/stdlib_server.py
2
+ import http.server
3
+ import socketserver
4
+ import json
5
+ import tempfile
6
+ import shutil
7
+ import os
8
+ from pathlib import Path
9
+ import email # This replaces cgi for multipart parsing
10
+
11
+ from pdflinkcheck.report import run_report
12
+
13
+ PORT = 8000
14
+
15
+ HTML_FORM = """
16
+ <!doctype html>
17
+ <html>
18
+ <head><title>pdflinkcheck Stdlib Server</title></head>
19
+ <body style="font-family: sans-serif; max-width: 800px; margin: 40px auto;">
20
+ <h1>pdflinkcheck API (Pure Stdlib, without cgi)</h1>
21
+ <p>Upload a PDF for link/TOC analysis. Zero third-party deps, future-proof.</p>
22
+ <form action="/" method="post" enctype="multipart/form-data">
23
+ <p><input type="file" name="file" accept=".pdf" required></p>
24
+ <p>
25
+ <label>Engine:</label>
26
+ <select name="pdf_library">
27
+ <option value="pypdf" selected>pypdf (pure Python, Termux-friendly)</option>
28
+ <option value="pymupdf">pymupdf (faster, if installed)</option>
29
+ </select>
30
+ </p>
31
+ <p>
32
+ <label>Max links to show (0 = all):</label>
33
+ <input type="number" name="max_links" value="0" min="0">
34
+ </p>
35
+ <p><button type="submit">Analyze PDF</button></p>
36
+ </form>
37
+ <hr>
38
+ <p>Returns JSON. Works on Termux & Python 3.13+.</p>
39
+ </body>
40
+ </html>
41
+ """
42
+
43
+ class ThreadedTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
44
+ allow_reuse_address = True
45
+
46
+ class PDFLinkCheckHandler(http.server.SimpleHTTPRequestHandler):
47
+ def do_GET(self):
48
+ if self.path == "/":
49
+ self.send_response(200)
50
+ self.send_header("Content-Type", "text/html; charset=utf-8")
51
+ self.end_headers()
52
+ self.wfile.write(HTML_FORM.encode("utf-8"))
53
+ return
54
+
55
+ if self.path == "/favicon.ico":
56
+ return
57
+ # Silent no-content response (most browsers cache this)
58
+ self.send_response(204)
59
+ self.end_headers()
60
+ return
61
+
62
+ self.send_error(404, "Not Found")
63
+
64
+ def do_POST(self):
65
+ if self.path != "/":
66
+ self.send_error(404, "Not Found")
67
+ return
68
+
69
+ # Get Content-Type and Content-Length
70
+ content_type = self.headers.get("Content-Type")
71
+ if not content_type or "multipart/form-data" not in content_type:
72
+ self._send_json_error("Expected multipart/form-data", 400)
73
+ return
74
+
75
+ content_length = int(self.headers.get("Content-Length", 0))
76
+ if content_length == 0:
77
+ self._send_json_error("No body sent", 400)
78
+ return
79
+
80
+ # Read the entire body
81
+ body = self.rfile.read(content_length)
82
+
83
+ # Parse using email.message (pure stdlib, no cgi)
84
+ msg = email.message_from_bytes(b"Content-Type: " + content_type.encode() + b"\r\n\r\n" + body)
85
+
86
+ if not msg.is_multipart():
87
+ self._send_json_error("Invalid multipart message", 400)
88
+ return
89
+
90
+ # Extract parts
91
+ file_item = None
92
+ pdf_library = "pypdf"
93
+ max_links = 0
94
+
95
+ for part in msg.get_payload():
96
+ disposition = part.get("Content-Disposition", "")
97
+ if not disposition.startswith("form-data"):
98
+ continue
99
+
100
+ name = part.get_param("name", header="Content-Disposition")
101
+ filename = part.get_param("filename", header="Content-Disposition")
102
+
103
+ if name == "file" and filename:
104
+ if not filename.lower().endswith(".pdf"):
105
+ self._send_json_error("Only .pdf files allowed", 400)
106
+ return
107
+ file_item = part.get_payload(decode=True) # bytes
108
+ file_filename = filename
109
+
110
+ elif name == "pdf_library":
111
+ pdf_library = part.get_payload(decode=True).decode().lower()
112
+ if pdf_library not in {"pypdf", "pymupdf"}:
113
+ self._send_json_error("Invalid pdf_library", 400)
114
+ return
115
+
116
+ elif name == "max_links":
117
+ try:
118
+ max_links = int(part.get_payload(decode=True).decode())
119
+ except ValueError:
120
+ max_links = 0
121
+
122
+ if not file_item:
123
+ self._send_json_error("No PDF file uploaded", 400)
124
+ return
125
+
126
+ # Save uploaded file to temp
127
+ tmp_path = None
128
+ try:
129
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
130
+ tmp_file.write(file_item)
131
+ tmp_path = tmp_file.name
132
+
133
+ result = run_report(
134
+ pdf_path=tmp_path,
135
+ max_links=max_links if max_links > 0 else 0,
136
+ export_format="",
137
+ pdf_library=pdf_library,
138
+ print_bool=False
139
+ )
140
+
141
+ response = {
142
+ "filename": file_filename,
143
+ "pdf_library_used": pdf_library,
144
+ "total_links": result["metadata"]["total_links"],
145
+ "data": result["data"],
146
+ "text_report": result["text"]
147
+ }
148
+
149
+ self._send_json(response)
150
+
151
+ except Exception as e:
152
+ self._send_json_error(f"Analysis failed: {str(e)}", 500)
153
+ finally:
154
+ if tmp_path and os.path.exists(tmp_path):
155
+ os.unlink(tmp_path)
156
+
157
+ def _send_json(self, data, status=200):
158
+ self.send_response(status)
159
+ self.send_header("Content-Type", "application/json; charset=utf-8")
160
+ self.send_header("Access-Control-Allow-Origin", "*")
161
+ self.end_headers()
162
+ json_bytes = json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8")
163
+ self.wfile.write(json_bytes)
164
+
165
+ def _send_json_error(self, message, status=400):
166
+ self._send_json({"error": message}, status)
167
+
168
+ if __name__ == "__main__":
169
+ with ThreadedTCPServer(("", PORT), PDFLinkCheckHandler) as httpd:
170
+ print(f"pdflinkcheck pure-stdlib server (no cgi) running at http://localhost:{PORT}")
171
+ print("Future-proof for Python 3.13+ • Handles concurrent uploads")
172
+ try:
173
+ httpd.serve_forever()
174
+ except KeyboardInterrupt:
175
+ print("\nShutting down...")
176
+ httpd.server_close()