pdflinkcheck 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. pdflinkcheck/__init__.py +88 -18
  2. pdflinkcheck/__main__.py +6 -0
  3. pdflinkcheck/analysis_pdfium.py +131 -0
  4. pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +99 -141
  5. pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +51 -39
  6. pdflinkcheck/cli.py +52 -48
  7. pdflinkcheck/data/LICENSE +18 -15
  8. pdflinkcheck/data/README.md +23 -25
  9. pdflinkcheck/data/pyproject.toml +17 -26
  10. pdflinkcheck/datacopy.py +16 -1
  11. pdflinkcheck/dev.py +2 -2
  12. pdflinkcheck/environment.py +14 -2
  13. pdflinkcheck/gui.py +346 -563
  14. pdflinkcheck/helpers.py +88 -0
  15. pdflinkcheck/io.py +24 -6
  16. pdflinkcheck/report.py +598 -97
  17. pdflinkcheck/security.py +189 -0
  18. pdflinkcheck/splash.py +38 -0
  19. pdflinkcheck/stdlib_server.py +7 -21
  20. pdflinkcheck/stdlib_server_alt.py +571 -0
  21. pdflinkcheck/tk_utils.py +188 -0
  22. pdflinkcheck/update_msix_version.py +2 -0
  23. pdflinkcheck/validate.py +104 -170
  24. pdflinkcheck/version_info.py +2 -2
  25. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +41 -40
  26. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/RECORD +34 -27
  27. pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
  28. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
  29. pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
  30. pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
  31. pdflinkcheck/analyze_pypdf_v2.py +0 -217
  32. pdflinkcheck-1.1.94.dist-info/WHEEL +0 -4
  33. pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +0 -24
  34. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-AGPL3 +0 -0
  35. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-MIT +0 -0
@@ -0,0 +1,571 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
3
+ """
4
+ pdflinkcheck stdlib HTTP service
5
+ ===============================
6
+
7
+ This module implements a small, single-purpose HTTP service intended to be:
8
+
9
+ - Packaged inside a PYZ
10
+ - Run locally, on LAN, or behind a reverse proxy
11
+ - Used as a backend for CLI, GUI, or web clients
12
+
13
+ IMPORTANT:
14
+ ----------
15
+ This server is NOT intended to be exposed directly to the public internet.
16
+
17
+ When running in public-facing deployments, it MUST be placed behind a
18
+ reverse proxy (e.g. Caddy, nginx, cloudflared) which provides:
19
+
20
+ - TLS termination
21
+ - Request size limits
22
+ - Connection timeouts
23
+ - Rate limiting
24
+ - Protection against slowloris-style attacks
25
+
26
+ This module intentionally does NOT:
27
+ - Manage TLS certificates
28
+ - Implement authentication
29
+ - Perform rate limiting
30
+ - Handle HTTP/2 or proxy protocols
31
+
32
+ Those concerns belong to infrastructure, not application code.
33
+
34
+ PUBLIC MODE:
35
+ ------------
36
+ When --public is enabled, this server assumes:
37
+ - A reverse proxy is present
38
+ - TLS is terminated upstream
39
+ - The service may be reachable by untrusted clients
40
+
41
+ In public mode, the server:
42
+ - Enables stricter limits
43
+ - Refuses new work during shutdown
44
+ """
45
+
46
+ from __future__ import annotations
47
+
48
+ import http.server
49
+ import socketserver
50
+ import json
51
+ import tempfile
52
+ import os
53
+ import email
54
+ import signal
55
+ import threading
56
+ from dataclasses import dataclass
57
+ from typing import Optional
58
+
59
+ try:
60
+ from pdflinkcheck.report import run_report_and_call_exports
61
+ except:
62
+ pass
63
+
64
+ # =========================
65
+ # Configuration
66
+ # =========================
67
+
68
+ HOST = "127.0.0.1"
69
+ PORT = 8000
70
+
71
+ MAX_UPLOAD_BYTES = 25 * 1024 * 1024 # 25 MB
72
+ ALLOWED_LIBRARIES = {"pypdf", "pymupdf", "pdfium"}
73
+
74
+ # Concurrency control
75
+ MAX_CONCURRENT_JOBS = 2
76
+ REQUEST_SEMAPHORE = threading.Semaphore(MAX_CONCURRENT_JOBS)
77
+
78
+ # Shutdown coordination
79
+ SHUTDOWN_EVENT = threading.Event()
80
+
81
+ # Set via CLI in real usage
82
+ PUBLIC_MODE = False
83
+
84
+
85
+
86
+ # =========================
87
+ # HTML UI
88
+ # =========================
89
+
90
+ HTML_FORM = """<!doctype html>
91
+ <html>
92
+ <head>
93
+ <title>pdflinkcheck API</title>
94
+ <meta charset="utf-8">
95
+ <!--style>
96
+ body {
97
+ font-family: system-ui, sans-serif;
98
+ max-width: 800px;
99
+ margin: 40px auto;
100
+ }
101
+ button { padding: 6px 12px; }
102
+ </style-->
103
+ <style>
104
+ body {
105
+ font-family: system-ui, sans-serif;
106
+ max-width: 800px;
107
+ margin: 40px auto;
108
+ line-height: 1.6;
109
+ background: #f8f9fa;
110
+ color: #212529;
111
+ }
112
+ h1 { text-align: center; }
113
+ form {
114
+ background: white;
115
+ padding: 20px;
116
+ border-radius: 12px;
117
+ box-shadow: 0 0 12px rgba(0,0,0,0.1);
118
+ }
119
+ input, select, button {
120
+ padding: 8px 12px;
121
+ margin-top: 6px;
122
+ margin-bottom: 12px;
123
+ border-radius: 6px;
124
+ border: 1px solid #ccc;
125
+ width: 100%;
126
+ box-sizing: border-box;
127
+ }
128
+ button {
129
+ background-color: #0d6efd;
130
+ color: white;
131
+ border: none;
132
+ cursor: pointer;
133
+ }
134
+ button:hover { background-color: #0b5ed7; }
135
+ </style>
136
+ </head>
137
+ <body>
138
+ <h1>pdflinkcheck (stdlib server)</h1>
139
+ <p>Upload a PDF for link and TOC analysis.</p>
140
+
141
+ <form action="/" method="post" enctype="multipart/form-data">
142
+ <p>
143
+ <input type="file" name="file" accept=".pdf" required>
144
+ </p>
145
+ <p>
146
+ <label>Engine:</label>
147
+ <select name="pdf_library">
148
+ <option value="pypdf" selected>pypdf (pure Python)</option>
149
+ <option value="pymupdf">PyMuPDF (fast, AGPL)</option>
150
+ <option value="pdfium">PDFium (fast, permissive)</option>
151
+ </select>
152
+ </p>
153
+ <button type="submit">Analyze</button>
154
+ </form>
155
+
156
+ <p>Returns JSON.</p>
157
+ </body>
158
+ </html>
159
+ """
160
+
161
+ # =========================
162
+ # Documentation
163
+ # =========================
164
+ OPENAPI_SPEC = {
165
+ "openapi": "3.0.3",
166
+ "info": {
167
+ "title": "pdflinkcheck API",
168
+ "description": (
169
+ "Single-purpose API for analyzing PDF links and tables of contents.\n\n"
170
+ "This service is designed to run behind a reverse proxy and accepts "
171
+ "multipart/form-data uploads containing a PDF file."
172
+ ),
173
+ "version": "1.1.0",
174
+ "license": {
175
+ "name": "MIT"
176
+ }
177
+ },
178
+ "servers": [
179
+ {"url": "/"}
180
+ ],
181
+ "paths": {
182
+ "/": {
183
+ "post": {
184
+ "summary": "Analyze a PDF",
185
+ "description": "Uploads a PDF file and returns link analysis results.",
186
+ "requestBody": {
187
+ "required": True,
188
+ "content": {
189
+ "multipart/form-data": {
190
+ "schema": {
191
+ "type": "object",
192
+ "required": ["file"],
193
+ "properties": {
194
+ "file": {
195
+ "type": "string",
196
+ "format": "binary",
197
+ "description": "PDF file to analyze"
198
+ },
199
+ "pdf_library": {
200
+ "type": "string",
201
+ "enum": ["pypdf", "pymupdf", "pdfium"],
202
+ "default": "pypdf"
203
+ }
204
+ }
205
+ }
206
+ }
207
+ }
208
+ },
209
+ "responses": {
210
+ "200": {
211
+ "description": "Analysis result",
212
+ "content": {
213
+ "application/json": {
214
+ "schema": {
215
+ "$ref": "#/components/schemas/AnalysisResponse"
216
+ }
217
+ }
218
+ }
219
+ },
220
+ "400": {
221
+ "description": "Validation error"
222
+ },
223
+ "503": {
224
+ "description": "Server shutting down"
225
+ }
226
+ }
227
+ }
228
+ },
229
+ "/ready": {
230
+ "get": {
231
+ "summary": "Readiness probe",
232
+ "description": "Indicates whether the server is ready to accept new work.",
233
+ "responses": {
234
+ "200": {
235
+ "description": "Server ready"
236
+ },
237
+ "503": {
238
+ "description": "Server shutting down"
239
+ }
240
+ }
241
+ }
242
+ },
243
+ "/openapi.json": {
244
+ "get": {
245
+ "summary": "OpenAPI specification",
246
+ "description": "Returns the OpenAPI 3.0 specification for this service.",
247
+ "responses": {
248
+ "200": {
249
+ "description": "OpenAPI JSON document"
250
+ }
251
+ }
252
+ }
253
+ }
254
+ },
255
+ "components": {
256
+ "schemas": {
257
+ "AnalysisResponse": {
258
+ "type": "object",
259
+ "properties": {
260
+ "filename": {
261
+ "type": "string"
262
+ },
263
+ "pdf_library_used": {
264
+ "type": "string"
265
+ },
266
+ "total_links_count": {
267
+ "type": "integer"
268
+ },
269
+ "data": {
270
+ "type": "object",
271
+ "description": "Structured analysis data"
272
+ },
273
+ "text_report": {
274
+ "type": "string",
275
+ "description": "Human-readable text report"
276
+ }
277
+ },
278
+ "required": [
279
+ "filename",
280
+ "pdf_library_used",
281
+ "total_links_count",
282
+ "data",
283
+ "text_report"
284
+ ]
285
+ }
286
+ }
287
+ }
288
+ }
289
+
290
+ # =========================
291
+ # Validation Models
292
+ # =========================
293
+
294
+ @dataclass(frozen=True)
295
+ class UploadRequest:
296
+ filename: str
297
+ pdf_bytes: bytes
298
+ pdf_library: str
299
+
300
+
301
+ class ValidationError(Exception):
302
+ """Client-side validation error (HTTP 400)."""
303
+
304
+
305
+ # =========================
306
+ # Validation Layer
307
+ # =========================
308
+
309
+ class RequestValidator:
310
+ """Pure validation: no I/O, no side effects."""
311
+
312
+ @staticmethod
313
+ def validate_upload(
314
+ *,
315
+ filename: str,
316
+ pdf_bytes: bytes,
317
+ pdf_library: str,
318
+ ) -> UploadRequest:
319
+
320
+ if not filename:
321
+ raise ValidationError("Missing filename")
322
+
323
+ if not filename.lower().endswith(".pdf"):
324
+ raise ValidationError("Only .pdf files are allowed")
325
+
326
+ if not pdf_bytes:
327
+ raise ValidationError("Empty file upload")
328
+
329
+ if len(pdf_bytes) > MAX_UPLOAD_BYTES:
330
+ raise ValidationError("File exceeds size limit")
331
+
332
+ if pdf_library not in ALLOWED_LIBRARIES:
333
+ raise ValidationError("Invalid pdf_library")
334
+
335
+ return UploadRequest(
336
+ filename=filename,
337
+ pdf_bytes=pdf_bytes,
338
+ pdf_library=pdf_library,
339
+ )
340
+
341
+
342
+ # =========================
343
+ # Multipart Parsing
344
+ # =========================
345
+
346
+ class MultipartParser:
347
+ """Extracts fields from multipart/form-data using stdlib email parser."""
348
+
349
+ @staticmethod
350
+ def parse(headers, body: bytes) -> dict:
351
+ content_type = headers.get("Content-Type")
352
+ if not content_type or "multipart/form-data" not in content_type:
353
+ raise ValidationError("Expected multipart/form-data")
354
+
355
+ msg = email.message_from_bytes(
356
+ b"Content-Type: " + content_type.encode() + b"\r\n\r\n" + body
357
+ )
358
+
359
+ if not msg.is_multipart():
360
+ raise ValidationError("Invalid multipart payload")
361
+
362
+ fields = {}
363
+
364
+ for part in msg.get_payload():
365
+ disposition = part.get("Content-Disposition", "")
366
+ if not disposition.startswith("form-data"):
367
+ continue
368
+
369
+ name = part.get_param("name", header="Content-Disposition")
370
+ filename = part.get_param("filename", header="Content-Disposition")
371
+
372
+ if filename:
373
+ fields[name] = {
374
+ "filename": filename,
375
+ "data": part.get_payload(decode=True),
376
+ }
377
+ else:
378
+ fields[name] = part.get_payload(decode=True).decode().strip()
379
+
380
+ return fields
381
+
382
+
383
+ # =========================
384
+ # HTTP Server
385
+ # =========================
386
+
387
+ class ThreadedHTTPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
388
+ allow_reuse_address = True
389
+ daemon_threads = True
390
+
391
+ class APIHandler(http.server.BaseHTTPRequestHandler):
392
+
393
+ server_version = "pdflinkcheck-stdlib/1.1"
394
+
395
+ def log_message(self, format, *args):
396
+ return
397
+
398
+ # -------- Utilities --------
399
+
400
+ def _send_json(self, payload: dict, status: int = 200) -> None:
401
+ body = json.dumps(payload, indent=2, ensure_ascii=False).encode("utf-8")
402
+ self.send_response(status)
403
+ self.send_header("Content-Type", "application/json; charset=utf-8")
404
+ self.send_header("Content-Length", str(len(body)))
405
+ self.send_header("Access-Control-Allow-Origin", "*")
406
+ self.end_headers()
407
+ self.wfile.write(body)
408
+
409
+ def _send_error_json(self, message: str, status: int) -> None:
410
+ self._send_json({"error": message}, status)
411
+
412
+ # -------- Handlers --------
413
+
414
+ def do_GET(self):
415
+ if self.path == "/":
416
+ body = HTML_FORM.encode("utf-8")
417
+ self.send_response(200)
418
+ self.send_header("Content-Type", "text/html; charset=utf-8")
419
+ self.send_header("Content-Length", str(len(body)))
420
+ self.end_headers()
421
+ self.wfile.write(body)
422
+ return
423
+ if self.path == "/openapi.json":
424
+ self._send_json(OPENAPI_SPEC)
425
+ return
426
+ if self.path == "/ready":
427
+ if SHUTDOWN_EVENT.is_set():
428
+ self._send_error_json("Server shutting down", 503)
429
+ else:
430
+ self._send_json({"status": "ready"})
431
+ return
432
+
433
+ if self.path == "/favicon.ico":
434
+ self.send_response(204)
435
+ self.end_headers()
436
+ return
437
+
438
+ self.send_error(404)
439
+
440
+ def do_POST(self):
441
+ if self.path != "/":
442
+ self.send_error(404)
443
+ return
444
+
445
+ if SHUTDOWN_EVENT.is_set():
446
+ self._send_error_json("Server shutting down", 503)
447
+ return
448
+
449
+ try:
450
+ self.connection.settimeout(30)
451
+
452
+ content_length = int(self.headers.get("Content-Length", "0"))
453
+ if content_length <= 0:
454
+ raise ValidationError("Empty request body")
455
+
456
+ if content_length > MAX_UPLOAD_BYTES * 2:
457
+ raise ValidationError("Request too large")
458
+
459
+ body = self.rfile.read(min(content_length, MAX_UPLOAD_BYTES * 2))
460
+ fields = MultipartParser.parse(self.headers, body)
461
+
462
+ file_field = fields.get("file")
463
+ if not isinstance(file_field, dict):
464
+ raise ValidationError("Missing file upload")
465
+
466
+ upload = RequestValidator.validate_upload(
467
+ filename=file_field["filename"],
468
+ pdf_bytes=file_field["data"],
469
+ pdf_library=fields.get("pdf_library", "pypdf"),
470
+ )
471
+
472
+ with REQUEST_SEMAPHORE:
473
+ response = self._process_pdf(upload)
474
+
475
+ self._send_json(response)
476
+
477
+ except ValidationError as e:
478
+ self._send_error_json(str(e), 400)
479
+
480
+ except Exception:
481
+ self._send_error_json("Internal server error", 500)
482
+
483
+ # -------- Business Logic --------
484
+
485
+ def _process_pdf(self, upload: UploadRequest) -> dict:
486
+ tmp_path: Optional[str] = None
487
+
488
+ try:
489
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
490
+ tmp.write(upload.pdf_bytes)
491
+ tmp_path = tmp.name
492
+
493
+ result = run_report_and_call_exports(
494
+ pdf_path=tmp_path,
495
+ export_format="",
496
+ pdf_library=upload.pdf_library,
497
+ print_bool=False,
498
+ )
499
+
500
+ link_count = (
501
+ result.get("metadata", {})
502
+ .get("link_counts", {})
503
+ .get("total_links_count", 0)
504
+ )
505
+
506
+ return {
507
+ "filename": upload.filename,
508
+ "pdf_library_used": upload.pdf_library,
509
+ "total_links_count": link_count,
510
+ "data": result["data"],
511
+ "text_report": result["text"],
512
+ }
513
+
514
+ finally:
515
+ if tmp_path and os.path.exists(tmp_path):
516
+ os.unlink(tmp_path)
517
+
518
+
519
+ # =========================
520
+ # Entrypoint
521
+ # =========================
522
+
523
+ def main():
524
+ with ThreadedHTTPServer((HOST, PORT), APIHandler) as httpd:
525
+
526
+ def shutdown_server():
527
+ SHUTDOWN_EVENT.set()
528
+ httpd.shutdown()
529
+
530
+ def handle_signal(signum, frame):
531
+ print("\nShutdown signal received")
532
+ threading.Thread(
533
+ target=shutdown_server,
534
+ daemon=True
535
+ ).start()
536
+
537
+ signal.signal(signal.SIGINT, handle_signal)
538
+ signal.signal(signal.SIGTERM, handle_signal)
539
+
540
+ print(f"pdflinkcheck stdlib server running at http://{HOST}:{PORT}")
541
+ print("Pure stdlib • Explicit validation • Graceful shutdown • Termux-safe")
542
+
543
+ try:
544
+ httpd.serve_forever()
545
+ finally:
546
+ httpd.server_close()
547
+
548
+ print("Server shut down cleanly")
549
+
550
+ def main_():
551
+ with ThreadedHTTPServer((HOST, PORT), APIHandler) as httpd:
552
+
553
+ def handle_shutdown(signum, frame):
554
+ print("\nShutdown signal received")
555
+ SHUTDOWN_EVENT.set()
556
+ httpd.shutdown()
557
+
558
+ signal.signal(signal.SIGINT, handle_shutdown)
559
+ signal.signal(signal.SIGTERM, handle_shutdown)
560
+
561
+ print(f"pdflinkcheck stdlib server running at http://{HOST}:{PORT}")
562
+ print("Pure stdlib • Explicit validation • Graceful shutdown • Termux-safe")
563
+
564
+ httpd.serve_forever()
565
+
566
+ print("Server shut down cleanly")
567
+
568
+
569
+ if __name__ == "__main__":
570
+ main()
571
+