pdflinkcheck 1.1.73__py3-none-any.whl → 1.2.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +88 -21
- pdflinkcheck/__main__.py +6 -0
- pdflinkcheck/analysis_pdfium.py +131 -0
- pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +109 -145
- pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +67 -37
- pdflinkcheck/cli.py +111 -116
- pdflinkcheck/data/I Have Questions.md +51 -0
- pdflinkcheck/data/LICENSE +20 -654
- pdflinkcheck/data/README.md +65 -67
- pdflinkcheck/data/icons/BoxArt-1080x1080.png +0 -0
- pdflinkcheck/data/icons/Logo-150x150.png +0 -0
- pdflinkcheck/data/icons/Logo-300x300.png +0 -0
- pdflinkcheck/data/icons/Logo-71x71.png +0 -0
- pdflinkcheck/data/icons/PosterArt-720x1080.png +0 -0
- pdflinkcheck/data/icons/SmallLogo-44x44.png +0 -0
- pdflinkcheck/data/icons/SplashScreen-620x300.png +0 -0
- pdflinkcheck/data/icons/StoreLogo-50x50.png +0 -0
- pdflinkcheck/data/icons/WideLogo-310x150.png +0 -0
- pdflinkcheck/data/icons/red_pdf_512px.ico +0 -0
- pdflinkcheck/data/pyproject.toml +25 -37
- pdflinkcheck/data/themes/forest/forest-dark/border-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/border-invalid.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/card.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/check-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/combo-button-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/down.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/empty.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/notebook.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/off-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/on-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/radio-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/rect-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/right.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/scale-hor.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/scale-vert.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/separator.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/sizegrip.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/spin-button-up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tab-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/thumb-vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tree-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/tree-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark/vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-dark.tcl +536 -0
- pdflinkcheck/data/themes/forest/forest-light/border-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/border-invalid.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/card.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/check-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/combo-button-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/down.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/empty.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/notebook.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/off-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/on-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-tri-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/radio-unsel-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-accent-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/rect-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/right-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/right.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/scale-hor.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/scale-vert.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/separator.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/sizegrip.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-down-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-down-focus.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/spin-button-up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tab-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-hor-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/thumb-vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tree-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/tree-pressed.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/up.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-accent.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-basic.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light/vert-hover.png +0 -0
- pdflinkcheck/data/themes/forest/forest-light.tcl +544 -0
- pdflinkcheck/datacopy.py +18 -1
- pdflinkcheck/dev.py +12 -25
- pdflinkcheck/environment.py +76 -0
- pdflinkcheck/gui.py +366 -457
- pdflinkcheck/helpers.py +88 -0
- pdflinkcheck/io.py +27 -23
- pdflinkcheck/report.py +692 -121
- pdflinkcheck/security.py +189 -0
- pdflinkcheck/splash.py +38 -0
- pdflinkcheck/stdlib_server.py +14 -20
- pdflinkcheck/stdlib_server_alt.py +571 -0
- pdflinkcheck/tk_utils.py +188 -0
- pdflinkcheck/update_msix_version.py +49 -0
- pdflinkcheck/validate.py +129 -218
- pdflinkcheck/version_info.py +6 -3
- {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +84 -81
- pdflinkcheck-1.2.29.dist-info/RECORD +183 -0
- pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
- {pdflinkcheck-1.1.73.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-MIT +9 -0
- pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
- pdflinkcheck/analyze_pypdf_v2.py +0 -218
- pdflinkcheck-1.1.73.dist-info/RECORD +0 -21
- pdflinkcheck-1.1.73.dist-info/WHEEL +0 -4
- /pdflinkcheck-1.1.73.dist-info/licenses/LICENSE → /pdflinkcheck-1.2.29.dist-info/licenses/LICENSE-AGPL3 +0 -0
|
@@ -0,0 +1,571 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
"""
|
|
4
|
+
pdflinkcheck stdlib HTTP service
|
|
5
|
+
===============================
|
|
6
|
+
|
|
7
|
+
This module implements a small, single-purpose HTTP service intended to be:
|
|
8
|
+
|
|
9
|
+
- Packaged inside a PYZ
|
|
10
|
+
- Run locally, on LAN, or behind a reverse proxy
|
|
11
|
+
- Used as a backend for CLI, GUI, or web clients
|
|
12
|
+
|
|
13
|
+
IMPORTANT:
|
|
14
|
+
----------
|
|
15
|
+
This server is NOT intended to be exposed directly to the public internet.
|
|
16
|
+
|
|
17
|
+
When running in public-facing deployments, it MUST be placed behind a
|
|
18
|
+
reverse proxy (e.g. Caddy, nginx, cloudflared) which provides:
|
|
19
|
+
|
|
20
|
+
- TLS termination
|
|
21
|
+
- Request size limits
|
|
22
|
+
- Connection timeouts
|
|
23
|
+
- Rate limiting
|
|
24
|
+
- Protection against slowloris-style attacks
|
|
25
|
+
|
|
26
|
+
This module intentionally does NOT:
|
|
27
|
+
- Manage TLS certificates
|
|
28
|
+
- Implement authentication
|
|
29
|
+
- Perform rate limiting
|
|
30
|
+
- Handle HTTP/2 or proxy protocols
|
|
31
|
+
|
|
32
|
+
Those concerns belong to infrastructure, not application code.
|
|
33
|
+
|
|
34
|
+
PUBLIC MODE:
|
|
35
|
+
------------
|
|
36
|
+
When --public is enabled, this server assumes:
|
|
37
|
+
- A reverse proxy is present
|
|
38
|
+
- TLS is terminated upstream
|
|
39
|
+
- The service may be reachable by untrusted clients
|
|
40
|
+
|
|
41
|
+
In public mode, the server:
|
|
42
|
+
- Enables stricter limits
|
|
43
|
+
- Refuses new work during shutdown
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
from __future__ import annotations
|
|
47
|
+
|
|
48
|
+
import http.server
|
|
49
|
+
import socketserver
|
|
50
|
+
import json
|
|
51
|
+
import tempfile
|
|
52
|
+
import os
|
|
53
|
+
import email
|
|
54
|
+
import signal
|
|
55
|
+
import threading
|
|
56
|
+
from dataclasses import dataclass
|
|
57
|
+
from typing import Optional
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
from pdflinkcheck.report import run_report_and_call_exports
|
|
61
|
+
except:
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
# =========================
|
|
65
|
+
# Configuration
|
|
66
|
+
# =========================
|
|
67
|
+
|
|
68
|
+
HOST = "127.0.0.1"
|
|
69
|
+
PORT = 8000
|
|
70
|
+
|
|
71
|
+
MAX_UPLOAD_BYTES = 25 * 1024 * 1024 # 25 MB
|
|
72
|
+
ALLOWED_LIBRARIES = {"pypdf", "pymupdf", "pdfium"}
|
|
73
|
+
|
|
74
|
+
# Concurrency control
|
|
75
|
+
MAX_CONCURRENT_JOBS = 2
|
|
76
|
+
REQUEST_SEMAPHORE = threading.Semaphore(MAX_CONCURRENT_JOBS)
|
|
77
|
+
|
|
78
|
+
# Shutdown coordination
|
|
79
|
+
SHUTDOWN_EVENT = threading.Event()
|
|
80
|
+
|
|
81
|
+
# Set via CLI in real usage
|
|
82
|
+
PUBLIC_MODE = False
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# =========================
|
|
87
|
+
# HTML UI
|
|
88
|
+
# =========================
|
|
89
|
+
|
|
90
|
+
HTML_FORM = """<!doctype html>
|
|
91
|
+
<html>
|
|
92
|
+
<head>
|
|
93
|
+
<title>pdflinkcheck API</title>
|
|
94
|
+
<meta charset="utf-8">
|
|
95
|
+
<!--style>
|
|
96
|
+
body {
|
|
97
|
+
font-family: system-ui, sans-serif;
|
|
98
|
+
max-width: 800px;
|
|
99
|
+
margin: 40px auto;
|
|
100
|
+
}
|
|
101
|
+
button { padding: 6px 12px; }
|
|
102
|
+
</style-->
|
|
103
|
+
<style>
|
|
104
|
+
body {
|
|
105
|
+
font-family: system-ui, sans-serif;
|
|
106
|
+
max-width: 800px;
|
|
107
|
+
margin: 40px auto;
|
|
108
|
+
line-height: 1.6;
|
|
109
|
+
background: #f8f9fa;
|
|
110
|
+
color: #212529;
|
|
111
|
+
}
|
|
112
|
+
h1 { text-align: center; }
|
|
113
|
+
form {
|
|
114
|
+
background: white;
|
|
115
|
+
padding: 20px;
|
|
116
|
+
border-radius: 12px;
|
|
117
|
+
box-shadow: 0 0 12px rgba(0,0,0,0.1);
|
|
118
|
+
}
|
|
119
|
+
input, select, button {
|
|
120
|
+
padding: 8px 12px;
|
|
121
|
+
margin-top: 6px;
|
|
122
|
+
margin-bottom: 12px;
|
|
123
|
+
border-radius: 6px;
|
|
124
|
+
border: 1px solid #ccc;
|
|
125
|
+
width: 100%;
|
|
126
|
+
box-sizing: border-box;
|
|
127
|
+
}
|
|
128
|
+
button {
|
|
129
|
+
background-color: #0d6efd;
|
|
130
|
+
color: white;
|
|
131
|
+
border: none;
|
|
132
|
+
cursor: pointer;
|
|
133
|
+
}
|
|
134
|
+
button:hover { background-color: #0b5ed7; }
|
|
135
|
+
</style>
|
|
136
|
+
</head>
|
|
137
|
+
<body>
|
|
138
|
+
<h1>pdflinkcheck (stdlib server)</h1>
|
|
139
|
+
<p>Upload a PDF for link and TOC analysis.</p>
|
|
140
|
+
|
|
141
|
+
<form action="/" method="post" enctype="multipart/form-data">
|
|
142
|
+
<p>
|
|
143
|
+
<input type="file" name="file" accept=".pdf" required>
|
|
144
|
+
</p>
|
|
145
|
+
<p>
|
|
146
|
+
<label>Engine:</label>
|
|
147
|
+
<select name="pdf_library">
|
|
148
|
+
<option value="pypdf" selected>pypdf (pure Python)</option>
|
|
149
|
+
<option value="pymupdf">PyMuPDF (fast, AGPL)</option>
|
|
150
|
+
<option value="pdfium">PDFium (fast, permissive)</option>
|
|
151
|
+
</select>
|
|
152
|
+
</p>
|
|
153
|
+
<button type="submit">Analyze</button>
|
|
154
|
+
</form>
|
|
155
|
+
|
|
156
|
+
<p>Returns JSON.</p>
|
|
157
|
+
</body>
|
|
158
|
+
</html>
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
# =========================
|
|
162
|
+
# Documentation
|
|
163
|
+
# =========================
|
|
164
|
+
OPENAPI_SPEC = {
|
|
165
|
+
"openapi": "3.0.3",
|
|
166
|
+
"info": {
|
|
167
|
+
"title": "pdflinkcheck API",
|
|
168
|
+
"description": (
|
|
169
|
+
"Single-purpose API for analyzing PDF links and tables of contents.\n\n"
|
|
170
|
+
"This service is designed to run behind a reverse proxy and accepts "
|
|
171
|
+
"multipart/form-data uploads containing a PDF file."
|
|
172
|
+
),
|
|
173
|
+
"version": "1.1.0",
|
|
174
|
+
"license": {
|
|
175
|
+
"name": "MIT"
|
|
176
|
+
}
|
|
177
|
+
},
|
|
178
|
+
"servers": [
|
|
179
|
+
{"url": "/"}
|
|
180
|
+
],
|
|
181
|
+
"paths": {
|
|
182
|
+
"/": {
|
|
183
|
+
"post": {
|
|
184
|
+
"summary": "Analyze a PDF",
|
|
185
|
+
"description": "Uploads a PDF file and returns link analysis results.",
|
|
186
|
+
"requestBody": {
|
|
187
|
+
"required": True,
|
|
188
|
+
"content": {
|
|
189
|
+
"multipart/form-data": {
|
|
190
|
+
"schema": {
|
|
191
|
+
"type": "object",
|
|
192
|
+
"required": ["file"],
|
|
193
|
+
"properties": {
|
|
194
|
+
"file": {
|
|
195
|
+
"type": "string",
|
|
196
|
+
"format": "binary",
|
|
197
|
+
"description": "PDF file to analyze"
|
|
198
|
+
},
|
|
199
|
+
"pdf_library": {
|
|
200
|
+
"type": "string",
|
|
201
|
+
"enum": ["pypdf", "pymupdf", "pdfium"],
|
|
202
|
+
"default": "pypdf"
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
},
|
|
209
|
+
"responses": {
|
|
210
|
+
"200": {
|
|
211
|
+
"description": "Analysis result",
|
|
212
|
+
"content": {
|
|
213
|
+
"application/json": {
|
|
214
|
+
"schema": {
|
|
215
|
+
"$ref": "#/components/schemas/AnalysisResponse"
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
},
|
|
220
|
+
"400": {
|
|
221
|
+
"description": "Validation error"
|
|
222
|
+
},
|
|
223
|
+
"503": {
|
|
224
|
+
"description": "Server shutting down"
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
},
|
|
229
|
+
"/ready": {
|
|
230
|
+
"get": {
|
|
231
|
+
"summary": "Readiness probe",
|
|
232
|
+
"description": "Indicates whether the server is ready to accept new work.",
|
|
233
|
+
"responses": {
|
|
234
|
+
"200": {
|
|
235
|
+
"description": "Server ready"
|
|
236
|
+
},
|
|
237
|
+
"503": {
|
|
238
|
+
"description": "Server shutting down"
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
},
|
|
243
|
+
"/openapi.json": {
|
|
244
|
+
"get": {
|
|
245
|
+
"summary": "OpenAPI specification",
|
|
246
|
+
"description": "Returns the OpenAPI 3.0 specification for this service.",
|
|
247
|
+
"responses": {
|
|
248
|
+
"200": {
|
|
249
|
+
"description": "OpenAPI JSON document"
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
},
|
|
255
|
+
"components": {
|
|
256
|
+
"schemas": {
|
|
257
|
+
"AnalysisResponse": {
|
|
258
|
+
"type": "object",
|
|
259
|
+
"properties": {
|
|
260
|
+
"filename": {
|
|
261
|
+
"type": "string"
|
|
262
|
+
},
|
|
263
|
+
"pdf_library_used": {
|
|
264
|
+
"type": "string"
|
|
265
|
+
},
|
|
266
|
+
"total_links_count": {
|
|
267
|
+
"type": "integer"
|
|
268
|
+
},
|
|
269
|
+
"data": {
|
|
270
|
+
"type": "object",
|
|
271
|
+
"description": "Structured analysis data"
|
|
272
|
+
},
|
|
273
|
+
"text_report": {
|
|
274
|
+
"type": "string",
|
|
275
|
+
"description": "Human-readable text report"
|
|
276
|
+
}
|
|
277
|
+
},
|
|
278
|
+
"required": [
|
|
279
|
+
"filename",
|
|
280
|
+
"pdf_library_used",
|
|
281
|
+
"total_links_count",
|
|
282
|
+
"data",
|
|
283
|
+
"text_report"
|
|
284
|
+
]
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
# =========================
|
|
291
|
+
# Validation Models
|
|
292
|
+
# =========================
|
|
293
|
+
|
|
294
|
+
@dataclass(frozen=True)
|
|
295
|
+
class UploadRequest:
|
|
296
|
+
filename: str
|
|
297
|
+
pdf_bytes: bytes
|
|
298
|
+
pdf_library: str
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
class ValidationError(Exception):
|
|
302
|
+
"""Client-side validation error (HTTP 400)."""
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
# =========================
|
|
306
|
+
# Validation Layer
|
|
307
|
+
# =========================
|
|
308
|
+
|
|
309
|
+
class RequestValidator:
|
|
310
|
+
"""Pure validation: no I/O, no side effects."""
|
|
311
|
+
|
|
312
|
+
@staticmethod
|
|
313
|
+
def validate_upload(
|
|
314
|
+
*,
|
|
315
|
+
filename: str,
|
|
316
|
+
pdf_bytes: bytes,
|
|
317
|
+
pdf_library: str,
|
|
318
|
+
) -> UploadRequest:
|
|
319
|
+
|
|
320
|
+
if not filename:
|
|
321
|
+
raise ValidationError("Missing filename")
|
|
322
|
+
|
|
323
|
+
if not filename.lower().endswith(".pdf"):
|
|
324
|
+
raise ValidationError("Only .pdf files are allowed")
|
|
325
|
+
|
|
326
|
+
if not pdf_bytes:
|
|
327
|
+
raise ValidationError("Empty file upload")
|
|
328
|
+
|
|
329
|
+
if len(pdf_bytes) > MAX_UPLOAD_BYTES:
|
|
330
|
+
raise ValidationError("File exceeds size limit")
|
|
331
|
+
|
|
332
|
+
if pdf_library not in ALLOWED_LIBRARIES:
|
|
333
|
+
raise ValidationError("Invalid pdf_library")
|
|
334
|
+
|
|
335
|
+
return UploadRequest(
|
|
336
|
+
filename=filename,
|
|
337
|
+
pdf_bytes=pdf_bytes,
|
|
338
|
+
pdf_library=pdf_library,
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
# =========================
|
|
343
|
+
# Multipart Parsing
|
|
344
|
+
# =========================
|
|
345
|
+
|
|
346
|
+
class MultipartParser:
|
|
347
|
+
"""Extracts fields from multipart/form-data using stdlib email parser."""
|
|
348
|
+
|
|
349
|
+
@staticmethod
|
|
350
|
+
def parse(headers, body: bytes) -> dict:
|
|
351
|
+
content_type = headers.get("Content-Type")
|
|
352
|
+
if not content_type or "multipart/form-data" not in content_type:
|
|
353
|
+
raise ValidationError("Expected multipart/form-data")
|
|
354
|
+
|
|
355
|
+
msg = email.message_from_bytes(
|
|
356
|
+
b"Content-Type: " + content_type.encode() + b"\r\n\r\n" + body
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
if not msg.is_multipart():
|
|
360
|
+
raise ValidationError("Invalid multipart payload")
|
|
361
|
+
|
|
362
|
+
fields = {}
|
|
363
|
+
|
|
364
|
+
for part in msg.get_payload():
|
|
365
|
+
disposition = part.get("Content-Disposition", "")
|
|
366
|
+
if not disposition.startswith("form-data"):
|
|
367
|
+
continue
|
|
368
|
+
|
|
369
|
+
name = part.get_param("name", header="Content-Disposition")
|
|
370
|
+
filename = part.get_param("filename", header="Content-Disposition")
|
|
371
|
+
|
|
372
|
+
if filename:
|
|
373
|
+
fields[name] = {
|
|
374
|
+
"filename": filename,
|
|
375
|
+
"data": part.get_payload(decode=True),
|
|
376
|
+
}
|
|
377
|
+
else:
|
|
378
|
+
fields[name] = part.get_payload(decode=True).decode().strip()
|
|
379
|
+
|
|
380
|
+
return fields
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
# =========================
|
|
384
|
+
# HTTP Server
|
|
385
|
+
# =========================
|
|
386
|
+
|
|
387
|
+
class ThreadedHTTPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
|
|
388
|
+
allow_reuse_address = True
|
|
389
|
+
daemon_threads = True
|
|
390
|
+
|
|
391
|
+
class APIHandler(http.server.BaseHTTPRequestHandler):
|
|
392
|
+
|
|
393
|
+
server_version = "pdflinkcheck-stdlib/1.1"
|
|
394
|
+
|
|
395
|
+
def log_message(self, format, *args):
|
|
396
|
+
return
|
|
397
|
+
|
|
398
|
+
# -------- Utilities --------
|
|
399
|
+
|
|
400
|
+
def _send_json(self, payload: dict, status: int = 200) -> None:
|
|
401
|
+
body = json.dumps(payload, indent=2, ensure_ascii=False).encode("utf-8")
|
|
402
|
+
self.send_response(status)
|
|
403
|
+
self.send_header("Content-Type", "application/json; charset=utf-8")
|
|
404
|
+
self.send_header("Content-Length", str(len(body)))
|
|
405
|
+
self.send_header("Access-Control-Allow-Origin", "*")
|
|
406
|
+
self.end_headers()
|
|
407
|
+
self.wfile.write(body)
|
|
408
|
+
|
|
409
|
+
def _send_error_json(self, message: str, status: int) -> None:
|
|
410
|
+
self._send_json({"error": message}, status)
|
|
411
|
+
|
|
412
|
+
# -------- Handlers --------
|
|
413
|
+
|
|
414
|
+
def do_GET(self):
|
|
415
|
+
if self.path == "/":
|
|
416
|
+
body = HTML_FORM.encode("utf-8")
|
|
417
|
+
self.send_response(200)
|
|
418
|
+
self.send_header("Content-Type", "text/html; charset=utf-8")
|
|
419
|
+
self.send_header("Content-Length", str(len(body)))
|
|
420
|
+
self.end_headers()
|
|
421
|
+
self.wfile.write(body)
|
|
422
|
+
return
|
|
423
|
+
if self.path == "/openapi.json":
|
|
424
|
+
self._send_json(OPENAPI_SPEC)
|
|
425
|
+
return
|
|
426
|
+
if self.path == "/ready":
|
|
427
|
+
if SHUTDOWN_EVENT.is_set():
|
|
428
|
+
self._send_error_json("Server shutting down", 503)
|
|
429
|
+
else:
|
|
430
|
+
self._send_json({"status": "ready"})
|
|
431
|
+
return
|
|
432
|
+
|
|
433
|
+
if self.path == "/favicon.ico":
|
|
434
|
+
self.send_response(204)
|
|
435
|
+
self.end_headers()
|
|
436
|
+
return
|
|
437
|
+
|
|
438
|
+
self.send_error(404)
|
|
439
|
+
|
|
440
|
+
def do_POST(self):
|
|
441
|
+
if self.path != "/":
|
|
442
|
+
self.send_error(404)
|
|
443
|
+
return
|
|
444
|
+
|
|
445
|
+
if SHUTDOWN_EVENT.is_set():
|
|
446
|
+
self._send_error_json("Server shutting down", 503)
|
|
447
|
+
return
|
|
448
|
+
|
|
449
|
+
try:
|
|
450
|
+
self.connection.settimeout(30)
|
|
451
|
+
|
|
452
|
+
content_length = int(self.headers.get("Content-Length", "0"))
|
|
453
|
+
if content_length <= 0:
|
|
454
|
+
raise ValidationError("Empty request body")
|
|
455
|
+
|
|
456
|
+
if content_length > MAX_UPLOAD_BYTES * 2:
|
|
457
|
+
raise ValidationError("Request too large")
|
|
458
|
+
|
|
459
|
+
body = self.rfile.read(min(content_length, MAX_UPLOAD_BYTES * 2))
|
|
460
|
+
fields = MultipartParser.parse(self.headers, body)
|
|
461
|
+
|
|
462
|
+
file_field = fields.get("file")
|
|
463
|
+
if not isinstance(file_field, dict):
|
|
464
|
+
raise ValidationError("Missing file upload")
|
|
465
|
+
|
|
466
|
+
upload = RequestValidator.validate_upload(
|
|
467
|
+
filename=file_field["filename"],
|
|
468
|
+
pdf_bytes=file_field["data"],
|
|
469
|
+
pdf_library=fields.get("pdf_library", "pypdf"),
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
with REQUEST_SEMAPHORE:
|
|
473
|
+
response = self._process_pdf(upload)
|
|
474
|
+
|
|
475
|
+
self._send_json(response)
|
|
476
|
+
|
|
477
|
+
except ValidationError as e:
|
|
478
|
+
self._send_error_json(str(e), 400)
|
|
479
|
+
|
|
480
|
+
except Exception:
|
|
481
|
+
self._send_error_json("Internal server error", 500)
|
|
482
|
+
|
|
483
|
+
# -------- Business Logic --------
|
|
484
|
+
|
|
485
|
+
def _process_pdf(self, upload: UploadRequest) -> dict:
|
|
486
|
+
tmp_path: Optional[str] = None
|
|
487
|
+
|
|
488
|
+
try:
|
|
489
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
|
490
|
+
tmp.write(upload.pdf_bytes)
|
|
491
|
+
tmp_path = tmp.name
|
|
492
|
+
|
|
493
|
+
result = run_report_and_call_exports(
|
|
494
|
+
pdf_path=tmp_path,
|
|
495
|
+
export_format="",
|
|
496
|
+
pdf_library=upload.pdf_library,
|
|
497
|
+
print_bool=False,
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
link_count = (
|
|
501
|
+
result.get("metadata", {})
|
|
502
|
+
.get("link_counts", {})
|
|
503
|
+
.get("total_links_count", 0)
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
return {
|
|
507
|
+
"filename": upload.filename,
|
|
508
|
+
"pdf_library_used": upload.pdf_library,
|
|
509
|
+
"total_links_count": link_count,
|
|
510
|
+
"data": result["data"],
|
|
511
|
+
"text_report": result["text"],
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
finally:
|
|
515
|
+
if tmp_path and os.path.exists(tmp_path):
|
|
516
|
+
os.unlink(tmp_path)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
# =========================
|
|
520
|
+
# Entrypoint
|
|
521
|
+
# =========================
|
|
522
|
+
|
|
523
|
+
def main():
|
|
524
|
+
with ThreadedHTTPServer((HOST, PORT), APIHandler) as httpd:
|
|
525
|
+
|
|
526
|
+
def shutdown_server():
|
|
527
|
+
SHUTDOWN_EVENT.set()
|
|
528
|
+
httpd.shutdown()
|
|
529
|
+
|
|
530
|
+
def handle_signal(signum, frame):
|
|
531
|
+
print("\nShutdown signal received")
|
|
532
|
+
threading.Thread(
|
|
533
|
+
target=shutdown_server,
|
|
534
|
+
daemon=True
|
|
535
|
+
).start()
|
|
536
|
+
|
|
537
|
+
signal.signal(signal.SIGINT, handle_signal)
|
|
538
|
+
signal.signal(signal.SIGTERM, handle_signal)
|
|
539
|
+
|
|
540
|
+
print(f"pdflinkcheck stdlib server running at http://{HOST}:{PORT}")
|
|
541
|
+
print("Pure stdlib • Explicit validation • Graceful shutdown • Termux-safe")
|
|
542
|
+
|
|
543
|
+
try:
|
|
544
|
+
httpd.serve_forever()
|
|
545
|
+
finally:
|
|
546
|
+
httpd.server_close()
|
|
547
|
+
|
|
548
|
+
print("Server shut down cleanly")
|
|
549
|
+
|
|
550
|
+
def main_():
|
|
551
|
+
with ThreadedHTTPServer((HOST, PORT), APIHandler) as httpd:
|
|
552
|
+
|
|
553
|
+
def handle_shutdown(signum, frame):
|
|
554
|
+
print("\nShutdown signal received")
|
|
555
|
+
SHUTDOWN_EVENT.set()
|
|
556
|
+
httpd.shutdown()
|
|
557
|
+
|
|
558
|
+
signal.signal(signal.SIGINT, handle_shutdown)
|
|
559
|
+
signal.signal(signal.SIGTERM, handle_shutdown)
|
|
560
|
+
|
|
561
|
+
print(f"pdflinkcheck stdlib server running at http://{HOST}:{PORT}")
|
|
562
|
+
print("Pure stdlib • Explicit validation • Graceful shutdown • Termux-safe")
|
|
563
|
+
|
|
564
|
+
httpd.serve_forever()
|
|
565
|
+
|
|
566
|
+
print("Server shut down cleanly")
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
if __name__ == "__main__":
|
|
570
|
+
main()
|
|
571
|
+
|