pdflinkcheck 1.1.47__py3-none-any.whl → 1.1.73__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,382 @@
1
+ # src/pdflinkcheck/validate.py
2
+
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Dict, Any
6
+
7
+ from pdflinkcheck.report import run_report
8
+ from pdflinkcheck.io import get_friendly_path, export_validation_json
9
+
10
+ SEP_COUNT=28
11
+
12
+ def run_validation(
13
+ report_results: Dict[str, Any],
14
+ pdf_path: str,
15
+ pdf_library: str = "pypdf",
16
+ check_external: bool = False,
17
+ export_json: bool = True,
18
+ print_bool: bool = True
19
+ ) -> Dict[str, Any]:
20
+ """
21
+ Validates links using the output from run_report().
22
+
23
+ Args:
24
+ report_results: The dict returned by run_report()
25
+ pdf_path: Path to the original PDF (needed for relative file checks and page count)
26
+ pdf_library: Engine used ("pypdf" or "pymupdf")
27
+ check_external: Whether to validate HTTP URLs (requires network + requests)
28
+ print_bool: Whether to print results to console
29
+
30
+ Returns:
31
+ Validation summary stats with valid/broken counts and detailed issues
32
+ """
33
+ data = report_results.get("data", {})
34
+ metadata = report_results.get("metadata", {})
35
+
36
+ all_links = data.get("external_links", []) + data.get("internal_links", [])
37
+ toc = data.get("toc", [])
38
+
39
+ if not all_links and not toc:
40
+ if print_bool:
41
+ print("No links or TOC to validate.")
42
+ return {"summary-stats": {"valid": 0, "broken": 0}, "issues": []}
43
+
44
+ # Get total page count (critical for internal validation)
45
+ try:
46
+ if pdf_library == "pymupdf":
47
+ import fitz
48
+ doc = fitz.open(pdf_path)
49
+ total_pages = doc.page_count
50
+ doc.close()
51
+ else:
52
+ from pypdf import PdfReader
53
+ reader = PdfReader(pdf_path)
54
+ total_pages = len(reader.pages)
55
+ except Exception as e:
56
+ if print_bool:
57
+ print(f"Could not determine page count: {e}")
58
+ total_pages = None
59
+
60
+ pdf_dir = Path(pdf_path).parent
61
+
62
+ issues = []
63
+ valid_count = 0
64
+ broken_file_count = 0
65
+ broken_page_count = 0
66
+ file_found_count = 0
67
+ unknown_web_count = 0
68
+ unknown_reasonableness_count = 0
69
+ unknown_link_count = 0
70
+
71
+ # Validate active links
72
+ for i, link in enumerate(all_links):
73
+ link_type = link.get("type")
74
+ status = "valid"
75
+ reason = None
76
+ if link_type in ("Internal (GoTo/Dest)", "Internal (Resolved Action)"):
77
+ target_page = int(link.get("destination_page"))
78
+ if not isinstance(target_page, int):
79
+ status = "broken-page"
80
+ reason = f"Target page not a number: {target_page}"
81
+ elif (1 <= target_page) and total_pages is None:
82
+ status = "unknown-reasonableness"
83
+ reason = "Total page count unavailable, but the page number is reasonable"
84
+ elif (1 <= target_page <= total_pages):
85
+ status = "valid"
86
+ reason = f"Page {target_page} within range (1–{total_pages})"
87
+ elif target_page < 1:
88
+ status = "broken-page"
89
+ reason = f"TOC targets page negative {target_page}."
90
+ elif not (1 <= target_page <= total_pages):
91
+ status = "broken-page"
92
+ reason = f"Page {target_page} out of range (1–{total_pages})"
93
+
94
+ elif link_type == "Remote (GoToR)":
95
+ remote_file = link.get("remote_file")
96
+ if not remote_file:
97
+ status = "broken-file"
98
+ reason = "Missing remote file name"
99
+ else:
100
+ target_path = (pdf_dir / remote_file).resolve()
101
+ if target_path.exists() and target_path.is_file():
102
+ status = "file-found"
103
+ reason = f"Found: {target_path.name}"
104
+ else:
105
+ status = "broken-file"
106
+ reason = f"File not found: {remote_file}"
107
+
108
+ elif link_type == "External (URI)":
109
+ url = link.get("url")
110
+ if url and url.startswith(("http://", "https://")) and check_external:
111
+ # Optional: add requests-based check later
112
+ status = "unknown-web"
113
+ reason = "External URL validation not enabled"
114
+ else:
115
+ status = "unknown-web"
116
+ reason = "External link (no network check)"
117
+
118
+ else:
119
+ status = "unknown-link"
120
+ reason = "Other/unsupported link type"
121
+
122
+ link_with_val = link.copy()
123
+ link_with_val["validation"] = {"status": status, "reason": reason}
124
+
125
+ if status == "valid":
126
+ valid_count += 1
127
+ elif status =="file-found":
128
+ file_found_count += 1
129
+ elif status == "unknown-web":
130
+ unknown_web_count += 1
131
+ elif status == "unknown-reasonableness":
132
+ unknown_reasonableness_count += 1
133
+ elif status == "unknown-link":
134
+ unknown_link_count += 1
135
+ elif status == "broken-file":
136
+ broken_page_count += 1
137
+ issues.append(link_with_val)
138
+ elif status == "broken-file":
139
+ broken_page_count += 1
140
+ issues.append(link_with_val)
141
+
142
+ # Validate TOC entries
143
+ for entry in toc:
144
+ target_page = int(entry.get("target_page"))
145
+ if isinstance(target_page, int):
146
+ if (1 <= target_page) and total_pages is None:
147
+ reason = "Page count unknown"
148
+ status = "unknown-reasonableness"
149
+ unknown_reasonableness_count += 1
150
+ elif target_page < 1:
151
+ status = "broken-page"
152
+ broken_count += 1
153
+ reason = f"TOC targets negative page: {target_page}."
154
+ elif 1 <= target_page <= total_pages:
155
+ valid_count += 1
156
+ continue
157
+ else:
158
+ status = "broken-page"
159
+ reason = f"TOC targets page {page} (out of 1–{total_pages})"
160
+ broken_count += 1
161
+ else:
162
+ status = "broken-page"
163
+ reason = f"Invalid page: {target_page}"
164
+ broken_count += 1
165
+
166
+ issues.append({
167
+ "type": "TOC Entry",
168
+ "title": entry["title"],
169
+ "level": entry["level"],
170
+ "target_page": target_page,
171
+ "validation": {"status": status, "reason": reason}
172
+ })
173
+
174
+ summary_stats = {
175
+ "total_checked": len(all_links) + len(toc),
176
+ "valid": valid_count,
177
+ "file-found": file_found_count,
178
+ "broken-page": broken_page_count,
179
+ "broken-file": broken_file_count,
180
+ "unknown-web": unknown_web_count,
181
+ "unknown-reasonableness": unknown_reasonableness_count,
182
+ "unknown-link": unknown_link_count,
183
+ #"unknown": len(all_links) + len(toc) - valid_count - broken_count # nah this is not granuar enough
184
+ }
185
+
186
+
187
+ def generate_validation_summary_txt_buffer(summary_stats, issues, pdf_path):
188
+ """
189
+ Prepare the validation overview for modular reuse
190
+ """
191
+ validation_buffer = []
192
+
193
+ # Helper to handle conditional printing and mandatory buffering
194
+ def log(msg: str):
195
+ validation_buffer.append(msg)
196
+
197
+ log("\n" + "=" * SEP_COUNT)
198
+ log("## Validation Results")
199
+ log("=" * SEP_COUNT)
200
+ log(f"PDF Path = {get_friendly_path(pdf_path)}")
201
+ log(f"Total items checked: {summary_stats['total_checked']}")
202
+ log(f"✅ Valid: {summary_stats['valid']}")
203
+ log(f"🌐 Web Addresses (Not Checked): {summary_stats['unknown-web']}")
204
+ log(f"⚠️ Unknown Page Reasonableness (Due to Missing Total Page Count): {summary_stats['unknown-reasonableness']}")
205
+ log(f"⚠️ Unsupported PDF Links: {summary_stats['unknown-link']}")
206
+ log(f"❌ Broken Page Reference: {summary_stats['broken-page']}")
207
+ log(f"❌ Broken File Reference: {summary_stats['broken-file']}")
208
+ log("=" * SEP_COUNT)
209
+
210
+ if issues:
211
+ log("\n## Issues Found")
212
+ log("{:<5} | {:<12} | {:<30} | {}".format("Idx", "Type", "Text", "Problem"))
213
+ log("-" * SEP_COUNT)
214
+ for i, issue in enumerate(issues[:25], 1):
215
+ link_type = issue.get("type", "Link")
216
+ text = issue.get("link_text", "") or issue.get("title", "") or "N/A"
217
+ text = text[:30]
218
+ reason = issue["validation"]["reason"]
219
+ log("{:<5} | {:<12} | {:<30} | {}".format(i, link_type, text, reason))
220
+ if len(issues) > 25:
221
+ log(f"... and {len(issues) - 25} more issues")
222
+ else:
223
+ log("No issues found — all links and TOC entries are valid!")
224
+
225
+ # Final aggregation of the buffer into one string
226
+ validation_buffer_str = "\n".join(validation_buffer)
227
+
228
+ return validation_buffer_str
229
+
230
+ summary_txt = generate_validation_summary_txt_buffer(summary_stats, issues, pdf_path)
231
+ if print_bool:
232
+ print(summary_txt)
233
+
234
+ validation_results = {
235
+ "pdf_path" : pdf_path,
236
+ "summary-stats": summary_stats,
237
+ "issues": issues,
238
+ "summary-txt": summary_txt,
239
+ "total_pages": total_pages
240
+ }
241
+
242
+ # Have export run interally so that the logic need not happen in an interface
243
+
244
+ export_validation_json(validation_results, pdf_path, pdf_library)
245
+ return validation_results
246
+
247
+
248
+ def run_validation_more_readable_slop(pdf_path: str = None, pdf_library: str = "pypdf", check_external_links:bool = False) -> Dict[str, Any]:
249
+ """
250
+ Experimental. Ignore for now.
251
+
252
+ Extends the report logic by programmatically testing every extracted link.
253
+ Validates Internal Jumps (page bounds), External URIs (HTTP status),
254
+ and Launch actions (file existence).
255
+ """
256
+ if check_external_links:
257
+ import requests
258
+
259
+ # 1. Setup Library Engine (Reuse your logic)
260
+ pdf_library = pdf_library.lower()
261
+ if pdf_library == "pypdf":
262
+ from pdflinkcheck.analyze_pypdf import extract_links_pypdf as extract_links
263
+ else:
264
+ from pdflinkcheck.analyze_pymupdf import extract_links_pymupdf as extract_links
265
+
266
+ if pdf_path is None:
267
+ pdf_path = get_first_pdf_in_cwd()
268
+
269
+ if not pdf_path:
270
+ print("Error: No PDF found for validation.")
271
+ return {}
272
+
273
+ print(f"\nValidating links in {Path(pdf_path).name}...")
274
+
275
+ # 2. Extract links and initialize validation counters
276
+ links = extract_links(pdf_path)
277
+ total_links = len(links)
278
+ results = {"valid": [], "broken": [], "error": []}
279
+
280
+ # 3. Validation Loop
281
+ for i, link in enumerate(links, 1):
282
+ # Progress indicator for long manuals
283
+ sys.stdout.write(f"\rChecking link {i}/{total_links}...")
284
+ sys.stdout.flush()
285
+
286
+ link_type = link.get('type')
287
+ status = {"is_valid": False, "reason": "Unknown Type"}
288
+
289
+ # --- A. Validate Internal Jumps ---
290
+ if "Internal" in link_type:
291
+ target_page = link.get('destination_page')
292
+ if isinstance(target_page, int) and target_page > 0:
293
+ # In a real run, you'd compare against reader.pages_count
294
+ status = {"is_valid": True, "reason": "Resolves"}
295
+ else:
296
+ status = {"is_valid": False, "reason": f"Invalid Page: {target_page}"}
297
+
298
+ # --- B. Validate Web URIs ---
299
+ elif link_type == 'External (URI)':
300
+
301
+ url = link.get('url')
302
+ if url and url.startswith("http") and check_external_links:
303
+ try:
304
+ # Use a short timeout and HEAD request to be polite/fast
305
+ resp = requests.head(url, timeout=5, allow_redirects=True)
306
+ if resp.status_code < 400:
307
+ status = {"is_valid": True, "reason": f"HTTP {resp.status_code}"}
308
+ else:
309
+ status = {"is_valid": False, "reason": f"HTTP {resp.status_code}"}
310
+ except Exception as e:
311
+ status = {"is_valid": False, "reason": "Connection Failed"}
312
+ else:
313
+ status = {"is_valid": False, "reason": "Malformed URL"}
314
+
315
+ # --- C. Validate Local File/Launch Links ---
316
+ elif link_type == 'Launch' or 'remote_file' in link:
317
+ file_path = link.get('remote_file') or link.get('url')
318
+ if file_path:
319
+ # Clean URI formatting
320
+ clean_path = file_path.replace("file://", "").replace("%20", " ")
321
+ # Check relative to the PDF's location
322
+ abs_path = Path(pdf_path).parent / clean_path
323
+ if abs_path.exists():
324
+ status = {"is_valid": True, "reason": "File Exists"}
325
+ else:
326
+ status = {"is_valid": False, "reason": "File Missing"}
327
+
328
+ # Append result
329
+ link['validation'] = status
330
+ if status['is_valid']:
331
+ results['valid'].append(link)
332
+ else:
333
+ results['broken'].append(link)
334
+
335
+ print("\n" + "=" * SEP_COUNT)
336
+ print(f"--- Validation Summary Stats for {Path(pdf_path).name} ---")
337
+ print(f"Total Checked: {total_links}")
338
+ print(f"✅ Valid: {len(results['valid'])}")
339
+ print(f"❌ Broken: {len(results['broken'])}")
340
+ print("=" * SEP_COUNT)
341
+
342
+ # 4. Print Detail Report for Broken Links
343
+ if results['broken']:
344
+ print("\n## ❌ Broken Links Found:")
345
+ print("{:<5} | {:<5} | {:<30} | {}".format("Idx", "Page", "Reason", "Target"))
346
+ print("-" * SEP_COUNT)
347
+ for i, link in enumerate(results['broken'], 1):
348
+ target = link.get('url') or link.get('destination_page') or link.get('remote_file')
349
+ print("{:<5} | {:<5} | {:<30} | {}".format(
350
+ i, link['page'], link['validation']['reason'], str(target)[:30]
351
+ ))
352
+
353
+ return results
354
+
355
+
356
+ if __name__ == "__main__":
357
+
358
+ from pdflinkcheck.io import get_first_pdf_in_cwd
359
+ pdf_path = get_first_pdf_in_cwd()
360
+ # Run analysis first
361
+ report = run_report(
362
+ pdf_path=pdf_path,
363
+ max_links=0,
364
+ export_format="",
365
+ pdf_library="pypdf",
366
+ print_bool=False # We handle printing in validation
367
+ )
368
+
369
+ if not report or not report.get("data"):
370
+ print("No data extracted — nothing to validate.")
371
+ sys.exit(1)
372
+
373
+ # Then validate
374
+ validation_results = run_validation(
375
+ report_results=report,
376
+ pdf_path=pdf_path,
377
+ pdf_library="pypdf",
378
+ export_json=True,
379
+ print_bool=True
380
+ )
381
+
382
+ export_validation_results()
@@ -0,0 +1,83 @@
1
+ # src/pdflinkcheck/version_info.py
2
+ import re
3
+ from pathlib import Path
4
+ import sys
5
+
6
+ """
7
+
8
+ This portion of the codebase is MIT licensed. It does not rely on any AGPL-licensed code.
9
+
10
+ ---
11
+
12
+ MIT License
13
+
14
+ Copyright (c) 2025 George Clayton Bennett <george.bennett@memphistn.gov>
15
+
16
+ Permission is hereby granted, free of charge, to any person obtaining a copy
17
+ of this software and associated documentation files (the "Software"), to deal
18
+ in the Software without restriction, including without limitation the rights
19
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20
+ copies of the Software, and to permit persons to whom the Software is
21
+ furnished to do so, subject to the following conditions:
22
+
23
+ The above copyright notice and this permission notice shall be included in all
24
+ copies or substantial portions of the Software.
25
+
26
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
31
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32
+ SOFTWARE.
33
+
34
+ """
35
+
36
+ # --- TOML Parsing Helper ---
37
+ def find_pyproject(start: Path) -> Path | None:
38
+ # 1. Handle PyInstaller / Frozen state
39
+ if getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS'):
40
+ # In PyInstaller, force-include maps to: sys._MEIPASS / package_name / data / file
41
+ candidate = Path(sys._MEIPASS) / "pdflinkcheck" / "data" / "pyproject.toml"
42
+ if candidate.exists():
43
+ return candidate
44
+ # Fallback for simple --add-data "pyproject.toml:."
45
+ candidate = Path(sys._MEIPASS) / "pyproject.toml"
46
+ if candidate.exists():
47
+ return candidate
48
+
49
+ # 4. Handle Development state (walking up the tree)
50
+ for p in start.resolve().parents:
51
+ candidate = p / "pyproject.toml"
52
+ if candidate.exists():
53
+ return candidate
54
+
55
+ # 3. Handle Installed / Wheel / Shiv state (using your force-include path)
56
+ internal_path = Path(__file__).parent / "data" / "pyproject.toml"
57
+ if internal_path.exists():
58
+ return internal_path
59
+
60
+ return None
61
+
62
+
63
+ def get_version_from_pyproject() -> str:
64
+ pyproject = find_pyproject(Path(__file__))
65
+ if not pyproject or not pyproject.exists():
66
+ print("ERROR: pyproject.toml missing.", file=sys.stderr)
67
+ return "0.0.0"
68
+
69
+ text = pyproject.read_text(encoding="utf-8")
70
+
71
+ # Match PEP 621 style: [project]
72
+ project_section = re.search(r"\[project\](.*?)(?:\n\[|$)", text, re.DOTALL | re.IGNORECASE)
73
+ if project_section:
74
+ match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', project_section.group(1))
75
+ if match: return match.group(1)
76
+
77
+ # Match Poetry style: [tool.poetry]
78
+ poetry_section = re.search(r"\[tool\.poetry\](.*?)(?:\n\[|$)", text, re.DOTALL | re.IGNORECASE)
79
+ if poetry_section:
80
+ match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', poetry_section.group(1))
81
+ if match: return match.group(1)
82
+
83
+ return "0.0.0"