pdflinkcheck 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdflinkcheck/__init__.py +88 -18
- pdflinkcheck/__main__.py +6 -0
- pdflinkcheck/analysis_pdfium.py +131 -0
- pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +99 -141
- pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +51 -39
- pdflinkcheck/cli.py +52 -48
- pdflinkcheck/data/LICENSE +18 -15
- pdflinkcheck/data/README.md +23 -25
- pdflinkcheck/data/pyproject.toml +17 -26
- pdflinkcheck/datacopy.py +16 -1
- pdflinkcheck/dev.py +2 -2
- pdflinkcheck/environment.py +14 -2
- pdflinkcheck/gui.py +346 -563
- pdflinkcheck/helpers.py +88 -0
- pdflinkcheck/io.py +24 -6
- pdflinkcheck/report.py +598 -97
- pdflinkcheck/security.py +189 -0
- pdflinkcheck/splash.py +38 -0
- pdflinkcheck/stdlib_server.py +7 -21
- pdflinkcheck/stdlib_server_alt.py +571 -0
- pdflinkcheck/tk_utils.py +188 -0
- pdflinkcheck/update_msix_version.py +2 -0
- pdflinkcheck/validate.py +104 -170
- pdflinkcheck/version_info.py +2 -2
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +41 -40
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/RECORD +34 -27
- pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
- pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
- pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
- pdflinkcheck/analyze_pypdf_v2.py +0 -217
- pdflinkcheck-1.1.94.dist-info/WHEEL +0 -4
- pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +0 -24
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-AGPL3 +0 -0
- {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-MIT +0 -0
pdflinkcheck/helpers.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# src/pdflinkcheck/helpers.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from pprint import pprint
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
Helper functions
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def debug_head(label: str, data: Any, n: int = 3):
|
|
11
|
+
"""Helper to cleanly print the first N items of a list or dict."""
|
|
12
|
+
print(f"\n--- [DEBUG: {label}] ---")
|
|
13
|
+
if isinstance(data, list):
|
|
14
|
+
pprint(data[:n], indent=2, compact=True, width=100)
|
|
15
|
+
elif isinstance(data, dict):
|
|
16
|
+
# Print first N keys
|
|
17
|
+
head_dict = {k: data[k] for k in list(data.keys())[:n]}
|
|
18
|
+
pprint(head_dict, indent=2, compact=True, width=100)
|
|
19
|
+
else:
|
|
20
|
+
print(data)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PageRef:
|
|
24
|
+
"""
|
|
25
|
+
A simple translator to handle the 0-to-1 index conversion
|
|
26
|
+
without the 'Double Bump' risk.
|
|
27
|
+
"""
|
|
28
|
+
def __init__(self, index: int):
|
|
29
|
+
self.index = index # The 0-based physical index
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def human(self) -> int:
|
|
33
|
+
"""The 1-based page number for humans."""
|
|
34
|
+
return self.index + 1
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def machine(self) -> int:
|
|
38
|
+
"""Alias for index. The 0-based page number for machines."""
|
|
39
|
+
return self.index
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def corrected_down(cls, human_num: int) -> "PageRef":
|
|
44
|
+
"""Explicitly compensates for 1-based data (e.g., PyMuPDF TOC)."""
|
|
45
|
+
return cls.from_human(human_num)
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def from_pymupdf_total_page_count(cls, total_pages: int) -> "PageRef":
|
|
49
|
+
"""
|
|
50
|
+
Converts PyMuPDF's doc.page_count into a PageRef
|
|
51
|
+
representing the final valid machine-facing index.
|
|
52
|
+
"""
|
|
53
|
+
return cls.from_human(total_pages)
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def from_human(cls, human_num: int) -> "PageRef":
|
|
57
|
+
"""Creates a PageRef from a 1-based human page number (e.g., from TOC)."""
|
|
58
|
+
return cls(human_num - 1)
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def from_index(cls, physical_index: int) -> "PageRef":
|
|
62
|
+
"""Creates a PageRef from a 0-based physical index (e.g., from links)."""
|
|
63
|
+
return cls(physical_index)
|
|
64
|
+
|
|
65
|
+
def __int__(self):
|
|
66
|
+
return self.index
|
|
67
|
+
|
|
68
|
+
def __str__(self):
|
|
69
|
+
return str(self.human)
|
|
70
|
+
|
|
71
|
+
def __repr__(self):
|
|
72
|
+
return f"PageRef(index={self.index}, human={self.human})"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
"""
|
|
76
|
+
### Indexing Map: Physical (0) vs. Logical (1)
|
|
77
|
+
|
|
78
|
+
| **File** | **Context** | **Index Rule** | **Reasoning** |
|
|
79
|
+
| --------------------- | ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------ |
|
|
80
|
+
| `ffi.py` (Rust bridge)| Data Extraction | **0-indexing only** | Rust's `pdf-extract` and `lopdf` crates are 0-indexed. Data should stay raw. |
|
|
81
|
+
| `analysis_pypdf.py` | Data Extraction | **0-indexing only** | `pypdf` is 0-indexed. Your previous `+ 1` hacks have been removed. |
|
|
82
|
+
| `analysis_pymupdf.py` | Data Extraction | **Mixed** | **Internal:** 0-indexed. **TOC:** `get_toc()` is natively 1-indexed. Needs normalization. |
|
|
83
|
+
| `validate.py` | Logic/Validation | **Mixed** | **Logic:** Uses `START_INDEX=0` for boundary checks. **Strings:** Formats error messages as 1-indexed. |
|
|
84
|
+
| `report.py` | Output/Reporting | **Mixed** | **Data:** Keeps dictionary values at 0. **Display:** Formats CLI tables as 1-indexed. |
|
|
85
|
+
| `helpers.py` | Translation | **Mixed** | The `PageRef` class acts as the "Border Control" between 0 and 1. |
|
|
86
|
+
| `__init__.py` | API Surface | **0-indexing only** | If exposing a library, users expect 0-indexed lists of pages/links. |
|
|
87
|
+
|
|
88
|
+
"""
|
pdflinkcheck/io.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# SPDX-License-Identifier: MIT
|
|
3
3
|
# src/pdflinkcheck/io.py
|
|
4
|
+
from __future__ import annotations
|
|
4
5
|
import logging
|
|
5
6
|
import json
|
|
6
7
|
import sys
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
from typing import Dict, Any, Union, List, Optional
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
import time
|
|
9
12
|
|
|
10
13
|
# --- Configuration ---
|
|
11
14
|
|
|
@@ -116,12 +119,13 @@ def export_report_json(
|
|
|
116
119
|
"""Exports structured dictionary results to a .json file."""
|
|
117
120
|
|
|
118
121
|
base_name = Path(pdf_filename).stem
|
|
119
|
-
output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_report.json"
|
|
122
|
+
output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_{get_unique_unix_time()}_report.json"
|
|
120
123
|
|
|
124
|
+
print("For more details, explore the exported file(s).")
|
|
121
125
|
try:
|
|
122
126
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
123
127
|
json.dump(report_data, f, indent=4)
|
|
124
|
-
print(f"
|
|
128
|
+
print(f"JSON report exported: {get_friendly_path(output_path)}")
|
|
125
129
|
return output_path
|
|
126
130
|
except Exception as e:
|
|
127
131
|
error_logger.error(f"JSON export failed: {e}", exc_info=True)
|
|
@@ -133,13 +137,13 @@ def export_report_txt(
|
|
|
133
137
|
pdf_library: str
|
|
134
138
|
) -> Path:
|
|
135
139
|
"""Exports the formatted string buffer to a .txt file."""
|
|
136
|
-
#pdf_filename = implement_non_redundant_naming(pdf_filename)
|
|
140
|
+
#pdf_filename = implement_non_redundant_naming(pdf_filename)
|
|
137
141
|
base_name = Path(pdf_filename).stem
|
|
138
|
-
output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_report.txt"
|
|
142
|
+
output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_{get_unique_unix_time()}_report.txt"
|
|
139
143
|
|
|
140
144
|
try:
|
|
141
145
|
output_path.write_text(report_text, encoding='utf-8')
|
|
142
|
-
print(f"
|
|
146
|
+
print(f"TXT report exported: {get_friendly_path(output_path)}")
|
|
143
147
|
return output_path
|
|
144
148
|
except Exception as e:
|
|
145
149
|
error_logger.error(f"TXT export failed: {e}", exc_info=True)
|
|
@@ -154,6 +158,20 @@ def get_friendly_path(full_path: str) -> str:
|
|
|
154
158
|
except ValueError:
|
|
155
159
|
return str(p)
|
|
156
160
|
|
|
161
|
+
def get_unique_unix_time():
|
|
162
|
+
"""
|
|
163
|
+
Get the unix time for right now.
|
|
164
|
+
Purpose: When added to a filename, this ensures a unique filename, to avoid overwrites for otherwise identical filenames.
|
|
165
|
+
Pros:
|
|
166
|
+
- cheap, easy, no reason to check for collision
|
|
167
|
+
|
|
168
|
+
Cons:
|
|
169
|
+
- Longer than YYYYMMDDalpha
|
|
170
|
+
- not human readable
|
|
171
|
+
"""
|
|
172
|
+
return int(time.mktime(datetime.now().timetuple()))
|
|
173
|
+
|
|
174
|
+
|
|
157
175
|
def get_first_pdf_in_cwd() -> Optional[str]:
|
|
158
176
|
"""
|
|
159
177
|
Scans the current working directory (CWD) for the first file ending
|
|
@@ -196,4 +214,4 @@ def get_first_pdf_in_cwd() -> Optional[str]:
|
|
|
196
214
|
except Exception as e:
|
|
197
215
|
# Handle potential permissions errors or other issues
|
|
198
216
|
print(f"Error while searching for PDF in CWD: {e}", file=sys.stderr)
|
|
199
|
-
return None
|
|
217
|
+
return None
|