pdflinkcheck 1.1.94__py3-none-any.whl → 1.2.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. pdflinkcheck/__init__.py +88 -18
  2. pdflinkcheck/__main__.py +6 -0
  3. pdflinkcheck/analysis_pdfium.py +131 -0
  4. pdflinkcheck/{analyze_pymupdf.py → analysis_pymupdf.py} +99 -141
  5. pdflinkcheck/{analyze_pypdf.py → analysis_pypdf.py} +51 -39
  6. pdflinkcheck/cli.py +52 -48
  7. pdflinkcheck/data/LICENSE +18 -15
  8. pdflinkcheck/data/README.md +23 -25
  9. pdflinkcheck/data/pyproject.toml +17 -26
  10. pdflinkcheck/datacopy.py +16 -1
  11. pdflinkcheck/dev.py +2 -2
  12. pdflinkcheck/environment.py +14 -2
  13. pdflinkcheck/gui.py +346 -563
  14. pdflinkcheck/helpers.py +88 -0
  15. pdflinkcheck/io.py +24 -6
  16. pdflinkcheck/report.py +598 -97
  17. pdflinkcheck/security.py +189 -0
  18. pdflinkcheck/splash.py +38 -0
  19. pdflinkcheck/stdlib_server.py +7 -21
  20. pdflinkcheck/stdlib_server_alt.py +571 -0
  21. pdflinkcheck/tk_utils.py +188 -0
  22. pdflinkcheck/update_msix_version.py +2 -0
  23. pdflinkcheck/validate.py +104 -170
  24. pdflinkcheck/version_info.py +2 -2
  25. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/METADATA +41 -40
  26. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/RECORD +34 -27
  27. pdflinkcheck-1.2.29.dist-info/WHEEL +5 -0
  28. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/entry_points.txt +0 -1
  29. pdflinkcheck-1.2.29.dist-info/licenses/LICENSE +27 -0
  30. pdflinkcheck-1.2.29.dist-info/top_level.txt +1 -0
  31. pdflinkcheck/analyze_pypdf_v2.py +0 -217
  32. pdflinkcheck-1.1.94.dist-info/WHEEL +0 -4
  33. pdflinkcheck-1.1.94.dist-info/licenses/LICENSE +0 -24
  34. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-AGPL3 +0 -0
  35. {pdflinkcheck-1.1.94.dist-info → pdflinkcheck-1.2.29.dist-info}/licenses/LICENSE-MIT +0 -0
@@ -0,0 +1,88 @@
1
+ # src/pdflinkcheck/helpers.py
2
+ from __future__ import annotations
3
+ from pprint import pprint
4
+ from typing import Any
5
+
6
+ """
7
+ Helper functions
8
+ """
9
+
10
+ def debug_head(label: str, data: Any, n: int = 3):
11
+ """Helper to cleanly print the first N items of a list or dict."""
12
+ print(f"\n--- [DEBUG: {label}] ---")
13
+ if isinstance(data, list):
14
+ pprint(data[:n], indent=2, compact=True, width=100)
15
+ elif isinstance(data, dict):
16
+ # Print first N keys
17
+ head_dict = {k: data[k] for k in list(data.keys())[:n]}
18
+ pprint(head_dict, indent=2, compact=True, width=100)
19
+ else:
20
+ print(data)
21
+
22
+
23
+ class PageRef:
24
+ """
25
+ A simple translator to handle the 0-to-1 index conversion
26
+ without the 'Double Bump' risk.
27
+ """
28
+ def __init__(self, index: int):
29
+ self.index = index # The 0-based physical index
30
+
31
+ @property
32
+ def human(self) -> int:
33
+ """The 1-based page number for humans."""
34
+ return self.index + 1
35
+
36
+ @property
37
+ def machine(self) -> int:
38
+ """Alias for index. The 0-based page number for machines."""
39
+ return self.index
40
+
41
+
42
+ @classmethod
43
+ def corrected_down(cls, human_num: int) -> "PageRef":
44
+ """Explicitly compensates for 1-based data (e.g., PyMuPDF TOC)."""
45
+ return cls.from_human(human_num)
46
+
47
+ @classmethod
48
+ def from_pymupdf_total_page_count(cls, total_pages: int) -> "PageRef":
49
+ """
50
+ Converts PyMuPDF's doc.page_count into a PageRef
51
+ representing the final valid machine-facing index.
52
+ """
53
+ return cls.from_human(total_pages)
54
+
55
+ @classmethod
56
+ def from_human(cls, human_num: int) -> "PageRef":
57
+ """Creates a PageRef from a 1-based human page number (e.g., from TOC)."""
58
+ return cls(human_num - 1)
59
+
60
+ @classmethod
61
+ def from_index(cls, physical_index: int) -> "PageRef":
62
+ """Creates a PageRef from a 0-based physical index (e.g., from links)."""
63
+ return cls(physical_index)
64
+
65
+ def __int__(self):
66
+ return self.index
67
+
68
+ def __str__(self):
69
+ return str(self.human)
70
+
71
+ def __repr__(self):
72
+ return f"PageRef(index={self.index}, human={self.human})"
73
+
74
+
75
+ """
76
+ ### Indexing Map: Physical (0) vs. Logical (1)
77
+
78
+ | **File** | **Context** | **Index Rule** | **Reasoning** |
79
+ | --------------------- | ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------ |
80
+ | `ffi.py` (Rust bridge)| Data Extraction | **0-indexing only** | Rust's `pdf-extract` and `lopdf` crates are 0-indexed. Data should stay raw. |
81
+ | `analysis_pypdf.py` | Data Extraction | **0-indexing only** | `pypdf` is 0-indexed. Your previous `+ 1` hacks have been removed. |
82
+ | `analysis_pymupdf.py` | Data Extraction | **Mixed** | **Internal:** 0-indexed. **TOC:** `get_toc()` is natively 1-indexed. Needs normalization. |
83
+ | `validate.py` | Logic/Validation | **Mixed** | **Logic:** Uses `START_INDEX=0` for boundary checks. **Strings:** Formats error messages as 1-indexed. |
84
+ | `report.py` | Output/Reporting | **Mixed** | **Data:** Keeps dictionary values at 0. **Display:** Formats CLI tables as 1-indexed. |
85
+ | `helpers.py` | Translation | **Mixed** | The `PageRef` class acts as the "Border Control" between 0 and 1. |
86
+ | `__init__.py` | API Surface | **0-indexing only** | If exposing a library, users expect 0-indexed lists of pages/links. |
87
+
88
+ """
pdflinkcheck/io.py CHANGED
@@ -1,11 +1,14 @@
1
1
  #!/usr/bin/env python3
2
2
  # SPDX-License-Identifier: MIT
3
3
  # src/pdflinkcheck/io.py
4
+ from __future__ import annotations
4
5
  import logging
5
6
  import json
6
7
  import sys
7
8
  from pathlib import Path
8
9
  from typing import Dict, Any, Union, List, Optional
10
+ from datetime import datetime
11
+ import time
9
12
 
10
13
  # --- Configuration ---
11
14
 
@@ -116,12 +119,13 @@ def export_report_json(
116
119
  """Exports structured dictionary results to a .json file."""
117
120
 
118
121
  base_name = Path(pdf_filename).stem
119
- output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_report.json"
122
+ output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_{get_unique_unix_time()}_report.json"
120
123
 
124
+ print("For more details, explore the exported file(s).")
121
125
  try:
122
126
  with open(output_path, 'w', encoding='utf-8') as f:
123
127
  json.dump(report_data, f, indent=4)
124
- print(f"\nJSON report exported: {get_friendly_path(output_path)}")
128
+ print(f"JSON report exported: {get_friendly_path(output_path)}")
125
129
  return output_path
126
130
  except Exception as e:
127
131
  error_logger.error(f"JSON export failed: {e}", exc_info=True)
@@ -133,13 +137,13 @@ def export_report_txt(
133
137
  pdf_library: str
134
138
  ) -> Path:
135
139
  """Exports the formatted string buffer to a .txt file."""
136
- #pdf_filename = implement_non_redundant_naming(pdf_filename)
140
+ #pdf_filename = implement_non_redundant_naming(pdf_filename)
137
141
  base_name = Path(pdf_filename).stem
138
- output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_report.txt"
142
+ output_path = PDFLINKCHECK_HOME / f"{base_name}_{pdf_library}_{get_unique_unix_time()}_report.txt"
139
143
 
140
144
  try:
141
145
  output_path.write_text(report_text, encoding='utf-8')
142
- print(f"\nTXT report exported: {get_friendly_path(output_path)}")
146
+ print(f"TXT report exported: {get_friendly_path(output_path)}")
143
147
  return output_path
144
148
  except Exception as e:
145
149
  error_logger.error(f"TXT export failed: {e}", exc_info=True)
@@ -154,6 +158,20 @@ def get_friendly_path(full_path: str) -> str:
154
158
  except ValueError:
155
159
  return str(p)
156
160
 
161
+ def get_unique_unix_time():
162
+ """
163
+ Get the unix time for right now.
164
+ Purpose: When added to a filename, this ensures a unique filename, to avoid overwrites for otherwise identical filenames.
165
+ Pros:
166
+ - cheap, easy, no reason to check for collision
167
+
168
+ Cons:
169
+ - Longer than YYYYMMDDalpha
170
+ - not human readable
171
+ """
172
+ return int(time.mktime(datetime.now().timetuple()))
173
+
174
+
157
175
  def get_first_pdf_in_cwd() -> Optional[str]:
158
176
  """
159
177
  Scans the current working directory (CWD) for the first file ending
@@ -196,4 +214,4 @@ def get_first_pdf_in_cwd() -> Optional[str]:
196
214
  except Exception as e:
197
215
  # Handle potential permissions errors or other issues
198
216
  print(f"Error while searching for PDF in CWD: {e}", file=sys.stderr)
199
- return None
217
+ return None