novel-downloader 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. novel_downloader/__init__.py +14 -0
  2. novel_downloader/cli/__init__.py +14 -0
  3. novel_downloader/cli/clean.py +134 -0
  4. novel_downloader/cli/download.py +132 -0
  5. novel_downloader/cli/interactive.py +67 -0
  6. novel_downloader/cli/main.py +45 -0
  7. novel_downloader/cli/settings.py +177 -0
  8. novel_downloader/config/__init__.py +52 -0
  9. novel_downloader/config/adapter.py +153 -0
  10. novel_downloader/config/loader.py +177 -0
  11. novel_downloader/config/models.py +173 -0
  12. novel_downloader/config/site_rules.py +97 -0
  13. novel_downloader/core/__init__.py +25 -0
  14. novel_downloader/core/downloaders/__init__.py +22 -0
  15. novel_downloader/core/downloaders/base_async_downloader.py +157 -0
  16. novel_downloader/core/downloaders/base_downloader.py +187 -0
  17. novel_downloader/core/downloaders/common_asynb_downloader.py +207 -0
  18. novel_downloader/core/downloaders/common_downloader.py +191 -0
  19. novel_downloader/core/downloaders/qidian_downloader.py +208 -0
  20. novel_downloader/core/factory/__init__.py +33 -0
  21. novel_downloader/core/factory/downloader_factory.py +149 -0
  22. novel_downloader/core/factory/parser_factory.py +62 -0
  23. novel_downloader/core/factory/requester_factory.py +106 -0
  24. novel_downloader/core/factory/saver_factory.py +49 -0
  25. novel_downloader/core/interfaces/__init__.py +32 -0
  26. novel_downloader/core/interfaces/async_downloader_protocol.py +37 -0
  27. novel_downloader/core/interfaces/async_requester_protocol.py +68 -0
  28. novel_downloader/core/interfaces/downloader_protocol.py +37 -0
  29. novel_downloader/core/interfaces/parser_protocol.py +40 -0
  30. novel_downloader/core/interfaces/requester_protocol.py +65 -0
  31. novel_downloader/core/interfaces/saver_protocol.py +61 -0
  32. novel_downloader/core/parsers/__init__.py +28 -0
  33. novel_downloader/core/parsers/base_parser.py +96 -0
  34. novel_downloader/core/parsers/common_parser/__init__.py +14 -0
  35. novel_downloader/core/parsers/common_parser/helper.py +321 -0
  36. novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
  37. novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
  38. novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
  39. novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
  40. novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
  41. novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
  42. novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
  43. novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
  44. novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
  45. novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
  46. novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
  47. novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
  48. novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
  49. novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
  50. novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
  51. novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
  52. novel_downloader/core/requesters/__init__.py +31 -0
  53. novel_downloader/core/requesters/base_async_session.py +297 -0
  54. novel_downloader/core/requesters/base_browser.py +210 -0
  55. novel_downloader/core/requesters/base_session.py +243 -0
  56. novel_downloader/core/requesters/common_requester/__init__.py +18 -0
  57. novel_downloader/core/requesters/common_requester/common_async_session.py +96 -0
  58. novel_downloader/core/requesters/common_requester/common_session.py +126 -0
  59. novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
  60. novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
  61. novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
  62. novel_downloader/core/savers/__init__.py +20 -0
  63. novel_downloader/core/savers/base_saver.py +169 -0
  64. novel_downloader/core/savers/common_saver/__init__.py +13 -0
  65. novel_downloader/core/savers/common_saver/common_epub.py +232 -0
  66. novel_downloader/core/savers/common_saver/common_txt.py +176 -0
  67. novel_downloader/core/savers/common_saver/main_saver.py +86 -0
  68. novel_downloader/core/savers/epub_utils/__init__.py +27 -0
  69. novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
  70. novel_downloader/core/savers/epub_utils/initializer.py +98 -0
  71. novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
  72. novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
  73. novel_downloader/core/savers/qidian_saver.py +22 -0
  74. novel_downloader/locales/en.json +91 -0
  75. novel_downloader/locales/zh.json +91 -0
  76. novel_downloader/resources/config/rules.toml +196 -0
  77. novel_downloader/resources/config/settings.yaml +73 -0
  78. novel_downloader/resources/css_styles/main.css +104 -0
  79. novel_downloader/resources/css_styles/volume-intro.css +56 -0
  80. novel_downloader/resources/images/volume_border.png +0 -0
  81. novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
  82. novel_downloader/resources/json/replace_word_map.json +4 -0
  83. novel_downloader/resources/text/blacklist.txt +22 -0
  84. novel_downloader/utils/__init__.py +0 -0
  85. novel_downloader/utils/cache.py +24 -0
  86. novel_downloader/utils/constants.py +158 -0
  87. novel_downloader/utils/crypto_utils.py +144 -0
  88. novel_downloader/utils/file_utils/__init__.py +43 -0
  89. novel_downloader/utils/file_utils/io.py +252 -0
  90. novel_downloader/utils/file_utils/normalize.py +68 -0
  91. novel_downloader/utils/file_utils/sanitize.py +77 -0
  92. novel_downloader/utils/fontocr/__init__.py +23 -0
  93. novel_downloader/utils/fontocr/ocr_v1.py +304 -0
  94. novel_downloader/utils/fontocr/ocr_v2.py +658 -0
  95. novel_downloader/utils/hash_store.py +288 -0
  96. novel_downloader/utils/hash_utils.py +103 -0
  97. novel_downloader/utils/i18n.py +41 -0
  98. novel_downloader/utils/logger.py +104 -0
  99. novel_downloader/utils/model_loader.py +72 -0
  100. novel_downloader/utils/network.py +287 -0
  101. novel_downloader/utils/state.py +156 -0
  102. novel_downloader/utils/text_utils/__init__.py +27 -0
  103. novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
  104. novel_downloader/utils/text_utils/diff_display.py +75 -0
  105. novel_downloader/utils/text_utils/font_mapping.py +31 -0
  106. novel_downloader/utils/text_utils/text_cleaning.py +57 -0
  107. novel_downloader/utils/time_utils/__init__.py +22 -0
  108. novel_downloader/utils/time_utils/datetime_utils.py +146 -0
  109. novel_downloader/utils/time_utils/sleep_utils.py +49 -0
  110. novel_downloader-1.1.0.dist-info/METADATA +157 -0
  111. novel_downloader-1.1.0.dist-info/RECORD +115 -0
  112. novel_downloader-1.1.0.dist-info/WHEEL +5 -0
  113. novel_downloader-1.1.0.dist-info/entry_points.txt +2 -0
  114. novel_downloader-1.1.0.dist-info/licenses/LICENSE +21 -0
  115. novel_downloader-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,658 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.fontocr.ocr_v2
5
+ -------------------------------------
6
+
7
+ This class provides utility methods for optical character recognition (OCR)
8
+ and font mapping, primarily used for decrypting custom font encryption
9
+ on web pages (e.g., the Qidian website).
10
+ """
11
+
12
+ import json
13
+ import logging
14
+ import math
15
+ import os
16
+ from pathlib import Path
17
+ from typing import Any, Dict, Generator, List, Optional, Set, Tuple, TypeVar, Union
18
+
19
+ import cv2
20
+ import numpy as np
21
+ import paddle
22
+ from fontTools.ttLib import TTFont
23
+ from paddle.inference import Config
24
+ from paddle.inference import create_predictor as _create_predictor
25
+ from paddleocr.ppocr.postprocess.rec_postprocess import CTCLabelDecode
26
+ from PIL import Image, ImageDraw, ImageFont
27
+
28
+ try:
29
+ # pip install cupy-cuda11x
30
+ import cupy as array_backend # GPU acceleration
31
+ except ImportError:
32
+ import numpy as array_backend # CPU only
33
+
34
+ from novel_downloader.utils.constants import (
35
+ REC_CHAR_MODEL_FILES,
36
+ REC_IMAGE_SHAPE_MAP,
37
+ )
38
+ from novel_downloader.utils.hash_store import img_hash_store
39
+ from novel_downloader.utils.model_loader import (
40
+ get_rec_char_vector_dir,
41
+ get_rec_chinese_char_model_dir,
42
+ )
43
+
44
+ T = TypeVar("T")
45
+ logger = logging.getLogger(__name__)
46
+
47
+
48
+ class TextRecognizer(object):
49
+ def __init__(
50
+ self,
51
+ rec_model_dir: str,
52
+ rec_image_shape: str,
53
+ rec_batch_num: int,
54
+ rec_char_dict_path: str,
55
+ use_space_char: bool = False,
56
+ use_gpu: bool = False,
57
+ gpu_mem: int = 500,
58
+ gpu_id: Optional[int] = None,
59
+ ):
60
+ self.rec_batch_num = int(rec_batch_num)
61
+ self.rec_image_shape = tuple(map(int, rec_image_shape.split(","))) # (C, H, W)
62
+ self.postprocess_op = CTCLabelDecode(
63
+ character_dict_path=rec_char_dict_path,
64
+ use_space_char=use_space_char,
65
+ )
66
+
67
+ self._create_predictor(
68
+ model_dir=rec_model_dir,
69
+ use_gpu=use_gpu,
70
+ gpu_mem=gpu_mem,
71
+ gpu_id=gpu_id,
72
+ )
73
+
74
+ def _get_infer_gpu_id(self) -> int:
75
+ """
76
+ Look at CUDA_VISIBLE_DEVICES or HIP_VISIBLE_DEVICES,
77
+ pick the first entry and return as integer. Fallback to 0.
78
+ """
79
+ if not paddle.device.is_compiled_with_rocm:
80
+ gpu_env = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
81
+ else:
82
+ gpu_env = os.environ.get("HIP_VISIBLE_DEVICES", "0")
83
+
84
+ first = gpu_env.split(",")[0]
85
+ try:
86
+ return int(first)
87
+ except ValueError:
88
+ return 0
89
+
90
+ def _create_predictor(
91
+ self,
92
+ model_dir: str,
93
+ use_gpu: bool,
94
+ gpu_mem: int,
95
+ gpu_id: Optional[int] = None,
96
+ ) -> None:
97
+ """
98
+ Internal helper to build the Paddle predictor + I/O handles
99
+ """
100
+ model_file = f"{model_dir}/inference.pdmodel"
101
+ params_file = f"{model_dir}/inference.pdiparams"
102
+
103
+ cfg = Config(model_file, params_file)
104
+ if use_gpu:
105
+ chosen = gpu_id if gpu_id is not None else self._get_infer_gpu_id()
106
+ cfg.enable_use_gpu(gpu_mem, chosen)
107
+ else:
108
+ cfg.disable_gpu()
109
+
110
+ # enable memory optim
111
+ cfg.enable_memory_optim()
112
+ cfg.disable_glog_info()
113
+ # Use zero-copy feed/fetch for speed
114
+ cfg.switch_use_feed_fetch_ops(False)
115
+ # Enable IR optimizations
116
+ cfg.switch_ir_optim(True)
117
+
118
+ self.config = cfg
119
+ self.predictor = _create_predictor(cfg)
120
+
121
+ in_name = self.predictor.get_input_names()[0]
122
+ self.input_tensor = self.predictor.get_input_handle(in_name)
123
+
124
+ out_names = self.predictor.get_output_names()
125
+ preferred = "softmax_0.tmp_0"
126
+ selected = [preferred] if preferred in out_names else out_names
127
+ self.output_tensors = [self.predictor.get_output_handle(n) for n in selected]
128
+
129
+ def __call__(self, img_list: List[np.ndarray]) -> List[Tuple[str, float]]:
130
+ """
131
+ Perform batch OCR on a list of images and return (text, confidence) tuples.
132
+ """
133
+ img_num = len(img_list)
134
+ results: List[Tuple[str, float]] = []
135
+
136
+ C, H, W0 = self.rec_image_shape
137
+
138
+ # Process images in batches
139
+ for start in range(0, img_num, self.rec_batch_num):
140
+ batch = img_list[start : start + self.rec_batch_num]
141
+ # Compute width-to-height ratios for all images in the batch
142
+ wh_ratios = [img.shape[1] / float(img.shape[0]) for img in batch]
143
+ max_wh = max(W0 / H, *wh_ratios)
144
+
145
+ B = len(batch)
146
+ # Pre-allocate a numpy array for the batch
147
+ batch_tensor = np.zeros(
148
+ (B, C, H, int(math.ceil(H * max_wh))), dtype=np.float32
149
+ )
150
+
151
+ # Normalize and pad each image into the batch tensor
152
+ for i, img in enumerate(batch):
153
+ norm = self.resize_norm_img(img, max_wh)
154
+ batch_tensor[i, :, :, : norm.shape[2]] = norm
155
+
156
+ # Run inference
157
+ self.input_tensor.copy_from_cpu(batch_tensor)
158
+ self.predictor.run()
159
+
160
+ # Retrieve and post-process outputs
161
+ outputs = [t.copy_to_cpu() for t in self.output_tensors]
162
+ preds = outputs[0] if len(outputs) == 1 else outputs
163
+
164
+ rec_batch = self.postprocess_op(
165
+ preds,
166
+ return_word_box=False,
167
+ wh_ratio_list=wh_ratios,
168
+ max_wh_ratio=max_wh,
169
+ )
170
+ results.extend(rec_batch)
171
+
172
+ return results
173
+
174
+ def resize_norm_img(self, img: np.ndarray, max_wh_ratio: float) -> np.ndarray:
175
+ C, H, W0 = self.rec_image_shape
176
+ if img.ndim == 2:
177
+ # Convert grayscale images to RGB
178
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
179
+ assert (
180
+ img.ndim == 3 and img.shape[2] == C
181
+ ), f"Expect {C}-channel image, got {img.shape}"
182
+
183
+ h, w = img.shape[:2]
184
+ # Determine new width based on the height and max width-height ratio
185
+ new_w = min(int(math.ceil(H * (w / h))), int(H * max_wh_ratio))
186
+ resized = cv2.resize(img, (new_w, H)).astype("float32")
187
+ # Change to CHW format and scale to [0,1]
188
+ resized = resized.transpose(2, 0, 1) / 255.0
189
+ # Normalize to [-1, 1]
190
+ resized = (resized - 0.5) / 0.5
191
+
192
+ return resized
193
+
194
+
195
+ class FontOCRV2:
196
+ """
197
+ Version 2 of the FontOCR utility.
198
+
199
+ :param use_freq: if True, weight scores by character frequency
200
+ :param cache_dir: base path to store font-map JSON data
201
+ :param threshold: minimum confidence threshold [0.0-1.0]
202
+ :param font_debug: if True, dump per-char debug images under cache_dir
203
+ """
204
+
205
+ # Default constants
206
+ CHAR_IMAGE_SIZE = 64
207
+ CHAR_FONT_SIZE = 52
208
+ _freq_weight = 0.05
209
+
210
+ # shared resources
211
+ _global_char_freq_db: Dict[str, int] = {}
212
+ _global_ocr: Optional[TextRecognizer] = None
213
+ _global_vec_db: Optional[np.ndarray] = None
214
+ _global_vec_label: Tuple[str, ...] = ()
215
+ _global_vec_shape: Tuple[int, int] = (32, 32)
216
+
217
+ def __init__(
218
+ self,
219
+ cache_dir: Union[str, Path],
220
+ use_freq: bool = False,
221
+ use_ocr: bool = True,
222
+ use_vec: bool = False,
223
+ batch_size: int = 32,
224
+ ocr_weight: float = 0.6,
225
+ vec_weight: float = 0.4,
226
+ ocr_version: str = "v1.0",
227
+ threshold: float = 0.0,
228
+ font_debug: bool = False,
229
+ **kwargs: Any,
230
+ ) -> None:
231
+ self.use_freq = use_freq
232
+ self.use_ocr = use_ocr
233
+ self.use_vec = use_vec
234
+ self.batch_size = batch_size
235
+ self.ocr_weight = ocr_weight
236
+ self.vec_weight = vec_weight
237
+ self.ocr_version = ocr_version
238
+ self.threshold = threshold
239
+ self.font_debug = font_debug
240
+ self._max_freq = 5
241
+
242
+ self._cache_dir = Path(cache_dir)
243
+ self._cache_dir.mkdir(parents=True, exist_ok=True)
244
+ self._fixed_map_dir = self._cache_dir / "fixed_font_map"
245
+ self._fixed_map_dir.mkdir(parents=True, exist_ok=True)
246
+
247
+ if font_debug:
248
+ self._debug_dir = self._cache_dir / "font_debug" / "badcase"
249
+ self._debug_dir.mkdir(parents=True, exist_ok=True)
250
+
251
+ # load shared OCR + frequency DB
252
+ if self.use_ocr:
253
+ self._load_ocr_model()
254
+ if self.use_freq:
255
+ self._load_char_freq_db()
256
+ if self.use_vec:
257
+ self._load_char_vec_db()
258
+
259
+ def _load_ocr_model(self) -> None:
260
+ """
261
+ Initialize the shared PaddleOCR model if not already loaded.
262
+ """
263
+ if FontOCRV2._global_ocr is not None:
264
+ return
265
+
266
+ gpu_available = paddle.device.is_compiled_with_cuda()
267
+ self._char_model_dir = get_rec_chinese_char_model_dir(self.ocr_version)
268
+
269
+ for fname in REC_CHAR_MODEL_FILES:
270
+ full_path = self._char_model_dir / fname
271
+ if not full_path.exists():
272
+ raise FileNotFoundError(f"[FontOCR] Required file missing: {full_path}")
273
+
274
+ char_dict_file = self._char_model_dir / "rec_custom_keys.txt"
275
+ FontOCRV2._global_ocr = TextRecognizer(
276
+ rec_model_dir=str(self._char_model_dir),
277
+ rec_char_dict_path=str(char_dict_file),
278
+ rec_image_shape=REC_IMAGE_SHAPE_MAP[self.ocr_version],
279
+ rec_batch_num=self.batch_size,
280
+ use_space_char=False,
281
+ use_gpu=gpu_available,
282
+ )
283
+
284
+ def _load_char_freq_db(self) -> bool:
285
+ """
286
+ Loads character frequency data from a JSON file and
287
+ assigns it to the instance variable.
288
+
289
+ :return: True if successfully loaded, False otherwise.
290
+ """
291
+ if FontOCRV2._global_char_freq_db is not None:
292
+ return True
293
+
294
+ try:
295
+ char_freq_map_file = self._char_model_dir / "char_freq.json"
296
+ with char_freq_map_file.open("r", encoding="utf-8") as f:
297
+ FontOCRV2._global_char_freq_db = json.load(f)
298
+ self._max_freq = max(FontOCRV2._global_char_freq_db.values())
299
+ return True
300
+ except Exception as e:
301
+ logger.warning("[FontOCR] Failed to load char freq DB: %s", e)
302
+ return False
303
+
304
+ def _load_char_vec_db(self) -> None:
305
+ """
306
+ Initialize the shared Char Vector if not already loaded.
307
+ """
308
+ if FontOCRV2._global_vec_db is not None:
309
+ return
310
+
311
+ char_vec_dir = get_rec_char_vector_dir(self.ocr_version)
312
+ char_vec_npy_file = char_vec_dir / "char_vectors.npy"
313
+ char_vec_label_file = char_vec_dir / "char_vectors.txt"
314
+
315
+ # Load and normalize vector database
316
+ vec_db = array_backend.load(char_vec_npy_file)
317
+ _, dim = vec_db.shape
318
+ side = int(np.sqrt(dim))
319
+ FontOCRV2._global_vec_shape = (side, side)
320
+
321
+ norm = array_backend.linalg.norm(vec_db, axis=1, keepdims=True) + 1e-6
322
+ FontOCRV2._global_vec_db = vec_db / norm
323
+
324
+ # Load corresponding labels
325
+ with open(char_vec_label_file, "r", encoding="utf-8") as f:
326
+ FontOCRV2._global_vec_label = tuple(line.strip() for line in f)
327
+
328
+ @staticmethod
329
+ def _generate_char_image(
330
+ char: str,
331
+ render_font: ImageFont.FreeTypeFont,
332
+ is_reflect: bool = False,
333
+ ) -> Optional[Image.Image]:
334
+ """
335
+ Render a single character into a square image.
336
+ If is_reflect is True, flip horizontally.
337
+ """
338
+ size = FontOCRV2.CHAR_IMAGE_SIZE
339
+ img = Image.new("L", (size, size), color=255)
340
+ draw = ImageDraw.Draw(img)
341
+ bbox = draw.textbbox((0, 0), char, font=render_font)
342
+ w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]
343
+ x = (size - w) // 2 - bbox[0]
344
+ y = (size - h) // 2 - bbox[1]
345
+ draw.text((x, y), char, fill=0, font=render_font)
346
+ if is_reflect:
347
+ img = img.transpose(Image.FLIP_LEFT_RIGHT)
348
+
349
+ img_np = np.array(img)
350
+ if np.unique(img_np).size == 1:
351
+ return None
352
+
353
+ return img
354
+
355
+ def match_text_by_embedding(
356
+ self,
357
+ images: Union[Image.Image, List[Image.Image]],
358
+ top_k: int = 1,
359
+ ) -> Union[List[Tuple[str, float]], List[List[Tuple[str, float]]]]:
360
+ """
361
+ Match input image to precomputed character embeddings using cosine similarity.
362
+
363
+ :param images: a PIL.Image or a list of PIL.Image to match
364
+ :param top_k: int, how many top matches to return
365
+
366
+ :return:
367
+ - If a single Image was passed in,
368
+ returns a list of (label, score) tuples sorted descending.
369
+
370
+ - If a list of Images was passed in, returns a list of such lists.
371
+ """
372
+ if self._global_vec_db is None:
373
+ return []
374
+ try:
375
+ imgs: List[Image.Image] = (
376
+ [images] if isinstance(images, Image.Image) else images
377
+ )
378
+
379
+ # Convert images to normalized 1D vectors
380
+ vecs = []
381
+ for img in imgs:
382
+ pil_gray = img.convert("L").resize(self._global_vec_shape)
383
+ arr = np.asarray(pil_gray, dtype=np.float32) / 255.0
384
+ v = array_backend.asarray(arr).ravel()
385
+ v /= array_backend.linalg.norm(v) + 1e-6
386
+ vecs.append(v)
387
+
388
+ batch = array_backend.stack(vecs, axis=0) # (N, D)
389
+ # Compute all cosine similarities in one batch:
390
+ sims_batch = batch.dot(self._global_vec_db.T) # (N, num_chars)
391
+
392
+ all_results: List[List[Tuple[str, float]]] = []
393
+ for sims in sims_batch:
394
+ k = min(top_k, sims.shape[0])
395
+ top_unsorted = array_backend.argpartition(-sims, k - 1)[:k]
396
+ top_idx = top_unsorted[array_backend.argsort(-sims[top_unsorted])]
397
+ results = [
398
+ (self._global_vec_label[int(i)], float(sims[int(i)]))
399
+ for i in top_idx
400
+ ]
401
+ all_results.append(results)
402
+
403
+ # Unwrap single-image case
404
+ return all_results[0] if isinstance(images, Image.Image) else all_results
405
+ except Exception as e:
406
+ logger.warning("[FontOCR] Error: %s", e)
407
+ default = [("", 0.0)]
408
+ if isinstance(images, Image.Image):
409
+ return default
410
+ else:
411
+ return [default for _ in range(len(images))]
412
+
413
+ def run_ocr_on_images(
414
+ self,
415
+ images: Union[Image.Image, List[Image.Image]],
416
+ ) -> Union[Tuple[str, float], List[Tuple[str, float]]]:
417
+ """
418
+ Run OCR on one or more PIL.Image(s) and return recognized text with confidence
419
+
420
+ :param images: A single PIL.Image or list of PIL.Images to recognize.
421
+ :return:
422
+ - If a single image is passed, returns Tuple[str, float].
423
+
424
+ - If a list is passed, returns List[Tuple[str, float]].
425
+ """
426
+ if self._global_ocr is None:
427
+ return []
428
+ try:
429
+ # Normalize input to a list of numpy arrays (RGB)
430
+ img_list = [images] if isinstance(images, Image.Image) else images
431
+ np_imgs: List[np.ndarray] = [
432
+ np.array(img.convert("RGB")) for img in img_list
433
+ ]
434
+
435
+ # Run OCR
436
+ ocr_results = self._global_ocr(np_imgs)
437
+
438
+ # Return result depending on input type
439
+ return ocr_results if isinstance(images, list) else ocr_results[0]
440
+
441
+ except Exception as e:
442
+ logger.warning("[FontOCR] OCR failed: %s", e)
443
+ fallback = ("", 0.0)
444
+ return (
445
+ fallback
446
+ if isinstance(images, Image.Image)
447
+ else [fallback for _ in images]
448
+ )
449
+
450
+ def query(
451
+ self,
452
+ images: Union[Image.Image, List[Image.Image]],
453
+ top_k: int = 3,
454
+ ) -> Union[List[Tuple[str, float]], List[List[Tuple[str, float]]]]:
455
+ """
456
+ For each input image, run OCR + embedding match, fuse scores,
457
+ and return a sorted list of (char, score) above self.threshold.
458
+ """
459
+ # normalize to list
460
+ single = isinstance(images, Image.Image)
461
+ imgs: List[Image.Image] = [images] if single else images
462
+
463
+ # try the hash store
464
+ hash_batch = [img_hash_store.query(img, k=top_k) or [] for img in imgs]
465
+
466
+ fallback_indices = [i for i, h in enumerate(hash_batch) if not h]
467
+ fallback_imgs = [imgs[i] for i in fallback_indices]
468
+
469
+ # OCR scores
470
+ raw_ocr: Union[Tuple[str, float], List[Tuple[str, float]]] = (
471
+ self.run_ocr_on_images(fallback_imgs)
472
+ if (self.use_ocr and fallback_imgs)
473
+ else []
474
+ )
475
+ if isinstance(raw_ocr, tuple):
476
+ ocr_fallback: List[Tuple[str, float]] = [raw_ocr]
477
+ else:
478
+ ocr_fallback = raw_ocr
479
+
480
+ # Vec‐embedding scores
481
+ raw_vec: Union[List[Tuple[str, float]], List[List[Tuple[str, float]]]] = (
482
+ self.match_text_by_embedding(fallback_imgs, top_k=top_k)
483
+ if (self.use_vec and fallback_imgs)
484
+ else []
485
+ )
486
+ if raw_vec and isinstance(raw_vec[0], tuple):
487
+ vec_fallback: List[List[Tuple[str, float]]] = [raw_vec] # type: ignore
488
+ else:
489
+ vec_fallback = raw_vec # type: ignore
490
+
491
+ # Fuse OCR+vector for the fallback set
492
+ fused_fallback: List[List[Tuple[str, float]]] = []
493
+ for ocr_preds, vec_preds in zip(ocr_fallback, vec_fallback):
494
+ scores: Dict[str, float] = {}
495
+
496
+ # OCR weight
497
+ if ocr_preds:
498
+ ch, s = ocr_preds
499
+ scores[ch] = scores.get(ch, 0.0) + self.ocr_weight * s
500
+ logger.debug(
501
+ "[FontOCR] OCR with weight: scores[%s] = %s", ch, scores[ch]
502
+ )
503
+ # Vec weight
504
+ for ch, s in vec_preds:
505
+ scores[ch] = scores.get(ch, 0.0) + self.vec_weight * s
506
+ logger.debug(
507
+ "[FontOCR] Vec with weight: scores[%s] = %s", ch, scores[ch]
508
+ )
509
+ # Optional frequency
510
+ if self.use_freq:
511
+ for ch in list(scores):
512
+ level = self._global_char_freq_db.get(ch, self._max_freq)
513
+ freq_score = (self._max_freq - level) / max(1, self._max_freq)
514
+ scores[ch] += self._freq_weight * freq_score
515
+ logger.debug(
516
+ "[FontOCR] After Freq weight: scores[%s] = %s", ch, scores[ch]
517
+ )
518
+
519
+ # Threshold + sort + top_k
520
+ filtered = [(ch, sc) for ch, sc in scores.items() if sc >= self.threshold]
521
+ filtered.sort(key=lambda x: -x[1])
522
+
523
+ fused_fallback.append(filtered[:top_k])
524
+
525
+ # Recombine hash hits + fallback in original order
526
+ fused_batch: List[List[Tuple[str, float]]] = []
527
+ fallback_iter = iter(fused_fallback)
528
+ for h_preds in hash_batch:
529
+ if h_preds:
530
+ fused_batch.append(h_preds)
531
+ else:
532
+ fused_batch.append(next(fallback_iter))
533
+
534
+ # Unwrap single‐image case
535
+ return fused_batch[0] if single else fused_batch
536
+
537
+ def _chunked(self, seq: List[T], size: int) -> Generator[List[T], None, None]:
538
+ """Yield successive chunks of `seq` of length `size`."""
539
+ for i in range(0, len(seq), size):
540
+ yield seq[i : i + size]
541
+
542
+ def generate_font_map(
543
+ self,
544
+ fixed_font_path: Union[str, Path],
545
+ random_font_path: Union[str, Path],
546
+ char_set: Set[str],
547
+ refl_set: Set[str],
548
+ chapter_id: Optional[str] = None,
549
+ ) -> Dict[str, str]:
550
+ """
551
+ Generates a mapping from encrypted (randomized) font characters to
552
+ their real recognized characters by rendering and OCR-based matching.
553
+
554
+ :param fixed_font_path: Path to the reference (fixed) font.
555
+ :param random_font_path: Path to the obfuscated (random) font.
556
+ :param char_set: Characters to process normally.
557
+ :param refl_set: Characters to process as horizontally flipped.
558
+ :param chapter_id: Chapter ID
559
+
560
+ :returns mapping_result: { obf_char: real_char, ... }
561
+ """
562
+ mapping_result: Dict[str, str] = {}
563
+ fixed_map_file = self._fixed_map_dir / f"{Path(fixed_font_path).stem}.json"
564
+
565
+ # load existing cache
566
+ try:
567
+ with open(fixed_map_file, "r", encoding="utf-8") as f:
568
+ fixed_map = json.load(f)
569
+ except Exception:
570
+ fixed_map = {}
571
+
572
+ # prepare font renderers and cmap sets
573
+ try:
574
+ fixed_ttf = TTFont(fixed_font_path)
575
+ fixed_chars = set(chr(c) for c in fixed_ttf.getBestCmap().keys())
576
+ fixed_font = ImageFont.truetype(str(fixed_font_path), self.CHAR_FONT_SIZE)
577
+
578
+ random_ttf = TTFont(random_font_path)
579
+ random_chars = set(chr(c) for c in random_ttf.getBestCmap().keys())
580
+ random_font = ImageFont.truetype(str(random_font_path), self.CHAR_FONT_SIZE)
581
+ except Exception as e:
582
+ logger.error("[FontOCR] Failed to load TTF fonts: %s", e)
583
+ return mapping_result
584
+
585
+ def _render_batch(
586
+ chars: List[Tuple[str, bool]]
587
+ ) -> List[Tuple[str, Image.Image]]:
588
+ out = []
589
+ for ch, reflect in chars:
590
+ if ch in fixed_chars:
591
+ font = fixed_font
592
+ elif ch in random_chars:
593
+ font = random_font
594
+ else:
595
+ continue
596
+ img = self._generate_char_image(ch, font, reflect)
597
+ if img is not None:
598
+ out.append((ch, img))
599
+ return out
600
+
601
+ # process normal and reflected sets together
602
+ debug_idx = 1
603
+ for chars, reflect in [(list(char_set), False), (list(refl_set), True)]:
604
+ for batch_chars in self._chunked(chars, self.batch_size):
605
+ # render all images in this batch
606
+ to_render = [(ch, reflect) for ch in batch_chars]
607
+ rendered = _render_batch(to_render)
608
+ if not rendered:
609
+ continue
610
+
611
+ # query OCR+vec simultaneously
612
+ imgs_to_query = [img for (ch, img) in rendered]
613
+ fused_raw = self.query(imgs_to_query, top_k=3)
614
+ if isinstance(fused_raw[0], tuple):
615
+ fused: List[List[Tuple[str, float]]] = [fused_raw] # type: ignore
616
+ else:
617
+ fused = fused_raw # type: ignore
618
+
619
+ # pick best per char, apply threshold + cache
620
+ for (ch, img), preds in zip(rendered, fused):
621
+ if ch in fixed_map:
622
+ mapping_result[ch] = fixed_map[ch]
623
+ logger.debug(
624
+ "[FontOCR] Using cached mapping: '%s' -> '%s'",
625
+ ch,
626
+ fixed_map[ch],
627
+ )
628
+ continue
629
+ if not preds:
630
+ if self.font_debug and chapter_id:
631
+ dbg_path = (
632
+ self._debug_dir / f"{chapter_id}_{debug_idx:04d}.png"
633
+ )
634
+ img.save(dbg_path)
635
+ logger.debug(
636
+ "[FontOCR] Saved debug image for '%s': %s", ch, dbg_path
637
+ )
638
+ debug_idx += 1
639
+ continue
640
+ real_char, _ = preds[0]
641
+ mapping_result[ch] = real_char
642
+ fixed_map[ch] = real_char
643
+ if self.font_debug:
644
+ logger.debug(
645
+ "[FontOCR] Prediction for char '%s': top_pred='%s'",
646
+ ch,
647
+ real_char,
648
+ )
649
+ logger.debug("[FontOCR] All predictions: %s", preds)
650
+
651
+ # persist updated fixed_map
652
+ try:
653
+ with open(fixed_map_file, "w", encoding="utf-8") as f:
654
+ json.dump(fixed_map, f, ensure_ascii=False, indent=2)
655
+ except Exception as e:
656
+ logger.error("[FontOCR] Failed to save fixed map: %s", e)
657
+
658
+ return mapping_result