natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/finetuning/index.md +176 -0
  6. docs/index.md +19 -0
  7. docs/ocr/index.md +63 -16
  8. docs/tutorials/01-loading-and-extraction.ipynb +411 -248
  9. docs/tutorials/02-finding-elements.ipynb +123 -46
  10. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  11. docs/tutorials/04-table-extraction.ipynb +17 -12
  12. docs/tutorials/05-excluding-content.ipynb +37 -32
  13. docs/tutorials/06-document-qa.ipynb +36 -31
  14. docs/tutorials/07-layout-analysis.ipynb +45 -40
  15. docs/tutorials/07-working-with-regions.ipynb +61 -60
  16. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  17. docs/tutorials/09-section-extraction.ipynb +160 -155
  18. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  19. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  20. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  21. docs/tutorials/12-ocr-integration.md +68 -106
  22. docs/tutorials/13-semantic-search.ipynb +641 -251
  23. natural_pdf/__init__.py +3 -0
  24. natural_pdf/analyzers/layout/gemini.py +63 -47
  25. natural_pdf/classification/manager.py +343 -0
  26. natural_pdf/classification/mixin.py +149 -0
  27. natural_pdf/classification/results.py +62 -0
  28. natural_pdf/collections/mixins.py +63 -0
  29. natural_pdf/collections/pdf_collection.py +326 -17
  30. natural_pdf/core/element_manager.py +73 -4
  31. natural_pdf/core/page.py +255 -83
  32. natural_pdf/core/pdf.py +385 -367
  33. natural_pdf/elements/base.py +1 -3
  34. natural_pdf/elements/collections.py +279 -49
  35. natural_pdf/elements/region.py +106 -21
  36. natural_pdf/elements/text.py +5 -2
  37. natural_pdf/exporters/__init__.py +4 -0
  38. natural_pdf/exporters/base.py +61 -0
  39. natural_pdf/exporters/paddleocr.py +345 -0
  40. natural_pdf/extraction/manager.py +134 -0
  41. natural_pdf/extraction/mixin.py +246 -0
  42. natural_pdf/extraction/result.py +37 -0
  43. natural_pdf/ocr/__init__.py +16 -8
  44. natural_pdf/ocr/engine.py +46 -30
  45. natural_pdf/ocr/engine_easyocr.py +86 -42
  46. natural_pdf/ocr/engine_paddle.py +39 -28
  47. natural_pdf/ocr/engine_surya.py +32 -16
  48. natural_pdf/ocr/ocr_factory.py +34 -23
  49. natural_pdf/ocr/ocr_manager.py +98 -34
  50. natural_pdf/ocr/ocr_options.py +38 -10
  51. natural_pdf/ocr/utils.py +59 -33
  52. natural_pdf/qa/document_qa.py +0 -4
  53. natural_pdf/selectors/parser.py +363 -238
  54. natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
  55. natural_pdf/utils/debug.py +4 -2
  56. natural_pdf/utils/identifiers.py +9 -5
  57. natural_pdf/utils/locks.py +8 -0
  58. natural_pdf/utils/packaging.py +172 -105
  59. natural_pdf/utils/text_extraction.py +96 -65
  60. natural_pdf/utils/tqdm_utils.py +43 -0
  61. natural_pdf/utils/visualization.py +1 -1
  62. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
  63. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
  64. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  65. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  66. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,420 @@
1
+ # Fine-tuning a PaddleOCR Recognition Model with Your Exported Data
2
+
3
+ This notebook guides you through fine-tuning a PaddleOCR text recognition model using the dataset you exported from `natural-pdf`.
4
+
5
+ **Goal:** Improve OCR accuracy on your specific documents (e.g., handle unique fonts, languages, or styles).
6
+
7
+ **Environment:** This notebook is designed to run on Google Colab with a GPU runtime.
8
+
9
+ ## 1. Setup Environment
10
+
11
+ First, let's install the necessary libraries: PaddlePaddle (GPU version) and PaddleOCR.
12
+
13
+ ```python
14
+ # Check GPU availability (Recommended: Select Runtime -> Change runtime type -> GPU)
15
+ !nvidia-smi
16
+ ```
17
+
18
+ ```python
19
+ # Install PaddlePaddle GPU version
20
+ # Visit https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/install/pip/linux-pip_en.html
21
+ # for the correct command based on your CUDA version.
22
+ # CUDA versions are backwards-compatible, so you don't have to worry about
23
+ # I mostly just go to https://www.paddlepaddle.org.cn/packages/stable/
24
+ # and see what the most recent version that kinda matches mine is
25
+ # e.g. colab is CUDA 12.4, there's a "123" directory, I use that.
26
+ !pip install --quiet paddlepaddle-gpu==3.0.0rc1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
27
+
28
+ # Install PaddleOCR and its dependencies
29
+ !pip install --quiet paddleocr
30
+ ```
31
+
32
+ ```python
33
+ # Verify PaddlePaddle installation and GPU detection
34
+ import paddle
35
+ print("PaddlePaddle version:", paddle.__version__)
36
+ print("GPU available:", paddle.device.is_compiled_with_cuda())
37
+ if paddle.device.is_compiled_with_cuda():
38
+ print("Number of GPUs:", paddle.device.cuda.device_count())
39
+ print("Current GPU:", paddle.device.get_device())
40
+ ```
41
+
42
+ ## 2. Upload and Unzip Your Dataset
43
+
44
+ Use the file browser on the left panel of Colab to upload the `.zip` file you created using the `PaddleOCRRecognitionExporter`. Then, unzip it.
45
+
46
+ ```python
47
+ # Replace 'your_exported_data.zip' with the actual filename you uploaded
48
+ !unzip -q your_exported_data.zip -d finetune_data
49
+
50
+ # List the contents to verify
51
+ !ls finetune_data
52
+ ```
53
+
54
+ You should see `images/`, `dict.txt`, `train.txt`, and `val.txt` (or `label.txt`) inside the `finetune_data` directory.
55
+
56
+ ## 3. Prepare Training Configuration
57
+
58
+ PaddleOCR uses YAML files for configuration. We'll create one based on a standard recognition config, modified for fine-tuning with our dataset.
59
+
60
+ **Key Parameters to potentially adjust:**
61
+
62
+ * `Global.pretrained_model`: Path or URL to the pre-trained model you want to fine-tune. Using a model pre-trained on a large dataset (like English or multilingual) is crucial. See PaddleOCR Model List for options.
63
+ * `Global.save_model_dir`: Where to save checkpoints during training.
64
+ * `Global.epoch_num`: Number of training epochs. Start small (e.g., 10-50) for fine-tuning and increase if needed based on validation performance.
65
+ * `Optimizer.lr.learning_rate`: Learning rate. Fine-tuning often requires a smaller learning rate than training from scratch (e.g., 1e-4, 5e-5).
66
+ * `Train.dataset.data_dir`: Path to the directory containing the `images/` folder.
67
+ * `Train.dataset.label_file_list`: Path to your `train.txt`.
68
+ * `Train.loader.batch_size_per_card`: Batch size. Adjust based on GPU memory.
69
+ * `Eval.dataset.data_dir`: Path to the directory containing the `images/` folder.
70
+ * `Eval.dataset.label_file_list`: Path to your `val.txt`.
71
+ * `Eval.loader.batch_size_per_card`: Batch size for evaluation.
72
+ * `Architecture...`: Ensure the architecture matches the `pretrained_model`.
73
+ * `Loss...`: Ensure the loss function matches the `pretrained_model`.
74
+
75
+ ```python
76
+ # Choose a pre-trained model (check PaddleOCR docs for latest/best models)
77
+ #PRETRAINED_MODEL_URL = "https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/latin_PP-OCRv4_rec_train.tar"
78
+ PRETRAINED_MODEL_URL = "https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_train.tar"
79
+
80
+ # Download and extract the pre-trained model
81
+ !wget -q {PRETRAINED_MODEL_URL} -O pretrained_model.tar
82
+ !tar -xf pretrained_model.tar
83
+
84
+ # Find the actual directory name (it might vary slightly)
85
+ PRETRAINED_MODEL_DIR = !find . -maxdepth 1 -type d -name '*_rec*' | head -n 1
86
+ PRETRAINED_MODEL_DIR = PRETRAINED_MODEL_DIR[0]
87
+ print(f"Using Pretrained Model Dir: {PRETRAINED_MODEL_DIR}")
88
+ ```
89
+
90
+ Depending on how you train, you may or may not need to know how many characters are in your alphabet.
91
+
92
+ ```python
93
+ num_classes = len([line for line in open("finetune_data/dict.txt", encoding="utf-8")])
94
+ num_classes
95
+ ```
96
+
97
+ You need to set a maximum length for your pieces of text – if you plan ahead you can cut them up in other ways, but the easiest route is to pick the 99th or 99.9th percentile to avoid outliers. In my first test the 95th percentile was 17, 99.9th was 41, and absolute max was 138! It would have wasted a lot of memory and energy if we'd centered everything around 138-character words.
98
+
99
+ ```python
100
+ lengths = []
101
+ with open("finetune_data/train.txt", encoding="utf-8") as f:
102
+ for line in f:
103
+ parts = line.strip().split(maxsplit=1)
104
+ if len(parts) == 2:
105
+ lengths.append(len(parts[1]))
106
+
107
+ # Basic stats
108
+ print("Max length:", max(lengths))
109
+ print("95th percentile:", sorted(lengths)[int(len(lengths) * 0.95)])
110
+ print("99th percentile:", sorted(lengths)[int(len(lengths) * 0.99)])
111
+ print("99.9th percentile:", sorted(lengths)[int(len(lengths) * 0.999)])
112
+
113
+ buffered_max_length = int(sorted(lengths)[int(len(lengths) * 0.999)] * 1.1)
114
+ buffered_max_length
115
+ ```
116
+
117
+ ```python
118
+ MAX_ALLOWED = buffered_max_length
119
+ MIN_ALLOWED = 3
120
+ removed = 0
121
+ cleaned_lines = []
122
+
123
+ with open("finetune_data/train.txt", encoding="utf-8") as f:
124
+ original_lines = f.readlines()
125
+
126
+ for i, line in enumerate(original_lines):
127
+ parts = line.strip().split(maxsplit=1)
128
+ if len(parts) == 2 and len(parts[1]) > MAX_ALLOWED:
129
+ removed += 1
130
+ print(f"⚠️ Line {i} exceeds max_text_length: {len(parts[1])} chars: {parts[1]}")
131
+ elif len(parts[1]) < MIN_ALLOWED:
132
+ removed += 1
133
+ print(f"⚠️ Line {i} under min_text_length: {len(parts[1])} chars: {parts[1]}")
134
+ elif "Sorry, I can't" in parts[1]:
135
+ removed += 1
136
+ print(f"⚠️ Line {i} was not OCR'd correctly")
137
+ else:
138
+ cleaned_lines.append(line)
139
+
140
+ if removed > 0:
141
+ print(f"Removed {removed} of {len(original_lines)}. Backing up original, writing clean copy.")
142
+ shutil.copy("finetune_data/train.txt", "finetune_data/train_backup.txt")
143
+
144
+ with open("finetune_data/train.txt", "w", encoding="utf-8") as f:
145
+ f.writelines(cleaned_lines)
146
+ else:
147
+ print("Found 0 long lines")
148
+ ```
149
+
150
+ You'll also notice it catches a lot of "Sorry, I can't process the image. Please upload the image again." and the like.
151
+
152
+ **And now it's configuration time!** We ignore almost all of the [suggestions from PaddleOCR's documentation](https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/model_train/finetune.html) because for some reason they get me ~40% while copying the [PPOCRv3 yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) gets me up to ~80%.
153
+
154
+ This creates a `finetune_rec.yml` file that controls how the training process will go.
155
+
156
+ ```python
157
+ yaml_content = f"""
158
+ Global:
159
+ use_gpu: true
160
+ epoch_num: 120
161
+ log_smooth_window: 20
162
+ print_batch_step: 50
163
+ save_model_dir: ./output/finetune_rec/
164
+ save_epoch_step: 5
165
+ eval_batch_step: [0, 200] # Evaluate every 200 steps
166
+ cal_metric_during_train: true
167
+ pretrained_model: {PRETRAINED_MODEL_DIR}/best_accuracy
168
+ checkpoints: null
169
+ save_inference_dir: null
170
+ use_visualdl: false
171
+ infer_img: doc/imgs_words/en/word_1.png
172
+ character_dict_path: finetune_data/dict.txt
173
+ max_text_length: {buffered_max_length}
174
+ infer_mode: false
175
+ use_space_char: true
176
+ save_res_path: ./output/rec/predicts_rec.txt
177
+
178
+ Optimizer:
179
+ name: AdamW
180
+ beta1: 0.9
181
+ beta2: 0.999
182
+ lr:
183
+ name: Cosine
184
+ learning_rate: 0.00005
185
+ warmup_epoch: 3
186
+ regularizer:
187
+ name: L2
188
+ factor: 0.00005
189
+
190
+ Architecture:
191
+ model_type: rec
192
+ algorithm: SVTR_LCNet
193
+ Transform: null
194
+ Backbone:
195
+ name: MobileNetV1Enhance
196
+ scale: 0.5
197
+ last_conv_stride: [1, 2]
198
+ last_pool_type: avg
199
+ last_pool_kernel_size: [2, 2]
200
+ Head:
201
+ name: MultiHead
202
+ head_list:
203
+ - CTCHead:
204
+ Neck:
205
+ name: svtr
206
+ dims: 64
207
+ depth: 2
208
+ hidden_dims: 120
209
+ use_guide: True
210
+ Head:
211
+ fc_decay: 0.00001
212
+ - SARHead:
213
+ enc_dim: 512
214
+ max_text_length: {buffered_max_length}
215
+
216
+ Loss:
217
+ name: MultiLoss
218
+ loss_config_list:
219
+ - CTCLoss:
220
+ - SARLoss:
221
+
222
+ PostProcess:
223
+ name: CTCLabelDecode
224
+
225
+ Metric:
226
+ name: RecMetric
227
+ main_indicator: acc
228
+ ignore_space: false
229
+
230
+ Train:
231
+ dataset:
232
+ name: SimpleDataSet
233
+ data_dir: ./finetune_data/
234
+ label_file_list: ["./finetune_data/train.txt"]
235
+ transforms:
236
+ - DecodeImage:
237
+ img_mode: BGR
238
+ channel_first: False
239
+ - MultiLabelEncode:
240
+ - SVTRRecResizeImg:
241
+ image_shape: [3, 48, 320]
242
+ - KeepKeys:
243
+ keep_keys: ["image", "label_ctc", "label_sar", "length", "valid_ratio"]
244
+ loader:
245
+ shuffle: true
246
+ batch_size_per_card: 64
247
+ drop_last: true
248
+ num_workers: 4
249
+
250
+ Eval:
251
+ dataset:
252
+ name: SimpleDataSet
253
+ data_dir: ./finetune_data/
254
+ label_file_list: ["./finetune_data/val.txt"]
255
+ transforms:
256
+ - DecodeImage:
257
+ img_mode: BGR
258
+ channel_first: False
259
+ - MultiLabelEncode:
260
+ - SVTRRecResizeImg:
261
+ image_shape: [3, 48, 320]
262
+ - KeepKeys:
263
+ keep_keys: ["image", "label_ctc", "label_sar", "length", "valid_ratio"]
264
+ loader:
265
+ shuffle: false
266
+ drop_last: false
267
+ batch_size_per_card: 64
268
+ num_workers: 4
269
+ """
270
+
271
+ with open("finetune_rec.yml", "w", encoding="utf-8") as fp:
272
+ fp.write(yaml_content)
273
+ ```
274
+
275
+ ## 4. Clone PaddleOCR Repository and Start Training
276
+
277
+ We need the PaddleOCR repository for its training scripts. Once we have it we'll point it at our `finetune_rec.yml` and set it in action.
278
+
279
+ ```python
280
+ # Clone the PaddleOCR repository (using main branch)
281
+ !git clone https://github.com/PaddlePaddle/PaddleOCR.git --depth 1 paddleocr_repo
282
+ ```
283
+
284
+ ```python
285
+ # Remove any existing trained model
286
+ !rm -rf output
287
+
288
+ # Start training!
289
+ # -c points to our config file
290
+ # -o Override specific config options if needed (e.g., Global.epoch_num=10)
291
+ !python paddleocr_repo/tools/train.py -c finetune_rec.yml
292
+ ```
293
+
294
+ Training will begin, printing logs and saving checkpoints to the directory specified in `Global.save_model_dir` (`./output/finetune_rec/` in the example). Monitor the accuracy (`acc`) and loss on the training and validation sets. You can stop training early if validation accuracy plateaus or starts to decrease.
295
+
296
+ ## 5. Export Best Model for Inference
297
+
298
+ Once training is complete, find the best checkpoint (usually named `best_accuracy.pdparams`) in the output directory and convert it into an inference model. The line below should automatically find the best model.
299
+
300
+ ```python
301
+ # Find the best model checkpoint
302
+ BEST_MODEL_PATH = "output/finetune_rec/best_accuracy" # Path relative to paddleocr_repo dir
303
+
304
+ # Export the model for inference
305
+ !python paddleocr_repo/tools/export_model.py \
306
+ -c finetune_rec.yml \
307
+ -o Global.pretrained_model="{BEST_MODEL_PATH}" \
308
+ Global.save_inference_dir="inference_model"
309
+ ```
310
+
311
+ This will create an `inference_model` directory containing `inference.pdmodel`, `inference.pdiparams`, and potentially other files needed for deployment.
312
+
313
+ ## 6. Test Inference (Optional)
314
+
315
+ You can use the exported inference model to predict text on new images.
316
+
317
+ ```python
318
+ from paddleocr import PaddleOCR
319
+ from IPython.display import Image, display
320
+ import random
321
+
322
+ ocr = PaddleOCR(
323
+ use_angle_cls=False,
324
+ lang='en',
325
+ rec_model_dir='inference_model',
326
+ rec_algorithm='SVTR_LCNet',
327
+ rec_image_shape='3,48,320',
328
+ rec_char_dict_path='finetune_data/dict.txt',
329
+ use_gpu=True
330
+ )
331
+
332
+ # Pick one random image from val.txt
333
+ with open("finetune_data/val.txt", encoding="utf-8") as f:
334
+ line = random.choice([l.strip() for l in f if l.strip()])
335
+ img_path, ground_truth = line.split(maxsplit=1)
336
+ img_path = "finetune_data/" + img_path
337
+
338
+ # Run inference
339
+ result = ocr.ocr(img_path, det=False)
340
+ prediction = result[0][0][1] if result else '[No result]'
341
+
342
+ # Display
343
+ display(Image(filename=img_path))
344
+ print(f"GT: {ground_truth}")
345
+ print(f"Pred: {prediction}")
346
+ ```
347
+
348
+ Compare the predicted text with the ground truth in your label file.
349
+
350
+ ## 7. Package and Distribute Your Model
351
+
352
+ Once you have successfully fine-tuned and tested your model, you'll want to package it for easy distribution and use. A properly packaged model should include all necessary files to use it with Natural PDF:
353
+
354
+ ````python
355
+ import shutil
356
+ import os
357
+
358
+ # Create a distribution directory
359
+ dist_dir = "my_paddleocr_model_distribution"
360
+ os.makedirs(dist_dir, exist_ok=True)
361
+
362
+ # Copy the inference model
363
+ shutil.copytree("inference_model", os.path.join(dist_dir, "inference_model"))
364
+
365
+ # Copy the dictionary file (critical for text recognition)
366
+ shutil.copy("finetune_data/dict.txt", os.path.join(dist_dir, "dict.txt"))
367
+
368
+ # Create a simple README
369
+ with open(os.path.join(dist_dir, "README.md"), "w") as f:
370
+ f.write("""# Custom PaddleOCR Model
371
+
372
+ ## Model Information
373
+ - Trained for: [describe your document type/language]
374
+ - Base model: [e.g., "PaddleOCR v3 Latin"]
375
+ - Training date: [date]
376
+ - Epochs trained: [number of epochs]
377
+ - Final accuracy: [accuracy percentage]
378
+
379
+ ## Usage with Natural PDF
380
+
381
+ from natural_pdf import PDF
382
+ from natural_pdf.ocr import PaddleOCROptions
383
+
384
+ # Configure OCR with this model
385
+ paddle_opts = PaddleOCROptions(
386
+ rec_model_dir="path/to/inference_model",
387
+ rec_char_dict_path="path/to/dict.txt",
388
+ )
389
+
390
+ # Use in your PDF processing
391
+ pdf = PDF("your-document.pdf")
392
+ page = pdf.pages[0]
393
+ ocr_elements = page.apply_ocr(engine='paddle', options=paddle_opts)
394
+ """)
395
+
396
+ # Zip everything up
397
+
398
+ shutil.make_archive(dist_dir, 'zip', dist_dir)
399
+ print(f"Model distribution package created: {dist_dir}.zip")
400
+ ````
401
+
402
+ ### Essential Components
403
+
404
+ Your distribution package must include:
405
+
406
+ 1. **Inference Model Directory**: Contains the trained model files (`inference.pdmodel`, `inference.pdiparams`, etc.)
407
+ 2. **Character Dictionary**: The `dict.txt` file used during training that maps character IDs to actual characters
408
+ 3. **Documentation**: A README with usage instructions and model information
409
+
410
+ ### Usage Notes
411
+
412
+ When sharing your model with others, advise them to:
413
+
414
+ 1. Extract all files while maintaining the directory structure
415
+ 2. Use the `PaddleOCROptions` class to configure Natural PDF with the model paths
416
+ 3. Understand model limitations (specific languages, document types, etc.)
417
+
418
+ You now have a fine-tuned PaddleOCR recognition model tailored to your data! The model can be distributed and used to improve OCR accuracy on similar documents in your application.
419
+
420
+ ---
@@ -1,6 +1,7 @@
1
1
  """
2
2
  OCR debug utilities for natural-pdf.
3
3
  """
4
+
4
5
  import base64
5
6
  import io
6
7
  import json
@@ -16,7 +17,8 @@ from PIL import Image
16
17
  try:
17
18
  from natural_pdf.core.page import Page
18
19
  except ImportError:
19
- Page = Any # Placeholder
20
+ Page = Any # Placeholder
21
+
20
22
 
21
23
  def _get_page_image_base64(page: Page) -> str:
22
24
  """Generate a base64 encoded image of the page."""
@@ -29,4 +31,4 @@ def _get_page_image_base64(page: Page) -> str:
29
31
  # Convert to base64
30
32
  buffered = io.BytesIO()
31
33
  img.save(buffered, format="PNG")
32
- return f"data:image/png;base64,{base64.b64encode(buffered.getvalue()).decode('utf-8')}"
34
+ return f"data:image/png;base64,{base64.b64encode(buffered.getvalue()).decode('utf-8')}"
@@ -1,10 +1,12 @@
1
1
  """
2
2
  Utilities for generating consistent identifiers.
3
3
  """
4
+
4
5
  import hashlib
5
6
  import base64
6
7
  import os
7
8
 
9
+
8
10
  def generate_short_path_hash(path_str: str, length: int = 8) -> str:
9
11
  """
10
12
  Generates a short, filesystem-safe hash ID from a path string.
@@ -18,12 +20,14 @@ def generate_short_path_hash(path_str: str, length: int = 8) -> str:
18
20
  """
19
21
  # Ensure consistency by using the absolute path
20
22
  normalized_path = os.path.abspath(path_str)
21
- path_bytes = normalized_path.encode('utf-8')
23
+ path_bytes = normalized_path.encode("utf-8")
22
24
  # Use SHA-256 for good collision resistance
23
- full_hash = hashlib.sha256(path_bytes).digest() # Get binary hash
25
+ full_hash = hashlib.sha256(path_bytes).digest() # Get binary hash
24
26
  # Encode using URL-safe Base64 and remove padding '=' characters
25
- b64_encoded = base64.urlsafe_b64encode(full_hash).decode('ascii').rstrip('=')
27
+ b64_encoded = base64.urlsafe_b64encode(full_hash).decode("ascii").rstrip("=")
26
28
  # Return the first 'length' characters
27
29
  if length <= 0 or length > len(b64_encoded):
28
- raise ValueError(f"Invalid length specified: {length}. Must be between 1 and {len(b64_encoded)}.")
29
- return b64_encoded[:length]
30
+ raise ValueError(
31
+ f"Invalid length specified: {length}. Must be between 1 and {len(b64_encoded)}."
32
+ )
33
+ return b64_encoded[:length]
@@ -0,0 +1,8 @@
1
+ """
2
+ Shared locks for thread synchronization across the natural-pdf library.
3
+ """
4
+
5
+ import threading
6
+
7
+ # Global lock for PDF rendering operations to prevent PDFium concurrency issues
8
+ pdf_render_lock = threading.RLock()