natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,157 +0,0 @@
1
- # Visual Debugging
2
-
3
- Sometimes it's hard to understand what's happening when working with PDFs. Natural PDF provides powerful visual debugging tools to help you see what you're extracting.
4
-
5
- ## Adding Persistent Highlights
6
-
7
- Use the `.highlight()` method on `Element` or `ElementCollection` objects to add persistent highlights to a page. These highlights are stored and will appear when viewing the page later.
8
-
9
- ```python
10
- from natural_pdf import PDF
11
-
12
- pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
13
- page = pdf.pages[0]
14
-
15
- # Find a specific element and add a persistent highlight
16
- page.find_all('text:contains("Summary")').highlight()
17
- page.find_all('text:contains("Date")').highlight()
18
- page.find_all('line').highlight()
19
- page.to_image(width=700)
20
- ```
21
-
22
- ## Customizing Persistent Highlights
23
-
24
- Customize the appearance of persistent highlights added with `.highlight()`:
25
-
26
- ```python
27
- page.clear_highlights()
28
-
29
- title = page.find('text:bold[size>=12]')
30
-
31
- # Highlight with a specific color (string name, hex, or RGB/RGBA tuple)
32
- # title.highlight(color=(1, 0, 0, 0.3)) # Red with 30% opacity
33
- # title.highlight(color="#FF0000") # Hex color
34
- title.highlight(color="red") # Color name
35
-
36
- text = page.find('text:contains("Critical")')
37
-
38
- # Add a label to the highlight (appears in legend)
39
- text.highlight(label="Critical")
40
-
41
- # Combine color and label
42
- rect = page.find('rect')
43
- rect.highlight(color=(0, 0, 1, 0.2), label="Box")
44
-
45
- page.to_image(width=700)
46
- ```
47
-
48
- ## Highlighting Multiple Elements
49
-
50
- Highlighting an `ElementCollection` applies the highlight to all elements within it. By default, all elements in the collection get the same color and a label based on their type.
51
-
52
- ```python
53
- # Find and highlight all headings with a single color/label
54
- headings = page.find_all('text[size>=14]:bold')
55
- headings.highlight(color=(0, 0.5, 0, 0.3), label="Headings")
56
-
57
- # Find and highlight all tables
58
- tables = page.find_all('region[type=table]')
59
- tables.highlight(color=(0, 0, 1, 0.2), label="Tables")
60
-
61
- # View the result
62
- page.viewer()
63
- ```
64
-
65
- ## Highlighting Regions
66
-
67
- You can highlight regions to see what area you're working with:
68
-
69
- ```python
70
- # Find a title and create a region below it
71
- title = page.find('text:contains("Violations")')
72
- content = title.below(height=200)
73
-
74
- # Highlight the region
75
- content.show()
76
- ```
77
-
78
- Or look at just the region by itself
79
-
80
- ```python
81
- # Find a title and create a region below it
82
- title = page.find('text:contains("Violations")')
83
- content = title.below(height=200)
84
-
85
- # Crop to the region
86
- content.to_image(crop_only=True, include_highlights=False)
87
- ```
88
-
89
- ## Working with Text Styles
90
-
91
- Visualize text styles to understand the document structure:
92
-
93
- ```python
94
- # Analyze and highlight text styles
95
- page.clear_highlights()
96
-
97
- page.analyze_text_styles()
98
- page.find_all('text').highlight(group_by='style_label')
99
-
100
- page.to_image(width=700)
101
- ```
102
-
103
- ## Displaying Attributes
104
-
105
- You can display element attributes directly on the highlights:
106
-
107
- ```python
108
- pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf")
109
- page = pdf.pages[0]
110
-
111
- text = page.find_all('line')
112
- text.highlight(include_attrs=['width', 'color'])
113
-
114
- page.to_image(width=700)
115
- ```
116
-
117
- Does it get busy? YES.
118
-
119
- ## Clearing Highlights
120
-
121
- You can clear persistent highlights from a page:
122
-
123
- ```python
124
- # Clear all highlights on the page
125
- page.clear_highlights()
126
-
127
- # Apply new highlights
128
- page.find_all('text:bold').highlight(label="Bold Text")
129
- page.viewer()
130
- ```
131
-
132
- ## Document QA Visualization
133
-
134
- Visualize document QA results:
135
-
136
- ```python
137
- pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/0500000US42007.pdf")
138
- page = pdf.pages[0]
139
- page.to_image(width=700)
140
- ```
141
-
142
- ```python
143
- response = page.ask("How many votes did Kamala Harris get on Election Day?")
144
- response
145
- ```
146
-
147
- ```python
148
- response['source_elements'].show()
149
- ```
150
-
151
- ## Next Steps
152
-
153
- Now that you know how to visualize PDF content, you might want to explore:
154
-
155
- - [OCR capabilities](../ocr/index.md) for working with scanned documents
156
- - [Layout analysis](../layout-analysis/index.ipynb) for automatic structure detection
157
- - [Document QA](../document-qa/index.ipynb) for asking questions directly to your documents
Binary file
@@ -1,415 +0,0 @@
1
- # Fine-tuning a PaddleOCR Recognition Model with Your Exported Data
2
-
3
- This notebook guides you through fine-tuning a PaddleOCR text recognition model using the dataset you exported from `natural-pdf`.
4
-
5
- **Goal:** Improve OCR accuracy on your specific documents (e.g., handle unique fonts, languages, or styles).
6
-
7
- **Environment:** This notebook is designed to run on Google Colab with a GPU runtime.
8
-
9
- ## 1. Setup Environment
10
-
11
- First, let's install the necessary libraries: PaddlePaddle (GPU version) and PaddleOCR.
12
-
13
- ```python
14
- # Check GPU availability (Recommended: Select Runtime -> Change runtime type -> GPU)
15
- !nvidia-smi
16
- ```
17
-
18
- ```python
19
- # Install PaddlePaddle GPU version
20
- # Visit https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/install/pip/linux-pip_en.html
21
- # for the correct command based on your CUDA version.
22
- # CUDA versions are backwards-compatible, so you don't have to worry about
23
- # I mostly just go to https://www.paddlepaddle.org.cn/packages/stable/
24
- # and see what the most recent version that kinda matches mine is
25
- # e.g. colab is CUDA 12.4, there's a "123" directory, I use that.
26
- !pip install --quiet paddlepaddle-gpu==3.0.0rc1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
27
-
28
- # Install PaddleOCR and its dependencies
29
- !pip install --quiet paddleocr
30
- ```
31
-
32
- ```python
33
- # Verify PaddlePaddle installation and GPU detection
34
- import paddle
35
- print("PaddlePaddle version:", paddle.__version__)
36
- print("GPU available:", paddle.device.is_compiled_with_cuda())
37
- if paddle.device.is_compiled_with_cuda():
38
- print("Number of GPUs:", paddle.device.cuda.device_count())
39
- print("Current GPU:", paddle.device.get_device())
40
- ```
41
-
42
- ## 2. Upload and Unzip Your Dataset
43
-
44
- Use the file browser on the left panel of Colab to upload the `.zip` file you created using the `PaddleOCRRecognitionExporter`. Then, unzip it.
45
-
46
- ```python
47
- # Replace 'your_exported_data.zip' with the actual filename you uploaded
48
- !unzip -q your_exported_data.zip -d finetune_data
49
-
50
- # List the contents to verify
51
- !ls finetune_data
52
- ```
53
-
54
- You should see `images/`, `dict.txt`, `train.txt`, and `val.txt` (or `label.txt`) inside the `finetune_data` directory.
55
-
56
- ## 3. Prepare Training Configuration
57
-
58
- PaddleOCR uses YAML files for configuration. We'll create one based on a standard recognition config, modified for fine-tuning with our dataset.
59
-
60
- **Key Parameters to potentially adjust:**
61
-
62
- * `Global.pretrained_model`: Path or URL to the pre-trained model you want to fine-tune. Using a model pre-trained on a large dataset (like English or multilingual) is crucial. See PaddleOCR Model List for options.
63
- * `Global.save_model_dir`: Where to save checkpoints during training.
64
- * `Global.epoch_num`: Number of training epochs. Start small (e.g., 10-50) for fine-tuning and increase if needed based on validation performance.
65
- * `Optimizer.lr.learning_rate`: Learning rate. Fine-tuning often requires a smaller learning rate than training from scratch (e.g., 1e-4, 5e-5).
66
- * `Train.dataset.data_dir`: Path to the directory containing the `images/` folder.
67
- * `Train.dataset.label_file_list`: Path to your `train.txt`.
68
- * `Train.loader.batch_size_per_card`: Batch size. Adjust based on GPU memory.
69
- * `Eval.dataset.data_dir`: Path to the directory containing the `images/` folder.
70
- * `Eval.dataset.label_file_list`: Path to your `val.txt`.
71
- * `Eval.loader.batch_size_per_card`: Batch size for evaluation.
72
- * `Architecture...`: Ensure the architecture matches the `pretrained_model`.
73
- * `Loss...`: Ensure the loss function matches the `pretrained_model`.
74
-
75
- ```python
76
- # Choose a pre-trained model (check PaddleOCR docs for latest/best models)
77
- #PRETRAINED_MODEL_URL = "https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/latin_PP-OCRv4_rec_train.tar"
78
- PRETRAINED_MODEL_URL = "https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_train.tar"
79
-
80
- # Download and extract the pre-trained model
81
- !wget -q {PRETRAINED_MODEL_URL} -O pretrained_model.tar
82
- !tar -xf pretrained_model.tar
83
-
84
- # Find the actual directory name (it might vary slightly)
85
- PRETRAINED_MODEL_DIR = !find . -maxdepth 1 -type d -name '*_rec*' | head -n 1
86
- PRETRAINED_MODEL_DIR = PRETRAINED_MODEL_DIR[0]
87
- print(f"Using Pretrained Model Dir: {PRETRAINED_MODEL_DIR}")
88
- ```
89
-
90
- Depending on how you train, you may or may not need to know how many characters are in your alphabet.
91
-
92
- ```python
93
- num_classes = len([line for line in open("finetune_data/dict.txt", encoding="utf-8")])
94
- num_classes
95
- ```
96
-
97
- You need to set a maximum length for your pieces of text – if you plan ahead you can cut them up in other ways, but the easiest route is to pick the 99th or 99.9th percentile to avoid outliers. In my first test the 95th percentile was 17, 99.9th was 41, and absolute max was 138! It would have wasted a lot of memory and energy if we'd centered everything around 138-character words.
98
-
99
- ```python
100
- lengths = []
101
- with open("finetune_data/train.txt", encoding="utf-8") as f:
102
- for line in f:
103
- parts = line.strip().split(maxsplit=1)
104
- if len(parts) == 2:
105
- lengths.append(len(parts[1]))
106
-
107
- # Basic stats
108
- print("Max length:", max(lengths))
109
- print("95th percentile:", sorted(lengths)[int(len(lengths) * 0.95)])
110
- print("99th percentile:", sorted(lengths)[int(len(lengths) * 0.99)])
111
- print("99.9th percentile:", sorted(lengths)[int(len(lengths) * 0.999)])
112
-
113
- buffered_max_length = int(sorted(lengths)[int(len(lengths) * 0.999)] * 1.1)
114
- buffered_max_length
115
- ```
116
-
117
- ```python
118
- import shutil
119
- from datetime import datetime
120
-
121
- MAX_ALLOWED = buffered_max_length
122
- removed = 0
123
- cleaned_lines = []
124
-
125
- with open("finetune_data/train.txt", encoding="utf-8") as f:
126
- original_lines = f.readlines()
127
-
128
- for i, line in enumerate(original_lines):
129
- parts = line.strip().split(maxsplit=1)
130
- if len(parts) == 2 and len(parts[1]) > MAX_ALLOWED:
131
- removed += 1
132
- print(f"⚠️ Line {i} exceeds max_text_length: {len(parts[1])} chars: {parts[1]}")
133
- else:
134
- cleaned_lines.append(line)
135
-
136
- if removed > 0:
137
- print(f"Removed {removed} of {len(original_lines)}. Backing up original, writing clean copy.")
138
- shutil.copy("finetune_data/train.txt", "finetune_data/train_backup.txt")
139
-
140
- with open("finetune_data/train.txt", "w", encoding="utf-8") as f:
141
- f.writelines(cleaned_lines)
142
- else:
143
- print("Found 0 long lines")
144
- ```
145
-
146
- You'll also notice it catches a lot of "Sorry, I can't process the image. Please upload the image again." and the like.
147
-
148
- **And now it's configuration time!** We ignore almost all of the [suggestions from PaddleOCR's documentation](https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/model_train/finetune.html) because for some reason they get me ~40% while copying the [PPOCRv3 yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) gets me up to ~80%.
149
-
150
- This creates a `finetune_rec.yml` file that controls how the training process will go.
151
-
152
- ```python
153
- yaml_content = f"""
154
- Global:
155
- use_gpu: true
156
- epoch_num: 120
157
- log_smooth_window: 20
158
- print_batch_step: 50
159
- save_model_dir: ./output/finetune_rec/
160
- save_epoch_step: 5
161
- eval_batch_step: [0, 200] # Evaluate every 200 steps
162
- cal_metric_during_train: true
163
- pretrained_model: {PRETRAINED_MODEL_DIR}/best_accuracy
164
- checkpoints: null
165
- save_inference_dir: null
166
- use_visualdl: false
167
- infer_img: doc/imgs_words/en/word_1.png
168
- character_dict_path: finetune_data/dict.txt
169
- max_text_length: {buffered_max_length}
170
- infer_mode: false
171
- use_space_char: true
172
- save_res_path: ./output/rec/predicts_rec.txt
173
-
174
- Optimizer:
175
- name: AdamW
176
- beta1: 0.9
177
- beta2: 0.999
178
- lr:
179
- name: Cosine
180
- learning_rate: 0.00005
181
- warmup_epoch: 3
182
- regularizer:
183
- name: L2
184
- factor: 0.00005
185
-
186
- Architecture:
187
- model_type: rec
188
- algorithm: SVTR_LCNet
189
- Transform: null
190
- Backbone:
191
- name: MobileNetV1Enhance
192
- scale: 0.5
193
- last_conv_stride: [1, 2]
194
- last_pool_type: avg
195
- last_pool_kernel_size: [2, 2]
196
- Head:
197
- name: MultiHead
198
- head_list:
199
- - CTCHead:
200
- Neck:
201
- name: svtr
202
- dims: 64
203
- depth: 2
204
- hidden_dims: 120
205
- use_guide: True
206
- Head:
207
- fc_decay: 0.00001
208
- - SARHead:
209
- enc_dim: 512
210
- max_text_length: {buffered_max_length}
211
-
212
- Loss:
213
- name: MultiLoss
214
- loss_config_list:
215
- - CTCLoss:
216
- - SARLoss:
217
-
218
- PostProcess:
219
- name: CTCLabelDecode
220
-
221
- Metric:
222
- name: RecMetric
223
- main_indicator: acc
224
- ignore_space: false
225
-
226
- Train:
227
- dataset:
228
- name: SimpleDataSet
229
- data_dir: ./finetune_data/
230
- label_file_list: ["./finetune_data/train.txt"]
231
- transforms:
232
- - DecodeImage:
233
- img_mode: BGR
234
- channel_first: False
235
- - MultiLabelEncode:
236
- - SVTRRecResizeImg:
237
- image_shape: [3, 48, 320]
238
- - KeepKeys:
239
- keep_keys: ["image", "label_ctc", "label_sar", "length", "valid_ratio"]
240
- loader:
241
- shuffle: true
242
- batch_size_per_card: 64
243
- drop_last: true
244
- num_workers: 4
245
-
246
- Eval:
247
- dataset:
248
- name: SimpleDataSet
249
- data_dir: ./finetune_data/
250
- label_file_list: ["./finetune_data/val.txt"]
251
- transforms:
252
- - DecodeImage:
253
- img_mode: BGR
254
- channel_first: False
255
- - MultiLabelEncode:
256
- - SVTRRecResizeImg:
257
- image_shape: [3, 48, 320]
258
- - KeepKeys:
259
- keep_keys: ["image", "label_ctc", "label_sar", "length", "valid_ratio"]
260
- loader:
261
- shuffle: false
262
- drop_last: false
263
- batch_size_per_card: 64
264
- num_workers: 4
265
- """
266
-
267
- with open("finetune_rec.yml", "w", encoding="utf-8") as fp:
268
- fp.write(yaml_content)
269
- ```
270
-
271
- ## 4. Clone PaddleOCR Repository and Start Training
272
-
273
- We need the PaddleOCR repository for its training scripts. Once we have it we'll point it at our `finetune_rec.yml` and set it in action.
274
-
275
- ```python
276
- # Clone the PaddleOCR repository (using main branch)
277
- !git clone https://github.com/PaddlePaddle/PaddleOCR.git --depth 1 paddleocr_repo
278
- ```
279
-
280
- ```python
281
- # Remove any existing trained model
282
- !rm -rf output
283
-
284
- # Start training!
285
- # -c points to our config file
286
- # -o Override specific config options if needed (e.g., Global.epoch_num=10)
287
- !python paddleocr_repo/tools/train.py -c ../finetune_rec.yml
288
- ```
289
-
290
- Training will begin, printing logs and saving checkpoints to the directory specified in `Global.save_model_dir` (`./output/finetune_rec/` in the example). Monitor the accuracy (`acc`) and loss on the training and validation sets. You can stop training early if validation accuracy plateaus or starts to decrease.
291
-
292
- ## 5. Export Best Model for Inference
293
-
294
- Once training is complete, find the best checkpoint (usually named `best_accuracy.pdparams`) in the output directory and convert it into an inference model. The line below should automatically find the best model.
295
-
296
- ```python
297
- # Find the best model checkpoint
298
- BEST_MODEL_PATH = "output/finetune_rec/best_accuracy" # Path relative to paddleocr_repo dir
299
-
300
- # Export the model for inference
301
- !python paddleocr_repo/tools/export_model.py \
302
- -c finetune_rec.yml \
303
- -o Global.pretrained_model="{BEST_MODEL_PATH}" \
304
- Global.save_inference_dir="inference_model"
305
- ```
306
-
307
- This will create an `inference_model` directory containing `inference.pdmodel`, `inference.pdiparams`, and potentially other files needed for deployment.
308
-
309
- ## 6. Test Inference (Optional)
310
-
311
- You can use the exported inference model to predict text on new images.
312
-
313
- ```python
314
- from paddleocr import PaddleOCR
315
- from IPython.display import Image, display
316
- import random
317
-
318
- ocr = PaddleOCR(
319
- use_angle_cls=False,
320
- lang='en',
321
- rec_model_dir='inference_model',
322
- rec_algorithm='SVTR_LCNet',
323
- rec_image_shape='3,48,320',
324
- rec_char_dict_path='finetune_data/dict.txt',
325
- use_gpu=True
326
- )
327
-
328
- # Pick one random image from val.txt
329
- with open("finetune_data/val.txt", encoding="utf-8") as f:
330
- line = random.choice([l.strip() for l in f if l.strip()])
331
- img_path, ground_truth = line.split(maxsplit=1)
332
-
333
- # Run inference
334
- result = ocr.ocr(img_path, det=False)
335
- prediction = result[0][0][1]['text'] if result else '[No result]'
336
-
337
- # Display
338
- display(Image(filename=img_path))
339
- print(f"GT: {ground_truth}")
340
- print(f"Pred: {prediction}")
341
- ```
342
-
343
- Compare the predicted text with the ground truth in your label file.
344
-
345
- ## 7. Package and Distribute Your Model
346
-
347
- Once you have successfully fine-tuned and tested your model, you'll want to package it for easy distribution and use. A properly packaged model should include all necessary files to use it with Natural PDF:
348
-
349
- ````python
350
- import shutil
351
- import os
352
-
353
- # Create a distribution directory
354
- dist_dir = "my_paddleocr_model_distribution"
355
- os.makedirs(dist_dir, exist_ok=True)
356
-
357
- # Copy the inference model
358
- shutil.copytree("inference_model", os.path.join(dist_dir, "inference_model"))
359
-
360
- # Copy the dictionary file (critical for text recognition)
361
- shutil.copy("finetune_data/dict.txt", os.path.join(dist_dir, "dict.txt"))
362
-
363
- # Create a simple README
364
- with open(os.path.join(dist_dir, "README.md"), "w") as f:
365
- f.write("""# Custom PaddleOCR Model
366
-
367
- ## Model Information
368
- - Trained for: [describe your document type/language]
369
- - Base model: [e.g., "PaddleOCR v3 Latin"]
370
- - Training date: [date]
371
- - Epochs trained: [number of epochs]
372
- - Final accuracy: [accuracy percentage]
373
-
374
- ## Usage with Natural PDF
375
-
376
- from natural_pdf import PDF
377
- from natural_pdf.ocr import PaddleOCROptions
378
-
379
- # Configure OCR with this model
380
- paddle_opts = PaddleOCROptions(
381
- rec_model_dir="path/to/inference_model",
382
- rec_char_dict_path="path/to/dict.txt",
383
- )
384
-
385
- # Use in your PDF processing
386
- pdf = PDF("your-document.pdf")
387
- page = pdf.pages[0]
388
- ocr_elements = page.apply_ocr(engine='paddle', options=paddle_opts)
389
- """)
390
-
391
- # Zip everything up
392
-
393
- shutil.make_archive(dist_dir, 'zip', dist_dir)
394
- print(f"Model distribution package created: {dist_dir}.zip")
395
- ````
396
-
397
- ### Essential Components
398
-
399
- Your distribution package must include:
400
-
401
- 1. **Inference Model Directory**: Contains the trained model files (`inference.pdmodel`, `inference.pdiparams`, etc.)
402
- 2. **Character Dictionary**: The `dict.txt` file used during training that maps character IDs to actual characters
403
- 3. **Documentation**: A README with usage instructions and model information
404
-
405
- ### Usage Notes
406
-
407
- When sharing your model with others, advise them to:
408
-
409
- 1. Extract all files while maintaining the directory structure
410
- 2. Use the `PaddleOCROptions` class to configure Natural PDF with the model paths
411
- 3. Understand model limitations (specific languages, document types, etc.)
412
-
413
- You now have a fine-tuned PaddleOCR recognition model tailored to your data! The model can be distributed and used to improve OCR accuracy on similar documents in your application.
414
-
415
- ---