natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
docs/visual-debugging/index.md
DELETED
@@ -1,157 +0,0 @@
|
|
1
|
-
# Visual Debugging
|
2
|
-
|
3
|
-
Sometimes it's hard to understand what's happening when working with PDFs. Natural PDF provides powerful visual debugging tools to help you see what you're extracting.
|
4
|
-
|
5
|
-
## Adding Persistent Highlights
|
6
|
-
|
7
|
-
Use the `.highlight()` method on `Element` or `ElementCollection` objects to add persistent highlights to a page. These highlights are stored and will appear when viewing the page later.
|
8
|
-
|
9
|
-
```python
|
10
|
-
from natural_pdf import PDF
|
11
|
-
|
12
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
|
13
|
-
page = pdf.pages[0]
|
14
|
-
|
15
|
-
# Find a specific element and add a persistent highlight
|
16
|
-
page.find_all('text:contains("Summary")').highlight()
|
17
|
-
page.find_all('text:contains("Date")').highlight()
|
18
|
-
page.find_all('line').highlight()
|
19
|
-
page.to_image(width=700)
|
20
|
-
```
|
21
|
-
|
22
|
-
## Customizing Persistent Highlights
|
23
|
-
|
24
|
-
Customize the appearance of persistent highlights added with `.highlight()`:
|
25
|
-
|
26
|
-
```python
|
27
|
-
page.clear_highlights()
|
28
|
-
|
29
|
-
title = page.find('text:bold[size>=12]')
|
30
|
-
|
31
|
-
# Highlight with a specific color (string name, hex, or RGB/RGBA tuple)
|
32
|
-
# title.highlight(color=(1, 0, 0, 0.3)) # Red with 30% opacity
|
33
|
-
# title.highlight(color="#FF0000") # Hex color
|
34
|
-
title.highlight(color="red") # Color name
|
35
|
-
|
36
|
-
text = page.find('text:contains("Critical")')
|
37
|
-
|
38
|
-
# Add a label to the highlight (appears in legend)
|
39
|
-
text.highlight(label="Critical")
|
40
|
-
|
41
|
-
# Combine color and label
|
42
|
-
rect = page.find('rect')
|
43
|
-
rect.highlight(color=(0, 0, 1, 0.2), label="Box")
|
44
|
-
|
45
|
-
page.to_image(width=700)
|
46
|
-
```
|
47
|
-
|
48
|
-
## Highlighting Multiple Elements
|
49
|
-
|
50
|
-
Highlighting an `ElementCollection` applies the highlight to all elements within it. By default, all elements in the collection get the same color and a label based on their type.
|
51
|
-
|
52
|
-
```python
|
53
|
-
# Find and highlight all headings with a single color/label
|
54
|
-
headings = page.find_all('text[size>=14]:bold')
|
55
|
-
headings.highlight(color=(0, 0.5, 0, 0.3), label="Headings")
|
56
|
-
|
57
|
-
# Find and highlight all tables
|
58
|
-
tables = page.find_all('region[type=table]')
|
59
|
-
tables.highlight(color=(0, 0, 1, 0.2), label="Tables")
|
60
|
-
|
61
|
-
# View the result
|
62
|
-
page.viewer()
|
63
|
-
```
|
64
|
-
|
65
|
-
## Highlighting Regions
|
66
|
-
|
67
|
-
You can highlight regions to see what area you're working with:
|
68
|
-
|
69
|
-
```python
|
70
|
-
# Find a title and create a region below it
|
71
|
-
title = page.find('text:contains("Violations")')
|
72
|
-
content = title.below(height=200)
|
73
|
-
|
74
|
-
# Highlight the region
|
75
|
-
content.show()
|
76
|
-
```
|
77
|
-
|
78
|
-
Or look at just the region by itself
|
79
|
-
|
80
|
-
```python
|
81
|
-
# Find a title and create a region below it
|
82
|
-
title = page.find('text:contains("Violations")')
|
83
|
-
content = title.below(height=200)
|
84
|
-
|
85
|
-
# Crop to the region
|
86
|
-
content.to_image(crop_only=True, include_highlights=False)
|
87
|
-
```
|
88
|
-
|
89
|
-
## Working with Text Styles
|
90
|
-
|
91
|
-
Visualize text styles to understand the document structure:
|
92
|
-
|
93
|
-
```python
|
94
|
-
# Analyze and highlight text styles
|
95
|
-
page.clear_highlights()
|
96
|
-
|
97
|
-
page.analyze_text_styles()
|
98
|
-
page.find_all('text').highlight(group_by='style_label')
|
99
|
-
|
100
|
-
page.to_image(width=700)
|
101
|
-
```
|
102
|
-
|
103
|
-
## Displaying Attributes
|
104
|
-
|
105
|
-
You can display element attributes directly on the highlights:
|
106
|
-
|
107
|
-
```python
|
108
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf")
|
109
|
-
page = pdf.pages[0]
|
110
|
-
|
111
|
-
text = page.find_all('line')
|
112
|
-
text.highlight(include_attrs=['width', 'color'])
|
113
|
-
|
114
|
-
page.to_image(width=700)
|
115
|
-
```
|
116
|
-
|
117
|
-
Does it get busy? YES.
|
118
|
-
|
119
|
-
## Clearing Highlights
|
120
|
-
|
121
|
-
You can clear persistent highlights from a page:
|
122
|
-
|
123
|
-
```python
|
124
|
-
# Clear all highlights on the page
|
125
|
-
page.clear_highlights()
|
126
|
-
|
127
|
-
# Apply new highlights
|
128
|
-
page.find_all('text:bold').highlight(label="Bold Text")
|
129
|
-
page.viewer()
|
130
|
-
```
|
131
|
-
|
132
|
-
## Document QA Visualization
|
133
|
-
|
134
|
-
Visualize document QA results:
|
135
|
-
|
136
|
-
```python
|
137
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/0500000US42007.pdf")
|
138
|
-
page = pdf.pages[0]
|
139
|
-
page.to_image(width=700)
|
140
|
-
```
|
141
|
-
|
142
|
-
```python
|
143
|
-
response = page.ask("How many votes did Kamala Harris get on Election Day?")
|
144
|
-
response
|
145
|
-
```
|
146
|
-
|
147
|
-
```python
|
148
|
-
response['source_elements'].show()
|
149
|
-
```
|
150
|
-
|
151
|
-
## Next Steps
|
152
|
-
|
153
|
-
Now that you know how to visualize PDF content, you might want to explore:
|
154
|
-
|
155
|
-
- [OCR capabilities](../ocr/index.md) for working with scanned documents
|
156
|
-
- [Layout analysis](../layout-analysis/index.ipynb) for automatic structure detection
|
157
|
-
- [Document QA](../document-qa/index.ipynb) for asking questions directly to your documents
|
docs/visual-debugging/region.png
DELETED
Binary file
|
@@ -1,415 +0,0 @@
|
|
1
|
-
# Fine-tuning a PaddleOCR Recognition Model with Your Exported Data
|
2
|
-
|
3
|
-
This notebook guides you through fine-tuning a PaddleOCR text recognition model using the dataset you exported from `natural-pdf`.
|
4
|
-
|
5
|
-
**Goal:** Improve OCR accuracy on your specific documents (e.g., handle unique fonts, languages, or styles).
|
6
|
-
|
7
|
-
**Environment:** This notebook is designed to run on Google Colab with a GPU runtime.
|
8
|
-
|
9
|
-
## 1. Setup Environment
|
10
|
-
|
11
|
-
First, let's install the necessary libraries: PaddlePaddle (GPU version) and PaddleOCR.
|
12
|
-
|
13
|
-
```python
|
14
|
-
# Check GPU availability (Recommended: Select Runtime -> Change runtime type -> GPU)
|
15
|
-
!nvidia-smi
|
16
|
-
```
|
17
|
-
|
18
|
-
```python
|
19
|
-
# Install PaddlePaddle GPU version
|
20
|
-
# Visit https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/install/pip/linux-pip_en.html
|
21
|
-
# for the correct command based on your CUDA version.
|
22
|
-
# CUDA versions are backwards-compatible, so you don't have to worry about
|
23
|
-
# I mostly just go to https://www.paddlepaddle.org.cn/packages/stable/
|
24
|
-
# and see what the most recent version that kinda matches mine is
|
25
|
-
# e.g. colab is CUDA 12.4, there's a "123" directory, I use that.
|
26
|
-
!pip install --quiet paddlepaddle-gpu==3.0.0rc1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
|
27
|
-
|
28
|
-
# Install PaddleOCR and its dependencies
|
29
|
-
!pip install --quiet paddleocr
|
30
|
-
```
|
31
|
-
|
32
|
-
```python
|
33
|
-
# Verify PaddlePaddle installation and GPU detection
|
34
|
-
import paddle
|
35
|
-
print("PaddlePaddle version:", paddle.__version__)
|
36
|
-
print("GPU available:", paddle.device.is_compiled_with_cuda())
|
37
|
-
if paddle.device.is_compiled_with_cuda():
|
38
|
-
print("Number of GPUs:", paddle.device.cuda.device_count())
|
39
|
-
print("Current GPU:", paddle.device.get_device())
|
40
|
-
```
|
41
|
-
|
42
|
-
## 2. Upload and Unzip Your Dataset
|
43
|
-
|
44
|
-
Use the file browser on the left panel of Colab to upload the `.zip` file you created using the `PaddleOCRRecognitionExporter`. Then, unzip it.
|
45
|
-
|
46
|
-
```python
|
47
|
-
# Replace 'your_exported_data.zip' with the actual filename you uploaded
|
48
|
-
!unzip -q your_exported_data.zip -d finetune_data
|
49
|
-
|
50
|
-
# List the contents to verify
|
51
|
-
!ls finetune_data
|
52
|
-
```
|
53
|
-
|
54
|
-
You should see `images/`, `dict.txt`, `train.txt`, and `val.txt` (or `label.txt`) inside the `finetune_data` directory.
|
55
|
-
|
56
|
-
## 3. Prepare Training Configuration
|
57
|
-
|
58
|
-
PaddleOCR uses YAML files for configuration. We'll create one based on a standard recognition config, modified for fine-tuning with our dataset.
|
59
|
-
|
60
|
-
**Key Parameters to potentially adjust:**
|
61
|
-
|
62
|
-
* `Global.pretrained_model`: Path or URL to the pre-trained model you want to fine-tune. Using a model pre-trained on a large dataset (like English or multilingual) is crucial. See PaddleOCR Model List for options.
|
63
|
-
* `Global.save_model_dir`: Where to save checkpoints during training.
|
64
|
-
* `Global.epoch_num`: Number of training epochs. Start small (e.g., 10-50) for fine-tuning and increase if needed based on validation performance.
|
65
|
-
* `Optimizer.lr.learning_rate`: Learning rate. Fine-tuning often requires a smaller learning rate than training from scratch (e.g., 1e-4, 5e-5).
|
66
|
-
* `Train.dataset.data_dir`: Path to the directory containing the `images/` folder.
|
67
|
-
* `Train.dataset.label_file_list`: Path to your `train.txt`.
|
68
|
-
* `Train.loader.batch_size_per_card`: Batch size. Adjust based on GPU memory.
|
69
|
-
* `Eval.dataset.data_dir`: Path to the directory containing the `images/` folder.
|
70
|
-
* `Eval.dataset.label_file_list`: Path to your `val.txt`.
|
71
|
-
* `Eval.loader.batch_size_per_card`: Batch size for evaluation.
|
72
|
-
* `Architecture...`: Ensure the architecture matches the `pretrained_model`.
|
73
|
-
* `Loss...`: Ensure the loss function matches the `pretrained_model`.
|
74
|
-
|
75
|
-
```python
|
76
|
-
# Choose a pre-trained model (check PaddleOCR docs for latest/best models)
|
77
|
-
#PRETRAINED_MODEL_URL = "https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/latin_PP-OCRv4_rec_train.tar"
|
78
|
-
PRETRAINED_MODEL_URL = "https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_train.tar"
|
79
|
-
|
80
|
-
# Download and extract the pre-trained model
|
81
|
-
!wget -q {PRETRAINED_MODEL_URL} -O pretrained_model.tar
|
82
|
-
!tar -xf pretrained_model.tar
|
83
|
-
|
84
|
-
# Find the actual directory name (it might vary slightly)
|
85
|
-
PRETRAINED_MODEL_DIR = !find . -maxdepth 1 -type d -name '*_rec*' | head -n 1
|
86
|
-
PRETRAINED_MODEL_DIR = PRETRAINED_MODEL_DIR[0]
|
87
|
-
print(f"Using Pretrained Model Dir: {PRETRAINED_MODEL_DIR}")
|
88
|
-
```
|
89
|
-
|
90
|
-
Depending on how you train, you may or may not need to know how many characters are in your alphabet.
|
91
|
-
|
92
|
-
```python
|
93
|
-
num_classes = len([line for line in open("finetune_data/dict.txt", encoding="utf-8")])
|
94
|
-
num_classes
|
95
|
-
```
|
96
|
-
|
97
|
-
You need to set a maximum length for your pieces of text – if you plan ahead you can cut them up in other ways, but the easiest route is to pick the 99th or 99.9th percentile to avoid outliers. In my first test the 95th percentile was 17, 99.9th was 41, and absolute max was 138! It would have wasted a lot of memory and energy if we'd centered everything around 138-character words.
|
98
|
-
|
99
|
-
```python
|
100
|
-
lengths = []
|
101
|
-
with open("finetune_data/train.txt", encoding="utf-8") as f:
|
102
|
-
for line in f:
|
103
|
-
parts = line.strip().split(maxsplit=1)
|
104
|
-
if len(parts) == 2:
|
105
|
-
lengths.append(len(parts[1]))
|
106
|
-
|
107
|
-
# Basic stats
|
108
|
-
print("Max length:", max(lengths))
|
109
|
-
print("95th percentile:", sorted(lengths)[int(len(lengths) * 0.95)])
|
110
|
-
print("99th percentile:", sorted(lengths)[int(len(lengths) * 0.99)])
|
111
|
-
print("99.9th percentile:", sorted(lengths)[int(len(lengths) * 0.999)])
|
112
|
-
|
113
|
-
buffered_max_length = int(sorted(lengths)[int(len(lengths) * 0.999)] * 1.1)
|
114
|
-
buffered_max_length
|
115
|
-
```
|
116
|
-
|
117
|
-
```python
|
118
|
-
import shutil
|
119
|
-
from datetime import datetime
|
120
|
-
|
121
|
-
MAX_ALLOWED = buffered_max_length
|
122
|
-
removed = 0
|
123
|
-
cleaned_lines = []
|
124
|
-
|
125
|
-
with open("finetune_data/train.txt", encoding="utf-8") as f:
|
126
|
-
original_lines = f.readlines()
|
127
|
-
|
128
|
-
for i, line in enumerate(original_lines):
|
129
|
-
parts = line.strip().split(maxsplit=1)
|
130
|
-
if len(parts) == 2 and len(parts[1]) > MAX_ALLOWED:
|
131
|
-
removed += 1
|
132
|
-
print(f"⚠️ Line {i} exceeds max_text_length: {len(parts[1])} chars: {parts[1]}")
|
133
|
-
else:
|
134
|
-
cleaned_lines.append(line)
|
135
|
-
|
136
|
-
if removed > 0:
|
137
|
-
print(f"Removed {removed} of {len(original_lines)}. Backing up original, writing clean copy.")
|
138
|
-
shutil.copy("finetune_data/train.txt", "finetune_data/train_backup.txt")
|
139
|
-
|
140
|
-
with open("finetune_data/train.txt", "w", encoding="utf-8") as f:
|
141
|
-
f.writelines(cleaned_lines)
|
142
|
-
else:
|
143
|
-
print("Found 0 long lines")
|
144
|
-
```
|
145
|
-
|
146
|
-
You'll also notice it catches a lot of "Sorry, I can't process the image. Please upload the image again." and the like.
|
147
|
-
|
148
|
-
**And now it's configuration time!** We ignore almost all of the [suggestions from PaddleOCR's documentation](https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/model_train/finetune.html) because for some reason they get me ~40% while copying the [PPOCRv3 yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) gets me up to ~80%.
|
149
|
-
|
150
|
-
This creates a `finetune_rec.yml` file that controls how the training process will go.
|
151
|
-
|
152
|
-
```python
|
153
|
-
yaml_content = f"""
|
154
|
-
Global:
|
155
|
-
use_gpu: true
|
156
|
-
epoch_num: 120
|
157
|
-
log_smooth_window: 20
|
158
|
-
print_batch_step: 50
|
159
|
-
save_model_dir: ./output/finetune_rec/
|
160
|
-
save_epoch_step: 5
|
161
|
-
eval_batch_step: [0, 200] # Evaluate every 200 steps
|
162
|
-
cal_metric_during_train: true
|
163
|
-
pretrained_model: {PRETRAINED_MODEL_DIR}/best_accuracy
|
164
|
-
checkpoints: null
|
165
|
-
save_inference_dir: null
|
166
|
-
use_visualdl: false
|
167
|
-
infer_img: doc/imgs_words/en/word_1.png
|
168
|
-
character_dict_path: finetune_data/dict.txt
|
169
|
-
max_text_length: {buffered_max_length}
|
170
|
-
infer_mode: false
|
171
|
-
use_space_char: true
|
172
|
-
save_res_path: ./output/rec/predicts_rec.txt
|
173
|
-
|
174
|
-
Optimizer:
|
175
|
-
name: AdamW
|
176
|
-
beta1: 0.9
|
177
|
-
beta2: 0.999
|
178
|
-
lr:
|
179
|
-
name: Cosine
|
180
|
-
learning_rate: 0.00005
|
181
|
-
warmup_epoch: 3
|
182
|
-
regularizer:
|
183
|
-
name: L2
|
184
|
-
factor: 0.00005
|
185
|
-
|
186
|
-
Architecture:
|
187
|
-
model_type: rec
|
188
|
-
algorithm: SVTR_LCNet
|
189
|
-
Transform: null
|
190
|
-
Backbone:
|
191
|
-
name: MobileNetV1Enhance
|
192
|
-
scale: 0.5
|
193
|
-
last_conv_stride: [1, 2]
|
194
|
-
last_pool_type: avg
|
195
|
-
last_pool_kernel_size: [2, 2]
|
196
|
-
Head:
|
197
|
-
name: MultiHead
|
198
|
-
head_list:
|
199
|
-
- CTCHead:
|
200
|
-
Neck:
|
201
|
-
name: svtr
|
202
|
-
dims: 64
|
203
|
-
depth: 2
|
204
|
-
hidden_dims: 120
|
205
|
-
use_guide: True
|
206
|
-
Head:
|
207
|
-
fc_decay: 0.00001
|
208
|
-
- SARHead:
|
209
|
-
enc_dim: 512
|
210
|
-
max_text_length: {buffered_max_length}
|
211
|
-
|
212
|
-
Loss:
|
213
|
-
name: MultiLoss
|
214
|
-
loss_config_list:
|
215
|
-
- CTCLoss:
|
216
|
-
- SARLoss:
|
217
|
-
|
218
|
-
PostProcess:
|
219
|
-
name: CTCLabelDecode
|
220
|
-
|
221
|
-
Metric:
|
222
|
-
name: RecMetric
|
223
|
-
main_indicator: acc
|
224
|
-
ignore_space: false
|
225
|
-
|
226
|
-
Train:
|
227
|
-
dataset:
|
228
|
-
name: SimpleDataSet
|
229
|
-
data_dir: ./finetune_data/
|
230
|
-
label_file_list: ["./finetune_data/train.txt"]
|
231
|
-
transforms:
|
232
|
-
- DecodeImage:
|
233
|
-
img_mode: BGR
|
234
|
-
channel_first: False
|
235
|
-
- MultiLabelEncode:
|
236
|
-
- SVTRRecResizeImg:
|
237
|
-
image_shape: [3, 48, 320]
|
238
|
-
- KeepKeys:
|
239
|
-
keep_keys: ["image", "label_ctc", "label_sar", "length", "valid_ratio"]
|
240
|
-
loader:
|
241
|
-
shuffle: true
|
242
|
-
batch_size_per_card: 64
|
243
|
-
drop_last: true
|
244
|
-
num_workers: 4
|
245
|
-
|
246
|
-
Eval:
|
247
|
-
dataset:
|
248
|
-
name: SimpleDataSet
|
249
|
-
data_dir: ./finetune_data/
|
250
|
-
label_file_list: ["./finetune_data/val.txt"]
|
251
|
-
transforms:
|
252
|
-
- DecodeImage:
|
253
|
-
img_mode: BGR
|
254
|
-
channel_first: False
|
255
|
-
- MultiLabelEncode:
|
256
|
-
- SVTRRecResizeImg:
|
257
|
-
image_shape: [3, 48, 320]
|
258
|
-
- KeepKeys:
|
259
|
-
keep_keys: ["image", "label_ctc", "label_sar", "length", "valid_ratio"]
|
260
|
-
loader:
|
261
|
-
shuffle: false
|
262
|
-
drop_last: false
|
263
|
-
batch_size_per_card: 64
|
264
|
-
num_workers: 4
|
265
|
-
"""
|
266
|
-
|
267
|
-
with open("finetune_rec.yml", "w", encoding="utf-8") as fp:
|
268
|
-
fp.write(yaml_content)
|
269
|
-
```
|
270
|
-
|
271
|
-
## 4. Clone PaddleOCR Repository and Start Training
|
272
|
-
|
273
|
-
We need the PaddleOCR repository for its training scripts. Once we have it we'll point it at our `finetune_rec.yml` and set it in action.
|
274
|
-
|
275
|
-
```python
|
276
|
-
# Clone the PaddleOCR repository (using main branch)
|
277
|
-
!git clone https://github.com/PaddlePaddle/PaddleOCR.git --depth 1 paddleocr_repo
|
278
|
-
```
|
279
|
-
|
280
|
-
```python
|
281
|
-
# Remove any existing trained model
|
282
|
-
!rm -rf output
|
283
|
-
|
284
|
-
# Start training!
|
285
|
-
# -c points to our config file
|
286
|
-
# -o Override specific config options if needed (e.g., Global.epoch_num=10)
|
287
|
-
!python paddleocr_repo/tools/train.py -c ../finetune_rec.yml
|
288
|
-
```
|
289
|
-
|
290
|
-
Training will begin, printing logs and saving checkpoints to the directory specified in `Global.save_model_dir` (`./output/finetune_rec/` in the example). Monitor the accuracy (`acc`) and loss on the training and validation sets. You can stop training early if validation accuracy plateaus or starts to decrease.
|
291
|
-
|
292
|
-
## 5. Export Best Model for Inference
|
293
|
-
|
294
|
-
Once training is complete, find the best checkpoint (usually named `best_accuracy.pdparams`) in the output directory and convert it into an inference model. The line below should automatically find the best model.
|
295
|
-
|
296
|
-
```python
|
297
|
-
# Find the best model checkpoint
|
298
|
-
BEST_MODEL_PATH = "output/finetune_rec/best_accuracy" # Path relative to paddleocr_repo dir
|
299
|
-
|
300
|
-
# Export the model for inference
|
301
|
-
!python paddleocr_repo/tools/export_model.py \
|
302
|
-
-c finetune_rec.yml \
|
303
|
-
-o Global.pretrained_model="{BEST_MODEL_PATH}" \
|
304
|
-
Global.save_inference_dir="inference_model"
|
305
|
-
```
|
306
|
-
|
307
|
-
This will create an `inference_model` directory containing `inference.pdmodel`, `inference.pdiparams`, and potentially other files needed for deployment.
|
308
|
-
|
309
|
-
## 6. Test Inference (Optional)
|
310
|
-
|
311
|
-
You can use the exported inference model to predict text on new images.
|
312
|
-
|
313
|
-
```python
|
314
|
-
from paddleocr import PaddleOCR
|
315
|
-
from IPython.display import Image, display
|
316
|
-
import random
|
317
|
-
|
318
|
-
ocr = PaddleOCR(
|
319
|
-
use_angle_cls=False,
|
320
|
-
lang='en',
|
321
|
-
rec_model_dir='inference_model',
|
322
|
-
rec_algorithm='SVTR_LCNet',
|
323
|
-
rec_image_shape='3,48,320',
|
324
|
-
rec_char_dict_path='finetune_data/dict.txt',
|
325
|
-
use_gpu=True
|
326
|
-
)
|
327
|
-
|
328
|
-
# Pick one random image from val.txt
|
329
|
-
with open("finetune_data/val.txt", encoding="utf-8") as f:
|
330
|
-
line = random.choice([l.strip() for l in f if l.strip()])
|
331
|
-
img_path, ground_truth = line.split(maxsplit=1)
|
332
|
-
|
333
|
-
# Run inference
|
334
|
-
result = ocr.ocr(img_path, det=False)
|
335
|
-
prediction = result[0][0][1]['text'] if result else '[No result]'
|
336
|
-
|
337
|
-
# Display
|
338
|
-
display(Image(filename=img_path))
|
339
|
-
print(f"GT: {ground_truth}")
|
340
|
-
print(f"Pred: {prediction}")
|
341
|
-
```
|
342
|
-
|
343
|
-
Compare the predicted text with the ground truth in your label file.
|
344
|
-
|
345
|
-
## 7. Package and Distribute Your Model
|
346
|
-
|
347
|
-
Once you have successfully fine-tuned and tested your model, you'll want to package it for easy distribution and use. A properly packaged model should include all necessary files to use it with Natural PDF:
|
348
|
-
|
349
|
-
````python
|
350
|
-
import shutil
|
351
|
-
import os
|
352
|
-
|
353
|
-
# Create a distribution directory
|
354
|
-
dist_dir = "my_paddleocr_model_distribution"
|
355
|
-
os.makedirs(dist_dir, exist_ok=True)
|
356
|
-
|
357
|
-
# Copy the inference model
|
358
|
-
shutil.copytree("inference_model", os.path.join(dist_dir, "inference_model"))
|
359
|
-
|
360
|
-
# Copy the dictionary file (critical for text recognition)
|
361
|
-
shutil.copy("finetune_data/dict.txt", os.path.join(dist_dir, "dict.txt"))
|
362
|
-
|
363
|
-
# Create a simple README
|
364
|
-
with open(os.path.join(dist_dir, "README.md"), "w") as f:
|
365
|
-
f.write("""# Custom PaddleOCR Model
|
366
|
-
|
367
|
-
## Model Information
|
368
|
-
- Trained for: [describe your document type/language]
|
369
|
-
- Base model: [e.g., "PaddleOCR v3 Latin"]
|
370
|
-
- Training date: [date]
|
371
|
-
- Epochs trained: [number of epochs]
|
372
|
-
- Final accuracy: [accuracy percentage]
|
373
|
-
|
374
|
-
## Usage with Natural PDF
|
375
|
-
|
376
|
-
from natural_pdf import PDF
|
377
|
-
from natural_pdf.ocr import PaddleOCROptions
|
378
|
-
|
379
|
-
# Configure OCR with this model
|
380
|
-
paddle_opts = PaddleOCROptions(
|
381
|
-
rec_model_dir="path/to/inference_model",
|
382
|
-
rec_char_dict_path="path/to/dict.txt",
|
383
|
-
)
|
384
|
-
|
385
|
-
# Use in your PDF processing
|
386
|
-
pdf = PDF("your-document.pdf")
|
387
|
-
page = pdf.pages[0]
|
388
|
-
ocr_elements = page.apply_ocr(engine='paddle', options=paddle_opts)
|
389
|
-
""")
|
390
|
-
|
391
|
-
# Zip everything up
|
392
|
-
|
393
|
-
shutil.make_archive(dist_dir, 'zip', dist_dir)
|
394
|
-
print(f"Model distribution package created: {dist_dir}.zip")
|
395
|
-
````
|
396
|
-
|
397
|
-
### Essential Components
|
398
|
-
|
399
|
-
Your distribution package must include:
|
400
|
-
|
401
|
-
1. **Inference Model Directory**: Contains the trained model files (`inference.pdmodel`, `inference.pdiparams`, etc.)
|
402
|
-
2. **Character Dictionary**: The `dict.txt` file used during training that maps character IDs to actual characters
|
403
|
-
3. **Documentation**: A README with usage instructions and model information
|
404
|
-
|
405
|
-
### Usage Notes
|
406
|
-
|
407
|
-
When sharing your model with others, advise them to:
|
408
|
-
|
409
|
-
1. Extract all files while maintaining the directory structure
|
410
|
-
2. Use the `PaddleOCROptions` class to configure Natural PDF with the model paths
|
411
|
-
3. Understand model limitations (specific languages, document types, etc.)
|
412
|
-
|
413
|
-
You now have a fine-tuned PaddleOCR recognition model tailored to your data! The model can be distributed and used to improve OCR accuracy on similar documents in your application.
|
414
|
-
|
415
|
-
---
|