rostaing-ocr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rostaing_ocr/__init__.py
ADDED
@@ -0,0 +1,223 @@
|
|
1
|
+
import os
|
2
|
+
import fitz # PyMuPDF
|
3
|
+
import pytesseract
|
4
|
+
from markitdown import MarkItDown
|
5
|
+
import io
|
6
|
+
from PIL import Image
|
7
|
+
# N.B. : 'ImageOps' n'était pas utilisé, je l'ai retiré pour plus de propreté.
|
8
|
+
from typing import List, Union, Optional, Dict, Tuple # Tuple n'est plus nécessaire mais bonne pratique à connaître
|
9
|
+
|
10
|
+
class RostaingOCR:
|
11
|
+
"""
|
12
|
+
Une classe pour convertir, extraire et sauvegarder le texte de fichiers
|
13
|
+
(images, PDF scannés) aux formats .txt et .md. La reconnaissance est
|
14
|
+
optimisée par un prétraitement des images.
|
15
|
+
|
16
|
+
Utilisation :
|
17
|
+
# Comportement par défaut (sauvegarde uniquement)
|
18
|
+
extractor = RostaingOCR("fichier.pdf", output_basename="resultat")
|
19
|
+
|
20
|
+
# Sauvegarde ET affichage dans la console
|
21
|
+
extractor = RostaingOCR("fichier.pdf", output_basename="resultat", print_to_console=True)
|
22
|
+
"""
|
23
|
+
|
24
|
+
def __init__(self,
|
25
|
+
input_path_or_paths: Union[str, List[str]],
|
26
|
+
output_basename: str = "output",
|
27
|
+
print_to_console: bool = False,
|
28
|
+
languages: List[str] = ['fra', 'eng'],
|
29
|
+
tesseract_cmd: Optional[str] = None):
|
30
|
+
"""
|
31
|
+
Initialise ET lance le processus d'extraction complet.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
input_path_or_paths (Union[str, List[str]]):
|
35
|
+
Chemin vers un fichier source unique ou une liste de chemins.
|
36
|
+
output_basename (str):
|
37
|
+
Nom de base pour les fichiers de sortie (sans extension).
|
38
|
+
Générera '{output_basename}.txt' et '{output_basename}.md'.
|
39
|
+
print_to_console (bool):
|
40
|
+
Si True, le contenu Markdown extrait sera affiché dans la console.
|
41
|
+
languages (List[str]):
|
42
|
+
Liste des langues à utiliser pour l'OCR.
|
43
|
+
tesseract_cmd (Optional[str]):
|
44
|
+
Chemin vers l'exécutable Tesseract.
|
45
|
+
"""
|
46
|
+
# --- 1. Configuration ---
|
47
|
+
if isinstance(input_path_or_paths, str):
|
48
|
+
self.input_paths = [input_path_or_paths]
|
49
|
+
else:
|
50
|
+
self.input_paths = input_path_or_paths
|
51
|
+
|
52
|
+
self.output_basename = output_basename
|
53
|
+
self.output_txt_path = f"{output_basename}.txt"
|
54
|
+
self.output_md_path = f"{output_basename}.md"
|
55
|
+
|
56
|
+
self.print_to_console = print_to_console
|
57
|
+
self.tesseract_lang_string = '+'.join(languages)
|
58
|
+
self.md_converter = MarkItDown()
|
59
|
+
self.results: Dict[str, Optional[str]] = {}
|
60
|
+
|
61
|
+
for path in self.input_paths:
|
62
|
+
if not os.path.exists(path):
|
63
|
+
raise FileNotFoundError(f"The specified input file does not exist: {path}")
|
64
|
+
|
65
|
+
if tesseract_cmd:
|
66
|
+
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
|
67
|
+
|
68
|
+
print(f"RostaingOCR initialized. Starting processing...")
|
69
|
+
print(f"Output files: '{self.output_txt_path}' and '{self.output_md_path}'")
|
70
|
+
if self.print_to_console:
|
71
|
+
print("Console display: Enabled")
|
72
|
+
|
73
|
+
# --- 2. Exécution immédiate du traitement ---
|
74
|
+
self._run_extraction_workflow()
|
75
|
+
|
76
|
+
print("\nProcessing complete.")
|
77
|
+
|
78
|
+
def _run_extraction_workflow(self):
|
79
|
+
"""(Privé) Gère le flux de travail pour tous les fichiers."""
|
80
|
+
all_final_content = []
|
81
|
+
|
82
|
+
for i, file_path in enumerate(self.input_paths):
|
83
|
+
print(f"\n--- Processing {os.path.basename(file_path)} ({i+1}/{len(self.input_paths)}) ---")
|
84
|
+
|
85
|
+
# CORRIGÉ: La méthode retourne maintenant une seule chaîne (le contenu Markdown)
|
86
|
+
extracted_content = self._extract_text_from_single_file(file_path)
|
87
|
+
|
88
|
+
self.results[file_path] = extracted_content
|
89
|
+
|
90
|
+
if extracted_content:
|
91
|
+
# Ajoute un titre pour la consolidation si plusieurs fichiers sont traités
|
92
|
+
content_with_header = f"# Content from : {os.path.basename(file_path)}\n\n{extracted_content}"
|
93
|
+
all_final_content.append(content_with_header)
|
94
|
+
print(f"--- SUCCESS for '{os.path.basename(file_path)}' ---")
|
95
|
+
|
96
|
+
# Affichage console immédiat pour ce fichier
|
97
|
+
if self.print_to_console:
|
98
|
+
print("\n" + "="*20 + f" CONTENT OF {os.path.basename(file_path)} " + "="*20)
|
99
|
+
print(extracted_content)
|
100
|
+
print("="* (42 + len(os.path.basename(file_path))) + "\n")
|
101
|
+
else:
|
102
|
+
print(f"--- FAILED for '{os.path.basename(file_path)}' ---")
|
103
|
+
|
104
|
+
if all_final_content:
|
105
|
+
# Consolide le contenu de tous les fichiers traités
|
106
|
+
final_output_string = "\n\n---\n\n".join(all_final_content)
|
107
|
+
self._save_outputs(final_output_string)
|
108
|
+
|
109
|
+
# CORRIGÉ: La signature de la fonction retourne maintenant un simple Optional[str]
|
110
|
+
def _extract_text_from_single_file(self, input_path: str) -> Optional[str]:
|
111
|
+
"""(Privé) Orchestre le processus pour un seul fichier et retourne le contenu Markdown."""
|
112
|
+
searchable_pdf_path = self._convert_to_searchable_pdf(input_path)
|
113
|
+
|
114
|
+
if not searchable_pdf_path:
|
115
|
+
return None
|
116
|
+
|
117
|
+
extracted_content = None
|
118
|
+
try:
|
119
|
+
print(f"\n[Step 2/3] Extracting text and converting to Markdown...")
|
120
|
+
result = self.md_converter.convert(searchable_pdf_path)
|
121
|
+
|
122
|
+
# CORRIGÉ: Utilisation de .text_content qui contient bien le Markdown
|
123
|
+
extracted_content = result.text_content
|
124
|
+
|
125
|
+
print(" - Extraction and conversion successful.")
|
126
|
+
|
127
|
+
except Exception as e:
|
128
|
+
# Erreur plus spécifique et utile pour le débogage
|
129
|
+
print(f" - ERROR during extraction with MarkItDown : {e}")
|
130
|
+
finally:
|
131
|
+
print("[Cleanup] Deleting temporary file...")
|
132
|
+
if searchable_pdf_path and os.path.exists(searchable_pdf_path):
|
133
|
+
os.remove(searchable_pdf_path)
|
134
|
+
print(f" - '{searchable_pdf_path}' supprimé.")
|
135
|
+
|
136
|
+
return extracted_content
|
137
|
+
|
138
|
+
def _preprocess_image(self, image: Image.Image) -> Image.Image:
|
139
|
+
"""(Privé) Prétraite une image pour optimiser l'OCR."""
|
140
|
+
# Conversion en niveaux de gris pour améliorer le contraste
|
141
|
+
return image.convert('L')
|
142
|
+
|
143
|
+
def _convert_to_searchable_pdf(self, input_path: str) -> Optional[str]:
|
144
|
+
"""(Privé) Convertit un fichier en PDF cherchable temporaire avec prétraitement d'image."""
|
145
|
+
print(f"[Step 1/3] Converting to searchable PDF...")
|
146
|
+
base_name = os.path.splitext(os.path.basename(input_path))[0]
|
147
|
+
temp_output_path = f"{base_name}_temp_searchable.pdf"
|
148
|
+
output_pdf = fitz.open()
|
149
|
+
|
150
|
+
# Utilisation de 'with' pour garantir la fermeture des ressources
|
151
|
+
try:
|
152
|
+
with fitz.open(input_path) as input_doc:
|
153
|
+
for i, page in enumerate(input_doc):
|
154
|
+
print(f" - Processing page {i+1}/{len(input_doc)}...")
|
155
|
+
pix = page.get_pixmap(dpi=300)
|
156
|
+
img_bytes = pix.tobytes("png")
|
157
|
+
|
158
|
+
img = Image.open(io.BytesIO(img_bytes))
|
159
|
+
preprocessed_img = self._preprocess_image(img)
|
160
|
+
|
161
|
+
result = pytesseract.image_to_pdf_or_hocr(preprocessed_img, lang=self.tesseract_lang_string, extension='pdf')
|
162
|
+
|
163
|
+
with fitz.open("pdf", result) as ocr_pdf:
|
164
|
+
new_page = output_pdf.new_page(width=page.rect.width, height=page.rect.height)
|
165
|
+
new_page.insert_image(page.rect, stream=img_bytes)
|
166
|
+
new_page.show_pdf_page(new_page.rect, ocr_pdf, 0)
|
167
|
+
|
168
|
+
except Exception:
|
169
|
+
print(f" - Warning: fitz could not open '{input_path}' directly. Attempting with Pillow...")
|
170
|
+
try:
|
171
|
+
with Image.open(input_path) as img:
|
172
|
+
preprocessed_img = self._preprocess_image(img.convert("RGB")) # Convertir en RGB avant pour la compatibilité
|
173
|
+
|
174
|
+
pdf_bytes = pytesseract.image_to_pdf_or_hocr(preprocessed_img, lang=self.tesseract_lang_string, extension='pdf')
|
175
|
+
|
176
|
+
with fitz.open("pdf", pdf_bytes) as ocr_pdf:
|
177
|
+
# Reconstruire un PDF avec l'image visuelle
|
178
|
+
img_as_bytes = io.BytesIO()
|
179
|
+
img.save(img_as_bytes, format='PNG')
|
180
|
+
img_as_bytes.seek(0)
|
181
|
+
|
182
|
+
page = output_pdf.new_page(width=img.width, height=img.height)
|
183
|
+
page.insert_image(page.rect, stream=img_as_bytes.read())
|
184
|
+
page.show_pdf_page(page.rect, ocr_pdf, 0)
|
185
|
+
|
186
|
+
except Exception as e2:
|
187
|
+
print(f" - FATAL ERROR: Could not process file '{input_path}'. EError : {e2}")
|
188
|
+
output_pdf.close()
|
189
|
+
return None
|
190
|
+
|
191
|
+
if len(output_pdf) > 0:
|
192
|
+
output_pdf.save(temp_output_path, garbage=4, deflate=True, clean=True)
|
193
|
+
print(f" - Temporary searchable PDF created : '{temp_output_path}'")
|
194
|
+
else:
|
195
|
+
print(" - ERROR: No pages were generated.")
|
196
|
+
temp_output_path = None
|
197
|
+
|
198
|
+
output_pdf.close()
|
199
|
+
return temp_output_path
|
200
|
+
|
201
|
+
def _save_outputs(self, final_content: str):
|
202
|
+
"""(Privé) Sauvegarde le contenu Markdown consolidé dans les fichiers .txt et .md."""
|
203
|
+
print(f"\n[Step 3/3] Saving consolidated content...")
|
204
|
+
|
205
|
+
for path in [self.output_md_path, self.output_txt_path]:
|
206
|
+
try:
|
207
|
+
with open(path, 'w', encoding='utf-8') as f:
|
208
|
+
f.write(final_content)
|
209
|
+
print(f" - Successfully saved to '{path}'.")
|
210
|
+
except IOError as e:
|
211
|
+
print(f" - ERROR: Could not write to file '{path}'. Error : {e}")
|
212
|
+
|
213
|
+
def __str__(self) -> str:
|
214
|
+
"""Représentation textuelle de l'objet pour afficher un résumé des résultats."""
|
215
|
+
summary_lines = [f"--- RostaingOCR Extraction Summary ---"]
|
216
|
+
summary_lines.append(f"Generated output files: '{self.output_txt_path}' et '{self.output_md_path}'")
|
217
|
+
|
218
|
+
for file_path, text_content in self.results.items():
|
219
|
+
status = "✅ Success" if text_content else "❌ Failure"
|
220
|
+
line = f"\n - Processed file : {os.path.basename(file_path)}\n"
|
221
|
+
line += f" Status : {status}"
|
222
|
+
summary_lines.append(line)
|
223
|
+
return "\n".join(summary_lines)
|
@@ -0,0 +1,143 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: rostaing-ocr
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Un outil OCR pour extraire du texte d'images et de PDF en utilisant Tesseract et PyMuPDF.
|
5
|
+
Author-email: Davila Rostaing <davilarostaing@gmail.com>
|
6
|
+
Project-URL: Homepage, https://github.com/Rostaing/rostaing-ocr
|
7
|
+
Project-URL: Bug Tracker, https://github.com/Rostaing/rostaing-ocr/issues
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Classifier: Topic :: Text Processing
|
12
|
+
Classifier: Intended Audience :: Developers
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
14
|
+
Requires-Python: >=3.7
|
15
|
+
Description-Content-Type: text/markdown
|
16
|
+
License-File: LICENSE
|
17
|
+
Requires-Dist: PyMuPDF
|
18
|
+
Requires-Dist: pytesseract
|
19
|
+
Requires-Dist: markitdown
|
20
|
+
Requires-Dist: Pillow
|
21
|
+
Dynamic: license-file
|
22
|
+
|
23
|
+
# Rostaing OCR, created by Davila Rostaing.
|
24
|
+
|
25
|
+
A simple and powerful Python tool for extracting text from images and scanned PDFs. It leverages Tesseract, PyMuPDF, and MarkItDown to produce clean plain text (`.txt`) and Markdown (`.md`) files.
|
26
|
+
|
27
|
+
## Features
|
28
|
+
|
29
|
+
- Converts images (PNG, JPG, etc.) and PDFs into text.
|
30
|
+
- Includes image preprocessing (grayscale conversion) to improve OCR accuracy.
|
31
|
+
- Processes multiple files in a single run, consolidating the output.
|
32
|
+
- Generates output in both plain text (`.txt`) and Markdown (`.md`) formats.
|
33
|
+
- Optional flag to print extracted content directly to the console for quick inspection.
|
34
|
+
|
35
|
+
## System Prerequisites: Tesseract OCR
|
36
|
+
|
37
|
+
**Important:** This package requires the **Tesseract OCR engine** to be installed on your system. `rostaing-ocr` is a Python wrapper that calls the `tesseract` command-line tool. You must install it and its language packs first.
|
38
|
+
|
39
|
+
### Windows Installation Guide (Recommended)
|
40
|
+
|
41
|
+
1. **Download the Installer**: Go to the official [Tesseract at UB Mannheim](https://github.com/UB-Mannheim/tesseract/wiki) repository. They provide the most up-to-date and reliable installers for Windows.
|
42
|
+
2. **Run the Installer**: Start the installation process.
|
43
|
+
3. **Crucial Step - Add to PATH**: On the "Select Additional Tasks" screen, make sure to check the box for **"Add Tesseract to system PATH"**. This is essential for Python to be able to find and execute Tesseract.
|
44
|
+
4. **Select Languages**: On the "Select additional language data" screen, expand the list and select the languages you will need for OCR (e.g., check `French` for `fra`, `English` is usually included by default).
|
45
|
+
5. **Complete Installation**: Finish the installation and, to be safe, restart your command prompt, terminal, or IDE to ensure the system's PATH variable is updated.
|
46
|
+
|
47
|
+
### macOS (via Homebrew)
|
48
|
+
|
49
|
+
```bash
|
50
|
+
brew install tesseract
|
51
|
+
```
|
52
|
+
You can add language packs by installing `tesseract-lang`.
|
53
|
+
|
54
|
+
### Linux (Debian/Ubuntu)
|
55
|
+
|
56
|
+
```bash
|
57
|
+
sudo apt update
|
58
|
+
sudo apt install tesseract-ocr
|
59
|
+
|
60
|
+
# Also install the language packs you need. For French:
|
61
|
+
sudo apt install tesseract-ocr-fra
|
62
|
+
```
|
63
|
+
|
64
|
+
## Installation
|
65
|
+
|
66
|
+
### Best Practice: Use a Virtual Environment
|
67
|
+
|
68
|
+
To keep project dependencies isolated and avoid conflicts with other Python projects on your system, it is highly recommended to use a virtual environment.
|
69
|
+
|
70
|
+
With a standard Python installation, you can create and activate a new environment using the following commands:
|
71
|
+
|
72
|
+
**On macOS/Linux:**
|
73
|
+
```bash
|
74
|
+
# Create an environment named '.venv' in your project directory
|
75
|
+
python3 -m venv .venv
|
76
|
+
|
77
|
+
# Activate the environment
|
78
|
+
source .venv/bin/activate
|
79
|
+
```
|
80
|
+
|
81
|
+
**On Windows:**
|
82
|
+
```bash
|
83
|
+
# Create an environment named '.venv' in your project directory
|
84
|
+
python -m venv .venv
|
85
|
+
|
86
|
+
# Activate the environment
|
87
|
+
.venv\Scripts\activate
|
88
|
+
```
|
89
|
+
|
90
|
+
### Install the Package
|
91
|
+
|
92
|
+
Once Tesseract is set up and your virtual environment is activated, you can install the package from PyPI:
|
93
|
+
|
94
|
+
```bash
|
95
|
+
pip install rostaing-ocr
|
96
|
+
```
|
97
|
+
|
98
|
+
## Usage
|
99
|
+
|
100
|
+
Here is a basic example of how to use the `RostaingOCR` class.
|
101
|
+
|
102
|
+
```python
|
103
|
+
from rostaing_ocr import RostaingOCR
|
104
|
+
|
105
|
+
# --- Example 1: Process a single file ---
|
106
|
+
# This will create 'my_result.txt' and 'my_result.md' in the current directory.
|
107
|
+
extractor = RostaingOCR(
|
108
|
+
input_path_or_paths="path/to/my_document.pdf",
|
109
|
+
output_basename="my_result", # Optionel
|
110
|
+
print_to_console=True # Optionel
|
111
|
+
)
|
112
|
+
|
113
|
+
# --- Example 2: Process multiple files and print to console ---
|
114
|
+
# This will process both files, save a consolidated output, and also print the results.
|
115
|
+
multi_extractor = RostaingOCR(
|
116
|
+
input_path_or_paths=["document1.png", "scan_page_2.pdf"],
|
117
|
+
output_basename="combined_report", # Optionel
|
118
|
+
print_to_console=True, # Optionel
|
119
|
+
languages=['fra', 'eng'] # Specify languages for Tesseract # Optionel
|
120
|
+
)
|
121
|
+
|
122
|
+
# You can print the object to get a summary of the operation.
|
123
|
+
print(multi_extractor)
|
124
|
+
```
|
125
|
+
|
126
|
+
## Application for LLM and RAG Pipelines
|
127
|
+
|
128
|
+
Large Language Models (LLMs) like GPT-4 or Llama understand text, not images or scanned documents. A vast amount of valuable knowledge is locked away in unstructured formats such as PDFs of research papers, scanned invoices, or legal contracts.
|
129
|
+
|
130
|
+
**`Rostaing OCR` serves as the crucial first step in any data ingestion pipeline for Retrieval-Augmented Generation (RAG) systems.** It bridges the gap by converting this inaccessible visual data into clean, structured text that LLMs can process.
|
131
|
+
|
132
|
+
By using `Rostaing OCR`, you can automate the process of building a knowledge base from your documents:
|
133
|
+
|
134
|
+
1. **Input**: A directory of `Scanned PDFs` or `Images`.
|
135
|
+
2. **Extraction (Rostaing OCR)**: Convert all documents into clean `Markdown/Text`.
|
136
|
+
3. **Processing**: The text output can be fed into text splitters and then embedding models.
|
137
|
+
4. **Indexing**: The resulting vectors are stored in a vector database (e.g., Chroma, Pinecone, FAISS) for efficient retrieval.
|
138
|
+
|
139
|
+
In short, `Rostaing OCR` unlocks your documents, making them ready for any modern AI stack.
|
140
|
+
|
141
|
+
## License
|
142
|
+
|
143
|
+
This project is licensed under the MIT License. See the `LICENSE` file for more details.
|
@@ -0,0 +1,7 @@
|
|
1
|
+
rostaing_ocr/__init__.py,sha256=HBwHqAcgfbxYvLsLkIiei-HEgXZrcHXM675VrLjO2fc,196
|
2
|
+
rostaing_ocr/rostaing_ocr.py,sha256=xp7aP0MiL4vS0KzztPnDCnAsYsPzZ6ZBWZxRnNAEgIs,10746
|
3
|
+
rostaing_ocr-0.1.0.dist-info/licenses/LICENSE,sha256=-mnJp92iFU5D7XvCVxTbqiHQBhIo_LKsH9yaMaPymoY,1087
|
4
|
+
rostaing_ocr-0.1.0.dist-info/METADATA,sha256=YocIRat6BHwcZPAB_Qk92Gep6Kl_SLPVFRb34Oeb-m0,6036
|
5
|
+
rostaing_ocr-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
+
rostaing_ocr-0.1.0.dist-info/top_level.txt,sha256=2MSL7KmO9dgjq622ZnYtXoYIxsa1VVm0ZAlrj3RKPCg,13
|
7
|
+
rostaing_ocr-0.1.0.dist-info/RECORD,,
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 RostaingOCR
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1 @@
|
|
1
|
+
rostaing_ocr
|