rostaing-ocr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ # Fichier : src/rostaing_ocr/__init__.py
2
+
3
+ from .rostaing_ocr import RostaingOCR
4
+
5
+ # Optionnel: définir ce qui est exporté quand on fait 'from rostaing_ocr import *'
6
+ __all__ = ['RostaingOCR']
@@ -0,0 +1,223 @@
1
+ import os
2
+ import fitz # PyMuPDF
3
+ import pytesseract
4
+ from markitdown import MarkItDown
5
+ import io
6
+ from PIL import Image
7
+ # N.B. : 'ImageOps' n'était pas utilisé, je l'ai retiré pour plus de propreté.
8
+ from typing import List, Union, Optional, Dict, Tuple # Tuple n'est plus nécessaire mais bonne pratique à connaître
9
+
10
+ class RostaingOCR:
11
+ """
12
+ Une classe pour convertir, extraire et sauvegarder le texte de fichiers
13
+ (images, PDF scannés) aux formats .txt et .md. La reconnaissance est
14
+ optimisée par un prétraitement des images.
15
+
16
+ Utilisation :
17
+ # Comportement par défaut (sauvegarde uniquement)
18
+ extractor = RostaingOCR("fichier.pdf", output_basename="resultat")
19
+
20
+ # Sauvegarde ET affichage dans la console
21
+ extractor = RostaingOCR("fichier.pdf", output_basename="resultat", print_to_console=True)
22
+ """
23
+
24
+ def __init__(self,
25
+ input_path_or_paths: Union[str, List[str]],
26
+ output_basename: str = "output",
27
+ print_to_console: bool = False,
28
+ languages: List[str] = ['fra', 'eng'],
29
+ tesseract_cmd: Optional[str] = None):
30
+ """
31
+ Initialise ET lance le processus d'extraction complet.
32
+
33
+ Args:
34
+ input_path_or_paths (Union[str, List[str]]):
35
+ Chemin vers un fichier source unique ou une liste de chemins.
36
+ output_basename (str):
37
+ Nom de base pour les fichiers de sortie (sans extension).
38
+ Générera '{output_basename}.txt' et '{output_basename}.md'.
39
+ print_to_console (bool):
40
+ Si True, le contenu Markdown extrait sera affiché dans la console.
41
+ languages (List[str]):
42
+ Liste des langues à utiliser pour l'OCR.
43
+ tesseract_cmd (Optional[str]):
44
+ Chemin vers l'exécutable Tesseract.
45
+ """
46
+ # --- 1. Configuration ---
47
+ if isinstance(input_path_or_paths, str):
48
+ self.input_paths = [input_path_or_paths]
49
+ else:
50
+ self.input_paths = input_path_or_paths
51
+
52
+ self.output_basename = output_basename
53
+ self.output_txt_path = f"{output_basename}.txt"
54
+ self.output_md_path = f"{output_basename}.md"
55
+
56
+ self.print_to_console = print_to_console
57
+ self.tesseract_lang_string = '+'.join(languages)
58
+ self.md_converter = MarkItDown()
59
+ self.results: Dict[str, Optional[str]] = {}
60
+
61
+ for path in self.input_paths:
62
+ if not os.path.exists(path):
63
+ raise FileNotFoundError(f"The specified input file does not exist: {path}")
64
+
65
+ if tesseract_cmd:
66
+ pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
67
+
68
+ print(f"RostaingOCR initialized. Starting processing...")
69
+ print(f"Output files: '{self.output_txt_path}' and '{self.output_md_path}'")
70
+ if self.print_to_console:
71
+ print("Console display: Enabled")
72
+
73
+ # --- 2. Exécution immédiate du traitement ---
74
+ self._run_extraction_workflow()
75
+
76
+ print("\nProcessing complete.")
77
+
78
+ def _run_extraction_workflow(self):
79
+ """(Privé) Gère le flux de travail pour tous les fichiers."""
80
+ all_final_content = []
81
+
82
+ for i, file_path in enumerate(self.input_paths):
83
+ print(f"\n--- Processing {os.path.basename(file_path)} ({i+1}/{len(self.input_paths)}) ---")
84
+
85
+ # CORRIGÉ: La méthode retourne maintenant une seule chaîne (le contenu Markdown)
86
+ extracted_content = self._extract_text_from_single_file(file_path)
87
+
88
+ self.results[file_path] = extracted_content
89
+
90
+ if extracted_content:
91
+ # Ajoute un titre pour la consolidation si plusieurs fichiers sont traités
92
+ content_with_header = f"# Content from : {os.path.basename(file_path)}\n\n{extracted_content}"
93
+ all_final_content.append(content_with_header)
94
+ print(f"--- SUCCESS for '{os.path.basename(file_path)}' ---")
95
+
96
+ # Affichage console immédiat pour ce fichier
97
+ if self.print_to_console:
98
+ print("\n" + "="*20 + f" CONTENT OF {os.path.basename(file_path)} " + "="*20)
99
+ print(extracted_content)
100
+ print("="* (42 + len(os.path.basename(file_path))) + "\n")
101
+ else:
102
+ print(f"--- FAILED for '{os.path.basename(file_path)}' ---")
103
+
104
+ if all_final_content:
105
+ # Consolide le contenu de tous les fichiers traités
106
+ final_output_string = "\n\n---\n\n".join(all_final_content)
107
+ self._save_outputs(final_output_string)
108
+
109
+ # CORRIGÉ: La signature de la fonction retourne maintenant un simple Optional[str]
110
+ def _extract_text_from_single_file(self, input_path: str) -> Optional[str]:
111
+ """(Privé) Orchestre le processus pour un seul fichier et retourne le contenu Markdown."""
112
+ searchable_pdf_path = self._convert_to_searchable_pdf(input_path)
113
+
114
+ if not searchable_pdf_path:
115
+ return None
116
+
117
+ extracted_content = None
118
+ try:
119
+ print(f"\n[Step 2/3] Extracting text and converting to Markdown...")
120
+ result = self.md_converter.convert(searchable_pdf_path)
121
+
122
+ # CORRIGÉ: Utilisation de .text_content qui contient bien le Markdown
123
+ extracted_content = result.text_content
124
+
125
+ print(" - Extraction and conversion successful.")
126
+
127
+ except Exception as e:
128
+ # Erreur plus spécifique et utile pour le débogage
129
+ print(f" - ERROR during extraction with MarkItDown : {e}")
130
+ finally:
131
+ print("[Cleanup] Deleting temporary file...")
132
+ if searchable_pdf_path and os.path.exists(searchable_pdf_path):
133
+ os.remove(searchable_pdf_path)
134
+ print(f" - '{searchable_pdf_path}' supprimé.")
135
+
136
+ return extracted_content
137
+
138
+ def _preprocess_image(self, image: Image.Image) -> Image.Image:
139
+ """(Privé) Prétraite une image pour optimiser l'OCR."""
140
+ # Conversion en niveaux de gris pour améliorer le contraste
141
+ return image.convert('L')
142
+
143
+ def _convert_to_searchable_pdf(self, input_path: str) -> Optional[str]:
144
+ """(Privé) Convertit un fichier en PDF cherchable temporaire avec prétraitement d'image."""
145
+ print(f"[Step 1/3] Converting to searchable PDF...")
146
+ base_name = os.path.splitext(os.path.basename(input_path))[0]
147
+ temp_output_path = f"{base_name}_temp_searchable.pdf"
148
+ output_pdf = fitz.open()
149
+
150
+ # Utilisation de 'with' pour garantir la fermeture des ressources
151
+ try:
152
+ with fitz.open(input_path) as input_doc:
153
+ for i, page in enumerate(input_doc):
154
+ print(f" - Processing page {i+1}/{len(input_doc)}...")
155
+ pix = page.get_pixmap(dpi=300)
156
+ img_bytes = pix.tobytes("png")
157
+
158
+ img = Image.open(io.BytesIO(img_bytes))
159
+ preprocessed_img = self._preprocess_image(img)
160
+
161
+ result = pytesseract.image_to_pdf_or_hocr(preprocessed_img, lang=self.tesseract_lang_string, extension='pdf')
162
+
163
+ with fitz.open("pdf", result) as ocr_pdf:
164
+ new_page = output_pdf.new_page(width=page.rect.width, height=page.rect.height)
165
+ new_page.insert_image(page.rect, stream=img_bytes)
166
+ new_page.show_pdf_page(new_page.rect, ocr_pdf, 0)
167
+
168
+ except Exception:
169
+ print(f" - Warning: fitz could not open '{input_path}' directly. Attempting with Pillow...")
170
+ try:
171
+ with Image.open(input_path) as img:
172
+ preprocessed_img = self._preprocess_image(img.convert("RGB")) # Convertir en RGB avant pour la compatibilité
173
+
174
+ pdf_bytes = pytesseract.image_to_pdf_or_hocr(preprocessed_img, lang=self.tesseract_lang_string, extension='pdf')
175
+
176
+ with fitz.open("pdf", pdf_bytes) as ocr_pdf:
177
+ # Reconstruire un PDF avec l'image visuelle
178
+ img_as_bytes = io.BytesIO()
179
+ img.save(img_as_bytes, format='PNG')
180
+ img_as_bytes.seek(0)
181
+
182
+ page = output_pdf.new_page(width=img.width, height=img.height)
183
+ page.insert_image(page.rect, stream=img_as_bytes.read())
184
+ page.show_pdf_page(page.rect, ocr_pdf, 0)
185
+
186
+ except Exception as e2:
187
+ print(f" - FATAL ERROR: Could not process file '{input_path}'. EError : {e2}")
188
+ output_pdf.close()
189
+ return None
190
+
191
+ if len(output_pdf) > 0:
192
+ output_pdf.save(temp_output_path, garbage=4, deflate=True, clean=True)
193
+ print(f" - Temporary searchable PDF created : '{temp_output_path}'")
194
+ else:
195
+ print(" - ERROR: No pages were generated.")
196
+ temp_output_path = None
197
+
198
+ output_pdf.close()
199
+ return temp_output_path
200
+
201
+ def _save_outputs(self, final_content: str):
202
+ """(Privé) Sauvegarde le contenu Markdown consolidé dans les fichiers .txt et .md."""
203
+ print(f"\n[Step 3/3] Saving consolidated content...")
204
+
205
+ for path in [self.output_md_path, self.output_txt_path]:
206
+ try:
207
+ with open(path, 'w', encoding='utf-8') as f:
208
+ f.write(final_content)
209
+ print(f" - Successfully saved to '{path}'.")
210
+ except IOError as e:
211
+ print(f" - ERROR: Could not write to file '{path}'. Error : {e}")
212
+
213
+ def __str__(self) -> str:
214
+ """Représentation textuelle de l'objet pour afficher un résumé des résultats."""
215
+ summary_lines = [f"--- RostaingOCR Extraction Summary ---"]
216
+ summary_lines.append(f"Generated output files: '{self.output_txt_path}' et '{self.output_md_path}'")
217
+
218
+ for file_path, text_content in self.results.items():
219
+ status = "✅ Success" if text_content else "❌ Failure"
220
+ line = f"\n - Processed file : {os.path.basename(file_path)}\n"
221
+ line += f" Status : {status}"
222
+ summary_lines.append(line)
223
+ return "\n".join(summary_lines)
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: rostaing-ocr
3
+ Version: 0.1.0
4
+ Summary: Un outil OCR pour extraire du texte d'images et de PDF en utilisant Tesseract et PyMuPDF.
5
+ Author-email: Davila Rostaing <davilarostaing@gmail.com>
6
+ Project-URL: Homepage, https://github.com/Rostaing/rostaing-ocr
7
+ Project-URL: Bug Tracker, https://github.com/Rostaing/rostaing-ocr/issues
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Topic :: Text Processing
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Development Status :: 4 - Beta
14
+ Requires-Python: >=3.7
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE
17
+ Requires-Dist: PyMuPDF
18
+ Requires-Dist: pytesseract
19
+ Requires-Dist: markitdown
20
+ Requires-Dist: Pillow
21
+ Dynamic: license-file
22
+
23
+ # Rostaing OCR, created by Davila Rostaing.
24
+
25
+ A simple and powerful Python tool for extracting text from images and scanned PDFs. It leverages Tesseract, PyMuPDF, and MarkItDown to produce clean plain text (`.txt`) and Markdown (`.md`) files.
26
+
27
+ ## Features
28
+
29
+ - Converts images (PNG, JPG, etc.) and PDFs into text.
30
+ - Includes image preprocessing (grayscale conversion) to improve OCR accuracy.
31
+ - Processes multiple files in a single run, consolidating the output.
32
+ - Generates output in both plain text (`.txt`) and Markdown (`.md`) formats.
33
+ - Optional flag to print extracted content directly to the console for quick inspection.
34
+
35
+ ## System Prerequisites: Tesseract OCR
36
+
37
+ **Important:** This package requires the **Tesseract OCR engine** to be installed on your system. `rostaing-ocr` is a Python wrapper that calls the `tesseract` command-line tool. You must install it and its language packs first.
38
+
39
+ ### Windows Installation Guide (Recommended)
40
+
41
+ 1. **Download the Installer**: Go to the official [Tesseract at UB Mannheim](https://github.com/UB-Mannheim/tesseract/wiki) repository. They provide the most up-to-date and reliable installers for Windows.
42
+ 2. **Run the Installer**: Start the installation process.
43
+ 3. **Crucial Step - Add to PATH**: On the "Select Additional Tasks" screen, make sure to check the box for **"Add Tesseract to system PATH"**. This is essential for Python to be able to find and execute Tesseract.
44
+ 4. **Select Languages**: On the "Select additional language data" screen, expand the list and select the languages you will need for OCR (e.g., check `French` for `fra`, `English` is usually included by default).
45
+ 5. **Complete Installation**: Finish the installation and, to be safe, restart your command prompt, terminal, or IDE to ensure the system's PATH variable is updated.
46
+
47
+ ### macOS (via Homebrew)
48
+
49
+ ```bash
50
+ brew install tesseract
51
+ ```
52
+ You can add language packs by installing `tesseract-lang`.
53
+
54
+ ### Linux (Debian/Ubuntu)
55
+
56
+ ```bash
57
+ sudo apt update
58
+ sudo apt install tesseract-ocr
59
+
60
+ # Also install the language packs you need. For French:
61
+ sudo apt install tesseract-ocr-fra
62
+ ```
63
+
64
+ ## Installation
65
+
66
+ ### Best Practice: Use a Virtual Environment
67
+
68
+ To keep project dependencies isolated and avoid conflicts with other Python projects on your system, it is highly recommended to use a virtual environment.
69
+
70
+ With a standard Python installation, you can create and activate a new environment using the following commands:
71
+
72
+ **On macOS/Linux:**
73
+ ```bash
74
+ # Create an environment named '.venv' in your project directory
75
+ python3 -m venv .venv
76
+
77
+ # Activate the environment
78
+ source .venv/bin/activate
79
+ ```
80
+
81
+ **On Windows:**
82
+ ```bash
83
+ # Create an environment named '.venv' in your project directory
84
+ python -m venv .venv
85
+
86
+ # Activate the environment
87
+ .venv\Scripts\activate
88
+ ```
89
+
90
+ ### Install the Package
91
+
92
+ Once Tesseract is set up and your virtual environment is activated, you can install the package from PyPI:
93
+
94
+ ```bash
95
+ pip install rostaing-ocr
96
+ ```
97
+
98
+ ## Usage
99
+
100
+ Here is a basic example of how to use the `RostaingOCR` class.
101
+
102
+ ```python
103
+ from rostaing_ocr import RostaingOCR
104
+
105
+ # --- Example 1: Process a single file ---
106
+ # This will create 'my_result.txt' and 'my_result.md' in the current directory.
107
+ extractor = RostaingOCR(
108
+ input_path_or_paths="path/to/my_document.pdf",
109
+ output_basename="my_result", # Optionel
110
+ print_to_console=True # Optionel
111
+ )
112
+
113
+ # --- Example 2: Process multiple files and print to console ---
114
+ # This will process both files, save a consolidated output, and also print the results.
115
+ multi_extractor = RostaingOCR(
116
+ input_path_or_paths=["document1.png", "scan_page_2.pdf"],
117
+ output_basename="combined_report", # Optionel
118
+ print_to_console=True, # Optionel
119
+ languages=['fra', 'eng'] # Specify languages for Tesseract # Optionel
120
+ )
121
+
122
+ # You can print the object to get a summary of the operation.
123
+ print(multi_extractor)
124
+ ```
125
+
126
+ ## Application for LLM and RAG Pipelines
127
+
128
+ Large Language Models (LLMs) like GPT-4 or Llama understand text, not images or scanned documents. A vast amount of valuable knowledge is locked away in unstructured formats such as PDFs of research papers, scanned invoices, or legal contracts.
129
+
130
+ **`Rostaing OCR` serves as the crucial first step in any data ingestion pipeline for Retrieval-Augmented Generation (RAG) systems.** It bridges the gap by converting this inaccessible visual data into clean, structured text that LLMs can process.
131
+
132
+ By using `Rostaing OCR`, you can automate the process of building a knowledge base from your documents:
133
+
134
+ 1. **Input**: A directory of `Scanned PDFs` or `Images`.
135
+ 2. **Extraction (Rostaing OCR)**: Convert all documents into clean `Markdown/Text`.
136
+ 3. **Processing**: The text output can be fed into text splitters and then embedding models.
137
+ 4. **Indexing**: The resulting vectors are stored in a vector database (e.g., Chroma, Pinecone, FAISS) for efficient retrieval.
138
+
139
+ In short, `Rostaing OCR` unlocks your documents, making them ready for any modern AI stack.
140
+
141
+ ## License
142
+
143
+ This project is licensed under the MIT License. See the `LICENSE` file for more details.
@@ -0,0 +1,7 @@
1
+ rostaing_ocr/__init__.py,sha256=HBwHqAcgfbxYvLsLkIiei-HEgXZrcHXM675VrLjO2fc,196
2
+ rostaing_ocr/rostaing_ocr.py,sha256=xp7aP0MiL4vS0KzztPnDCnAsYsPzZ6ZBWZxRnNAEgIs,10746
3
+ rostaing_ocr-0.1.0.dist-info/licenses/LICENSE,sha256=-mnJp92iFU5D7XvCVxTbqiHQBhIo_LKsH9yaMaPymoY,1087
4
+ rostaing_ocr-0.1.0.dist-info/METADATA,sha256=YocIRat6BHwcZPAB_Qk92Gep6Kl_SLPVFRb34Oeb-m0,6036
5
+ rostaing_ocr-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ rostaing_ocr-0.1.0.dist-info/top_level.txt,sha256=2MSL7KmO9dgjq622ZnYtXoYIxsa1VVm0ZAlrj3RKPCg,13
7
+ rostaing_ocr-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 RostaingOCR
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ rostaing_ocr