PyPI - io4it - Versions diffs - 0.0.0.9__tar.gz - Mend

io4it 0.0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

io4it-0.0.0.9/License.txt ADDED Viewed

@@ -0,0 +1,6 @@
+THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT ANY WARRANTY WHATSOEVER.
+If you use or redistribute this software, you are permitted to do so
+under the terms of GNU [GPL-3.0]+ license.
+[GPL-3.0]: https://www.gnu.org/licenses/gpl-3.0.en.html

io4it-0.0.0.9/PKG-INFO ADDED Viewed

@@ -0,0 +1,8 @@
+Metadata-Version: 2.1
+Name: io4it
+Version: 0.0.0.9
+Home-page:
+Author:
+Author-email:
+Keywords: orange3 add-on
+License-File: License.txt

io4it-0.0.0.9/io4it.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,8 @@
+Metadata-Version: 2.1
+Name: io4it
+Version: 0.0.0.9
+Home-page:
+Author:
+Author-email:
+Keywords: orange3 add-on
+License-File: License.txt

io4it-0.0.0.9/io4it.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,33 @@
+License.txt
+setup.cfg
+setup.py
+io4it.egg-info/PKG-INFO
+io4it.egg-info/SOURCES.txt
+io4it.egg-info/dependency_links.txt
+io4it.egg-info/entry_points.txt
+io4it.egg-info/namespace_packages.txt
+io4it.egg-info/requires.txt
+io4it.egg-info/top_level.txt
+orangecontrib/__init__.py
+orangecontrib/IO4IT/__init__.py
+orangecontrib/IO4IT/ocr_function/__init__.py
+orangecontrib/IO4IT/ocr_function/word_converter.py
+orangecontrib/IO4IT/widgets/OWMarkdownizer.py
+orangecontrib/IO4IT/widgets/OWPathPropagator.py
+orangecontrib/IO4IT/widgets/OWS3Uploader.py
+orangecontrib/IO4IT/widgets/OWS3downloader.py
+orangecontrib/IO4IT/widgets/OWS3list.py
+orangecontrib/IO4IT/widgets/OWSpeechToText.py
+orangecontrib/IO4IT/widgets/OWwordpdf2docx.py
+orangecontrib/IO4IT/widgets/__init__.py
+orangecontrib/IO4IT/widgets/designer/ow_in_or_out_path.ui
+orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui
+orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui
+orangecontrib/IO4IT/widgets/icons/category.svg
+orangecontrib/IO4IT/widgets/icons/download.png
+orangecontrib/IO4IT/widgets/icons/in_or_out.png
+orangecontrib/IO4IT/widgets/icons/list_aws.png
+orangecontrib/IO4IT/widgets/icons/md.png
+orangecontrib/IO4IT/widgets/icons/speech_to_text.png
+orangecontrib/IO4IT/widgets/icons/upload.png
+orangecontrib/IO4IT/widgets/icons/wordpdf2docx.png

io4it-0.0.0.9/io4it.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

io4it-0.0.0.9/io4it.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [orange.widgets]
2	+ Advanced Artificial Intelligence Tools = orangecontrib.IO4IT.widgets

io4it-0.0.0.9/io4it.egg-info/namespace_packages.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ orangecontrib

io4it-0.0.0.9/io4it.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,10 @@
+boto3
+docling
+docling-core
+speechbrain
+whisper
+whisper-openai
+pyannote.audio
+pyannote.core
+wave
+scikit-learn

io4it-0.0.0.9/io4it.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ orangecontrib

io4it-0.0.0.9/orangecontrib/IO4IT/__init__.py ADDED Viewed

File without changes

io4it-0.0.0.9/orangecontrib/IO4IT/ocr_function/__init__.py ADDED Viewed

File without changes

io4it-0.0.0.9/orangecontrib/IO4IT/ocr_function/word_converter.py ADDED Viewed

@@ -0,0 +1,327 @@
+import os
+import win32com.client
+from pathlib import Path
+import pathlib
+import tempfile
+import shutil
+import time
+import pythoncom
+import fitz
+if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
+    from Orange.widgets.orangecontrib.AAIT.utils.MetManagement import get_local_store_path,reset_folder
+else:
+    from orangecontrib.AAIT.utils.MetManagement import get_local_store_path,reset_folder
+def enable_long_path(path):
+    """Simplifie la gestion des chemins longs sous Windows."""
+    return pathlib.Path(r"\\?\\" + str(path))
+def convert_pdf_structure(input_dir: str, output_dir: str,ignore_exsting_out_put=False,eprogress_callback=None):
+    """
+    return a string with log in  case of error
+    Recursively lists all .pdf and .PDF files in the input directory,
+    replicates the folder structure in the output directory, and
+    creates empty .docx files with the same names.
+    Parameters:
+    input_dir (str): Path to the input directory containing PDF files.
+    output_dir (str): Path to the output directory where DOCX files will be created.
+    """
+    error_log=""
+    if os.name != 'nt':
+        error_log="version developped for windows computer "
+        return error_log
+    nbre_file = 0
+    for i, data in enumerate(input_dir):
+        input_path = Path(str(input_dir[i]))
+        for pdf_file in input_path.rglob("*.pdf"):
+            nbre_file += 1
+    k = 1
+    for i, data in enumerate(input_dir):
+        input_path = Path(str(input_dir[i]))
+        output_path = Path(str(output_dir[i]))
+        if not input_path.exists() or not input_path.is_dir():
+            print(f"Error: The input directory '{input_dir}' does not exist or is not a directory.")
+            return f"Error: The input directory '{input_dir}' does not exist or is not a directory. "
+        for pdf_file in input_path.rglob("*.pdf"):  # Recursively search for .pdf and .PDF files
+            relative_path = pdf_file.relative_to(input_path)  # Get relative path from input root
+            new_file_path = output_path / relative_path.with_suffix(".docx")  # Change extension to .docx
+            if ignore_exsting_out_put:
+                if os.path.exists(enable_long_path(str(new_file_path))):
+                    print("ignoring",pdf_file)
+                    continue
+            if 0!= convert_pdf_with_temp(str(pdf_file),str(new_file_path)):#convert_pdf_with_temp #convert_pdf_to_docx
+                if error_log!="":
+                    error_log+="\n"
+                error_log+="error -> "+str(pdf_file)
+                return error_log # a supprimer
+            if progress_callback is not None:
+                progress_value = float(100 * (k) / nbre_file)
+                k += 1
+                progress_callback(progress_value)
+    # purge temp dir if everithing is ok
+    if error_log=="":
+        reset_folder(get_local_store_path() + "temp_word_conversion/", attempts=10, delay=0.05, recreate=False)
+    return error_log
+def convert_pdf_to_docx(pdf_path, docx_path):
+    """
+    Convertit un fichier PDF en DOCX en utilisant Microsoft Word.
+    Args:
+        pdf_path (str): Chemin du fichier PDF source.
+        docx_path (str): Chemin du fichier DOCX de destination.
+    Returns:
+        int: 0 si la conversion a réussi, 1 en cas d'échec.
+    """
+    if not os.path.exists(pdf_path):
+        print(f"Erreur : Le fichier {pdf_path} n'existe pas.")
+        return 1
+    try:
+        # Initialiser COM
+        pythoncom.CoInitialize()
+        # Lancer Word
+        word = win32com.client.Dispatch("Word.Application")
+        word.DisplayAlerts = 0  # Désactiver les alertes
+        word.Visible = True  # Mettre à True pour voir Word en action
+        print(f"Conversion de {pdf_path} en {docx_path}...")
+        # Ouvrir le PDF en lecture seule
+        doc = word.Documents.Open(pdf_path, ReadOnly=True, ConfirmConversions=False)
+        # Sauvegarder en DOCX
+        doc.SaveAs(docx_path, FileFormat=16)  # 16 = wdFormatDocumentDefault
+        doc.Close(False)
+        print(f"Conversion réussie : {docx_path}")
+        return 0
+    except Exception as e:
+        print(f"Erreur lors de la conversion : {e}")
+        return 1
+    finally:
+        if 'word' in locals():
+            word.Quit()
+        # Libérer COM
+        pythoncom.CoUninitialize()
+def wait_for_file_access(file_path, timeout=10, interval=0.5):
+    """
+    Attendre que le fichier soit accessible en lecture/écriture.
+    Args:
+        file_path (str): Chemin du fichier à vérifier.
+        timeout (int): Temps max en secondes avant d'abandonner.
+        interval (float): Temps d'attente entre chaque vérification.
+    Returns:
+        bool: True si le fichier est accessible, False sinon.
+    """
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        if os.path.exists(file_path) and os.access(file_path, os.R_OK | os.W_OK):
+            try:
+                with open(file_path, "a"):
+                    pass  # Test d'ouverture en écriture
+                return True
+            except IOError:
+                pass
+        time.sleep(interval)  # Attendre avant de réessayer
+    print(f"Erreur : Le fichier {file_path} est verrouillé ou inaccessible.")
+    return False
+def is_pdf_a4(pdf_path: str) -> bool:
+    """
+    Vérifie si toutes les pages du PDF sont au format A4.
+    Retourne True si toutes les pages sont en A4, False sinon.
+    """
+    A4_WIDTH_PTS = 595  # Largeur A4 en points (approx. 210mm)
+    A4_HEIGHT_PTS = 842  # Hauteur A4 en points (approx. 297mm)
+    TOLERANCE = 5  # Marge de tolérance en points
+    try:
+        doc = fitz.open(pdf_path)
+        if len(doc) == 0:
+            return False
+        for page in doc:
+            width, height = page.rect.width, page.rect.height
+            if not (
+                    (abs(width - A4_WIDTH_PTS) <= TOLERANCE and abs(height - A4_HEIGHT_PTS) <= TOLERANCE) or
+                    (abs(width - A4_HEIGHT_PTS) <= TOLERANCE and abs(height - A4_WIDTH_PTS) <= TOLERANCE)
+            ):
+                return False
+    except Exception as e:
+        print("is A4?",e)
+        return False
+    return True
+def convert_pdf_to_a4(input_pdf, output_pdf):
+    try:
+        # Dimensions A4 en points
+        a4_width, a4_height = fitz.paper_size("a4")
+        doc = fitz.open(input_pdf)
+        new_doc = fitz.open()
+        for page in doc:
+            page_w, page_h = page.rect.width, page.rect.height
+            # Si la page est déjà en A4 (tolérance de 1 point)
+            if abs(page_w - a4_width) < 1 and abs(page_h - a4_height) < 1:
+                new_doc.insert_pdf(doc, from_page=page.number, to_page=page.number)
+                continue
+            # Définition de la transformation selon l'orientation de la page
+            if page_w > page_h:  # Paysage
+                # Après rotation, les dimensions seront inversées (largeur <-> hauteur)
+                effective_scale = min(a4_width / page_h, a4_height / page_w)
+                matrix = fitz.Matrix(effective_scale, effective_scale)
+                # Rotation de 90° et translation pour repositionner le contenu
+                matrix = matrix.prerotate(90).pretranslate(page_h * effective_scale, 0)
+            else:  # Portrait
+                effective_scale = min(a4_width / page_w, a4_height / page_h)
+                matrix = fitz.Matrix(effective_scale, effective_scale)
+            # Générer le pixmap à la résolution finale souhaitée
+            pix = page.get_pixmap(matrix=matrix)
+            # Calcul du centrage sur la page A4
+            new_img_w, new_img_h = pix.width, pix.height
+            x_offset = (a4_width - new_img_w) / 2
+            y_offset = (a4_height - new_img_h) / 2
+            # Créer la nouvelle page et y insérer l'image
+            new_page = new_doc.new_page(width=a4_width, height=a4_height)
+            new_page.insert_image(
+                fitz.Rect(x_offset, y_offset, x_offset + new_img_w, y_offset + new_img_h),
+                pixmap=pix
+            )
+        new_doc.save(output_pdf)
+        new_doc.close()
+        doc.close()
+        return 0
+    except:
+        return 1
+def write_two_strings_to_file(file_path: str,string1: str, string2: str):
+    """
+    Writes two strings to a file, one per line, handling errors gracefully.
+    :param string1: The first string to write.
+    :param string2: The second string to write.
+    :param file_path: The path where the file should be saved.
+    """
+    try:
+        file = open(file_path, 'w', encoding='utf-8')
+        file.write(string1 + "\n")
+        file.write(string2 )
+        print(f"Successfully written to {file_path}")
+    except IOError as e:
+        print(f"Error writing to file: {e}")
+        return 1
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return 1
+    finally:
+        file.close()
+    return 0
+def convert_pdf_with_temp(temp_pdf, output_path):
+    """
+    Copie le PDF source dans un dossier temporaire, le convertit en DOCX,
+    puis copie le fichier résultant vers le chemin de sortie spécifié,
+    en gérant les chemins longs.
+    """
+    pdf_path = enable_long_path(os.path.abspath(temp_pdf))
+    output_path = enable_long_path(os.path.abspath(output_path))
+    output_dir = output_path.parent
+    if not pdf_path.exists():
+        print(f"Le fichier {pdf_path} n'existe pas.")
+        return 1
+    # Créer le dossier de sortie s'il n'existe pas
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        dest_dir = get_local_store_path() + "temp_word_conversion/"
+        if 0 != reset_folder(dest_dir, attempts=10, delay=0.05):
+            print("impossible to reset " + dest_dir)
+            return 1
+        # Création du dossier temporaire
+        temp_pdf = os.path.join(dest_dir, "input_toto.pdf")
+        temp_docx = os.path.join(dest_dir, "input_toto.docx")
+        print(dest_dir+"conversion_en_cours.txt")
+        print("######################################")
+        if 0!=write_two_strings_to_file(dest_dir+"conversion_en_cours.txt",str(pdf_path),str(output_path)):
+            print("error writing ",dest_dir+"conversion_en_cours.txt")
+            return 1
+        # Copie du fichier source vers le dossier temporaire
+        shutil.copy2(pdf_path, temp_pdf)
+        wait_for_file_access(temp_pdf)
+        if is_pdf_a4(temp_pdf)==False:
+            temp_pdf2 = os.path.join(dest_dir, "input_totoA4.pdf")
+            if 0!=convert_pdf_to_a4(temp_pdf,temp_pdf2):
+                print("erreur au resize du pdf")
+                return 1
+            temp_pdf=temp_pdf2
+            wait_for_file_access(temp_pdf)
+            time.sleep(1)
+        result=0
+        # Conversion du PDF en DOCX
+        for _ in range(4):
+            time.sleep(1)
+            result = convert_pdf_to_docx(str(temp_pdf), str(temp_docx))
+            if result==0:
+                break
+        if result == 0:
+            # Copie du fichier converti vers la destination finale
+            shutil.copy2(temp_docx, output_path)
+            print(f"recopie réussie : {output_path}")
+            # Supprimer les fichiers temporaires après le déplacement
+            # if temp_docx.exists():
+            #     temp_docx.unlink()
+            # if temp_pdf.exists():
+            #     temp_pdf.unlink()
+            return 0
+        else:
+            print("Erreur lors de la conversion.")
+            return 1
+    except Exception as e:
+        print(f"Erreur : {e}")
+        return 1

io4it-0.0.0.9/orangecontrib/IO4IT/widgets/OWMarkdownizer.py ADDED Viewed

@@ -0,0 +1,202 @@
+import os
+import logging
+import urllib.parse
+from pathlib import Path
+from PyQt5.QtCore import QThread, pyqtSignal
+from AnyQt.QtWidgets import QApplication, QLabel, QPushButton, QProgressBar, QListWidget, QListWidgetItem
+import Orange.data
+from Orange.widgets import widget
+from Orange.widgets.utils.signals import Input, Output
+from Orange.data import Domain, StringVariable, Table
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling_core.types.doc import ImageRefMode
+_log = logging.getLogger(__name__)
+IMAGE_RESOLUTION_SCALE = 2.0
+class MarkdownConversionThread(QThread):
+    result_signal = pyqtSignal(list)
+    progress_signal = pyqtSignal(str, int)
+    def __init__(self, input_dir, output_dir, parent=None):
+        super().__init__(parent)
+        self.input_dir = input_dir
+        self.output_dir = output_dir
+    def run(self):
+        results = []
+        files = list(self.input_dir.glob("*.pdf")) + list(self.input_dir.glob("*.docx"))
+        total_files = len(files)
+        processed = 0
+        pipeline_options = PdfPipelineOptions()
+        pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
+        pipeline_options.generate_page_images = True
+        pipeline_options.generate_picture_images = True
+        doc_converter_pdf = DocumentConverter(
+            format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
+        )
+        for idx, file_path in enumerate(files):
+            try:
+                output_subdir = self.output_dir / file_path.parent.relative_to(file_path.parents[1])
+                output_subdir.mkdir(parents=True, exist_ok=True)
+                doc_filename = file_path.stem + "_md-with-image-refs.md"
+                output_file_path = output_subdir / doc_filename
+                if output_file_path.exists():
+                    print(f"🔁 Fichier déjà traité, ignoré : {doc_filename}")
+                    results.append((doc_filename, output_file_path.read_text(encoding='utf-8')))
+                    processed += 1
+                    self.progress_signal.emit(doc_filename, int(processed / total_files * 100))
+                    continue
+                # Conversion
+                if file_path.suffix.lower() == ".pdf":
+                    conv_res = doc_converter_pdf.convert(file_path)
+                else:
+                    conv_res = DocumentConverter().convert(file_path)
+                conv_res.document.save_as_markdown(output_file_path, image_mode=ImageRefMode.REFERENCED)
+                with open(output_file_path, 'r', encoding='utf-8') as f:
+                    content = urllib.parse.unquote(f.read())
+                with open(output_file_path, 'w', encoding='utf-8') as f:
+                    f.write(content)
+                results.append((doc_filename, content))
+                processed += 1
+                self.progress_signal.emit(doc_filename, int(processed / total_files * 100))
+            except Exception as e:
+                print(f"❌ Erreur lors du traitement de {file_path}: {e}")
+                continue
+        self.result_signal.emit(results)
+class FileProcessorApp(widget.OWWidget):
+    name = "Markdownizer"
+    description = "Convert PDFs, DOCX, TXT, CSV to Markdown and store in an output folder"
+    icon = "icons/md.png"
+    if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
+        icon = "icons_dev/md.png"
+    priority = 1001
+    category = "Advanced Artificial Intelligence Tools"
+    want_control_area = False
+    class Inputs:
+        data = Input("Input Directory", Orange.data.Table)
+    class Outputs:
+        data = Output("Markdown Data Table", Orange.data.Table)
+    @Inputs.data
+    def set_data(self, data):
+        self.data = data
+        if self.data is not None:
+            path_index = None
+            for i, meta_var in enumerate(self.data.domain.metas):
+                if meta_var.name.lower() == 'input_dir':
+                    path_index = i
+                    break
+            if path_index is not None:
+                self.input_path = self.data.metas[0][path_index]
+                print("Extracted input_dir:", self.input_path)
+                self.startProcessing()
+            else:
+                print("No 'input_dir' column found in input data. Available columns:", [m.name for m in self.data.domain.metas])
+    def __init__(self):
+        super().__init__()
+        self.initUI()
+        self.data = None
+        self.input_path = None
+    def initUI(self):
+        self.setGeometry(200, 200, 600, 400)
+        self.mainArea.layout().setSpacing(10)
+        self.status_label = QLabel("Sélectionnez un dossier contenant des fichiers.")
+        self.progress_bar = QProgressBar()
+        self.progress_bar.setValue(0)
+        self.start_button = QPushButton("Démarrer le traitement")
+        self.start_button.clicked.connect(self.startProcessing)
+        self.file_list = QListWidget()
+        self.mainArea.layout().addWidget(self.status_label)
+        self.mainArea.layout().addWidget(self.progress_bar)
+        self.mainArea.layout().addWidget(self.start_button)
+        self.mainArea.layout().addWidget(self.file_list)
+    def startProcessing(self):
+        if not self.input_path:
+            print("No valid input path found.")
+            return
+        input_dir = Path(self.input_path)
+        if not input_dir.exists():
+            print("Input directory does not exist:", input_dir)
+            return
+        self.output_dir = input_dir.parent / (input_dir.name + "_md")
+        self.progress_bar.setValue(0)
+        self.start_button.setEnabled(False)
+        self.status_label.setText("Traitement en cours...")
+        self.thread = MarkdownConversionThread(input_dir, self.output_dir)
+        self.thread.result_signal.connect(self.handle_results)
+        self.thread.progress_signal.connect(self.update_progress)
+        self.thread.start()
+        self.progressBarInit()
+    def update_progress(self, filename, progress):
+        print(f"File processed: {filename}")
+        self.file_list.addItem(f"✅ {filename}")
+        self.progress_bar.setValue(progress)
+    def handle_results(self, results):
+        self.processingComplete(results)
+        self.progressBarFinished()
+    def processingComplete(self, results):
+        self.status_label.setText("Traitement terminé.")
+        self.start_button.setEnabled(True)
+        self.send_output(results)
+    def send_output(self, results):
+        domain = Domain([], metas=[
+            StringVariable('input_dir'),
+            StringVariable('output_dir'),
+            StringVariable('name'),
+            StringVariable('content')
+        ])
+        metas = [[
+            str(self.input_path),
+            str(self.output_dir),
+            name,
+            content
+        ] for name, content in results] if results else [["", "", "", ""]]
+        table = Table(domain, [[] for _ in metas])
+        for i, meta in enumerate(metas):
+            table.metas[i] = meta
+        self.Outputs.data.send(table)
+if __name__ == "__main__":
+    import sys
+    app = QApplication(sys.argv)
+    window = FileProcessorApp()
+    window.show()
+    sys.exit(app.exec_())