io4it 2.1.0.4__tar.gz → 2.1.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- io4it-2.1.0.6/PKG-INFO +7 -0
- io4it-2.1.0.6/io4it.egg-info/PKG-INFO +7 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/io4it.egg-info/SOURCES.txt +16 -2
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/ocr_function/word_converter.py +12 -12
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/OWDeep_Search.py +1 -1
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/OWDoclingMarkdownizerSimple.py +280 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/OWExportMarkdown.py +1 -1
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/OWMarkdownLoader.py +114 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/OWMarkdownizer.py +2 -2
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/OWOfficeNormalizer.py +183 -0
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/OWPdfType.py +193 -0
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/OWProcessPoolExecutor.py +87 -0
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/designer/owdoclingmarkdownizersimple.ui +54 -0
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/designer/owmarkdownloader.ui +54 -0
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui +54 -0
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/designer/owpdftype.ui +54 -0
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/designer/owprocesspoolexecutor.ui +100 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui +16 -0
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/icons/check_pdf.png +0 -0
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/icons/dep_md_old.png +0 -0
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/icons/load_md.png +0 -0
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/icons/office_normalizer.png +0 -0
- io4it-2.1.0.6/orangecontrib/IO4IT/widgets/icons/process_pool_executor.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/setup.py +1 -1
- io4it-2.1.0.4/PKG-INFO +0 -33
- io4it-2.1.0.4/io4it.egg-info/PKG-INFO +0 -33
- io4it-2.1.0.4/orangecontrib/IO4IT/widgets/designer/chart.html +0 -281
- io4it-2.1.0.4/orangecontrib/IO4IT/widgets/designer/owvisualizationer.ui +0 -56
- {io4it-2.1.0.4 → io4it-2.1.0.6}/io4it.egg-info/dependency_links.txt +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/io4it.egg-info/entry_points.txt +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/io4it.egg-info/namespace_packages.txt +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/io4it.egg-info/requires.txt +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/io4it.egg-info/top_level.txt +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/ocr_function/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/utils/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/utils/mail.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/utils/offuscation_basique.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/utils/pool_exec_utils.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/utils/utils_md.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/OWChatGpt.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/OWS3Uploader.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/OWS3downloader.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/OWS3list.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/OWSpeechToText.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/OWmailLoader.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/OWmailSender.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/OWwordpdf2docx.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/designer/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/designer/nogui.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/designer/owchatgpt.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/designer/owdeepsearch.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/designer/owexportmarkdown.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/designer/owmailloader.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/designer/owmailsender.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/designer/owmarkdownizer.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/chatgpt.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/deepsearch.svg +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/download.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/export_md.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/file_extensor.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/list_aws.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/mail_loader.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/mail_writer.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/md.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/speech_to_text.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/upload.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/visualizationer.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons/wordpdf2docx.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/IO4IT/widgets/icons_dev/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/orangecontrib/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.6}/setup.cfg +0 -0
io4it-2.1.0.6/PKG-INFO
ADDED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
setup.cfg
|
|
1
2
|
setup.py
|
|
2
3
|
io4it.egg-info/PKG-INFO
|
|
3
4
|
io4it.egg-info/SOURCES.txt
|
|
@@ -17,8 +18,13 @@ orangecontrib/IO4IT/utils/pool_exec_utils.py
|
|
|
17
18
|
orangecontrib/IO4IT/utils/utils_md.py
|
|
18
19
|
orangecontrib/IO4IT/widgets/OWChatGpt.py
|
|
19
20
|
orangecontrib/IO4IT/widgets/OWDeep_Search.py
|
|
21
|
+
orangecontrib/IO4IT/widgets/OWDoclingMarkdownizerSimple.py
|
|
20
22
|
orangecontrib/IO4IT/widgets/OWExportMarkdown.py
|
|
23
|
+
orangecontrib/IO4IT/widgets/OWMarkdownLoader.py
|
|
21
24
|
orangecontrib/IO4IT/widgets/OWMarkdownizer.py
|
|
25
|
+
orangecontrib/IO4IT/widgets/OWOfficeNormalizer.py
|
|
26
|
+
orangecontrib/IO4IT/widgets/OWPdfType.py
|
|
27
|
+
orangecontrib/IO4IT/widgets/OWProcessPoolExecutor.py
|
|
22
28
|
orangecontrib/IO4IT/widgets/OWS3Uploader.py
|
|
23
29
|
orangecontrib/IO4IT/widgets/OWS3downloader.py
|
|
24
30
|
orangecontrib/IO4IT/widgets/OWS3list.py
|
|
@@ -28,28 +34,36 @@ orangecontrib/IO4IT/widgets/OWmailSender.py
|
|
|
28
34
|
orangecontrib/IO4IT/widgets/OWwordpdf2docx.py
|
|
29
35
|
orangecontrib/IO4IT/widgets/__init__.py
|
|
30
36
|
orangecontrib/IO4IT/widgets/designer/__init__.py
|
|
31
|
-
orangecontrib/IO4IT/widgets/designer/chart.html
|
|
32
37
|
orangecontrib/IO4IT/widgets/designer/nogui.ui
|
|
33
38
|
orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui
|
|
34
39
|
orangecontrib/IO4IT/widgets/designer/owchatgpt.ui
|
|
35
40
|
orangecontrib/IO4IT/widgets/designer/owdeepsearch.ui
|
|
41
|
+
orangecontrib/IO4IT/widgets/designer/owdoclingmarkdownizersimple.ui
|
|
36
42
|
orangecontrib/IO4IT/widgets/designer/owexportmarkdown.ui
|
|
37
43
|
orangecontrib/IO4IT/widgets/designer/owmailloader.ui
|
|
38
44
|
orangecontrib/IO4IT/widgets/designer/owmailsender.ui
|
|
39
45
|
orangecontrib/IO4IT/widgets/designer/owmarkdownizer.ui
|
|
46
|
+
orangecontrib/IO4IT/widgets/designer/owmarkdownloader.ui
|
|
47
|
+
orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui
|
|
48
|
+
orangecontrib/IO4IT/widgets/designer/owpdftype.ui
|
|
49
|
+
orangecontrib/IO4IT/widgets/designer/owprocesspoolexecutor.ui
|
|
40
50
|
orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui
|
|
41
|
-
orangecontrib/IO4IT/widgets/designer/owvisualizationer.ui
|
|
42
51
|
orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui
|
|
43
52
|
orangecontrib/IO4IT/widgets/icons/__init__.py
|
|
44
53
|
orangecontrib/IO4IT/widgets/icons/chatgpt.png
|
|
54
|
+
orangecontrib/IO4IT/widgets/icons/check_pdf.png
|
|
45
55
|
orangecontrib/IO4IT/widgets/icons/deepsearch.svg
|
|
56
|
+
orangecontrib/IO4IT/widgets/icons/dep_md_old.png
|
|
46
57
|
orangecontrib/IO4IT/widgets/icons/download.png
|
|
47
58
|
orangecontrib/IO4IT/widgets/icons/export_md.png
|
|
48
59
|
orangecontrib/IO4IT/widgets/icons/file_extensor.png
|
|
49
60
|
orangecontrib/IO4IT/widgets/icons/list_aws.png
|
|
61
|
+
orangecontrib/IO4IT/widgets/icons/load_md.png
|
|
50
62
|
orangecontrib/IO4IT/widgets/icons/mail_loader.png
|
|
51
63
|
orangecontrib/IO4IT/widgets/icons/mail_writer.png
|
|
52
64
|
orangecontrib/IO4IT/widgets/icons/md.png
|
|
65
|
+
orangecontrib/IO4IT/widgets/icons/office_normalizer.png
|
|
66
|
+
orangecontrib/IO4IT/widgets/icons/process_pool_executor.png
|
|
53
67
|
orangecontrib/IO4IT/widgets/icons/speech_to_text.png
|
|
54
68
|
orangecontrib/IO4IT/widgets/icons/upload.png
|
|
55
69
|
orangecontrib/IO4IT/widgets/icons/visualizationer.png
|
|
@@ -16,7 +16,7 @@ def enable_long_path(path):
|
|
|
16
16
|
"""Simplifie la gestion des chemins longs sous Windows."""
|
|
17
17
|
return pathlib.Path(r"\\?\\" + str(path))
|
|
18
18
|
|
|
19
|
-
def convert_pdf_structure(input_dir: str, output_dir: str,ignore_exsting_out_put=False,progress_callback=None):
|
|
19
|
+
def convert_pdf_structure(input_dir: str, output_dir: str,ignore_exsting_out_put=False,forceBasicConvertion=False,progress_callback=None):
|
|
20
20
|
"""
|
|
21
21
|
return a string with log in case of error
|
|
22
22
|
Recursively lists all .pdf and .PDF files in the input directory,
|
|
@@ -60,7 +60,7 @@ def convert_pdf_structure(input_dir: str, output_dir: str,ignore_exsting_out_put
|
|
|
60
60
|
continue
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
if 0!= convert_pdf_with_temp(str(pdf_file),str(new_file_path)):#convert_pdf_with_temp #convert_pdf_to_docx
|
|
63
|
+
if 0!= convert_pdf_with_temp(str(pdf_file),str(new_file_path),forceBasicConvertion):#convert_pdf_with_temp #convert_pdf_to_docx
|
|
64
64
|
if error_log!="":
|
|
65
65
|
error_log+="\n"
|
|
66
66
|
error_log+="error -> "+str(pdf_file)
|
|
@@ -259,7 +259,7 @@ def write_two_strings_to_file(file_path: str,string1: str, string2: str):
|
|
|
259
259
|
return 0
|
|
260
260
|
|
|
261
261
|
|
|
262
|
-
def convert_pdf_with_temp(temp_pdf, output_path):
|
|
262
|
+
def convert_pdf_with_temp(temp_pdf, output_path,forceBasicConvertion=False):
|
|
263
263
|
"""
|
|
264
264
|
Copie le PDF source dans un dossier temporaire, le convertit en DOCX,
|
|
265
265
|
puis copie le fichier résultant vers le chemin de sortie spécifié,
|
|
@@ -294,15 +294,15 @@ def convert_pdf_with_temp(temp_pdf, output_path):
|
|
|
294
294
|
# Copie du fichier source vers le dossier temporaire
|
|
295
295
|
shutil.copy2(pdf_path, temp_pdf)
|
|
296
296
|
wait_for_file_access(temp_pdf)
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
297
|
+
if forceBasicConvertion==False:
|
|
298
|
+
if is_pdf_a4(temp_pdf)==False:
|
|
299
|
+
temp_pdf2 = os.path.join(dest_dir, "input_totoA4.pdf")
|
|
300
|
+
if 0!=convert_pdf_to_a4(temp_pdf,temp_pdf2):
|
|
301
|
+
print("erreur au resize du pdf")
|
|
302
|
+
return 1
|
|
303
|
+
temp_pdf=temp_pdf2
|
|
304
|
+
wait_for_file_access(temp_pdf)
|
|
305
|
+
time.sleep(1)
|
|
306
306
|
result=0
|
|
307
307
|
# Conversion du PDF en DOCX
|
|
308
308
|
for _ in range(4):
|
|
@@ -30,7 +30,7 @@ class OWDeep_Search(widget.OWWidget):
|
|
|
30
30
|
category = "AAIT - LLM INTEGRATION"
|
|
31
31
|
icon = "icons/deepsearch.svg"
|
|
32
32
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
33
|
-
icon = "icons_dev/
|
|
33
|
+
icon = "icons_dev/deepsearch.svg"
|
|
34
34
|
gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owdeepsearch.ui")
|
|
35
35
|
want_control_area = True
|
|
36
36
|
priority = 1089
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
import os, time, datetime
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from concurrent.futures import as_completed
|
|
4
|
+
|
|
5
|
+
from AnyQt.QtWidgets import QLabel, QTextEdit
|
|
6
|
+
from AnyQt.QtCore import pyqtSignal
|
|
7
|
+
from Orange.widgets import widget
|
|
8
|
+
from Orange.widgets.utils.signals import Input, Output
|
|
9
|
+
from Orange.data import Domain, StringVariable, Table, DiscreteVariable
|
|
10
|
+
|
|
11
|
+
# --- Docling (unique lib utilisée pour la conversion) ---
|
|
12
|
+
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
13
|
+
from docling.datamodel.base_models import InputFormat
|
|
14
|
+
from docling.document_converter import (
|
|
15
|
+
DocumentConverter,
|
|
16
|
+
PdfFormatOption,
|
|
17
|
+
WordFormatOption,
|
|
18
|
+
)
|
|
19
|
+
from docling.pipeline.simple_pipeline import SimplePipeline
|
|
20
|
+
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
|
21
|
+
|
|
22
|
+
# --- Orange contrib Imports ---
|
|
23
|
+
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
24
|
+
from Orange.widgets.orangecontrib.AAIT.utils.thread_management import Thread
|
|
25
|
+
from Orange.widgets.orangecontrib.AAIT.utils.import_uic import uic
|
|
26
|
+
else:
|
|
27
|
+
from orangecontrib.AAIT.utils.thread_management import Thread
|
|
28
|
+
from orangecontrib.AAIT.utils.import_uic import uic
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# --------- worker stateless : convertit 1 fichier avec Docling ----------
|
|
32
|
+
def _convert_one_file(file_path_str: str):
|
|
33
|
+
"""Convertit un fichier (PDF/DOCX/PPTX) en Markdown via Docling.
|
|
34
|
+
Écrit <parent>/a_md/<stem>.md et renvoie [input_path, output_md, status, duration_sec, message].
|
|
35
|
+
Pensé pour être appelé soit directement, soit via ProcessPoolExecutor.
|
|
36
|
+
"""
|
|
37
|
+
t0 = time.time()
|
|
38
|
+
src = Path(file_path_str)
|
|
39
|
+
out_dir = src.parent / "conversion_markdown"
|
|
40
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
41
|
+
out_md = out_dir / f"{src.stem}.md"
|
|
42
|
+
|
|
43
|
+
# Si déjà converti : on ne refait pas
|
|
44
|
+
if out_md.exists():
|
|
45
|
+
status = "ok"
|
|
46
|
+
message = "existant: deja converti"
|
|
47
|
+
duration = time.time() - t0
|
|
48
|
+
return [str(src), str(out_md), status, f"{duration:.2f}", message]
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# Docling minimal config (inspiré du snippet)
|
|
52
|
+
doc_converter = DocumentConverter(
|
|
53
|
+
allowed_formats=[InputFormat.PDF, InputFormat.DOCX, InputFormat.PPTX],
|
|
54
|
+
format_options={
|
|
55
|
+
InputFormat.PDF: PdfFormatOption(
|
|
56
|
+
pipeline_cls=StandardPdfPipeline,
|
|
57
|
+
backend=PyPdfiumDocumentBackend
|
|
58
|
+
),
|
|
59
|
+
InputFormat.DOCX: WordFormatOption(
|
|
60
|
+
pipeline_cls=SimplePipeline
|
|
61
|
+
),
|
|
62
|
+
# PPTX: pas d'option spécifique; géré par défaut
|
|
63
|
+
},
|
|
64
|
+
)
|
|
65
|
+
doc = doc_converter.convert(str(src)).document
|
|
66
|
+
md = doc.export_to_markdown()
|
|
67
|
+
out_md.write_text(md, encoding="utf-8")
|
|
68
|
+
status, message = "ok", ""
|
|
69
|
+
except Exception as e:
|
|
70
|
+
status = "nok"
|
|
71
|
+
message = f"{type(e).__name__}: {e}"
|
|
72
|
+
# on écrit quand même un trace .md
|
|
73
|
+
try:
|
|
74
|
+
out_md.write_text(f"[Erreur conversion] {message}", encoding="utf-8")
|
|
75
|
+
except Exception:
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
duration = time.time() - t0
|
|
79
|
+
return [str(src), str(out_md), status, f"{duration:.2f}", message]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class OWDoclingMarkdownizerSimple(widget.OWWidget):
|
|
83
|
+
name = "Docling To Markdown"
|
|
84
|
+
description = "Convert DOCX/PPTX/PDF to Markdown via Docling"
|
|
85
|
+
icon = "icons/md.png"
|
|
86
|
+
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
87
|
+
icon = "icons_dev/md.png"
|
|
88
|
+
gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owdoclingmarkdownizersimple.ui")
|
|
89
|
+
want_main_area = False
|
|
90
|
+
want_control_area = True
|
|
91
|
+
priority = 1004
|
|
92
|
+
|
|
93
|
+
status_update_signal = pyqtSignal(list)
|
|
94
|
+
|
|
95
|
+
class Inputs:
|
|
96
|
+
data = Input("Files Table", Table)
|
|
97
|
+
executor = Input("ProcessPoolExecutor", object)
|
|
98
|
+
|
|
99
|
+
class Outputs:
|
|
100
|
+
data = Output("Markdown Table", Table)
|
|
101
|
+
status_data = Output("Status Table", Table)
|
|
102
|
+
|
|
103
|
+
def __init__(self):
|
|
104
|
+
super().__init__()
|
|
105
|
+
self.data = None
|
|
106
|
+
self.external_executor = None
|
|
107
|
+
self.thread = None
|
|
108
|
+
self.result = None
|
|
109
|
+
self.exec_info = QLabel("Exécution: séquentielle (aucun executor connecté).", self)
|
|
110
|
+
self.processed_statuses = {} # Dictionary to accumulate statuses for each file
|
|
111
|
+
|
|
112
|
+
uic.loadUi(self.gui, self)
|
|
113
|
+
|
|
114
|
+
self.error("")
|
|
115
|
+
self.warning("")
|
|
116
|
+
|
|
117
|
+
@Inputs.data
|
|
118
|
+
def set_data(self, in_data: Table | None):
|
|
119
|
+
self.data = in_data
|
|
120
|
+
self.error("")
|
|
121
|
+
self.warning("")
|
|
122
|
+
|
|
123
|
+
if not in_data:
|
|
124
|
+
self.Outputs.data.send(None)
|
|
125
|
+
self.Outputs.status_data.send(None)
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
_ = in_data.domain["file_path"]
|
|
130
|
+
except Exception:
|
|
131
|
+
self.error('Colonne "file_path" (Text) requise.')
|
|
132
|
+
self.Outputs.data.send(None)
|
|
133
|
+
self.Outputs.status_data.send(None)
|
|
134
|
+
return
|
|
135
|
+
|
|
136
|
+
self._convert_now()
|
|
137
|
+
|
|
138
|
+
@Inputs.executor
|
|
139
|
+
def set_executor(self, executor_obj):
|
|
140
|
+
self.external_executor = executor_obj
|
|
141
|
+
if executor_obj is not None:
|
|
142
|
+
self.exec_info.setText("Exécution: via ProcessPoolExecutor externe (parallèle).")
|
|
143
|
+
else:
|
|
144
|
+
self.exec_info.setText("Exécution: séquentielle (aucun executor connecté).")
|
|
145
|
+
|
|
146
|
+
def _convert_now(self):
|
|
147
|
+
if self.thread is not None and self.thread.isRunning():
|
|
148
|
+
self.thread.safe_quit()
|
|
149
|
+
|
|
150
|
+
if not self.data:
|
|
151
|
+
self.Outputs.data.send(None)
|
|
152
|
+
self.Outputs.status_data.send(None)
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
# Start progress bar
|
|
156
|
+
self.progressBarInit()
|
|
157
|
+
|
|
158
|
+
# Récupère les chemins et filtre par extensions supportées
|
|
159
|
+
try:
|
|
160
|
+
paths = [Path(str(x)) for x in self.data.get_column("file_path")]
|
|
161
|
+
except Exception as e:
|
|
162
|
+
self.error(f"Lecture de 'file_path' impossible: {e}")
|
|
163
|
+
self.Outputs.data.send(None)
|
|
164
|
+
self.Outputs.status_data.send(None)
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
files = [p for p in paths if p.exists() and p.suffix.lower() in (".pdf", ".docx", ".pptx")]
|
|
168
|
+
if not files:
|
|
169
|
+
self.Outputs.data.send(None)
|
|
170
|
+
self.Outputs.status_data.send(None)
|
|
171
|
+
self.progressBarFinished()
|
|
172
|
+
return
|
|
173
|
+
|
|
174
|
+
# Initialize status dictionary for incremental updates
|
|
175
|
+
self.processed_statuses = {str(p): ["pending", ""] for p in files}
|
|
176
|
+
self.update_status_output()
|
|
177
|
+
|
|
178
|
+
# Connect internal signal
|
|
179
|
+
self.status_update_signal.connect(self.handle_status_update)
|
|
180
|
+
|
|
181
|
+
# Connect and start thread for the main function
|
|
182
|
+
self.thread = Thread(self._run_conversion, files)
|
|
183
|
+
self.thread.progress.connect(self.handle_progress)
|
|
184
|
+
self.thread.result.connect(self.handle_result)
|
|
185
|
+
self.thread.finish.connect(self.handle_finish)
|
|
186
|
+
self.thread.start()
|
|
187
|
+
|
|
188
|
+
def update_status_output(self):
|
|
189
|
+
"""Helper function to create and send the status table."""
|
|
190
|
+
status_domain = Domain(
|
|
191
|
+
[], # This list must be empty because the table has no attributes.
|
|
192
|
+
metas=[
|
|
193
|
+
StringVariable("input_path"),
|
|
194
|
+
DiscreteVariable("status", values=["pending", "in_progress", "ok", "nok"]),
|
|
195
|
+
StringVariable("message"),
|
|
196
|
+
],
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
status_rows = []
|
|
200
|
+
for path_str, status_info in self.processed_statuses.items():
|
|
201
|
+
status, message = status_info
|
|
202
|
+
# Orange's Table.from_list expects a flat list of values matching the domain's order.
|
|
203
|
+
# The row should contain the values for input_path, status, and message.
|
|
204
|
+
# The status_info is a list [status, message]. We need to prepend the path_str.
|
|
205
|
+
status_rows.append([path_str, status, message])
|
|
206
|
+
|
|
207
|
+
status_table = Table.from_list(status_domain, status_rows)
|
|
208
|
+
self.Outputs.status_data.send(status_table)
|
|
209
|
+
|
|
210
|
+
def _run_conversion(self, files, progress_callback):
|
|
211
|
+
"""Main function to run the conversion, supports sequential and parallel modes."""
|
|
212
|
+
results = []
|
|
213
|
+
|
|
214
|
+
if self.external_executor is None:
|
|
215
|
+
# --- Mode simple séquentiel ---
|
|
216
|
+
for i, p in enumerate(files):
|
|
217
|
+
path_str = str(p)
|
|
218
|
+
# on met à jour le statut en "in_progress" et on envoie
|
|
219
|
+
self.status_update_signal.emit([path_str, "in_progress", ""])
|
|
220
|
+
|
|
221
|
+
row = _convert_one_file(path_str)
|
|
222
|
+
results.append(row)
|
|
223
|
+
|
|
224
|
+
# Mise à jour du tableau avec le résultat et envoi immédiat
|
|
225
|
+
self.status_update_signal.emit([row[0], row[2], row[4]])
|
|
226
|
+
|
|
227
|
+
progress_callback((i + 1) / len(files) * 100)
|
|
228
|
+
else:
|
|
229
|
+
# --- Mode parallèle via executor externe ---
|
|
230
|
+
fut_map = {self.external_executor.submit(_convert_one_file, str(p)): str(p) for p in files}
|
|
231
|
+
|
|
232
|
+
for i, fut in enumerate(as_completed(fut_map), 1):
|
|
233
|
+
file_path_str = fut_map[fut]
|
|
234
|
+
# on met à jour le statut en "in_progress" et on envoie
|
|
235
|
+
self.status_update_signal.emit([file_path_str, "in_progress", ""])
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
row = fut.result()
|
|
239
|
+
results.append(row)
|
|
240
|
+
# Mise à jour du résultat de la future et envoi
|
|
241
|
+
self.status_update_signal.emit([row[0], row[2], row[4]])
|
|
242
|
+
except Exception as e:
|
|
243
|
+
# Gestion des erreurs de la future et envoi
|
|
244
|
+
row = [file_path_str, str((Path(file_path_str).parent / 'a_md' / f"{Path(file_path_str).stem}.md")),
|
|
245
|
+
"nok", "0.00", f"FutureError: {e}"]
|
|
246
|
+
results.append(row)
|
|
247
|
+
self.status_update_signal.emit([row[0], "nok", f"FutureError: {e}"])
|
|
248
|
+
|
|
249
|
+
progress_callback(i / len(files) * 100)
|
|
250
|
+
|
|
251
|
+
# The final result is the table built from all results
|
|
252
|
+
domain = Domain([], metas=[
|
|
253
|
+
StringVariable("input_path"),
|
|
254
|
+
StringVariable("output_md"),
|
|
255
|
+
StringVariable("status"),
|
|
256
|
+
StringVariable("duration_sec"),
|
|
257
|
+
StringVariable("message"),
|
|
258
|
+
])
|
|
259
|
+
final_table = Table.from_list(domain, results)
|
|
260
|
+
return final_table
|
|
261
|
+
|
|
262
|
+
def handle_progress(self, value: float) -> None:
|
|
263
|
+
self.progressBarSet(value)
|
|
264
|
+
|
|
265
|
+
def handle_status_update(self, status_info: list):
|
|
266
|
+
"""Receives a single status update and updates the internal dictionary and the output."""
|
|
267
|
+
path_str, status, message = status_info
|
|
268
|
+
self.processed_statuses[path_str] = [status, message]
|
|
269
|
+
self.update_status_output()
|
|
270
|
+
|
|
271
|
+
def handle_result(self, result: Table):
|
|
272
|
+
try:
|
|
273
|
+
self.result = result
|
|
274
|
+
self.Outputs.data.send(result)
|
|
275
|
+
except Exception as e:
|
|
276
|
+
print("An error occurred when sending out_data:", e)
|
|
277
|
+
self.Outputs.data.send(None)
|
|
278
|
+
|
|
279
|
+
def handle_finish(self):
|
|
280
|
+
self.progressBarFinished()
|
|
@@ -29,7 +29,7 @@ class OWExportMarkdown(widget.OWWidget):
|
|
|
29
29
|
icon = "icons/export_md.png"
|
|
30
30
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
31
31
|
icon = "icons_dev/export_md.png"
|
|
32
|
-
want_control_area =
|
|
32
|
+
want_control_area = False
|
|
33
33
|
priority = 9999
|
|
34
34
|
category = "AAIT - TOOLBOX"
|
|
35
35
|
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from Orange.widgets import widget
|
|
6
|
+
from Orange.widgets.utils.signals import Input, Output
|
|
7
|
+
from Orange.data import Domain, StringVariable, Table
|
|
8
|
+
from AnyQt.QtWidgets import QCheckBox
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from Orange.widgets.orangecontrib.AAIT.utils.import_uic import uic
|
|
12
|
+
from range.widgets.orangecontrib.IO4IT.utils import utils_md
|
|
13
|
+
except ImportError:
|
|
14
|
+
from orangecontrib.IO4IT.utils import utils_md
|
|
15
|
+
from orangecontrib.AAIT.utils.import_uic import uic
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class OWMarkdownLoader(widget.OWWidget):
|
|
19
|
+
name = "Markdown Loader"
|
|
20
|
+
description = "Charge tous les fichiers Markdown d’un dossier (récursif)"
|
|
21
|
+
icon = "icons/load_md.png"
|
|
22
|
+
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
23
|
+
icon = "icons_dev/load_md.png"
|
|
24
|
+
gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owmarkdownloader.ui")
|
|
25
|
+
want_control_area = False
|
|
26
|
+
priority = 1001
|
|
27
|
+
|
|
28
|
+
class Inputs:
|
|
29
|
+
data = Input("Data", Table)
|
|
30
|
+
|
|
31
|
+
class Outputs:
|
|
32
|
+
md_files = Output("Markdown Files", Table) # -> (file_path, content)
|
|
33
|
+
data = Output("Data", Table) # passthrough de l'entrée
|
|
34
|
+
|
|
35
|
+
def __init__(self):
|
|
36
|
+
super().__init__()
|
|
37
|
+
|
|
38
|
+
self.in_data = None
|
|
39
|
+
self.input_dir = None
|
|
40
|
+
uic.loadUi(self.gui, self)
|
|
41
|
+
self.checkBoxRecursive = self.findChild(QCheckBox, 'checkBoxRecursive')
|
|
42
|
+
# These lines MUST be after super().__init__()
|
|
43
|
+
self.recursive = self.checkBoxRecursive.isChecked()
|
|
44
|
+
self.checkBoxRecursive.stateChanged.connect(self._on_recursive_toggled)
|
|
45
|
+
|
|
46
|
+
self.warning("")
|
|
47
|
+
|
|
48
|
+
def _on_recursive_toggled(self, _state):
|
|
49
|
+
self.recursive = self.checkBoxRecursive.isChecked()
|
|
50
|
+
# If a directory is already set, re-run the production
|
|
51
|
+
if self.input_dir:
|
|
52
|
+
self._produce()
|
|
53
|
+
|
|
54
|
+
@Inputs.data
|
|
55
|
+
def set_data(self, in_data: Table | None):
|
|
56
|
+
self.in_data = in_data
|
|
57
|
+
self.warning("")
|
|
58
|
+
|
|
59
|
+
# Always pass through the input data
|
|
60
|
+
self.Outputs.data.send(in_data)
|
|
61
|
+
|
|
62
|
+
if not in_data:
|
|
63
|
+
# If no input data, send an empty table
|
|
64
|
+
self.Outputs.md_files.send(self._empty_md_table())
|
|
65
|
+
self.Description.setText(
|
|
66
|
+
"This widget loads Markdown files from a folder. The path must be in a column named 'input_dir'.")
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
# Look for the 'input_dir' column and get the first folder
|
|
70
|
+
try:
|
|
71
|
+
input_dir_column = in_data.domain["input_dir"]
|
|
72
|
+
self.input_dir = str(in_data[0, input_dir_column].value)
|
|
73
|
+
except (KeyError, IndexError, AttributeError):
|
|
74
|
+
self.warning('"input_dir" (Text) is required in the input data.')
|
|
75
|
+
self.Outputs.md_files.send(self._empty_md_table())
|
|
76
|
+
self.Description.setText("Error: 'input_dir' (Text) column not found or is empty.")
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
self.Description.setText(f"Dossier : {self.input_dir}")
|
|
80
|
+
self._produce()
|
|
81
|
+
|
|
82
|
+
def _empty_md_table(self) -> Table:
|
|
83
|
+
domain = Domain([], metas=[StringVariable("file_path"), StringVariable("content")])
|
|
84
|
+
X = np.empty((0, 0))
|
|
85
|
+
metas = np.empty((0, 2), dtype=object)
|
|
86
|
+
return Table.from_numpy(domain, X, metas=metas)
|
|
87
|
+
|
|
88
|
+
def _produce(self):
|
|
89
|
+
if not self.input_dir or not os.path.isdir(self.input_dir):
|
|
90
|
+
self.warning(f"Invalid directory path: '{self.input_dir}'")
|
|
91
|
+
self.Outputs.md_files.send(self._empty_md_table())
|
|
92
|
+
return
|
|
93
|
+
base = Path(self.input_dir)
|
|
94
|
+
patterns = ["*.md"]
|
|
95
|
+
paths = []
|
|
96
|
+
|
|
97
|
+
for patt in patterns:
|
|
98
|
+
if self.recursive:
|
|
99
|
+
paths.extend(base.rglob(patt))
|
|
100
|
+
else:
|
|
101
|
+
paths.extend(base.glob(patt))
|
|
102
|
+
|
|
103
|
+
md_rows = []
|
|
104
|
+
for p in sorted(set(paths)):
|
|
105
|
+
try:
|
|
106
|
+
md_rows.append([str(p), utils_md.try_read_text(p)])
|
|
107
|
+
except Exception:
|
|
108
|
+
md_rows.append([str(p), ""])
|
|
109
|
+
domain = Domain([], metas=[StringVariable("file_path"), StringVariable("content")])
|
|
110
|
+
X = np.empty((len(md_rows), 0))
|
|
111
|
+
metas = np.array(md_rows, dtype=object) if md_rows else np.empty((0, 2), dtype=object)
|
|
112
|
+
md_table = Table.from_numpy(domain, X, metas=metas)
|
|
113
|
+
|
|
114
|
+
self.Outputs.md_files.send(md_table)
|
|
@@ -522,9 +522,9 @@ class MarkdownConversionThread(QThread):
|
|
|
522
522
|
class FileProcessorApp(widget.OWWidget):
|
|
523
523
|
name = "Markdownizer"
|
|
524
524
|
description = "Convert PDFs, DOCX, PPTX to Markdown (texte seul & word only)"
|
|
525
|
-
icon = "icons/
|
|
525
|
+
icon = "icons/dep_md_old.png"
|
|
526
526
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
527
|
-
icon = "icons_dev/
|
|
527
|
+
icon = "icons_dev/dep_md_old.png"
|
|
528
528
|
want_control_area = False
|
|
529
529
|
priority = 1001
|
|
530
530
|
gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owmarkdownizer.ui")
|