io4it 2.1.0.4__tar.gz → 2.1.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- io4it-2.1.0.5/PKG-INFO +7 -0
- io4it-2.1.0.5/io4it.egg-info/PKG-INFO +7 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/io4it.egg-info/SOURCES.txt +15 -2
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/ocr_function/word_converter.py +12 -12
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/OWDeep_Search.py +1 -1
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/OWDoclingMarkdownizerSimple.py +281 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/OWExportMarkdown.py +1 -1
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/OWMarkdownLoader.py +112 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/OWMarkdownizer.py +2 -2
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/OWOfficeNormalizer.py +183 -0
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/OWPdfType.py +193 -0
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/OWProcessPoolExecutor.py +87 -0
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/designer/owdoclingmarkdownizersimple.ui +54 -0
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/designer/owmarkdownloader.ui +54 -0
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui +54 -0
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/designer/owpdftype.ui +54 -0
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/designer/owprocesspoolexecutor.ui +100 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui +16 -0
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/icons/check_pdf.png +0 -0
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/icons/load_md.png +0 -0
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/icons/md_old.png +0 -0
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/icons/office_normalizer.png +0 -0
- io4it-2.1.0.5/orangecontrib/IO4IT/widgets/icons/process_pool_executor.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/setup.py +1 -1
- io4it-2.1.0.4/PKG-INFO +0 -33
- io4it-2.1.0.4/io4it.egg-info/PKG-INFO +0 -33
- io4it-2.1.0.4/orangecontrib/IO4IT/widgets/designer/chart.html +0 -281
- io4it-2.1.0.4/orangecontrib/IO4IT/widgets/designer/owvisualizationer.ui +0 -56
- {io4it-2.1.0.4 → io4it-2.1.0.5}/io4it.egg-info/dependency_links.txt +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/io4it.egg-info/entry_points.txt +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/io4it.egg-info/namespace_packages.txt +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/io4it.egg-info/requires.txt +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/io4it.egg-info/top_level.txt +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/ocr_function/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/utils/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/utils/mail.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/utils/offuscation_basique.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/utils/pool_exec_utils.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/utils/utils_md.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/OWChatGpt.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/OWS3Uploader.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/OWS3downloader.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/OWS3list.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/OWSpeechToText.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/OWmailLoader.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/OWmailSender.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/OWwordpdf2docx.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/designer/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/designer/nogui.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/designer/owchatgpt.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/designer/owdeepsearch.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/designer/owexportmarkdown.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/designer/owmailloader.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/designer/owmailsender.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/designer/owmarkdownizer.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/chatgpt.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/deepsearch.svg +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/download.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/export_md.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/file_extensor.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/list_aws.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/mail_loader.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/mail_writer.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/md.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/speech_to_text.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/upload.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/visualizationer.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons/wordpdf2docx.png +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/IO4IT/widgets/icons_dev/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/orangecontrib/__init__.py +0 -0
- {io4it-2.1.0.4 → io4it-2.1.0.5}/setup.cfg +0 -0
io4it-2.1.0.5/PKG-INFO
ADDED
|
@@ -17,8 +17,13 @@ orangecontrib/IO4IT/utils/pool_exec_utils.py
|
|
|
17
17
|
orangecontrib/IO4IT/utils/utils_md.py
|
|
18
18
|
orangecontrib/IO4IT/widgets/OWChatGpt.py
|
|
19
19
|
orangecontrib/IO4IT/widgets/OWDeep_Search.py
|
|
20
|
+
orangecontrib/IO4IT/widgets/OWDoclingMarkdownizerSimple.py
|
|
20
21
|
orangecontrib/IO4IT/widgets/OWExportMarkdown.py
|
|
22
|
+
orangecontrib/IO4IT/widgets/OWMarkdownLoader.py
|
|
21
23
|
orangecontrib/IO4IT/widgets/OWMarkdownizer.py
|
|
24
|
+
orangecontrib/IO4IT/widgets/OWOfficeNormalizer.py
|
|
25
|
+
orangecontrib/IO4IT/widgets/OWPdfType.py
|
|
26
|
+
orangecontrib/IO4IT/widgets/OWProcessPoolExecutor.py
|
|
22
27
|
orangecontrib/IO4IT/widgets/OWS3Uploader.py
|
|
23
28
|
orangecontrib/IO4IT/widgets/OWS3downloader.py
|
|
24
29
|
orangecontrib/IO4IT/widgets/OWS3list.py
|
|
@@ -28,28 +33,36 @@ orangecontrib/IO4IT/widgets/OWmailSender.py
|
|
|
28
33
|
orangecontrib/IO4IT/widgets/OWwordpdf2docx.py
|
|
29
34
|
orangecontrib/IO4IT/widgets/__init__.py
|
|
30
35
|
orangecontrib/IO4IT/widgets/designer/__init__.py
|
|
31
|
-
orangecontrib/IO4IT/widgets/designer/chart.html
|
|
32
36
|
orangecontrib/IO4IT/widgets/designer/nogui.ui
|
|
33
37
|
orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui
|
|
34
38
|
orangecontrib/IO4IT/widgets/designer/owchatgpt.ui
|
|
35
39
|
orangecontrib/IO4IT/widgets/designer/owdeepsearch.ui
|
|
40
|
+
orangecontrib/IO4IT/widgets/designer/owdoclingmarkdownizersimple.ui
|
|
36
41
|
orangecontrib/IO4IT/widgets/designer/owexportmarkdown.ui
|
|
37
42
|
orangecontrib/IO4IT/widgets/designer/owmailloader.ui
|
|
38
43
|
orangecontrib/IO4IT/widgets/designer/owmailsender.ui
|
|
39
44
|
orangecontrib/IO4IT/widgets/designer/owmarkdownizer.ui
|
|
45
|
+
orangecontrib/IO4IT/widgets/designer/owmarkdownloader.ui
|
|
46
|
+
orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui
|
|
47
|
+
orangecontrib/IO4IT/widgets/designer/owpdftype.ui
|
|
48
|
+
orangecontrib/IO4IT/widgets/designer/owprocesspoolexecutor.ui
|
|
40
49
|
orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui
|
|
41
|
-
orangecontrib/IO4IT/widgets/designer/owvisualizationer.ui
|
|
42
50
|
orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui
|
|
43
51
|
orangecontrib/IO4IT/widgets/icons/__init__.py
|
|
44
52
|
orangecontrib/IO4IT/widgets/icons/chatgpt.png
|
|
53
|
+
orangecontrib/IO4IT/widgets/icons/check_pdf.png
|
|
45
54
|
orangecontrib/IO4IT/widgets/icons/deepsearch.svg
|
|
46
55
|
orangecontrib/IO4IT/widgets/icons/download.png
|
|
47
56
|
orangecontrib/IO4IT/widgets/icons/export_md.png
|
|
48
57
|
orangecontrib/IO4IT/widgets/icons/file_extensor.png
|
|
49
58
|
orangecontrib/IO4IT/widgets/icons/list_aws.png
|
|
59
|
+
orangecontrib/IO4IT/widgets/icons/load_md.png
|
|
50
60
|
orangecontrib/IO4IT/widgets/icons/mail_loader.png
|
|
51
61
|
orangecontrib/IO4IT/widgets/icons/mail_writer.png
|
|
52
62
|
orangecontrib/IO4IT/widgets/icons/md.png
|
|
63
|
+
orangecontrib/IO4IT/widgets/icons/md_old.png
|
|
64
|
+
orangecontrib/IO4IT/widgets/icons/office_normalizer.png
|
|
65
|
+
orangecontrib/IO4IT/widgets/icons/process_pool_executor.png
|
|
53
66
|
orangecontrib/IO4IT/widgets/icons/speech_to_text.png
|
|
54
67
|
orangecontrib/IO4IT/widgets/icons/upload.png
|
|
55
68
|
orangecontrib/IO4IT/widgets/icons/visualizationer.png
|
|
@@ -16,7 +16,7 @@ def enable_long_path(path):
|
|
|
16
16
|
"""Simplifie la gestion des chemins longs sous Windows."""
|
|
17
17
|
return pathlib.Path(r"\\?\\" + str(path))
|
|
18
18
|
|
|
19
|
-
def convert_pdf_structure(input_dir: str, output_dir: str,ignore_exsting_out_put=False,progress_callback=None):
|
|
19
|
+
def convert_pdf_structure(input_dir: str, output_dir: str,ignore_exsting_out_put=False,forceBasicConvertion=False,progress_callback=None):
|
|
20
20
|
"""
|
|
21
21
|
return a string with log in case of error
|
|
22
22
|
Recursively lists all .pdf and .PDF files in the input directory,
|
|
@@ -60,7 +60,7 @@ def convert_pdf_structure(input_dir: str, output_dir: str,ignore_exsting_out_put
|
|
|
60
60
|
continue
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
if 0!= convert_pdf_with_temp(str(pdf_file),str(new_file_path)):#convert_pdf_with_temp #convert_pdf_to_docx
|
|
63
|
+
if 0!= convert_pdf_with_temp(str(pdf_file),str(new_file_path),forceBasicConvertion):#convert_pdf_with_temp #convert_pdf_to_docx
|
|
64
64
|
if error_log!="":
|
|
65
65
|
error_log+="\n"
|
|
66
66
|
error_log+="error -> "+str(pdf_file)
|
|
@@ -259,7 +259,7 @@ def write_two_strings_to_file(file_path: str,string1: str, string2: str):
|
|
|
259
259
|
return 0
|
|
260
260
|
|
|
261
261
|
|
|
262
|
-
def convert_pdf_with_temp(temp_pdf, output_path):
|
|
262
|
+
def convert_pdf_with_temp(temp_pdf, output_path,forceBasicConvertion=False):
|
|
263
263
|
"""
|
|
264
264
|
Copie le PDF source dans un dossier temporaire, le convertit en DOCX,
|
|
265
265
|
puis copie le fichier résultant vers le chemin de sortie spécifié,
|
|
@@ -294,15 +294,15 @@ def convert_pdf_with_temp(temp_pdf, output_path):
|
|
|
294
294
|
# Copie du fichier source vers le dossier temporaire
|
|
295
295
|
shutil.copy2(pdf_path, temp_pdf)
|
|
296
296
|
wait_for_file_access(temp_pdf)
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
297
|
+
if forceBasicConvertion==False:
|
|
298
|
+
if is_pdf_a4(temp_pdf)==False:
|
|
299
|
+
temp_pdf2 = os.path.join(dest_dir, "input_totoA4.pdf")
|
|
300
|
+
if 0!=convert_pdf_to_a4(temp_pdf,temp_pdf2):
|
|
301
|
+
print("erreur au resize du pdf")
|
|
302
|
+
return 1
|
|
303
|
+
temp_pdf=temp_pdf2
|
|
304
|
+
wait_for_file_access(temp_pdf)
|
|
305
|
+
time.sleep(1)
|
|
306
306
|
result=0
|
|
307
307
|
# Conversion du PDF en DOCX
|
|
308
308
|
for _ in range(4):
|
|
@@ -30,7 +30,7 @@ class OWDeep_Search(widget.OWWidget):
|
|
|
30
30
|
category = "AAIT - LLM INTEGRATION"
|
|
31
31
|
icon = "icons/deepsearch.svg"
|
|
32
32
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
33
|
-
icon = "icons_dev/
|
|
33
|
+
icon = "icons_dev/deepsearch.svg"
|
|
34
34
|
gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owdeepsearch.ui")
|
|
35
35
|
want_control_area = True
|
|
36
36
|
priority = 1089
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
import os, time, datetime
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from concurrent.futures import as_completed
|
|
4
|
+
|
|
5
|
+
from AnyQt.QtWidgets import QLabel, QTextEdit
|
|
6
|
+
from AnyQt.QtCore import pyqtSignal
|
|
7
|
+
from Orange.widgets import widget
|
|
8
|
+
from Orange.widgets.utils.signals import Input, Output
|
|
9
|
+
from Orange.data import Domain, StringVariable, Table, DiscreteVariable
|
|
10
|
+
|
|
11
|
+
# --- Docling (unique lib utilisée pour la conversion) ---
|
|
12
|
+
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
13
|
+
from docling.datamodel.base_models import InputFormat
|
|
14
|
+
from docling.document_converter import (
|
|
15
|
+
DocumentConverter,
|
|
16
|
+
PdfFormatOption,
|
|
17
|
+
WordFormatOption,
|
|
18
|
+
)
|
|
19
|
+
from docling.pipeline.simple_pipeline import SimplePipeline
|
|
20
|
+
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
|
21
|
+
|
|
22
|
+
# --- Orange contrib Imports ---
|
|
23
|
+
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
24
|
+
from Orange.widgets.orangecontrib.AAIT.utils.thread_management import Thread
|
|
25
|
+
from Orange.widgets.orangecontrib.AAIT.utils.import_uic import uic
|
|
26
|
+
else:
|
|
27
|
+
from orangecontrib.AAIT.utils.thread_management import Thread
|
|
28
|
+
from orangecontrib.AAIT.utils.import_uic import uic
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# --------- worker stateless : convertit 1 fichier avec Docling ----------
|
|
32
|
+
def _convert_one_file(file_path_str: str):
|
|
33
|
+
"""Convertit un fichier (PDF/DOCX/PPTX) en Markdown via Docling.
|
|
34
|
+
Écrit <parent>/a_md/<stem>.md et renvoie [input_path, output_md, status, duration_sec, message].
|
|
35
|
+
Pensé pour être appelé soit directement, soit via ProcessPoolExecutor.
|
|
36
|
+
"""
|
|
37
|
+
t0 = time.time()
|
|
38
|
+
src = Path(file_path_str)
|
|
39
|
+
out_dir = src.parent / "conversion_markdown"
|
|
40
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
41
|
+
out_md = out_dir / f"{src.stem}.md"
|
|
42
|
+
|
|
43
|
+
# Si déjà converti : on ne refait pas
|
|
44
|
+
if out_md.exists():
|
|
45
|
+
status = "ok"
|
|
46
|
+
message = "existant: deja converti"
|
|
47
|
+
duration = time.time() - t0
|
|
48
|
+
return [str(src), str(out_md), status, f"{duration:.2f}", message]
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# Docling minimal config (inspiré du snippet)
|
|
52
|
+
doc_converter = DocumentConverter(
|
|
53
|
+
allowed_formats=[InputFormat.PDF, InputFormat.DOCX, InputFormat.PPTX],
|
|
54
|
+
format_options={
|
|
55
|
+
InputFormat.PDF: PdfFormatOption(
|
|
56
|
+
pipeline_cls=StandardPdfPipeline,
|
|
57
|
+
backend=PyPdfiumDocumentBackend
|
|
58
|
+
),
|
|
59
|
+
InputFormat.DOCX: WordFormatOption(
|
|
60
|
+
pipeline_cls=SimplePipeline
|
|
61
|
+
),
|
|
62
|
+
# PPTX: pas d'option spécifique; géré par défaut
|
|
63
|
+
},
|
|
64
|
+
)
|
|
65
|
+
doc = doc_converter.convert(str(src)).document
|
|
66
|
+
md = doc.export_to_markdown()
|
|
67
|
+
out_md.write_text(md, encoding="utf-8")
|
|
68
|
+
status, message = "ok", ""
|
|
69
|
+
except Exception as e:
|
|
70
|
+
status = "nok"
|
|
71
|
+
message = f"{type(e).__name__}: {e}"
|
|
72
|
+
# on écrit quand même un trace .md
|
|
73
|
+
try:
|
|
74
|
+
out_md.write_text(f"[Erreur conversion] {message}", encoding="utf-8")
|
|
75
|
+
except Exception:
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
duration = time.time() - t0
|
|
79
|
+
return [str(src), str(out_md), status, f"{duration:.2f}", message]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class OWDoclingMarkdownizerSimple(widget.OWWidget):
|
|
83
|
+
name = "Docling To Markdown"
|
|
84
|
+
description = "Convert DOCX/PPTX/PDF to Markdown via Docling"
|
|
85
|
+
icon = "icons/md.png"
|
|
86
|
+
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
87
|
+
icon = "icons_dev/md.png"
|
|
88
|
+
gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owdoclingmarkdownizersimple.ui")
|
|
89
|
+
want_main_area = False
|
|
90
|
+
want_control_area = True
|
|
91
|
+
priority = 1004
|
|
92
|
+
|
|
93
|
+
# --- New signal for incremental status updates ---
|
|
94
|
+
status_update_signal = pyqtSignal(list)
|
|
95
|
+
|
|
96
|
+
class Inputs:
|
|
97
|
+
data = Input("Files Table", Table)
|
|
98
|
+
executor = Input("ProcessPoolExecutor", object)
|
|
99
|
+
|
|
100
|
+
class Outputs:
|
|
101
|
+
data = Output("Markdown Table", Table)
|
|
102
|
+
status_data = Output("Status Table", Table)
|
|
103
|
+
|
|
104
|
+
def __init__(self):
|
|
105
|
+
super().__init__()
|
|
106
|
+
self.data = None
|
|
107
|
+
self.external_executor = None
|
|
108
|
+
self.thread = None
|
|
109
|
+
self.result = None
|
|
110
|
+
self.exec_info = QLabel("Exécution: séquentielle (aucun executor connecté).", self)
|
|
111
|
+
self.processed_statuses = {} # Dictionary to accumulate statuses for each file
|
|
112
|
+
|
|
113
|
+
uic.loadUi(self.gui, self)
|
|
114
|
+
|
|
115
|
+
self.error("")
|
|
116
|
+
self.warning("")
|
|
117
|
+
|
|
118
|
+
@Inputs.data
|
|
119
|
+
def set_data(self, in_data: Table | None):
|
|
120
|
+
self.data = in_data
|
|
121
|
+
self.error("")
|
|
122
|
+
self.warning("")
|
|
123
|
+
|
|
124
|
+
if not in_data:
|
|
125
|
+
self.Outputs.data.send(None)
|
|
126
|
+
self.Outputs.status_data.send(None)
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
_ = in_data.domain["file_path"]
|
|
131
|
+
except Exception:
|
|
132
|
+
self.error('Colonne "file_path" (Text) requise.')
|
|
133
|
+
self.Outputs.data.send(None)
|
|
134
|
+
self.Outputs.status_data.send(None)
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
self._convert_now()
|
|
138
|
+
|
|
139
|
+
@Inputs.executor
|
|
140
|
+
def set_executor(self, executor_obj):
|
|
141
|
+
self.external_executor = executor_obj
|
|
142
|
+
if executor_obj is not None:
|
|
143
|
+
self.exec_info.setText("Exécution: via ProcessPoolExecutor externe (parallèle).")
|
|
144
|
+
else:
|
|
145
|
+
self.exec_info.setText("Exécution: séquentielle (aucun executor connecté).")
|
|
146
|
+
|
|
147
|
+
def _convert_now(self):
|
|
148
|
+
if self.thread is not None and self.thread.isRunning():
|
|
149
|
+
self.thread.safe_quit()
|
|
150
|
+
|
|
151
|
+
if not self.data:
|
|
152
|
+
self.Outputs.data.send(None)
|
|
153
|
+
self.Outputs.status_data.send(None)
|
|
154
|
+
return
|
|
155
|
+
|
|
156
|
+
# Start progress bar
|
|
157
|
+
self.progressBarInit()
|
|
158
|
+
|
|
159
|
+
# Récupère les chemins et filtre par extensions supportées
|
|
160
|
+
try:
|
|
161
|
+
paths = [Path(str(x)) for x in self.data.get_column("file_path")]
|
|
162
|
+
except Exception as e:
|
|
163
|
+
self.error(f"Lecture de 'file_path' impossible: {e}")
|
|
164
|
+
self.Outputs.data.send(None)
|
|
165
|
+
self.Outputs.status_data.send(None)
|
|
166
|
+
return
|
|
167
|
+
|
|
168
|
+
files = [p for p in paths if p.exists() and p.suffix.lower() in (".pdf", ".docx", ".pptx")]
|
|
169
|
+
if not files:
|
|
170
|
+
self.Outputs.data.send(None)
|
|
171
|
+
self.Outputs.status_data.send(None)
|
|
172
|
+
self.progressBarFinished()
|
|
173
|
+
return
|
|
174
|
+
|
|
175
|
+
# Initialize status dictionary for incremental updates
|
|
176
|
+
self.processed_statuses = {str(p): ["pending", ""] for p in files}
|
|
177
|
+
self.update_status_output()
|
|
178
|
+
|
|
179
|
+
# Connect internal signal
|
|
180
|
+
self.status_update_signal.connect(self.handle_status_update)
|
|
181
|
+
|
|
182
|
+
# Connect and start thread for the main function
|
|
183
|
+
self.thread = Thread(self._run_conversion, files)
|
|
184
|
+
self.thread.progress.connect(self.handle_progress)
|
|
185
|
+
self.thread.result.connect(self.handle_result)
|
|
186
|
+
self.thread.finish.connect(self.handle_finish)
|
|
187
|
+
self.thread.start()
|
|
188
|
+
|
|
189
|
+
def update_status_output(self):
|
|
190
|
+
"""Helper function to create and send the status table."""
|
|
191
|
+
status_domain = Domain(
|
|
192
|
+
[], # This list must be empty because the table has no attributes.
|
|
193
|
+
metas=[
|
|
194
|
+
StringVariable("input_path"),
|
|
195
|
+
DiscreteVariable("status", values=["pending", "in_progress", "ok", "nok"]),
|
|
196
|
+
StringVariable("message"),
|
|
197
|
+
],
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
status_rows = []
|
|
201
|
+
for path_str, status_info in self.processed_statuses.items():
|
|
202
|
+
status, message = status_info
|
|
203
|
+
# Orange's Table.from_list expects a flat list of values matching the domain's order.
|
|
204
|
+
# The row should contain the values for input_path, status, and message.
|
|
205
|
+
# The status_info is a list [status, message]. We need to prepend the path_str.
|
|
206
|
+
status_rows.append([path_str, status, message])
|
|
207
|
+
|
|
208
|
+
status_table = Table.from_list(status_domain, status_rows)
|
|
209
|
+
self.Outputs.status_data.send(status_table)
|
|
210
|
+
|
|
211
|
+
def _run_conversion(self, files, progress_callback):
|
|
212
|
+
"""Main function to run the conversion, supports sequential and parallel modes."""
|
|
213
|
+
results = []
|
|
214
|
+
|
|
215
|
+
if self.external_executor is None:
|
|
216
|
+
# --- Mode simple séquentiel ---
|
|
217
|
+
for i, p in enumerate(files):
|
|
218
|
+
path_str = str(p)
|
|
219
|
+
# on met à jour le statut en "in_progress" et on envoie
|
|
220
|
+
self.status_update_signal.emit([path_str, "in_progress", ""])
|
|
221
|
+
|
|
222
|
+
row = _convert_one_file(path_str)
|
|
223
|
+
results.append(row)
|
|
224
|
+
|
|
225
|
+
# Mise à jour du tableau avec le résultat et envoi immédiat
|
|
226
|
+
self.status_update_signal.emit([row[0], row[2], row[4]])
|
|
227
|
+
|
|
228
|
+
progress_callback((i + 1) / len(files) * 100)
|
|
229
|
+
else:
|
|
230
|
+
# --- Mode parallèle via executor externe ---
|
|
231
|
+
fut_map = {self.external_executor.submit(_convert_one_file, str(p)): str(p) for p in files}
|
|
232
|
+
|
|
233
|
+
for i, fut in enumerate(as_completed(fut_map), 1):
|
|
234
|
+
file_path_str = fut_map[fut]
|
|
235
|
+
# on met à jour le statut en "in_progress" et on envoie
|
|
236
|
+
self.status_update_signal.emit([file_path_str, "in_progress", ""])
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
row = fut.result()
|
|
240
|
+
results.append(row)
|
|
241
|
+
# Mise à jour du résultat de la future et envoi
|
|
242
|
+
self.status_update_signal.emit([row[0], row[2], row[4]])
|
|
243
|
+
except Exception as e:
|
|
244
|
+
# Gestion des erreurs de la future et envoi
|
|
245
|
+
row = [file_path_str, str((Path(file_path_str).parent / 'a_md' / f"{Path(file_path_str).stem}.md")),
|
|
246
|
+
"nok", "0.00", f"FutureError: {e}"]
|
|
247
|
+
results.append(row)
|
|
248
|
+
self.status_update_signal.emit([row[0], "nok", f"FutureError: {e}"])
|
|
249
|
+
|
|
250
|
+
progress_callback(i / len(files) * 100)
|
|
251
|
+
|
|
252
|
+
# The final result is the table built from all results
|
|
253
|
+
domain = Domain([], metas=[
|
|
254
|
+
StringVariable("input_path"),
|
|
255
|
+
StringVariable("output_md"),
|
|
256
|
+
StringVariable("status"),
|
|
257
|
+
StringVariable("duration_sec"),
|
|
258
|
+
StringVariable("message"),
|
|
259
|
+
])
|
|
260
|
+
final_table = Table.from_list(domain, results)
|
|
261
|
+
return final_table
|
|
262
|
+
|
|
263
|
+
def handle_progress(self, value: float) -> None:
|
|
264
|
+
self.progressBarSet(value)
|
|
265
|
+
|
|
266
|
+
def handle_status_update(self, status_info: list):
|
|
267
|
+
"""Receives a single status update and updates the internal dictionary and the output."""
|
|
268
|
+
path_str, status, message = status_info
|
|
269
|
+
self.processed_statuses[path_str] = [status, message]
|
|
270
|
+
self.update_status_output()
|
|
271
|
+
|
|
272
|
+
def handle_result(self, result: Table):
|
|
273
|
+
try:
|
|
274
|
+
self.result = result
|
|
275
|
+
self.Outputs.data.send(result)
|
|
276
|
+
except Exception as e:
|
|
277
|
+
print("An error occurred when sending out_data:", e)
|
|
278
|
+
self.Outputs.data.send(None)
|
|
279
|
+
|
|
280
|
+
def handle_finish(self):
|
|
281
|
+
self.progressBarFinished()
|
|
@@ -29,7 +29,7 @@ class OWExportMarkdown(widget.OWWidget):
|
|
|
29
29
|
icon = "icons/export_md.png"
|
|
30
30
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
31
31
|
icon = "icons_dev/export_md.png"
|
|
32
|
-
want_control_area =
|
|
32
|
+
want_control_area = False
|
|
33
33
|
priority = 9999
|
|
34
34
|
category = "AAIT - TOOLBOX"
|
|
35
35
|
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# ow_markdown_loader.py
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from AnyQt.QtWidgets import QLabel, QCheckBox
|
|
7
|
+
from Orange.widgets import widget
|
|
8
|
+
from Orange.widgets.utils.signals import Input, Output
|
|
9
|
+
from Orange.data import Domain, StringVariable, Table
|
|
10
|
+
|
|
11
|
+
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
12
|
+
from Orange.widgets.orangecontrib.IO4IT.utils import utils_md
|
|
13
|
+
else:
|
|
14
|
+
from orangecontrib.IO4IT.utils import utils_md
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class OWMarkdownLoader(widget.OWWidget):
|
|
18
|
+
name = "Markdown Loader"
|
|
19
|
+
description = "Charge tous les fichiers Markdown d’un dossier (récursif)"
|
|
20
|
+
icon = "icons/load_md.png"
|
|
21
|
+
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
22
|
+
icon = "icons_dev/load_md.png"
|
|
23
|
+
gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owmarkdownloader.ui")
|
|
24
|
+
want_control_area = False
|
|
25
|
+
priority = 1001
|
|
26
|
+
|
|
27
|
+
class Inputs:
|
|
28
|
+
data = Input("Data", Table)
|
|
29
|
+
|
|
30
|
+
class Outputs:
|
|
31
|
+
md_files = Output("Markdown Files", Table) # -> (file_path, content)
|
|
32
|
+
data = Output("Data", Table) # passthrough de l'entrée
|
|
33
|
+
|
|
34
|
+
def __init__(self):
|
|
35
|
+
super().__init__()
|
|
36
|
+
self.in_data = None
|
|
37
|
+
self.input_dir = None
|
|
38
|
+
self.recursive = True
|
|
39
|
+
|
|
40
|
+
# UI minimal
|
|
41
|
+
self.label = QLabel(self)
|
|
42
|
+
self.checkbox = QCheckBox("Recherche récursive", self)
|
|
43
|
+
self.checkbox.setChecked(True)
|
|
44
|
+
self.checkbox.stateChanged.connect(self._on_recursive_toggled)
|
|
45
|
+
|
|
46
|
+
self.layout().addWidget(self.label)
|
|
47
|
+
self.layout().addWidget(self.checkbox)
|
|
48
|
+
self.warning("")
|
|
49
|
+
|
|
50
|
+
def _on_recursive_toggled(self, _state):
|
|
51
|
+
self.recursive = self.checkbox.isChecked()
|
|
52
|
+
# Si on a déjà un dossier, on relance la production
|
|
53
|
+
if self.input_dir:
|
|
54
|
+
self._produce()
|
|
55
|
+
|
|
56
|
+
@Inputs.data
|
|
57
|
+
def set_data(self, in_data: Table | None):
|
|
58
|
+
self.in_data = in_data
|
|
59
|
+
self.warning("")
|
|
60
|
+
|
|
61
|
+
# Toujours émettre le passthrough (même si None)
|
|
62
|
+
self.Outputs.data.send(in_data)
|
|
63
|
+
|
|
64
|
+
if not in_data:
|
|
65
|
+
# Rien à charger côté MD : on émet une table vide sur Markdown Files
|
|
66
|
+
self.Outputs.md_files.send(self._empty_md_table())
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
# Cherche la colonne 'input_dir' et récupère le premier dossier
|
|
70
|
+
try:
|
|
71
|
+
_ = in_data.domain["input_dir"]
|
|
72
|
+
self.input_dir = str(in_data.get_column("input_dir")[0])
|
|
73
|
+
except Exception:
|
|
74
|
+
self.warning('"input_dir" (Text) est requis en entrée')
|
|
75
|
+
self.Outputs.md_files.send(self._empty_md_table())
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
self.label.setText(f"Dossier : {self.input_dir}")
|
|
79
|
+
self._produce()
|
|
80
|
+
|
|
81
|
+
def _empty_md_table(self) -> Table:
|
|
82
|
+
domain = Domain([], metas=[StringVariable("file_path"), StringVariable("content")])
|
|
83
|
+
X = np.empty((0, 0))
|
|
84
|
+
metas = np.empty((0, 2), dtype=object)
|
|
85
|
+
return Table.from_numpy(domain, X, metas=metas)
|
|
86
|
+
|
|
87
|
+
def _produce(self):
|
|
88
|
+
base = Path(self.input_dir)
|
|
89
|
+
patterns = ["*.md"]
|
|
90
|
+
paths = []
|
|
91
|
+
|
|
92
|
+
for patt in patterns:
|
|
93
|
+
if self.recursive:
|
|
94
|
+
paths.extend(base.rglob(patt))
|
|
95
|
+
else:
|
|
96
|
+
paths.extend(base.glob(patt))
|
|
97
|
+
|
|
98
|
+
md_rows = []
|
|
99
|
+
for p in sorted(set(paths)):
|
|
100
|
+
try:
|
|
101
|
+
md_rows.append([str(p), utils_md.try_read_text(p)])
|
|
102
|
+
except Exception:
|
|
103
|
+
md_rows.append([str(p), ""])
|
|
104
|
+
|
|
105
|
+
# Construit la table pour "Markdown Files"
|
|
106
|
+
domain = Domain([], metas=[StringVariable("file_path"), StringVariable("content")])
|
|
107
|
+
X = np.empty((len(md_rows), 0))
|
|
108
|
+
metas = np.array(md_rows, dtype=object) if md_rows else np.empty((0, 2), dtype=object)
|
|
109
|
+
md_table = Table.from_numpy(domain, X, metas=metas)
|
|
110
|
+
|
|
111
|
+
self.Outputs.md_files.send(md_table)
|
|
112
|
+
# Le passthrough est déjà envoyé dans set_data ; on n'y retouche pas ici.
|
|
@@ -522,9 +522,9 @@ class MarkdownConversionThread(QThread):
|
|
|
522
522
|
class FileProcessorApp(widget.OWWidget):
|
|
523
523
|
name = "Markdownizer"
|
|
524
524
|
description = "Convert PDFs, DOCX, PPTX to Markdown (texte seul & word only)"
|
|
525
|
-
icon = "icons/
|
|
525
|
+
icon = "icons/md_old.png"
|
|
526
526
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
527
|
-
icon = "icons_dev/
|
|
527
|
+
icon = "icons_dev/md_old.png"
|
|
528
528
|
want_control_area = False
|
|
529
529
|
priority = 1001
|
|
530
530
|
gui = os.path.join(os.path.dirname(os.path.abspath(__file__)), "designer/owmarkdownizer.ui")
|