io4it 2.1.2.1__tar.gz → 2.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- io4it-2.1.3/PKG-INFO +7 -0
- io4it-2.1.3/io4it.egg-info/PKG-INFO +7 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/io4it.egg-info/SOURCES.txt +0 -1
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWChatGpt.py +45 -44
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWMarkdownizer.py +1 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWOfficeNormalizer.py +46 -103
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWPdfType.py +35 -23
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owmailsender.ui +31 -96
- io4it-2.1.3/orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui +86 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/setup.py +1 -1
- io4it-2.1.2.1/PKG-INFO +0 -30
- io4it-2.1.2.1/io4it.egg-info/PKG-INFO +0 -30
- io4it-2.1.2.1/orangecontrib/IO4IT/widgets/designer/OWmailSender.py +0 -155
- io4it-2.1.2.1/orangecontrib/IO4IT/widgets/designer/owofficenormalizer.ui +0 -124
- {io4it-2.1.2.1 → io4it-2.1.3}/io4it.egg-info/dependency_links.txt +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/io4it.egg-info/entry_points.txt +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/io4it.egg-info/namespace_packages.txt +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/io4it.egg-info/requires.txt +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/io4it.egg-info/top_level.txt +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/__init__.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/ocr_function/__init__.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/ocr_function/word_converter.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/utils/__init__.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/utils/mail.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/utils/offuscation_basique.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/utils/pool_exec_utils.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/utils/utils_md.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWDeep_Search.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWDoclingMarkdownizerSimple.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWExportMarkdown.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWExtractTablesDocxToCSV.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWInboxMailMonitoring.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWMarkdownLoader.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWProcessPoolExecutor.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWS3Uploader.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWS3downloader.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWS3list.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWSpeechToText.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWmailLoader.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWmailSender.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/OWwordpdf2docx.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/__init__.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/__init__.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/nogui.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owchatgpt.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owdeepsearch.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owdoclingasr.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owdoclingmarkdownizersimple.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owdocxtocsv.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owexportmarkdown.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owinboxmailmonitoring.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owmailloader.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owmarkdownizer.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owmarkdownloader.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owpdftype.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owprocesspoolexecutor.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owspeechtotext.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/owvisualizationer.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/designer/wordpdf2docx.ui +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/__init__.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/chatgpt.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/check_pdf.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/deepsearch.svg +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/dep_md_old.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/download.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/export_md.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/extract_table.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/file_extensor.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/list_aws.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/load_md.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/mail_loader.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/mail_writer.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/md.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/monitor-email.svg +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/office_normalizer.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/process_pool_executor.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/speech_to_text.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/upload.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/visualizationer.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons/wordpdf2docx.png +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/IO4IT/widgets/icons_dev/__init__.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/orangecontrib/__init__.py +0 -0
- {io4it-2.1.2.1 → io4it-2.1.3}/setup.cfg +0 -0
io4it-2.1.3/PKG-INFO
ADDED
|
@@ -34,7 +34,6 @@ orangecontrib/IO4IT/widgets/OWmailLoader.py
|
|
|
34
34
|
orangecontrib/IO4IT/widgets/OWmailSender.py
|
|
35
35
|
orangecontrib/IO4IT/widgets/OWwordpdf2docx.py
|
|
36
36
|
orangecontrib/IO4IT/widgets/__init__.py
|
|
37
|
-
orangecontrib/IO4IT/widgets/designer/OWmailSender.py
|
|
38
37
|
orangecontrib/IO4IT/widgets/designer/__init__.py
|
|
39
38
|
orangecontrib/IO4IT/widgets/designer/nogui.ui
|
|
40
39
|
orangecontrib/IO4IT/widgets/designer/ow_file_ext_selector.ui
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
import sys
|
|
3
3
|
import base64
|
|
4
4
|
import ast
|
|
5
|
-
|
|
5
|
+
import openai
|
|
6
6
|
import Orange
|
|
7
7
|
from Orange.data import StringVariable
|
|
8
8
|
from Orange.widgets.widget import OWWidget, Input, Output
|
|
@@ -98,53 +98,30 @@ class ChatGpt(OWWidget):
|
|
|
98
98
|
|
|
99
99
|
def generate_answers(self):
|
|
100
100
|
try:
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
filename = os.path.basename(img_path)
|
|
119
|
-
user_content.append({"type": "input_text", "text": f"Photo : {filename}"})
|
|
120
|
-
|
|
121
|
-
with open(img_path, "rb") as f:
|
|
122
|
-
b64_img = base64.b64encode(f.read()).decode("utf-8")
|
|
123
|
-
|
|
124
|
-
mime = "image/png" if filename.lower().endswith(".png") else "image/jpeg"
|
|
125
|
-
user_content.append({
|
|
126
|
-
"type": "input_image",
|
|
127
|
-
"image_url": f"data:{mime};base64,{b64_img}",
|
|
128
|
-
})
|
|
129
|
-
response = client.responses.create(
|
|
130
|
-
model=self.model,
|
|
131
|
-
input=[
|
|
132
|
-
{"role": "system", "content": system_content},
|
|
133
|
-
{"role": "user", "content": user_content},
|
|
134
|
-
],
|
|
135
|
-
max_output_tokens=self.max_tokens,
|
|
136
|
-
# temperature=self.temperature,
|
|
137
|
-
)
|
|
138
|
-
self.text_response = response.output_text
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
if self.text_response is None:
|
|
142
|
-
self.error("No response from model.")
|
|
143
|
-
|
|
101
|
+
openai.api_key = self.api_keys
|
|
102
|
+
response = openai.chat.completions.create(
|
|
103
|
+
model=self.model,
|
|
104
|
+
messages=[
|
|
105
|
+
{
|
|
106
|
+
"role": "system",
|
|
107
|
+
"content": self.system_prompt
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
"role": "user",
|
|
111
|
+
"content": self.prompt
|
|
112
|
+
}
|
|
113
|
+
],
|
|
114
|
+
max_tokens=self.max_tokens,
|
|
115
|
+
temperature=self.temperature
|
|
116
|
+
)
|
|
117
|
+
self.text_response = response.choices[0].message.content
|
|
144
118
|
except Exception as e:
|
|
145
119
|
print(e)
|
|
146
120
|
self.error(f"Error: {e}")
|
|
147
121
|
return
|
|
122
|
+
if self.text_response is None:
|
|
123
|
+
self.error("No response from chatgpt.")
|
|
124
|
+
|
|
148
125
|
|
|
149
126
|
def run(self):
|
|
150
127
|
self.error("")
|
|
@@ -160,6 +137,30 @@ class ChatGpt(OWWidget):
|
|
|
160
137
|
self.error("No api keys provided.")
|
|
161
138
|
return
|
|
162
139
|
|
|
140
|
+
# si on relance la génération par le bouton le prompt est déjà rempli (déjà une liste)
|
|
141
|
+
# mais on peut changer le model, la temp ou le nombre max de token
|
|
142
|
+
if isinstance(self.prompt, list):
|
|
143
|
+
self.prompt = [{"type": "text", "text": self.prompt}]
|
|
144
|
+
if self.image_paths is not None and self.image_paths != []:
|
|
145
|
+
if type(self.image_paths) == str:
|
|
146
|
+
self.image_paths = ast.literal_eval(self.image_paths)
|
|
147
|
+
for img_path in self.image_paths:
|
|
148
|
+
filename = os.path.basename(img_path)
|
|
149
|
+
|
|
150
|
+
# Ajoute une mention de l’image avant
|
|
151
|
+
self.prompt.append({
|
|
152
|
+
"type": "text",
|
|
153
|
+
"text": f"Photo : {filename}"
|
|
154
|
+
})
|
|
155
|
+
|
|
156
|
+
with open(img_path, "rb") as f:
|
|
157
|
+
b64_img = base64.b64encode(f.read()).decode("utf-8")
|
|
158
|
+
self.prompt.append({
|
|
159
|
+
"type": "image_url",
|
|
160
|
+
"image_url": {
|
|
161
|
+
"url": f"data:image/jpeg;base64,{b64_img}"
|
|
162
|
+
}
|
|
163
|
+
})
|
|
163
164
|
self.progressBarInit()
|
|
164
165
|
self.thread = thread_management.Thread(self.generate_answers)
|
|
165
166
|
self.thread.progress.connect(self.handle_progress)
|
|
@@ -16,6 +16,7 @@ import easyocr
|
|
|
16
16
|
|
|
17
17
|
from AnyQt.QtCore import QThread, pyqtSignal
|
|
18
18
|
from AnyQt.QtWidgets import QApplication, QLabel, QSpinBox, QTextEdit, QPushButton
|
|
19
|
+
from AnyQt import uic
|
|
19
20
|
|
|
20
21
|
from Orange.widgets import widget
|
|
21
22
|
from Orange.widgets.utils.signals import Input, Output
|
|
@@ -2,18 +2,15 @@ import os
|
|
|
2
2
|
import sys
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
import shutil
|
|
5
|
-
|
|
5
|
+
|
|
6
6
|
from AnyQt.QtWidgets import QApplication
|
|
7
7
|
from Orange.widgets import widget
|
|
8
8
|
from Orange.widgets.utils.signals import Input, Output
|
|
9
9
|
from Orange.data import Domain, StringVariable, Table, DiscreteVariable
|
|
10
|
+
|
|
11
|
+
# --- Ajout pour l'écriture Excel ---
|
|
10
12
|
from openpyxl import Workbook
|
|
11
|
-
import docx
|
|
12
|
-
import aspose.words as aw
|
|
13
|
-
import multiprocessing
|
|
14
|
-
import queue
|
|
15
13
|
|
|
16
|
-
# Les imports sont adaptés pour correspondre au style de l'autre script
|
|
17
14
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
18
15
|
from Orange.widgets.orangecontrib.IO4IT.utils import utils_md
|
|
19
16
|
from Orange.widgets.orangecontrib.AAIT.utils.import_uic import uic
|
|
@@ -22,24 +19,6 @@ else:
|
|
|
22
19
|
from orangecontrib.AAIT.utils.import_uic import uic
|
|
23
20
|
|
|
24
21
|
|
|
25
|
-
def _convert_file_process(src_path: Path, dst_dir: Path, file_type: str, result_queue: multiprocessing.Queue):
|
|
26
|
-
"""
|
|
27
|
-
Fonction de conversion exécutée dans un processus séparé.
|
|
28
|
-
Place le résultat (statut, chemin, détails) dans une file d'attente.
|
|
29
|
-
"""
|
|
30
|
-
try:
|
|
31
|
-
if file_type == "doc":
|
|
32
|
-
dst = utils_md.convert_doc_to_docx(src_path, dst_dir)
|
|
33
|
-
result_queue.put(("ok", str(dst), "doc->docx"))
|
|
34
|
-
elif file_type == "ppt":
|
|
35
|
-
dst = utils_md.convert_ppt_to_pptx(src_path, dst_dir)
|
|
36
|
-
result_queue.put(("ok", str(dst), "ppt->pptx"))
|
|
37
|
-
except comtypes.COMError:
|
|
38
|
-
result_queue.put(("ko", "", "conversion failed: COM error"))
|
|
39
|
-
except Exception as e:
|
|
40
|
-
result_queue.put(("ko", "", f"conversion failed: {e}"))
|
|
41
|
-
|
|
42
|
-
|
|
43
22
|
class OWOfficeNormalizer(widget.OWWidget):
|
|
44
23
|
name = "Office Normalizer"
|
|
45
24
|
description = "Convertit .doc→.docx et .ppt→.pptx via COM (Windows + Office)"
|
|
@@ -68,11 +47,6 @@ class OWOfficeNormalizer(widget.OWWidget):
|
|
|
68
47
|
self.autorun = True
|
|
69
48
|
self.result = None
|
|
70
49
|
self.processed_statuses = []
|
|
71
|
-
|
|
72
|
-
# Connecter la case à cocher pour activer/désactiver le spinbox
|
|
73
|
-
self.checkBox_timeout.toggled.connect(self.spinBox_timeout.setEnabled)
|
|
74
|
-
self.spinBox_timeout.setEnabled(self.checkBox_timeout.isChecked())
|
|
75
|
-
|
|
76
50
|
self.post_initialized()
|
|
77
51
|
|
|
78
52
|
@Inputs.data
|
|
@@ -98,41 +72,13 @@ class OWOfficeNormalizer(widget.OWWidget):
|
|
|
98
72
|
self.processed_statuses = []
|
|
99
73
|
self.Outputs.status_data.send(None)
|
|
100
74
|
|
|
101
|
-
#
|
|
102
|
-
self.timeout_value = None
|
|
103
|
-
if self.checkBox_timeout.isChecked():
|
|
104
|
-
self.timeout_value = self.spinBox_timeout.value()
|
|
105
|
-
|
|
75
|
+
# Process files directly without a separate thread
|
|
106
76
|
result_table = self._normalize_files(self.data)
|
|
107
77
|
|
|
78
|
+
# Send the final results to the primary output
|
|
108
79
|
self.Outputs.data.send(result_table)
|
|
109
80
|
self.progressBarFinished()
|
|
110
81
|
|
|
111
|
-
def _check_file_status(self, file_path: Path):
|
|
112
|
-
"""
|
|
113
|
-
Vérifie si un fichier est accessible, non corrompu et non protégé par un mot de passe.
|
|
114
|
-
Retourne un tuple : (statut_court, détails)
|
|
115
|
-
"""
|
|
116
|
-
if not file_path.exists():
|
|
117
|
-
return "ko", "not found"
|
|
118
|
-
try:
|
|
119
|
-
with open(file_path, 'rb'):
|
|
120
|
-
pass
|
|
121
|
-
except IOError as e:
|
|
122
|
-
return "ko", f"locked or permission denied: {e}"
|
|
123
|
-
try:
|
|
124
|
-
file_info = aw.FileFormatUtil.detect_file_format(str(file_path))
|
|
125
|
-
if file_info.is_encrypted:
|
|
126
|
-
return "ko", "password protected"
|
|
127
|
-
except Exception:
|
|
128
|
-
pass
|
|
129
|
-
if file_path.suffix.lower() == ".docx":
|
|
130
|
-
try:
|
|
131
|
-
docx.Document(file_path)
|
|
132
|
-
except Exception:
|
|
133
|
-
return "ko", "corrupted"
|
|
134
|
-
return "ok", "ready"
|
|
135
|
-
|
|
136
82
|
def _normalize_files(self, in_data: Table) -> Table:
|
|
137
83
|
rows = []
|
|
138
84
|
file_paths = [str(x) for x in in_data.get_column("file_path")]
|
|
@@ -147,6 +93,7 @@ class OWOfficeNormalizer(widget.OWWidget):
|
|
|
147
93
|
output_base_dir = common_path / "office_normalisation"
|
|
148
94
|
output_base_dir.mkdir(parents=True, exist_ok=True)
|
|
149
95
|
|
|
96
|
+
# Gère le nom du fichier Excel avec incrémentation
|
|
150
97
|
base_name = "normalization_results"
|
|
151
98
|
excel_path = output_base_dir / f"{base_name}.xlsx"
|
|
152
99
|
counter = 1
|
|
@@ -154,6 +101,7 @@ class OWOfficeNormalizer(widget.OWWidget):
|
|
|
154
101
|
excel_path = output_base_dir / f"{base_name}_{counter}.xlsx"
|
|
155
102
|
counter += 1
|
|
156
103
|
|
|
104
|
+
# Initialise le classeur Excel
|
|
157
105
|
wb = Workbook()
|
|
158
106
|
ws = wb.active
|
|
159
107
|
ws.title = "Normalization Results"
|
|
@@ -162,72 +110,67 @@ class OWOfficeNormalizer(widget.OWWidget):
|
|
|
162
110
|
|
|
163
111
|
for i, path_str in enumerate(file_paths):
|
|
164
112
|
self.progressBarSet(i / total_files * 100)
|
|
113
|
+
|
|
165
114
|
src = Path(path_str)
|
|
166
115
|
dst_path = ""
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
116
|
+
status_text = ""
|
|
117
|
+
status_short = ""
|
|
118
|
+
details = ""
|
|
119
|
+
|
|
120
|
+
if not src.exists():
|
|
121
|
+
status_short = "ko"
|
|
122
|
+
details = "not found"
|
|
123
|
+
status_text = f"ko: {details}"
|
|
124
|
+
else:
|
|
170
125
|
try:
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
dst = dst_dir / src.name
|
|
175
|
-
shutil.copy(src, dst)
|
|
176
|
-
dst_path = str(dst)
|
|
177
|
-
details = "docx - unchanged"
|
|
178
|
-
|
|
179
|
-
elif src.suffix.lower() in [".doc", ".ppt"]:
|
|
180
|
-
dst_dir = output_base_dir / src.parent.relative_to(common_path)
|
|
181
|
-
dst_dir.mkdir(parents=True, exist_ok=True)
|
|
182
|
-
|
|
183
|
-
result_queue = multiprocessing.Queue()
|
|
184
|
-
p = multiprocessing.Process(
|
|
185
|
-
target=_convert_file_process,
|
|
186
|
-
args=(src, dst_dir, src.suffix.lower().lstrip("."), result_queue)
|
|
187
|
-
)
|
|
188
|
-
p.start()
|
|
189
|
-
|
|
190
|
-
try:
|
|
191
|
-
# Utilisation de la valeur de timeout sélectionnée
|
|
192
|
-
p.join(timeout=self.timeout_value)
|
|
193
|
-
|
|
194
|
-
if p.is_alive():
|
|
195
|
-
p.terminate()
|
|
196
|
-
status_short = "ko"
|
|
197
|
-
details = "conversion timed out"
|
|
198
|
-
else:
|
|
199
|
-
status_short, dst_path, details = result_queue.get(timeout=1)
|
|
200
|
-
except queue.Empty:
|
|
201
|
-
status_short = "ko"
|
|
202
|
-
details = "conversion process failed silently"
|
|
203
|
-
except Exception as e:
|
|
204
|
-
status_short = "ko"
|
|
205
|
-
details = f"conversion failed: {e}"
|
|
126
|
+
relative_path_from_common = src.parent.relative_to(common_path)
|
|
127
|
+
dst_dir = output_base_dir / relative_path_from_common
|
|
128
|
+
dst_dir.mkdir(parents=True, exist_ok=True)
|
|
206
129
|
|
|
130
|
+
if src.suffix.lower() == ".doc":
|
|
131
|
+
dst = utils_md.convert_doc_to_docx(src, dst_dir)
|
|
132
|
+
dst_path = str(dst)
|
|
133
|
+
status_short = "ok"
|
|
134
|
+
details = "doc->docx"
|
|
135
|
+
status_text = f"ok: {details}"
|
|
136
|
+
elif src.suffix.lower() == ".ppt":
|
|
137
|
+
dst = utils_md.convert_ppt_to_pptx(src, dst_dir)
|
|
138
|
+
dst_path = str(dst)
|
|
139
|
+
status_short = "ok"
|
|
140
|
+
details = "ppt->pptx"
|
|
141
|
+
status_text = f"ok: {details}"
|
|
207
142
|
else:
|
|
208
|
-
dst_dir = output_base_dir / src.parent.relative_to(common_path)
|
|
209
|
-
dst_dir.mkdir(parents=True, exist_ok=True)
|
|
210
143
|
dst = dst_dir / src.name
|
|
211
144
|
if not dst.exists():
|
|
212
145
|
shutil.copy(src, dst)
|
|
213
146
|
dst_path = str(dst)
|
|
147
|
+
status_short = "ok"
|
|
214
148
|
details = "unchanged"
|
|
215
|
-
|
|
149
|
+
status_text = f"ok: {details}"
|
|
216
150
|
except Exception as e:
|
|
151
|
+
error_msg = str(e)
|
|
217
152
|
status_short = "ko"
|
|
218
|
-
details = f"error: {
|
|
153
|
+
details = f"error: {error_msg}"
|
|
154
|
+
status_text = f"ko: {details}"
|
|
219
155
|
|
|
156
|
+
# Ajoute la ligne de résultat à la table Excel et la sauvegarde
|
|
220
157
|
result_row = [path_str, dst_path, status_short, details]
|
|
221
158
|
ws.append(result_row)
|
|
222
159
|
wb.save(excel_path)
|
|
223
|
-
rows.append([path_str, dst_path, status_short])
|
|
224
160
|
|
|
161
|
+
# Append to the final results list for Orange table
|
|
162
|
+
rows.append([path_str, dst_path, status_text])
|
|
163
|
+
|
|
164
|
+
# Append to the status update list and send the incremental table
|
|
225
165
|
self.processed_statuses.append([path_str, status_short, details])
|
|
226
166
|
self._send_status_table()
|
|
227
167
|
|
|
168
|
+
# This is crucial for UI updates, including the progress bar
|
|
228
169
|
QApplication.processEvents()
|
|
229
170
|
|
|
230
171
|
self.progressBarSet(100)
|
|
172
|
+
|
|
173
|
+
# Create and return the final output table
|
|
231
174
|
domain = Domain([], metas=[
|
|
232
175
|
StringVariable("src_path"),
|
|
233
176
|
StringVariable("dst_path"),
|
|
@@ -236,6 +179,7 @@ class OWOfficeNormalizer(widget.OWWidget):
|
|
|
236
179
|
return Table.from_list(domain, rows)
|
|
237
180
|
|
|
238
181
|
def _send_status_table(self):
|
|
182
|
+
"""Sends an incremental table to the status_data output."""
|
|
239
183
|
domain = Domain([], metas=[
|
|
240
184
|
StringVariable("src_path"),
|
|
241
185
|
DiscreteVariable("status", values=["ok", "ko"]),
|
|
@@ -252,7 +196,6 @@ class OWOfficeNormalizer(widget.OWWidget):
|
|
|
252
196
|
|
|
253
197
|
|
|
254
198
|
if __name__ == "__main__":
|
|
255
|
-
multiprocessing.freeze_support()
|
|
256
199
|
app = QApplication(sys.argv)
|
|
257
200
|
my_widget = OWOfficeNormalizer()
|
|
258
201
|
my_widget.show()
|
|
@@ -10,7 +10,6 @@ from Orange.widgets.utils.signals import Input, Output
|
|
|
10
10
|
|
|
11
11
|
# --- Ajout pour l'écriture Excel ---
|
|
12
12
|
from openpyxl import Workbook
|
|
13
|
-
import pypdf
|
|
14
13
|
|
|
15
14
|
# Les imports sont adaptés pour correspondre au style de l'autre script
|
|
16
15
|
if "site-packages/Orange/widgets" in os.path.dirname(os.path.abspath(__file__)).replace("\\", "/"):
|
|
@@ -57,7 +56,7 @@ class OWPdfType(widget.OWWidget):
|
|
|
57
56
|
self.thread = None
|
|
58
57
|
self.autorun = True
|
|
59
58
|
self.result = None
|
|
60
|
-
self.processed_statuses = []
|
|
59
|
+
self.processed_statuses = [] # List to accumulate statuses
|
|
61
60
|
self.post_initialized()
|
|
62
61
|
|
|
63
62
|
@Inputs.data
|
|
@@ -74,7 +73,6 @@ class OWPdfType(widget.OWWidget):
|
|
|
74
73
|
self.Outputs.text_data.send(None)
|
|
75
74
|
self.Outputs.image_data.send(None)
|
|
76
75
|
self.Outputs.status_data.send(None)
|
|
77
|
-
QApplication.quit()
|
|
78
76
|
return
|
|
79
77
|
|
|
80
78
|
self.error("")
|
|
@@ -82,17 +80,19 @@ class OWPdfType(widget.OWWidget):
|
|
|
82
80
|
self.data.domain["file_path"]
|
|
83
81
|
except KeyError:
|
|
84
82
|
self.error("You need a 'file_path' column in input data.")
|
|
85
|
-
QApplication.quit()
|
|
86
83
|
return
|
|
87
84
|
|
|
88
85
|
if type(self.data.domain["file_path"]).__name__ != 'StringVariable':
|
|
89
86
|
self.error("'file_path' column needs to be a Text.")
|
|
90
|
-
QApplication.quit()
|
|
91
87
|
return
|
|
92
88
|
|
|
93
89
|
self.progressBarInit()
|
|
94
|
-
self.processed_statuses = []
|
|
90
|
+
self.processed_statuses = [] # Reset status list for a new run
|
|
91
|
+
|
|
92
|
+
# Connect the internal status update signal to a new handler
|
|
95
93
|
self.status_update_signal.connect(self.handle_status_update)
|
|
94
|
+
|
|
95
|
+
# Pass the status update signal's emit method to the thread
|
|
96
96
|
self.thread = Thread(self._process_pdfs, self.data, status_callback=self.status_update_signal.emit)
|
|
97
97
|
self.thread.progress.connect(self.handle_progress)
|
|
98
98
|
self.thread.result.connect(self.handle_result)
|
|
@@ -102,8 +102,10 @@ class OWPdfType(widget.OWWidget):
|
|
|
102
102
|
def _process_pdfs(self, in_data: Table, progress_callback: callable, status_callback: callable) -> tuple[
|
|
103
103
|
Table | None, Table | None]:
|
|
104
104
|
|
|
105
|
+
# Extraction des chemins de fichiers avant de commencer le traitement
|
|
105
106
|
paths = [str(x) for x in in_data.get_column("file_path")]
|
|
106
107
|
|
|
108
|
+
# --- Gérer le nom du fichier Excel avec incrémentation ---
|
|
107
109
|
excel_output_dir = Path.cwd() / "pdf_check_results"
|
|
108
110
|
if paths:
|
|
109
111
|
first_file_path = Path(paths[0])
|
|
@@ -118,6 +120,7 @@ class OWPdfType(widget.OWWidget):
|
|
|
118
120
|
excel_path = excel_output_dir / f"{base_name}_{counter}.xlsx"
|
|
119
121
|
counter += 1
|
|
120
122
|
|
|
123
|
+
# --- Initialiser le classeur Excel ---
|
|
121
124
|
wb = Workbook()
|
|
122
125
|
ws = wb.active
|
|
123
126
|
ws.title = "PDF Check Results"
|
|
@@ -130,8 +133,9 @@ class OWPdfType(widget.OWWidget):
|
|
|
130
133
|
total_files = len(paths)
|
|
131
134
|
for i, p in enumerate(paths):
|
|
132
135
|
progress_callback(i / total_files * 100)
|
|
136
|
+
|
|
133
137
|
fp = Path(p)
|
|
134
|
-
result_row = [p, "", ""]
|
|
138
|
+
result_row = [p, "", ""] # Initialisation de la ligne de résultat
|
|
135
139
|
|
|
136
140
|
if not fp.exists() or fp.suffix.lower() != ".pdf":
|
|
137
141
|
result_row[1] = "ko"
|
|
@@ -142,56 +146,64 @@ class OWPdfType(widget.OWWidget):
|
|
|
142
146
|
continue
|
|
143
147
|
|
|
144
148
|
try:
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
result_row[1] = "
|
|
149
|
-
result_row[2] = "
|
|
149
|
+
is_text = utils_md.is_pdf_text_based(fp)
|
|
150
|
+
if is_text:
|
|
151
|
+
text_indices.append(i)
|
|
152
|
+
result_row[1] = "ok"
|
|
153
|
+
result_row[2] = "Text-based PDF"
|
|
150
154
|
else:
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
result_row[1] = "ok"
|
|
159
|
-
result_row[2] = "Image-based PDF"
|
|
155
|
+
image_indices.append(i)
|
|
156
|
+
result_row[1] = "ok"
|
|
157
|
+
result_row[2] = "Image-based PDF"
|
|
158
|
+
|
|
159
|
+
status_callback(result_row)
|
|
160
|
+
ws.append(result_row)
|
|
161
|
+
wb.save(excel_path)
|
|
160
162
|
except Exception as e:
|
|
161
163
|
result_row[1] = "ko"
|
|
162
164
|
result_row[2] = f"Error: {str(e)}"
|
|
163
|
-
finally:
|
|
164
165
|
status_callback(result_row)
|
|
165
166
|
ws.append(result_row)
|
|
166
167
|
wb.save(excel_path)
|
|
167
168
|
|
|
168
169
|
progress_callback(100)
|
|
169
170
|
|
|
171
|
+
# Create table for text PDFs
|
|
170
172
|
if not text_indices:
|
|
171
173
|
text_table = None
|
|
172
174
|
else:
|
|
173
175
|
text_table = in_data[text_indices]
|
|
174
176
|
|
|
177
|
+
# Create table for image PDFs
|
|
175
178
|
if not image_indices:
|
|
176
179
|
image_table = None
|
|
177
180
|
else:
|
|
178
181
|
image_table = in_data[image_indices]
|
|
179
182
|
|
|
183
|
+
# The final result is still returned here
|
|
180
184
|
return text_table, image_table
|
|
181
185
|
|
|
182
186
|
def handle_progress(self, value: float) -> None:
|
|
183
187
|
self.progressBarSet(value)
|
|
184
188
|
|
|
185
189
|
def handle_status_update(self, new_status: list):
|
|
190
|
+
"""
|
|
191
|
+
Receives a single status update from the thread, appends it to the list,
|
|
192
|
+
and sends a new, updated status table.
|
|
193
|
+
"""
|
|
186
194
|
self.processed_statuses.append(new_status)
|
|
195
|
+
|
|
196
|
+
# Correct Domain creation: move "file_path" to metas
|
|
187
197
|
status_domain = Domain(
|
|
188
|
-
[],
|
|
198
|
+
[], # The variables list should be empty
|
|
189
199
|
metas=[
|
|
190
200
|
StringVariable("file_path"),
|
|
191
201
|
DiscreteVariable("status", values=["ok", "ko"]),
|
|
192
202
|
StringVariable("details")
|
|
193
203
|
]
|
|
194
204
|
)
|
|
205
|
+
|
|
206
|
+
# Now, the data is correctly structured for the new domain
|
|
195
207
|
status_table = Table.from_list(status_domain, self.processed_statuses)
|
|
196
208
|
self.Outputs.status_data.send(status_table)
|
|
197
209
|
|